scala.collection.Map Scala Example

Source File: CommandUtils.scala From drizzle-spark with Apache License 2.0

7 votes

package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.internal.Logging
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
}

Source File: GroupedCountEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: WordpieceTokenized.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp.annotators.common

import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
import scala.collection.Map


object WordpieceTokenized extends Annotated[WordpieceTokenizedSentence] {

  override def annotatorType: String = AnnotatorType.WORDPIECE

  override def unpack(annotations: Seq[Annotation]): Seq[WordpieceTokenizedSentence] = {
    val tokens = annotations
      .filter(_.annotatorType == annotatorType)
      .toArray

    SentenceSplit.unpack(annotations).map(sentence => {
      tokens.filter(token =>
        token.begin >= sentence.start & token.end <= sentence.end
      ).map(token =>
        TokenPiece(wordpiece = token.result,
           token = token.metadata("token"),
           pieceId = token.metadata("pieceId").toInt,
           isWordStart = token.metadata("isWordStart").toBoolean,
           begin = token.begin,
           end = token.end
        )
      )
    }).filter(_.nonEmpty).map(tokens => WordpieceTokenizedSentence(tokens))

  }

  override def pack(sentences: Seq[WordpieceTokenizedSentence]): Seq[Annotation] = {
    var sentenceIndex = 0

    sentences.flatMap{sentence =>
      sentenceIndex += 1
      sentence.tokens.map{token =>
        Annotation(annotatorType, token.begin, token.end, token.wordpiece,
          Map("sentence" -> sentenceIndex.toString,
            "isWordStart" -> token.isWordStart.toString,
            "pieceId" -> token.pieceId.toString,
            "token" -> token.token)
        )
      }}
  }
}

case class WordpieceTokenizedSentence(tokens: Array[TokenPiece])
case class TokenPiece(wordpiece: String, token: String, pieceId: Int, isWordStart: Boolean, begin: Int, end: Int)

Source File: NerConverter.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp.annotators.ner

import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT, NAMED_ENTITY, TOKEN}
import com.johnsnowlabs.nlp.annotators.common.NerTagged
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, ParamsAndFeaturesReadable}
import org.apache.spark.ml.param.{BooleanParam, StringArrayParam}
import org.apache.spark.ml.util.Identifiable

import scala.collection.Map


  def setPreservePosition(value: Boolean): this.type = set(preservePosition, value)

  setDefault(
    preservePosition -> true
  )

  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
    val sentences = NerTagged.unpack(annotations)
    val docs = annotations.filter(a => a.annotatorType == AnnotatorType.DOCUMENT)
    val entities = sentences.zip(docs.zipWithIndex).flatMap { case (sentence, doc) =>
      NerTagsEncoding.fromIOB(sentence, doc._1, sentenceIndex=doc._2, $(preservePosition))
    }

    entities.filter(entity => get(whiteList).forall(validEntity => validEntity.contains(entity.entity))).
      zipWithIndex.map{case (entity, idx) =>
      Annotation(
        outputAnnotatorType,
        entity.start,
        entity.end,
        entity.text,
        Map("entity" -> entity.entity, "sentence" -> entity.sentenceId, "chunk" -> idx.toString)
      )
    }
  }

}

object NerConverter extends ParamsAndFeaturesReadable[NerConverter]

Source File: DeltaTableOperations.scala From delta with Apache License 2.0

5 votes

package io.delta.tables.execution

import scala.collection.Map

import org.apache.spark.sql.delta.{DeltaErrors, DeltaHistoryManager, DeltaLog, PreprocessTableUpdate}
import org.apache.spark.sql.delta.commands.{DeleteCommand, DeltaGenerateCommand, VacuumCommand}
import org.apache.spark.sql.delta.util.AnalysisHelper
import io.delta.tables.DeltaTable

import org.apache.spark.sql.{functions, Column, DataFrame, Dataset}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression}
import org.apache.spark.sql.catalyst.plans.logical._


trait DeltaTableOperations extends AnalysisHelper { self: DeltaTable =>

  protected def executeDelete(condition: Option[Expression]): Unit = improveUnsupportedOpError {
    val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition)
    toDataset(sparkSession, delete)
  }

  protected def executeHistory(deltaLog: DeltaLog, limit: Option[Int]): DataFrame = {
    val history = new DeltaHistoryManager(deltaLog)
    val spark = self.toDF.sparkSession
    spark.createDataFrame(history.getHistory(limit))
  }

  protected def executeGenerate(tblIdentifier: String, mode: String): Unit = {
    val tableId: TableIdentifier = sparkSession
      .sessionState
      .sqlParser
      .parseTableIdentifier(tblIdentifier)
    val generate = DeltaGenerateCommand(mode, tableId)
    generate.run(sparkSession)
  }

  protected def executeUpdate(
      set: Map[String, Column],
      condition: Option[Column]): Unit = improveUnsupportedOpError {
    val assignments = set.map { case (targetColName, column) =>
      Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr)
    }.toSeq
    val update = UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr))
    toDataset(sparkSession, update)
  }

  protected def executeVacuum(
      deltaLog: DeltaLog,
      retentionHours: Option[Double]): DataFrame = {
    VacuumCommand.gc(sparkSession, deltaLog, false, retentionHours)
    sparkSession.emptyDataFrame
  }

  protected def toStrColumnMap(map: Map[String, String]): Map[String, Column] = {
    map.toSeq.map { case (k, v) => k -> functions.expr(v) }.toMap
  }

  protected def sparkSession = self.toDF.sparkSession
}

Source File: ElasticsearchConnector.scala From hail with MIT License

5 votes

package is.hail.io

import org.apache.spark
import org.elasticsearch.spark.sql._

import scala.collection.JavaConverters._
import scala.collection.Map

object ElasticsearchConnector {

  def export(
    df: spark.sql.DataFrame,
    host: String,
    port: Int,
    index: String,
    indexType: String,
    blockSize: Int,
    config: java.util.HashMap[String, String],
    verbose: Boolean) {
    export(df, host, port, index, indexType, blockSize,
      Option(config).map(_.asScala.toMap).getOrElse(Map.empty[String, String]), verbose)
  }

  def export(df: spark.sql.DataFrame, host: String = "localhost", port: Int = 9200,
    index: String, indexType: String, blockSize: Int = 1000,
    config: Map[String, String], verbose: Boolean = true) {

    // config docs: https://www.elastic.co/guide/en/elasticsearch/hadoop/master/configuration.html

    val defaultConfig = Map(
      "es.nodes" -> host,
      "es.port" -> port.toString,
      "es.batch.size.entries" -> blockSize.toString,
      "es.index.auto.create" -> "true")

    val mergedConfig = if (config == null)
      defaultConfig
    else
      defaultConfig ++ config

    if (verbose)
      println(s"Config ${ mergedConfig }")

    df.saveToEs(s"${ index }/${ indexType }", mergedConfig)
  }
}

Source File: CommandUtils.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.internal.Logging
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
}

Source File: GroupedCountEvaluator.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
}

Source File: JacksonGenerator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.json

import scala.collection.Map

import com.fasterxml.jackson.core._

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types._

private[sql] object JacksonGenerator {
  
  def apply(rowSchema: StructType, gen: JsonGenerator)(row: Row): Unit = {
    def valWriter: (DataType, Any) => Unit = {
      case (_, null) | (NullType, _) => gen.writeNull()
      case (StringType, v: String) => gen.writeString(v)
      case (TimestampType, v: java.sql.Timestamp) => gen.writeString(v.toString)
      case (IntegerType, v: Int) => gen.writeNumber(v)
      case (ShortType, v: Short) => gen.writeNumber(v)
      case (FloatType, v: Float) => gen.writeNumber(v)
      case (DoubleType, v: Double) => gen.writeNumber(v)
      case (LongType, v: Long) => gen.writeNumber(v)
      case (DecimalType(), v: java.math.BigDecimal) => gen.writeNumber(v)
      case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
      case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
      case (BooleanType, v: Boolean) => gen.writeBoolean(v)
      case (DateType, v) => gen.writeString(v.toString)
      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, udt.serialize(v))

      case (ArrayType(ty, _), v: Seq[_]) =>
        gen.writeStartArray()
        v.foreach(valWriter(ty, _))
        gen.writeEndArray()

      case (MapType(kv, vv, _), v: Map[_, _]) =>
        gen.writeStartObject()
        v.foreach { p =>
          gen.writeFieldName(p._1.toString)
          valWriter(vv, p._2)
        }
        gen.writeEndObject()

      case (StructType(ty), v: Row) =>
        gen.writeStartObject()
        ty.zip(v.toSeq).foreach {
          case (_, null) =>
          case (field, v) =>
            gen.writeFieldName(field.name)
            valWriter(field.dataType, v)
        }
        gen.writeEndObject()
    }

    valWriter(rowSchema, row)
  }
}

Source File: CommandUtils.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}
import java.lang.System._

import scala.collection.JavaConversions._
import scala.collection.Map

import org.apache.spark.Logging
import org.apache.spark.deploy.Command
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
}

Source File: GroupedSumEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
}

Source File: GroupedCountEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: GroupedMeanEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: JsonUtils.scala From OUTDATED_ledger-wallet-android with MIT License

5 votes

package co.ledger.wallet.core.utils

import org.json.{JSONArray, JSONObject}
import scala.collection.Map

trait JsonUtils {

  implicit def Map2JsonObject[T](map: Map[String, T]): JSONObject = {
    val json = new JSONObject()
    map foreach {case (key, value) =>
      value match {
        case string: String => json.put(key, string)
        case double: Double => json.put(key, double)
        case float: Float => json.put(key, float)
        case boolean: Boolean => json.put(key, boolean)
        case jsonObject: JSONObject => json.put(key, jsonObject)
        case jsonArray: JSONArray => json.put(key, jsonArray)
        case map: Map[_, _] => json.put(key, Map2JsonObject(map.asInstanceOf[Map[String, _]]))
        case array: Array[_] => json.put(key, Array2JsonArray(array))
        case _ => json.put(key, value.toString)
      }
    }
    json
  }

  implicit def Array2JsonArray[T](array: Array[T]): JSONArray = {
    val json = new JSONArray()
    array foreach {
      case string: String => json.put(string)
      case double: Double => json.put(double)
      case float: Float => json.put(float)
      case boolean: Boolean => json.put(boolean)
      case jsonObject: JSONObject => json.put(jsonObject)
      case jsonArray: JSONArray => json.put(jsonArray)
      case map: Map[_, _] => json.put(Map2JsonObject(map.asInstanceOf[Map[String, _]]))
      case array: Array[AnyRef] => json.put(Array2JsonArray(array))
      case value => json.put(value.toString)
    }
    json
  }

  implicit class JsonStringContext(val c: StringContext) {
    def json(args: Any*): JSONObject = {
      val strings = c.parts.iterator
      val arguments = args.iterator
      val string = new StringBuffer(strings.next())
      while (strings.hasNext) {
        arguments.next() match {
          case charSequence: CharSequence => string.append("\"" + charSequence.toString + "\"")
          case arg => string.append(arg.toString)
        }
        string.append(strings.next())
      }
      new JSONObject(string.toString)
    }
  }

}

object JsonUtils extends JsonUtils

Source File: TaskResult.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.Map
import scala.collection.mutable

import org.apache.spark.SparkEnv
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.BlockId
import org.apache.spark.util.Utils

// Task result. Also contains updates to accumulator variables.
//任务结果,还包含累加器变量的更新,

private[spark] sealed trait TaskResult[T]


  def value(): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value.
      //这不应该在持有锁时运行,因为它可能花费数十秒钟值
      val resultSer = SparkEnv.get.serializer.newInstance()
      valueObject = resultSer.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
}

Source File: GroupedSumEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
}

Source File: SentenceWithEmbeddings.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp.annotators.common

import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
import scala.collection.Map


case class WordpieceEmbeddingsSentence
(
  tokens: Array[TokenPieceEmbeddings],
  sentenceId: Int
)

case class TokenPieceEmbeddings(wordpiece: String, token: String, pieceId: Int,
                                isWordStart: Boolean, isOOV: Boolean,
                                embeddings: Array[Float], begin: Int, end: Int)

object TokenPieceEmbeddings {
  def apply(piece: TokenPiece, embeddings: Array[Float]): TokenPieceEmbeddings = {
    TokenPieceEmbeddings(
      wordpiece = piece.wordpiece,
      token = piece.token,
      pieceId = piece.pieceId,
      isWordStart = piece.isWordStart,
      isOOV = false, // FIXME: I think BERT wont have OOV, this "constructor" is called from TensorFlowBert
      embeddings = embeddings,
      begin = piece.begin,
      end = piece.end)
  }
  def apply(wordpiece: String, token: String, pieceId: Int,
            isWordStart: Boolean,
            embeddings: Option[Array[Float]], zeroArray: Array[Float], begin: Int, end: Int): TokenPieceEmbeddings = {

    val vector = embeddings.getOrElse(zeroArray)
    val oov = embeddings match { case Some(_) => false; case default => true; }
    TokenPieceEmbeddings(
      wordpiece = wordpiece,
      token = token,
      pieceId = pieceId,
      isWordStart = isWordStart,
      isOOV = oov,
      embeddings = vector,
      begin = begin,
      end = end)
  }
}

object  WordpieceEmbeddingsSentence extends Annotated[WordpieceEmbeddingsSentence] {
  override def annotatorType: String = AnnotatorType.WORD_EMBEDDINGS

  override def unpack(annotations: Seq[Annotation]): Seq[WordpieceEmbeddingsSentence] = {
    val tokens = annotations
      .filter(_.annotatorType == annotatorType)
      .groupBy(_.metadata("sentence").toInt)

    tokens.map{case (idx: Int, sentenceTokens: Seq[Annotation]) =>
      val tokensWithSentence = sentenceTokens.map { token =>
        new TokenPieceEmbeddings(
          wordpiece = token.result,
          token = token.metadata("token"),
          pieceId = token.metadata("pieceId").toInt,
          isWordStart = token.metadata("isWordStart").toBoolean,
          isOOV = token.metadata.getOrElse("isOOV", "false").toBoolean,
          embeddings = token.embeddings,
          begin = token.begin,
          end = token.end
        )
      }.toArray

      WordpieceEmbeddingsSentence(tokensWithSentence, idx)
    }.toSeq.sortBy(_.sentenceId)
  }

  override def pack(sentences: Seq[WordpieceEmbeddingsSentence]): Seq[Annotation] = {
    sentences.flatMap{sentence =>
      var isFirstToken = true
      sentence.tokens.map{ token =>
        // Store embeddings for token
        val embeddings = token.embeddings

        isFirstToken = false
        Annotation(annotatorType, token.begin, token.end, token.token,
          Map("sentence" -> sentence.sentenceId.toString,
            "token" -> token.token,
            "pieceId" -> token.pieceId.toString,
            "isWordStart" -> token.isWordStart.toString,
            "isOOV" -> token.isOOV.toString
          ),
          embeddings
        )
      }
    }
  }
}

Source File: GroupedMeanEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: LanguageDetectorTransformer.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import com.google.common.base.Optional
import com.optimaize.langdetect.LanguageDetector
import com.optimaize.langdetect.i18n.LdLocale
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}

import scala.collection.Map


  def setOutputCol(value: String): this.type = set(outputCol, value)

  def this() = this(Identifiable.randomUID("languageDetector"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), StringType)
  }

  @transient object languageDetectorWrapped extends Serializable {
    val languageDetector: LanguageDetector =
      LanguageDetectorUtils.buildLanguageDetector(
        LanguageDetectorUtils.readListLangsBuiltIn(),
        $(minimalConfidence),
        $(languagePriors).toMap)
  }

}

Source File: CommandUtils.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.internal.Logging
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
}

Source File: GroupedCountEvaluator.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
}

Source File: JacksonGenerator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.json

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.{MapData, ArrayData, DateTimeUtils}

import scala.collection.Map

import com.fasterxml.jackson.core._

import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

private[sql] object JacksonGenerator {
  
  def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = {
    def valWriter: (DataType, Any) => Unit = {
      case (_, null) | (NullType, _) => gen.writeNull()
      case (StringType, v) => gen.writeString(v.toString)
      case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString)
      case (IntegerType, v: Int) => gen.writeNumber(v)
      case (ShortType, v: Short) => gen.writeNumber(v)
      case (FloatType, v: Float) => gen.writeNumber(v)
      case (DoubleType, v: Double) => gen.writeNumber(v)
      case (LongType, v: Long) => gen.writeNumber(v)
      case (DecimalType(), v: Decimal) => gen.writeNumber(v.toJavaBigDecimal)
      case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
      case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
      case (BooleanType, v: Boolean) => gen.writeBoolean(v)
      case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString)
      // For UDT values, they should be in the SQL type's corresponding value type.
      // We should not see values in the user-defined class at here.
      // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is
      // an ArrayData at here, instead of a Vector.
      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v)

      case (ArrayType(ty, _), v: ArrayData) =>
        gen.writeStartArray()
        v.foreach(ty, (_, value) => valWriter(ty, value))
        gen.writeEndArray()

      case (MapType(kt, vt, _), v: MapData) =>
        gen.writeStartObject()
        v.foreach(kt, vt, { (k, v) =>
          gen.writeFieldName(k.toString)
          valWriter(vt, v)
        })
        gen.writeEndObject()

      case (StructType(ty), v: InternalRow) =>
        gen.writeStartObject()
        var i = 0
        while (i < ty.length) {
          val field = ty(i)
          val value = v.get(i, field.dataType)
          if (value != null) {
            gen.writeFieldName(field.name)
            valWriter(field.dataType, value)
          }
          i += 1
        }
        gen.writeEndObject()

      case (dt, v) =>
        sys.error(
          s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.")
    }

    valWriter(rowSchema, row)
  }
}

Source File: CommandUtils.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.Logging
import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
}

Source File: TaskResult.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.Map
import scala.collection.mutable

import org.apache.spark.SparkEnv
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.BlockId
import org.apache.spark.util.Utils

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value.
      val resultSer = SparkEnv.get.serializer.newInstance()
      valueObject = resultSer.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
}

Source File: GroupedSumEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result.put(entry.getKey, new BoundedDouble(sum, 1.0, sum, sum))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result.put(entry.getKey, new BoundedDouble(sumEstimate, confidence, low, high))
      }
      result.asScala
    }
  }
}

Source File: GroupedCountEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result.put(key, new BoundedDouble(sum, 1.0, sum, sum))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result.put(key, new BoundedDouble(mean, confidence, low, high))
      }
      result.asScala
    }
  }
}

Source File: GroupedMeanEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result.put(entry.getKey, new BoundedDouble(mean, 1.0, mean, mean))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result.put(entry.getKey, new BoundedDouble(mean, confidence, low, high))
      }
      result.asScala
    }
  }
}

Source File: Settings.scala From neuroflow with Apache License 2.0

5 votes

package neuroflow.core

import neuroflow.core.Network.LearningRate

import scala.collection.{Map, Set}


case class Settings[V]
                    (verbose           :  Boolean                      =  true,
                     learningRate      :  LearningRate[V]              =  { case (i, α) => α }: LearningRate[V],
                     updateRule        :  Update[V]                    =  Vanilla[V](),
                     precision         :  Double                       =  1E-3,
                     iterations        :  Int                          =  Int.MaxValue,
                     prettyPrint       :  Boolean                      =  false,
                     batchSize         :  Option[Int]                  =  None,
                     gcThreshold       :  Option[Long]                 =  None,
                     lossFuncOutput    :  Option[LossFuncOutput]       =  None,
                     waypoint          :  Option[Waypoint[V]]          =  None,
                     approximation     :  Option[Approximation[V]]     =  None,
                     regularization    :  Option[Regularization]       =  None,
                     partitions        :  Option[Set[Int]]             =  None,
                     specifics         :  Option[Map[String, V]]       =  None) extends Serializable

Source File: PSVector.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.models

import java.util.concurrent.Future
import scala.collection.Map

import org.apache.spark.SparkException

import com.tencent.angel.ml.math2.vector.Vector
import com.tencent.angel.ml.math2.utils.RowType
import com.tencent.angel.ml.matrix.psf.get.base.{GetFunc, GetResult}
import com.tencent.angel.ml.matrix.psf.update.base.{UpdateFunc, VoidResult}
import com.tencent.angel.sona.context.PSContext


  def longKeySparse(dim: Long,
                    maxRange: Long,
                    capacity: Int = 20,
                    rowType: RowType = RowType.T_DOUBLE_SPARSE_LONGKEY,
                    additionalConfiguration: Map[String, String] = Map()): PSVector = {
    sparse(dim, capacity, maxRange, rowType, additionalConfiguration)
  }

  def sparse(dimension: Long, capacity: Int, range: Long, rowType: RowType,
             additionalConfiguration: Map[String, String]): PSVector = {
    PSContext.instance().createVector(dimension, rowType, capacity, range, additionalConfiguration)
  }

  def sparse(dimension: Long, capacity: Int = 20, rowType: RowType = RowType.T_DOUBLE_SPARSE_LONGKEY,
             additionalConfiguration: Map[String, String] = Map()): PSVector = {
    sparse(dimension, capacity, dimension, rowType, additionalConfiguration)
  }
}

Source File: ModelWithDescriptor.scala From kafka-with-akka-streams-kafka-streams-tutorial with Apache License 2.0

5 votes

package com.lightbend.scala.modelServer.model

import java.io.{DataInputStream, DataOutputStream}

import com.lightbend.model.modeldescriptor.ModelDescriptor

import scala.collection.Map
import com.lightbend.scala.modelServer.model.PMML.PMMLModel
import com.lightbend.scala.modelServer.model.tensorflow.TensorFlowModel

import scala.util.Try


case class ModelWithDescriptor(model: Model, descriptor: ModelToServe){}

object ModelWithDescriptor {

  private val factories = Map(
    ModelDescriptor.ModelType.PMML.name -> PMMLModel,
    ModelDescriptor.ModelType.TENSORFLOW.name -> TensorFlowModel
  )

  private val factoriesInt = Map(
    ModelDescriptor.ModelType.PMML.index -> PMMLModel,
    ModelDescriptor.ModelType.TENSORFLOW.index -> TensorFlowModel
  )

  def fromModelToServe(descriptor : ModelToServe): Try[ModelWithDescriptor] = Try{
    println(s"New model - $descriptor")
    factories.get(descriptor.modelType.name) match {
      case Some(factory) => ModelWithDescriptor(factory.create(descriptor),descriptor)
      case _ => throw new Throwable("Undefined model type")
    }
  }

  def readModel(input : DataInputStream) : Option[Model] = {
    input.readLong.toInt match{
      case length if length > 0 =>
        val `type` = input.readLong.toInt
        val bytes = new Array[Byte](length)
        input.read(bytes)
        factoriesInt.get(`type`) match {
          case Some(factory) => try {
            Some(factory.restore(bytes))
          }
          catch {
            case t: Throwable =>
              System.out.println("Error Deserializing model")
              t.printStackTrace()
              None
          }
          case _ => None
        }
      case _ => None
    }
  }

  def writeModel(output : DataOutputStream, model: Model) : Unit = {
    if(model == null)
      output.writeLong(0l)
    else {
      try {
        val bytes = model.toBytes
        output.writeLong(bytes.length)
        output.writeLong(model.getType)
        output.write(bytes)
      } catch {
        case t: Throwable =>
          System.out.println("Error Serializing model")
          t.printStackTrace()
      }
    }
  }
}

Source File: SpecHelper.scala From peregrine with Apache License 2.0

5 votes

package io.peregrine

import com.twitter.finagle.http.{Request => FinagleRequest, Response => FinagleResponse}
import com.twitter.util.{Await, Future}
import org.jboss.netty.handler.codec.http.HttpMethod
import org.jboss.netty.util.CharsetUtil.UTF_8

import scala.collection.Map

class MockResponse(val originalResponse: FinagleResponse) {

  def status                  = originalResponse.getStatus()
  def code                    = originalResponse.getStatus().getCode
  def body                    = originalResponse.getContent().toString(UTF_8)
  def getHeader(name: String) = originalResponse.headers().get(name)
  def getHeaders              = originalResponse.headerMap

}

trait SpecHelper {

  def response  = new MockResponse(Await.result(lastResponse))
  var lastResponse: Future[FinagleResponse] = null

  def server: PeregrineServer

  def get(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map()) {
    executeRequest(HttpMethod.GET,path,params,headers)
  }

  def post(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map(), body:AnyRef=null) {
    executeRequest(HttpMethod.POST,path,params,headers,body)
  }

  def put(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map(), body:AnyRef=null) {
    executeRequest(HttpMethod.PUT,path,params,headers,body)
  }

  def delete(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map()) {
    executeRequest(HttpMethod.DELETE,path,params,headers)
  }

  def head(path:String,params:Map[String,String]=Map(), headers:Map[String,String]=Map()) {
    executeRequest(HttpMethod.HEAD,path,params,headers)
  }

  def patch(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map()) {
    executeRequest(HttpMethod.PATCH,path,params,headers)
  }

  def options(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map(), body:AnyRef=null) {
    executeRequest(HttpMethod.OPTIONS,path,params,headers,body)
  }

  def send(request: FinagleRequest) {
    executeRequest(request)
  }

  private def executeRequest(
                              method: HttpMethod,
                              path: String,
                              params: Map[String, String] = Map(),
                              headers: Map[String,String] = Map(),
                              body: AnyRef = null
                              ) {
    val app = MockApp(server)
    val result: MockResult = app.execute(method = method, path = path, params = params, headers = headers, body = body)
    lastResponse = result.response
  }

  private def executeRequest(request: FinagleRequest) {
    val app = MockApp(server)
    val result: MockResult = app.execute(request)
    lastResponse = result.response
  }

}

Source File: GenerateDataFeaturesFile.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples

import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object GenerateDataFeaturesFile{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    val sc = Util.sc
    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()

    val records = rawData.map(line => line.split(","))
    val first = records.first()

    println(numData.toInt)
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    print("Mapping of second categorical feature column: " +  get_mapping(records, 3))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => Util.extractLabel(r) + "," +  Util.extractSumFeature(r, catLen, mappings))
    }

    val data_collection = data.collect()
    val d_iterator = data_collection.iterator
    while(d_iterator.hasNext) {
      val x = d_iterator.next
      println(x)
    }
    val first_point = data.first()
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    data.saveAsTextFile("./output/x_features" + date + ".csv")

    sc.stop()
  }

}

Source File: GroupedCountEvaluator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
}

Source File: TopElementsAggregator.scala From salt-core with Apache License 2.0

5 votes

package software.uncharted.salt.core.analytic.collection

import software.uncharted.salt.core.analytic.Aggregator

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.{Map => MutableMap}
import scala.collection.mutable.PriorityQueue
import scala.reflect.ClassTag


class TopElementsAggregator[ET: ClassTag](elementLimit: Int)
extends Aggregator[Seq[ET], Map[ET, Int], List[(ET, Int)]] {

  def default(): Map[ET, Int] = {
    Map[ET, Int]()
  }

  override def add(current: Map[ET, Int], next: Option[Seq[ET]]): Map[ET, Int] = {
    if (next.isDefined) {
      // If our current map is mutable, add new data in directly.
      // If not, convert to a mutable map, and then add data in
      val sum = current match {
        case hm: MutableMap[ET, Int] => hm
        case _ => {
          // The current value isn't itself a mutable hashmap yet; convert to one.
          val hm = new HashMap[ET, Int]()
          hm ++= current
          hm
        }
      }
      next.get.foreach(t => sum.put(t, sum.getOrElse(t, 0) + 1))
      sum
    } else {
      current
    }
  }

  override def merge(left: Map[ET, Int], right: Map[ET, Int]): Map[ET, Int] = {
    // If either input map is mutable, merge the other into it.
    // If neither is, convert one to mutable, and add the other into it.
    val (to, from) = left match {
      case hm: MutableMap[ET, Int] => (hm, right)
      case _ =>
        right match {
          case hm: MutableMap[ET, Int] => (hm, left)
          case _ =>
            val hm = new HashMap[ET, Int]()
            hm ++= left
            (hm, right)
        }
    }
    from.foreach(t => {
      to.put(t._1, to.getOrElse(t._1, 0) + t._2)
    })
    to
  }

  override def finish(intermediate: Map[ET, Int]): List[(ET, Int)] = {
    val x = new PriorityQueue[(ET, Int)]()(Ordering.by(
      a => a._2
    ))
    intermediate.foreach(t => {
      x.enqueue(t)
    })
    var result = new ListBuffer[(ET, Int)]
    for (i <- 0 until Math.min(elementLimit, x.size)) {
      result.append(x.dequeue)
    }
    result.toList
  }
}

Source File: TypeRewriter.scala From Converter with GNU General Public License v3.0

5 votes

package org.scalablytyped.converter.internal
package ts
package transforms

import scala.collection.Map

class TypeRewriter(base: TsTree) extends TreeTransformation[Map[TsType, TsType]] {
  override def leaveTsType(replacements: Map[TsType, TsType])(x: TsType): TsType =
    replacements.getOrElse(x, x)

  override def withTree(t: Map[TsType, TsType], tree: TsTree): Map[TsType, TsType] =
    if (tree === base) t
    else
      tree match {
        
        case HasTParams(tparams) =>
          t.filterKeys {
            case TsTypeRef(_, TsQIdent(IArray.exactlyOne(one: TsIdentSimple)), _) if tparams.exists(_.name === one) =>
              false
            case _ => true
          }
        case _ => t
      }
}

Source File: PairSyntax.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.syntax

import com.danielwestheide.kontextfrei.DCollectionOps
import org.apache.spark.Partitioner

import scala.collection.Map
import scala.reflect.ClassTag

class PairSyntax[DCollection[_], A: ClassTag, B: ClassTag](
    val self: DCollectionOps[DCollection],
    val coll: DCollection[(A, B)]) {

  final def keys: DCollection[A] = self.keys(coll)

  final def values: DCollection[B] = self.values(coll)

  final def cogroup[C: ClassTag](other: DCollection[(A, C)])
    : DCollection[(A, (Iterable[B], Iterable[C]))] =
    self.cogroup(coll)(other)

  final def leftOuterJoin[C: ClassTag](
      other: DCollection[(A, C)]): DCollection[(A, (B, Option[C]))] =
    self.leftOuterJoin(coll)(other)

  final def rightOuterJoin[C: ClassTag](
      other: DCollection[(A, C)]): DCollection[(A, (Option[B], C))] =
    self.rightOuterJoin(coll)(other)

  final def fullOuterJoin[C: ClassTag](
      other: DCollection[(A, C)]): DCollection[(A, (Option[B], Option[C]))] =
    self.fullOuterJoin(coll)(other)

  final def mapValues[C: ClassTag](f: B => C): DCollection[(A, C)] =
    self.mapValues(coll)(f)

  final def flatMapValues[C: ClassTag](
      f: B => TraversableOnce[C]): DCollection[(A, C)] =
    self.flatMapValues(coll)(f)

  final def reduceByKey(f: (B, B) => B): DCollection[(A, B)] =
    self.reduceByKey(coll)(f)

  final def foldByKey(zeroValue: B)(f: (B, B) => B): DCollection[(A, B)] =
    self.foldByKey(coll)(zeroValue, f)

  final def aggregateByKey[C: ClassTag](zeroValue: C)(
      seqOp: (C, B) => C,
      combOp: (C, C) => C): DCollection[(A, C)] =
    self.aggregateByKey(coll)(zeroValue)(seqOp, combOp)

  final def combineByKey[C: ClassTag](
      createCombiner: B => C,
      mergeValue: (C, B) => C,
      mergeCombiners: (C, C) => C): DCollection[(A, C)] =
    self.combineByKey(coll)(createCombiner)(mergeValue, mergeCombiners)

  final def countByKey(): Map[A, Long] = self.countByKey(coll)

  final def collectAsMap(): Map[A, B] = self.collectAsMap(coll)

  final def partitionBy(partitioner: Partitioner): DCollection[(A, B)] =
    self.partitionBy(coll)(partitioner)
}

Source File: RDDPairFunctions.scala From kontextfrei with Apache License 2.0

5 votes

package com.danielwestheide.kontextfrei.rdd

import com.danielwestheide.kontextfrei.DCollectionPairFunctions
import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.reflect.ClassTag

private[kontextfrei] trait RDDPairFunctions
    extends DCollectionPairFunctions[RDD] { this: RDDBase =>

  override final def cogroup[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Iterable[B], Iterable[C]))] = withSite(x) {
    _.cogroup(y)
  }

  override final def values[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[B] = withSite(x) {
    _.values
  }

  override final def keys[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[A] = withSite(x) {
    _.keys
  }

  override final def leftOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (B, Option[C]))] = withSite(x) {
    _.leftOuterJoin(y)
  }

  override final def rightOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], C))] = withSite(x) {
    _.rightOuterJoin(y)
  }

  override final def fullOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], Option[C]))] = withSite(x) {
    _.fullOuterJoin(y)
  }

  override final def mapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => C): RDD[(A, C)] = withSite(x) {
    _.mapValues(f)
  }

  override final def flatMapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => TraversableOnce[C]): RDD[(A, C)] = withSite(x) {
    _.flatMapValues(f)
  }

  override final def reduceByKey[A: ClassTag, B: ClassTag](xs: RDD[(A, B)])(
      f: (B, B) => B): RDD[(A, B)] = withSite(xs) {
    _.reduceByKey(f)
  }

  override final def foldByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(zeroValue: B, f: (B, B) => B): RDD[(A, B)] = withSite(xs) {
    _.foldByKey(zeroValue)(f)
  }

  override final def aggregateByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(zeroValue: C)(seqOp: (C, B) => C,
                                     combOp: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.aggregateByKey(zeroValue)(seqOp, combOp)
  }

  override final def combineByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(createCombiner: B => C)(
      mergeValue: (C, B) => C,
      mergeCombiners: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.combineByKey(createCombiner, mergeValue, mergeCombiners)
  }

  override final def countByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, Long] = withSite(xs) {
    _.countByKey()
  }

  override final def collectAsMap[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, B] = withSite(xs) {
    _.collectAsMap()
  }

  override final def partitionBy[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(partitioner: Partitioner): RDD[(A, B)] = withSite(xs) {
    _.partitionBy(partitioner)
  }
}

Source File: XmlFile.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml.util

import java.io.CharArrayWriter
import java.nio.charset.Charset
import javax.xml.stream.XMLOutputFactory

import scala.collection.Map

import com.databricks.spark.xml.parsers.StaxXmlGenerator
import com.sun.xml.txw2.output.IndentingXMLStreamWriter
import org.apache.hadoop.io.{Text, LongWritable}

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.sql.DataFrame
import com.databricks.spark.xml.{XmlOptions, XmlInputFormat}

private[xml] object XmlFile {
  val DEFAULT_INDENT = "    "

  def withCharset(
      context: SparkContext,
      location: String,
      charset: String,
      rowTag: String): RDD[String] = {
    // This just checks the charset's validity early, to keep behavior
    Charset.forName(charset)
    context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset)
    context.newAPIHadoopFile(location,
      classOf[XmlInputFormat],
      classOf[LongWritable],
      classOf[Text]).map { case (_, text) => new String(text.getBytes, 0, text.getLength, charset) }
  }

  
  def saveAsXmlFile(
      dataFrame: DataFrame,
      path: String,
      parameters: Map[String, String] = Map()): Unit = {
    val options = XmlOptions(parameters.toMap)
    val codecClass = CompressionCodecs.getCodecClass(options.codec)
    val rowSchema = dataFrame.schema
    val indent = XmlFile.DEFAULT_INDENT

    val xmlRDD = dataFrame.rdd.mapPartitions { iter =>
      val factory = XMLOutputFactory.newInstance()
      val writer = new CharArrayWriter()
      val xmlWriter = factory.createXMLStreamWriter(writer)
      val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter)
      indentingXmlWriter.setIndentStep(indent)

      new Iterator[String] {
        var firstRow: Boolean = true
        var lastRow: Boolean = true

        override def hasNext: Boolean = iter.hasNext || firstRow || lastRow

        override def next: String = {
          if (iter.nonEmpty) {
            if (firstRow) {
              indentingXmlWriter.writeStartElement(options.rootTag)
              firstRow = false
            }
            val xml = {
              StaxXmlGenerator(
                rowSchema,
                indentingXmlWriter,
                options)(iter.next())
              indentingXmlWriter.flush()
              writer.toString
            }
            writer.reset()
            xml
          } else {
            if (!firstRow) {
              lastRow = false
              indentingXmlWriter.writeEndElement()
              indentingXmlWriter.close()
              writer.toString
            } else {
              // This means the iterator was initially empty.
              firstRow = false
              lastRow = false
              ""
            }
          }
        }
      }
    }

    codecClass match {
      case null => xmlRDD.saveAsTextFile(path)
      case codec => xmlRDD.saveAsTextFile(path, codec)
    }
  }
}

Source File: MapSerializerModule.scala From mango with Apache License 2.0

5 votes

package com.kakao.shaded.jackson.module.scala.ser

import com.kakao.shaded.jackson.databind.`type`.{TypeFactory, MapType, MapLikeType}
import com.kakao.shaded.jackson.databind.jsontype.TypeSerializer
import com.kakao.shaded.jackson.databind.ser.Serializers
import com.kakao.shaded.jackson.databind.ser.std.StdDelegatingSerializer
import com.kakao.shaded.jackson.databind.util.StdConverter
import com.kakao.shaded.jackson.databind._
import com.kakao.shaded.jackson.module.scala.modifiers.MapTypeModifierModule
import scala.collection.JavaConverters._
import scala.collection.Map

private class MapConverter(inputType: JavaType, config: SerializationConfig)
  extends StdConverter[Map[_,_],java.util.Map[_,_]]
{
  def convert(value: Map[_,_]): java.util.Map[_,_] = {
    val m = if (config.isEnabled(SerializationFeature.WRITE_NULL_MAP_VALUES)) {
      value
    } else {
      value.filter(_._2 != None)
    }
    m.asJava
  }


  override def getInputType(factory: TypeFactory) = inputType

  override def getOutputType(factory: TypeFactory) =
    factory.constructMapType(classOf[java.util.Map[_,_]], inputType.getKeyType, inputType.getContentType)
      .withTypeHandler(inputType.getTypeHandler)
      .withValueHandler(inputType.getValueHandler)
}

private object MapSerializerResolver extends Serializers.Base {

  val BASE = classOf[collection.Map[_,_]]

  override def findMapLikeSerializer(config: SerializationConfig,
                                     mapLikeType : MapLikeType,
                                     beanDesc: BeanDescription,
                                     keySerializer: JsonSerializer[AnyRef],
                                     elementTypeSerializer: TypeSerializer,
                                     elementValueSerializer: JsonSerializer[AnyRef]): JsonSerializer[_] = {


    val rawClass = mapLikeType.getRawClass

    if (!BASE.isAssignableFrom(rawClass)) null
    else new StdDelegatingSerializer(new MapConverter(mapLikeType, config))
  }

}

trait MapSerializerModule extends MapTypeModifierModule {
  this += MapSerializerResolver
}

Source File: TruckAndTrafficJoinBolt.scala From trucking-iot with Apache License 2.0

5 votes

package com.orendainx.trucking.storm.bolts

import java.util

import com.orendainx.trucking.commons.models.{EnrichedTruckAndTrafficData, EnrichedTruckData, TrafficData}
import com.typesafe.scalalogging.Logger
import org.apache.storm.task.{OutputCollector, TopologyContext}
import org.apache.storm.topology.OutputFieldsDeclarer
import org.apache.storm.topology.base.BaseWindowedBolt
import org.apache.storm.tuple.{Fields, Values}
import org.apache.storm.windowing.TupleWindow

import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer
import scala.collection.{Map, mutable}
import scala.language.implicitConversions


  private def processAndEmitData(truckDataPerRoute: Map[Int, ListBuffer[EnrichedTruckData]],
                                 trafficDataPerRoute: Map[Int, ListBuffer[TrafficData]]) {

    // For each EnrichedTruckData object, find the TrafficData object with the closest timestamp
    truckDataPerRoute.foreach { case (routeId, truckDataList) =>
      trafficDataPerRoute.get(routeId) match {
        case None => // No traffic data for this routeId, so drop/ignore truck data
        case Some(trafficDataList) =>
          truckDataList foreach { truckData =>
            trafficDataList.sortBy(data => math.abs(data.eventTime - truckData.eventTime)).headOption match {
              case None => // Window didn't capture any traffic data for this truck's route
              case Some(trafficData) =>

                val joinedData = EnrichedTruckAndTrafficData(truckData.eventTime, truckData.truckId, truckData.driverId, truckData.driverName,
                  truckData.routeId, truckData.routeName, truckData.latitude, truckData.longitude, truckData.speed,
                  truckData.eventType, truckData.foggy, truckData.rainy, truckData.windy, trafficData.congestionLevel)

                outputCollector.emit(new Values("EnrichedTruckAndTrafficData", joinedData))
            }
          }
      }
    }
  }

  override def declareOutputFields(declarer: OutputFieldsDeclarer): Unit = declarer.declare(new Fields("dataType", "data"))
}

Source File: LinearRegression.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object LinearRegression{
  def main(args: Array[String]) {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.025
    val intercept =true

    //LinearRegressionWithSGD.tr
    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features)))
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = true
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD")

  }

}

Source File: LinearRegressionWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object LinearRegressionWithLog{



  def main(args: Array[String]) {

    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    //val step = 0.2
    val step = 0.025
    val intercept =true

    //LinearRegressionWithSGD.tr
    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = data.map(p => (Math.exp(p.label), Math.exp(linear_model.predict(p.features))))
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = false
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log")

  }

}

Source File: DecisionTreeUtil.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object DecisionTreeUtil {

  def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val data_dt = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r)))
    }

    val splits = data_dt.randomSplit(Array(0.8, 0.2), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)
    return (training, test)
  }

  def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint],
               categoricalFeaturesInfo: scala.Predef.Map[Int, Int],
                maxDepth :Int, maxBins: Int): Double = {
    val impurity = "variance"
    val decisionTreeModel = DecisionTree.trainRegressor(train, categoricalFeaturesInfo,
      impurity,maxDepth, maxBins )

    val true_vs_predicted = test.map(p => (p.label, decisionTreeModel.predict(p.features)))
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())
    return rmsle
  }

}

Source File: DecisionTreeCategoricalFeaturesApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object DecisionTreeCategoricalFeaturesApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    val save = true
    //val sc = new SparkContext("local[2]", "First Spark App")
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()

    val records = rawData.map(line => line.split(","))
    val first = records.first()

    println(numData.toInt)
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    println("Feature vector length for categorical features:"+ catLen)
    println("Feature vector length for numerical features:" + numLen)
    println("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val data_dt = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r)))
    }
    val first_point = data_dt.first()
    println("Decision Tree feature vector:" + first_point.features.toString)
    println("Decision Tree feature vector length: " + first_point.features.size)


    def getCatFeatures(): scala.Predef.Map[Int, Int] = {

      var d = scala.Predef.Map[Int, Int]()

      for(a <- 2 until 10){
        d += (a-2 -> (get_mapping(records, a).size + 1))
        //d.put(a-2,get_mapping(records, a).size + 1)
      }
      return d

    }
    val cat_features = getCatFeatures()
    //dict([(i - 2, len(get_mapping(records, i)) + 1) for i in range(2,10)])

    //val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32
    val decisionTreeModel= DecisionTree.trainRegressor(data_dt, cat_features,  impurity, maxDepth, maxBins)
    //val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo,
    //  impurity, maxDepth, maxBins )

    val preds = decisionTreeModel.predict(data_dt.map( p=> p.features))
    val actual = data.map( p=> p.label)
    val true_vs_predicted_dt = actual.zip(preds)
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + decisionTreeModel.predict(p.features))

    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_categorical_" + date + ".csv")
    }

    print("Decision Tree depth: " + decisionTreeModel.depth)
    print("Decision Tree number of nodes: " + decisionTreeModel.numNodes)
    Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree Categorical Features")
  }

}

Source File: DecisionTreeWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object DecisionTreeWithLog{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    val save = false
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()

    val records = rawData.map(line => line.split(","))
    val first = records.first()

    println(numData.toInt)
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    println("Feature vector length for categorical features:"+ catLen)
    println("Feature vector length for numerical features:" + numLen)
    println("Total feature vector length: " + totalLen)


    val data_dt = {
      records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extract_features_dt(r)))
    }
    val first_point = data_dt.first()
    println("Decision Tree feature vector:" + first_point.features.toString)
    println("Decision Tree feature vector length: " + first_point.features.size)

    val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32

    val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo,
      impurity, maxDepth, maxBins )

    val preds = decisionTreeModel.predict(data_dt.map( p=> p.features))
    val preds_2 = preds.map(p=> Math.exp(p))
    val actual = data_dt.map( p=> Math.exp(p.label))
    val true_vs_predicted_dt = actual.zip(preds)

    if(save){
      val true_vs_predicted_csv = data_dt.map(p => p.label + " ,"  + decisionTreeModel.predict(p.features))
      val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
      val date = format.format(new java.util.Date())
      true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_" + date + ".csv")
    }

    print("Decision Tree depth: " + decisionTreeModel.depth)
    print("Decision Tree number of nodes: " + decisionTreeModel.numNodes)

    Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree With Log")
    Util.sc.stop()
  }

}

Source File: RidgeRegressionApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples

import org.apache.spark.mllib.regression.{LabeledPoint, RidgeRegressionWithSGD}
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object RidgeRegressionApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    //val sc = new SparkContext("local[2]", "First Spark App")
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()
    val records = rawData.map(line => line.split(","))
    records.cache()
    //print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.1
    val intercept =false
    val rr = new RidgeRegressionWithSGD()
    rr.optimizer.setNumIterations(iterations)
    rr.optimizer.setStepSize(0.1)
    val rrModel = rr.run(data)
    val true_vs_predicted = data.map(p => (p.label, rrModel.predict(p.features)))
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }
    val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())

    println("Ridge Regression - Mean Squared Error: "  + mse)
    println("Ridge Regression  - Mean Absolute Error: " + mae)
    println("Ridge Regression  - Root Mean Squared Log Error:" + rmsle)
  }
}

Source File: JsonUtils.scala From InteractiveGraph-neo4j with BSD 2-Clause "Simplified" License

5 votes

package org.grapheco.server.util

import com.google.gson._

import scala.collection.Map


object JsonUtils {
  def getPrimitiveValue(value: JsonPrimitive): Any = {
    (value.isBoolean, value.isNumber, value.isString) match {
      case (true, false, false) => value.getAsBoolean;
      case (false, true, false) => Some(value.getAsNumber).map(num =>
        if (num.toString.contains(".")) {
          num.doubleValue()
        }
        else {
          num.intValue()
        }
      ).get;
      case (false, false, true) => value.getAsString;
    }
  }

  val gson = new GsonBuilder()
    .setPrettyPrinting()
    .create();

  def parse(json: String): JsonElement = {
    new JsonParser().parse(json);
  }

  def stringify(e: JsonElement): String = {
    gson.toJson(e);
  }

  def stringify(e: Map[String, _]): String = {
    gson.toJson(asJsonObject(e));
  }

  def asJsonArray(arr: Array[_]) = {
    val ja = new JsonArray();
    arr.foreach(x => ja.add(asJsonElement(x)));
    ja;
  }

  def asJsonElement(v: Any): JsonElement = {
    if (v.isInstanceOf[Map[_, _]]) {
      asJsonObject(v.asInstanceOf[Map[String, _]]);
    }
    else if (v.isInstanceOf[Array[_]]) {
      asJsonArray(v.asInstanceOf[Array[_]]);
    }
    else {
      v match {
        case x: String =>
          new JsonPrimitive(x);
        case x: Number =>
          new JsonPrimitive(x);
        case x: Boolean =>
          new JsonPrimitive(x);
      }
    }
  }

  def asJsonObject(map: Map[String, _]) = {
    val jo = new JsonObject();
    map.foreach(en => {
      jo.add(en._1, asJsonElement(en._2));
    })
    jo;
  }
}

Source File: GradientBoostedTreesUtil.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.gradientboosted

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object GradientBoostedTreesUtil {

  def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }


    val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)
    return (training, test)
  }

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }



  def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint], iterations:Int, maxDepth:Int,
               maxBins: Int): Double ={

    var boostingStrategy = BoostingStrategy.defaultParams("Regression")
    boostingStrategy.setNumIterations(iterations)
    boostingStrategy.treeStrategy.setMaxDepth(maxDepth)
    boostingStrategy.treeStrategy.setMaxBins(maxBins)

    val model = GradientBoostedTrees.train(train, boostingStrategy)
//
//    @classmethod
//    @since("1.3.0")
//    def trainRegressor(cls, data, categoricalFeaturesInfo,
//                       loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
//                       maxBins=32):

    val true_vs_predicted = test.map(p => (p.label, model.predict(p.features)))
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())
    return rmsle
  }

}

Source File: GradientBoostedTreesApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.gradientboosted

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object GradientBoostedTreesApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    //val conf = new SparkConf().setMaster("local").setAppName("GradientBoostedTreesRegressionApp")
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()
    val records = rawData.map(line => line.split(","))
    records.cache()
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Gradient Boosted Trees Model feature vector:" + first_point.features.toString)
    println("Gradient Boosted Trees Model feature vector length: " + first_point.features.size)


    var boostingStrategy = BoostingStrategy.defaultParams("Regression")
    boostingStrategy.setNumIterations(3)// Note: Use more iterations in practice.
    boostingStrategy.treeStrategy.setMaxDepth(5)


    val model = GradientBoostedTrees.train(data, boostingStrategy)
    val true_vs_predicted = data.map(p => (p.label, model.predict(p.features)))
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }
    val save = true
    if(save){
      val true_vs_predicted_csv = data.map(p => p.label + " ,"  + model.predict(p.features))
      val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
      val date = format.format(new java.util.Date())
      true_vs_predicted_csv.saveAsTextFile("./output/gradient_boosted_trees_" + date + ".csv")
    }
    val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())

    println("Gradient Boosted Trees - Mean Squared Error: "  + mse)
    println("Gradient Boosted Trees - Mean Absolute Error: " + mae)
    println("Gradient Boosted Trees - Root Mean Squared Log Error:" + rmsle)
  }
}

Source File: CalculateStdDeviation.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples

import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object CalculateStdDeviation{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {

    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => Util.extractFeatures(r, catLen, mappings))
    }
    //data.saveAsTextFile("./output/temp.txt")
    val count_columns = data.first().size

    var a = 0;
    var x = new Array[Double](count_columns)
    // for loop execution with a range
    for( a <- 0 to (count_columns -1) ){
      val stddev = data.map(r => r(a)).stdev()
      //println(a +  ": " +  );
      x.update(a,stddev)

    }
    for( a <- 0 to (count_columns -1) ){
      println(a  + " : " + x(a))
    }

    //val data_1_std_dev = data.map(r => r(1)).stdev()
    //println(data_1_std_dev)


  }

}

Source File: LinearRegressionWithIntercept.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object LinearRegressionWithIntercept{

  def main(args: Array[String]) {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val data1 = {
      records.map(r => Util.extractFeatures(r, catLen, mappings))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.025
    val intercept =true

    val linReg = new LinearRegressionWithSGD().setIntercept(intercept)
    linReg.optimizer.setNumIterations(iterations).setStepSize(step)
    val linear_model = linReg.run(data)
    print(data.first());
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features)))
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = true
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }
    val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())

    println("Linear Model - Mean Squared Error: "  + mse)
    println("Linear Model - Mean Absolute Error: " + mae)
    println("Linear Model - Root Mean Squared Log Error:" + rmsle)

  }

}

Source File: LinearRegression.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object LinearRegression{
  def main(args: Array[String]) {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.025
    val intercept =true

    //LinearRegressionWithSGD.tr
    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features)))
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = true
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD")

  }

}

Source File: LinearRegressionWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object LinearRegressionWithLog{



  def main(args: Array[String]) {

    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    //val step = 0.2
    val step = 0.025
    val intercept =true

    //LinearRegressionWithSGD.tr
    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = data.map(p => (Math.exp(p.label), Math.exp(linear_model.predict(p.features))))
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = false
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log")

  }

}

Source File: IsotonicRegressionApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples

import org.apache.spark.mllib.regression.{IsotonicRegression, LabeledPoint}
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object IsotonicRegressionApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader_1000.csv")
    val numData = rawData.count()
    val records = rawData.map(line => line.split(","))
    records.cache()

    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen


    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val parsedData = records.map { r =>
      (Util.extractLabel(r), Util.extractSumFeature(r, catLen, mappings), 1.0)
    }

    val iterations = 10
    val step = 0.1
    val intercept =false

    val x = new IsotonicRegression().setIsotonic(false)
    val model = x.run(parsedData)

    val parsedData1: RDD[Double] = parsedData.map(r => r._2)
    //val model = GradientBoostedTrees.train(data, boostingStrategy)
    val true_vs_predicted = parsedData.map(p => (p._1, model.predict(p._2)))

    val save = true
    if(save){
      val true_vs_predicted_csv = parsedData.map(p => ( p._1+ "," + model.predict(p._2)))
      val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
      val date = format.format(new java.util.Date())
      true_vs_predicted_csv.saveAsTextFile("./output/isotonic_regression_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }

    val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())

    Util.calculatePrintMetrics(true_vs_predicted, "Isotonic Regression")

  }
}

Source File: CommandUtils.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.internal.Logging
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
}

Source File: GroupedCountEvaluator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
}

Source File: GroupedSumEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
}

Source File: GroupedCountEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T,Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T,Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T,Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: GroupedMeanEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: Cache.scala From sangria with Apache License 2.0

5 votes

package sangria.util

import scala.collection.{Map, Set}

trait Cache[Key, Value] {
  def size: Int

  def contains(key: Key): Boolean
  def apply(key: Key): Value
  def get(key: Key): Option[Value]
  def getOrElse(key: Key, default: => Value): Value
  def update(key: Key, value: Value): Unit
  def remove(key: Key): Unit
  def clear(): Unit

  // NOTE: that `getOrElseUpdate` allows a race condition between value retrieval and cache update.
  // It is an explicit decision to avoid any kind of synchronization (it is preferred to recompute value multiple times than to synchronize)
  def getOrElseUpdate(key: Key, fn: => Value): Value
  def find(fn: (Key, Value) => Boolean): Option[(Key, Value)]
  def mapToSet[R](fn: (Key, Value) => R): Set[R]
  def mapValues[R](fn: Value => R): Map[Key, R]
  def keyExists(fn: Key => Boolean): Boolean
  def forEachValue(fn: Value => Unit): Unit
  def removeKeys(fn: Key => Boolean): Unit
}

object Cache {
  def empty[Key, Value]: Cache[Key, Value] = emptyConcurrentHashMap[Key, Value]

  def emptyTrieMap[Key, Value] = new TrieMapCache[Key, Value]
  def emptyConcurrentHashMap[Key, Value] = new ConcurrentHashMapCache[Key, Value]
  
  def apply[Key, Value](elems: (Key, Value)*) = {
    val c = empty[Key, Value]
    elems.foreach {case (key, value) => c(key) = value}
    c
  }
}

scala.collection.Map Scala Examples