scala.collection.Map Scala Examples

The following examples show how to use scala.collection.Map. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: CommandUtils.scala    From drizzle-spark   with Apache License 2.0 7 votes vote down vote up
package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.internal.Logging
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
} 
Example 2
Source File: GroupedCountEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 3
Source File: WordpieceTokenized.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.common

import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
import scala.collection.Map


object WordpieceTokenized extends Annotated[WordpieceTokenizedSentence] {

  override def annotatorType: String = AnnotatorType.WORDPIECE

  override def unpack(annotations: Seq[Annotation]): Seq[WordpieceTokenizedSentence] = {
    val tokens = annotations
      .filter(_.annotatorType == annotatorType)
      .toArray

    SentenceSplit.unpack(annotations).map(sentence => {
      tokens.filter(token =>
        token.begin >= sentence.start & token.end <= sentence.end
      ).map(token =>
        TokenPiece(wordpiece = token.result,
           token = token.metadata("token"),
           pieceId = token.metadata("pieceId").toInt,
           isWordStart = token.metadata("isWordStart").toBoolean,
           begin = token.begin,
           end = token.end
        )
      )
    }).filter(_.nonEmpty).map(tokens => WordpieceTokenizedSentence(tokens))

  }

  override def pack(sentences: Seq[WordpieceTokenizedSentence]): Seq[Annotation] = {
    var sentenceIndex = 0

    sentences.flatMap{sentence =>
      sentenceIndex += 1
      sentence.tokens.map{token =>
        Annotation(annotatorType, token.begin, token.end, token.wordpiece,
          Map("sentence" -> sentenceIndex.toString,
            "isWordStart" -> token.isWordStart.toString,
            "pieceId" -> token.pieceId.toString,
            "token" -> token.token)
        )
      }}
  }
}

case class WordpieceTokenizedSentence(tokens: Array[TokenPiece])
case class TokenPiece(wordpiece: String, token: String, pieceId: Int, isWordStart: Boolean, begin: Int, end: Int) 
Example 4
Source File: NerConverter.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.ner

import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT, NAMED_ENTITY, TOKEN}
import com.johnsnowlabs.nlp.annotators.common.NerTagged
import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, ParamsAndFeaturesReadable}
import org.apache.spark.ml.param.{BooleanParam, StringArrayParam}
import org.apache.spark.ml.util.Identifiable

import scala.collection.Map


  def setPreservePosition(value: Boolean): this.type = set(preservePosition, value)

  setDefault(
    preservePosition -> true
  )

  override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = {
    val sentences = NerTagged.unpack(annotations)
    val docs = annotations.filter(a => a.annotatorType == AnnotatorType.DOCUMENT)
    val entities = sentences.zip(docs.zipWithIndex).flatMap { case (sentence, doc) =>
      NerTagsEncoding.fromIOB(sentence, doc._1, sentenceIndex=doc._2, $(preservePosition))
    }

    entities.filter(entity => get(whiteList).forall(validEntity => validEntity.contains(entity.entity))).
      zipWithIndex.map{case (entity, idx) =>
      Annotation(
        outputAnnotatorType,
        entity.start,
        entity.end,
        entity.text,
        Map("entity" -> entity.entity, "sentence" -> entity.sentenceId, "chunk" -> idx.toString)
      )
    }
  }

}

object NerConverter extends ParamsAndFeaturesReadable[NerConverter] 
Example 5
Source File: DeltaTableOperations.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package io.delta.tables.execution

import scala.collection.Map

import org.apache.spark.sql.delta.{DeltaErrors, DeltaHistoryManager, DeltaLog, PreprocessTableUpdate}
import org.apache.spark.sql.delta.commands.{DeleteCommand, DeltaGenerateCommand, VacuumCommand}
import org.apache.spark.sql.delta.util.AnalysisHelper
import io.delta.tables.DeltaTable

import org.apache.spark.sql.{functions, Column, DataFrame, Dataset}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression}
import org.apache.spark.sql.catalyst.plans.logical._


trait DeltaTableOperations extends AnalysisHelper { self: DeltaTable =>

  protected def executeDelete(condition: Option[Expression]): Unit = improveUnsupportedOpError {
    val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition)
    toDataset(sparkSession, delete)
  }

  protected def executeHistory(deltaLog: DeltaLog, limit: Option[Int]): DataFrame = {
    val history = new DeltaHistoryManager(deltaLog)
    val spark = self.toDF.sparkSession
    spark.createDataFrame(history.getHistory(limit))
  }

  protected def executeGenerate(tblIdentifier: String, mode: String): Unit = {
    val tableId: TableIdentifier = sparkSession
      .sessionState
      .sqlParser
      .parseTableIdentifier(tblIdentifier)
    val generate = DeltaGenerateCommand(mode, tableId)
    generate.run(sparkSession)
  }

  protected def executeUpdate(
      set: Map[String, Column],
      condition: Option[Column]): Unit = improveUnsupportedOpError {
    val assignments = set.map { case (targetColName, column) =>
      Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr)
    }.toSeq
    val update = UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr))
    toDataset(sparkSession, update)
  }

  protected def executeVacuum(
      deltaLog: DeltaLog,
      retentionHours: Option[Double]): DataFrame = {
    VacuumCommand.gc(sparkSession, deltaLog, false, retentionHours)
    sparkSession.emptyDataFrame
  }

  protected def toStrColumnMap(map: Map[String, String]): Map[String, Column] = {
    map.toSeq.map { case (k, v) => k -> functions.expr(v) }.toMap
  }

  protected def sparkSession = self.toDF.sparkSession
} 
Example 6
Source File: ElasticsearchConnector.scala    From hail   with MIT License 5 votes vote down vote up
package is.hail.io

import org.apache.spark
import org.elasticsearch.spark.sql._

import scala.collection.JavaConverters._
import scala.collection.Map

object ElasticsearchConnector {

  def export(
    df: spark.sql.DataFrame,
    host: String,
    port: Int,
    index: String,
    indexType: String,
    blockSize: Int,
    config: java.util.HashMap[String, String],
    verbose: Boolean) {
    export(df, host, port, index, indexType, blockSize,
      Option(config).map(_.asScala.toMap).getOrElse(Map.empty[String, String]), verbose)
  }

  def export(df: spark.sql.DataFrame, host: String = "localhost", port: Int = 9200,
    index: String, indexType: String, blockSize: Int = 1000,
    config: Map[String, String], verbose: Boolean = true) {

    // config docs: https://www.elastic.co/guide/en/elasticsearch/hadoop/master/configuration.html

    val defaultConfig = Map(
      "es.nodes" -> host,
      "es.port" -> port.toString,
      "es.batch.size.entries" -> blockSize.toString,
      "es.index.auto.create" -> "true")

    val mergedConfig = if (config == null)
      defaultConfig
    else
      defaultConfig ++ config

    if (verbose)
      println(s"Config ${ mergedConfig }")

    df.saveToEs(s"${ index }/${ indexType }", mergedConfig)
  }
} 
Example 7
Source File: CommandUtils.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.internal.Logging
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
} 
Example 8
Source File: GroupedCountEvaluator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
} 
Example 9
Source File: JacksonGenerator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.json

import scala.collection.Map

import com.fasterxml.jackson.core._

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types._

private[sql] object JacksonGenerator {
  
  def apply(rowSchema: StructType, gen: JsonGenerator)(row: Row): Unit = {
    def valWriter: (DataType, Any) => Unit = {
      case (_, null) | (NullType, _) => gen.writeNull()
      case (StringType, v: String) => gen.writeString(v)
      case (TimestampType, v: java.sql.Timestamp) => gen.writeString(v.toString)
      case (IntegerType, v: Int) => gen.writeNumber(v)
      case (ShortType, v: Short) => gen.writeNumber(v)
      case (FloatType, v: Float) => gen.writeNumber(v)
      case (DoubleType, v: Double) => gen.writeNumber(v)
      case (LongType, v: Long) => gen.writeNumber(v)
      case (DecimalType(), v: java.math.BigDecimal) => gen.writeNumber(v)
      case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
      case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
      case (BooleanType, v: Boolean) => gen.writeBoolean(v)
      case (DateType, v) => gen.writeString(v.toString)
      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, udt.serialize(v))

      case (ArrayType(ty, _), v: Seq[_]) =>
        gen.writeStartArray()
        v.foreach(valWriter(ty, _))
        gen.writeEndArray()

      case (MapType(kv, vv, _), v: Map[_, _]) =>
        gen.writeStartObject()
        v.foreach { p =>
          gen.writeFieldName(p._1.toString)
          valWriter(vv, p._2)
        }
        gen.writeEndObject()

      case (StructType(ty), v: Row) =>
        gen.writeStartObject()
        ty.zip(v.toSeq).foreach {
          case (_, null) =>
          case (field, v) =>
            gen.writeFieldName(field.name)
            valWriter(field.dataType, v)
        }
        gen.writeEndObject()
    }

    valWriter(rowSchema, row)
  }
} 
Example 10
Source File: CommandUtils.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}
import java.lang.System._

import scala.collection.JavaConversions._
import scala.collection.Map

import org.apache.spark.Logging
import org.apache.spark.deploy.Command
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
} 
Example 11
Source File: GroupedSumEvaluator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
} 
Example 12
Source File: GroupedCountEvaluator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 13
Source File: GroupedMeanEvaluator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 14
Source File: JsonUtils.scala    From OUTDATED_ledger-wallet-android   with MIT License 5 votes vote down vote up
package co.ledger.wallet.core.utils

import org.json.{JSONArray, JSONObject}
import scala.collection.Map

trait JsonUtils {

  implicit def Map2JsonObject[T](map: Map[String, T]): JSONObject = {
    val json = new JSONObject()
    map foreach {case (key, value) =>
      value match {
        case string: String => json.put(key, string)
        case double: Double => json.put(key, double)
        case float: Float => json.put(key, float)
        case boolean: Boolean => json.put(key, boolean)
        case jsonObject: JSONObject => json.put(key, jsonObject)
        case jsonArray: JSONArray => json.put(key, jsonArray)
        case map: Map[_, _] => json.put(key, Map2JsonObject(map.asInstanceOf[Map[String, _]]))
        case array: Array[_] => json.put(key, Array2JsonArray(array))
        case _ => json.put(key, value.toString)
      }
    }
    json
  }

  implicit def Array2JsonArray[T](array: Array[T]): JSONArray = {
    val json = new JSONArray()
    array foreach {
      case string: String => json.put(string)
      case double: Double => json.put(double)
      case float: Float => json.put(float)
      case boolean: Boolean => json.put(boolean)
      case jsonObject: JSONObject => json.put(jsonObject)
      case jsonArray: JSONArray => json.put(jsonArray)
      case map: Map[_, _] => json.put(Map2JsonObject(map.asInstanceOf[Map[String, _]]))
      case array: Array[AnyRef] => json.put(Array2JsonArray(array))
      case value => json.put(value.toString)
    }
    json
  }

  implicit class JsonStringContext(val c: StringContext) {
    def json(args: Any*): JSONObject = {
      val strings = c.parts.iterator
      val arguments = args.iterator
      val string = new StringBuffer(strings.next())
      while (strings.hasNext) {
        arguments.next() match {
          case charSequence: CharSequence => string.append("\"" + charSequence.toString + "\"")
          case arg => string.append(arg.toString)
        }
        string.append(strings.next())
      }
      new JSONObject(string.toString)
    }
  }

}

object JsonUtils extends JsonUtils 
Example 15
Source File: TaskResult.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.Map
import scala.collection.mutable

import org.apache.spark.SparkEnv
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.BlockId
import org.apache.spark.util.Utils

// Task result. Also contains updates to accumulator variables.
//任务结果,还包含累加器变量的更新,

private[spark] sealed trait TaskResult[T]


  def value(): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value.
      //这不应该在持有锁时运行,因为它可能花费数十秒钟值
      val resultSer = SparkEnv.get.serializer.newInstance()
      valueObject = resultSer.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
} 
Example 16
Source File: GroupedSumEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
} 
Example 17
Source File: SentenceWithEmbeddings.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp.annotators.common

import com.johnsnowlabs.nlp.{Annotation, AnnotatorType}
import scala.collection.Map


case class WordpieceEmbeddingsSentence
(
  tokens: Array[TokenPieceEmbeddings],
  sentenceId: Int
)

case class TokenPieceEmbeddings(wordpiece: String, token: String, pieceId: Int,
                                isWordStart: Boolean, isOOV: Boolean,
                                embeddings: Array[Float], begin: Int, end: Int)

object TokenPieceEmbeddings {
  def apply(piece: TokenPiece, embeddings: Array[Float]): TokenPieceEmbeddings = {
    TokenPieceEmbeddings(
      wordpiece = piece.wordpiece,
      token = piece.token,
      pieceId = piece.pieceId,
      isWordStart = piece.isWordStart,
      isOOV = false, // FIXME: I think BERT wont have OOV, this "constructor" is called from TensorFlowBert
      embeddings = embeddings,
      begin = piece.begin,
      end = piece.end)
  }
  def apply(wordpiece: String, token: String, pieceId: Int,
            isWordStart: Boolean,
            embeddings: Option[Array[Float]], zeroArray: Array[Float], begin: Int, end: Int): TokenPieceEmbeddings = {

    val vector = embeddings.getOrElse(zeroArray)
    val oov = embeddings match { case Some(_) => false; case default => true; }
    TokenPieceEmbeddings(
      wordpiece = wordpiece,
      token = token,
      pieceId = pieceId,
      isWordStart = isWordStart,
      isOOV = oov,
      embeddings = vector,
      begin = begin,
      end = end)
  }
}

object  WordpieceEmbeddingsSentence extends Annotated[WordpieceEmbeddingsSentence] {
  override def annotatorType: String = AnnotatorType.WORD_EMBEDDINGS

  override def unpack(annotations: Seq[Annotation]): Seq[WordpieceEmbeddingsSentence] = {
    val tokens = annotations
      .filter(_.annotatorType == annotatorType)
      .groupBy(_.metadata("sentence").toInt)

    tokens.map{case (idx: Int, sentenceTokens: Seq[Annotation]) =>
      val tokensWithSentence = sentenceTokens.map { token =>
        new TokenPieceEmbeddings(
          wordpiece = token.result,
          token = token.metadata("token"),
          pieceId = token.metadata("pieceId").toInt,
          isWordStart = token.metadata("isWordStart").toBoolean,
          isOOV = token.metadata.getOrElse("isOOV", "false").toBoolean,
          embeddings = token.embeddings,
          begin = token.begin,
          end = token.end
        )
      }.toArray

      WordpieceEmbeddingsSentence(tokensWithSentence, idx)
    }.toSeq.sortBy(_.sentenceId)
  }

  override def pack(sentences: Seq[WordpieceEmbeddingsSentence]): Seq[Annotation] = {
    sentences.flatMap{sentence =>
      var isFirstToken = true
      sentence.tokens.map{ token =>
        // Store embeddings for token
        val embeddings = token.embeddings

        isFirstToken = false
        Annotation(annotatorType, token.begin, token.end, token.token,
          Map("sentence" -> sentence.sentenceId.toString,
            "token" -> token.token,
            "pieceId" -> token.pieceId.toString,
            "isWordStart" -> token.isWordStart.toString,
            "isOOV" -> token.isOOV.toString
          ),
          embeddings
        )
      }
    }
  }
} 
Example 18
Source File: GroupedMeanEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 19
Source File: LanguageDetectorTransformer.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl.texts

import com.google.common.base.Optional
import com.optimaize.langdetect.LanguageDetector
import com.optimaize.langdetect.i18n.LdLocale
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}

import scala.collection.Map


  def setOutputCol(value: String): this.type = set(outputCol, value)

  def this() = this(Identifiable.randomUID("languageDetector"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), StringType)
  }

  @transient object languageDetectorWrapped extends Serializable {
    val languageDetector: LanguageDetector =
      LanguageDetectorUtils.buildLanguageDetector(
        LanguageDetectorUtils.readListLangsBuiltIn(),
        $(minimalConfidence),
        $(languagePriors).toMap)
  }

} 
Example 20
Source File: CommandUtils.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.internal.Logging
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
} 
Example 21
Source File: GroupedCountEvaluator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
} 
Example 22
Source File: JacksonGenerator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.json

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.{MapData, ArrayData, DateTimeUtils}

import scala.collection.Map

import com.fasterxml.jackson.core._

import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

private[sql] object JacksonGenerator {
  
  def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = {
    def valWriter: (DataType, Any) => Unit = {
      case (_, null) | (NullType, _) => gen.writeNull()
      case (StringType, v) => gen.writeString(v.toString)
      case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString)
      case (IntegerType, v: Int) => gen.writeNumber(v)
      case (ShortType, v: Short) => gen.writeNumber(v)
      case (FloatType, v: Float) => gen.writeNumber(v)
      case (DoubleType, v: Double) => gen.writeNumber(v)
      case (LongType, v: Long) => gen.writeNumber(v)
      case (DecimalType(), v: Decimal) => gen.writeNumber(v.toJavaBigDecimal)
      case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
      case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
      case (BooleanType, v: Boolean) => gen.writeBoolean(v)
      case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString)
      // For UDT values, they should be in the SQL type's corresponding value type.
      // We should not see values in the user-defined class at here.
      // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is
      // an ArrayData at here, instead of a Vector.
      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v)

      case (ArrayType(ty, _), v: ArrayData) =>
        gen.writeStartArray()
        v.foreach(ty, (_, value) => valWriter(ty, value))
        gen.writeEndArray()

      case (MapType(kt, vt, _), v: MapData) =>
        gen.writeStartObject()
        v.foreach(kt, vt, { (k, v) =>
          gen.writeFieldName(k.toString)
          valWriter(vt, v)
        })
        gen.writeEndObject()

      case (StructType(ty), v: InternalRow) =>
        gen.writeStartObject()
        var i = 0
        while (i < ty.length) {
          val field = ty(i)
          val value = v.get(i, field.dataType)
          if (value != null) {
            gen.writeFieldName(field.name)
            valWriter(field.dataType, value)
          }
          i += 1
        }
        gen.writeEndObject()

      case (dt, v) =>
        sys.error(
          s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.")
    }

    valWriter(rowSchema, row)
  }
} 
Example 23
Source File: CommandUtils.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.Logging
import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
} 
Example 24
Source File: TaskResult.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.Map
import scala.collection.mutable

import org.apache.spark.SparkEnv
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.BlockId
import org.apache.spark.util.Utils

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value.
      val resultSer = SparkEnv.get.serializer.newInstance()
      valueObject = resultSer.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
} 
Example 25
Source File: GroupedSumEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result.put(entry.getKey, new BoundedDouble(sum, 1.0, sum, sum))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result.put(entry.getKey, new BoundedDouble(sumEstimate, confidence, low, high))
      }
      result.asScala
    }
  }
} 
Example 26
Source File: GroupedCountEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result.put(key, new BoundedDouble(sum, 1.0, sum, sum))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result.put(key, new BoundedDouble(mean, confidence, low, high))
      }
      result.asScala
    }
  }
} 
Example 27
Source File: GroupedMeanEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result.put(entry.getKey, new BoundedDouble(mean, 1.0, mean, mean))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result.put(entry.getKey, new BoundedDouble(mean, confidence, low, high))
      }
      result.asScala
    }
  }
} 
Example 28
Source File: Settings.scala    From neuroflow   with Apache License 2.0 5 votes vote down vote up
package neuroflow.core

import neuroflow.core.Network.LearningRate

import scala.collection.{Map, Set}


case class Settings[V]
                    (verbose           :  Boolean                      =  true,
                     learningRate      :  LearningRate[V]              =  { case (i, α) => α }: LearningRate[V],
                     updateRule        :  Update[V]                    =  Vanilla[V](),
                     precision         :  Double                       =  1E-3,
                     iterations        :  Int                          =  Int.MaxValue,
                     prettyPrint       :  Boolean                      =  false,
                     batchSize         :  Option[Int]                  =  None,
                     gcThreshold       :  Option[Long]                 =  None,
                     lossFuncOutput    :  Option[LossFuncOutput]       =  None,
                     waypoint          :  Option[Waypoint[V]]          =  None,
                     approximation     :  Option[Approximation[V]]     =  None,
                     regularization    :  Option[Regularization]       =  None,
                     partitions        :  Option[Set[Int]]             =  None,
                     specifics         :  Option[Map[String, V]]       =  None) extends Serializable 
Example 29
Source File: PSVector.scala    From sona   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.sona.models

import java.util.concurrent.Future
import scala.collection.Map

import org.apache.spark.SparkException

import com.tencent.angel.ml.math2.vector.Vector
import com.tencent.angel.ml.math2.utils.RowType
import com.tencent.angel.ml.matrix.psf.get.base.{GetFunc, GetResult}
import com.tencent.angel.ml.matrix.psf.update.base.{UpdateFunc, VoidResult}
import com.tencent.angel.sona.context.PSContext


  def longKeySparse(dim: Long,
                    maxRange: Long,
                    capacity: Int = 20,
                    rowType: RowType = RowType.T_DOUBLE_SPARSE_LONGKEY,
                    additionalConfiguration: Map[String, String] = Map()): PSVector = {
    sparse(dim, capacity, maxRange, rowType, additionalConfiguration)
  }

  def sparse(dimension: Long, capacity: Int, range: Long, rowType: RowType,
             additionalConfiguration: Map[String, String]): PSVector = {
    PSContext.instance().createVector(dimension, rowType, capacity, range, additionalConfiguration)
  }

  def sparse(dimension: Long, capacity: Int = 20, rowType: RowType = RowType.T_DOUBLE_SPARSE_LONGKEY,
             additionalConfiguration: Map[String, String] = Map()): PSVector = {
    sparse(dimension, capacity, dimension, rowType, additionalConfiguration)
  }
} 
Example 30
Source File: ModelWithDescriptor.scala    From kafka-with-akka-streams-kafka-streams-tutorial   with Apache License 2.0 5 votes vote down vote up
package com.lightbend.scala.modelServer.model

import java.io.{DataInputStream, DataOutputStream}

import com.lightbend.model.modeldescriptor.ModelDescriptor

import scala.collection.Map
import com.lightbend.scala.modelServer.model.PMML.PMMLModel
import com.lightbend.scala.modelServer.model.tensorflow.TensorFlowModel

import scala.util.Try


case class ModelWithDescriptor(model: Model, descriptor: ModelToServe){}

object ModelWithDescriptor {

  private val factories = Map(
    ModelDescriptor.ModelType.PMML.name -> PMMLModel,
    ModelDescriptor.ModelType.TENSORFLOW.name -> TensorFlowModel
  )

  private val factoriesInt = Map(
    ModelDescriptor.ModelType.PMML.index -> PMMLModel,
    ModelDescriptor.ModelType.TENSORFLOW.index -> TensorFlowModel
  )

  def fromModelToServe(descriptor : ModelToServe): Try[ModelWithDescriptor] = Try{
    println(s"New model - $descriptor")
    factories.get(descriptor.modelType.name) match {
      case Some(factory) => ModelWithDescriptor(factory.create(descriptor),descriptor)
      case _ => throw new Throwable("Undefined model type")
    }
  }

  def readModel(input : DataInputStream) : Option[Model] = {
    input.readLong.toInt match{
      case length if length > 0 =>
        val `type` = input.readLong.toInt
        val bytes = new Array[Byte](length)
        input.read(bytes)
        factoriesInt.get(`type`) match {
          case Some(factory) => try {
            Some(factory.restore(bytes))
          }
          catch {
            case t: Throwable =>
              System.out.println("Error Deserializing model")
              t.printStackTrace()
              None
          }
          case _ => None
        }
      case _ => None
    }
  }

  def writeModel(output : DataOutputStream, model: Model) : Unit = {
    if(model == null)
      output.writeLong(0l)
    else {
      try {
        val bytes = model.toBytes
        output.writeLong(bytes.length)
        output.writeLong(model.getType)
        output.write(bytes)
      } catch {
        case t: Throwable =>
          System.out.println("Error Serializing model")
          t.printStackTrace()
      }
    }
  }
} 
Example 31
Source File: SpecHelper.scala    From peregrine   with Apache License 2.0 5 votes vote down vote up
package io.peregrine

import com.twitter.finagle.http.{Request => FinagleRequest, Response => FinagleResponse}
import com.twitter.util.{Await, Future}
import org.jboss.netty.handler.codec.http.HttpMethod
import org.jboss.netty.util.CharsetUtil.UTF_8

import scala.collection.Map

class MockResponse(val originalResponse: FinagleResponse) {

  def status                  = originalResponse.getStatus()
  def code                    = originalResponse.getStatus().getCode
  def body                    = originalResponse.getContent().toString(UTF_8)
  def getHeader(name: String) = originalResponse.headers().get(name)
  def getHeaders              = originalResponse.headerMap

}

trait SpecHelper {

  def response  = new MockResponse(Await.result(lastResponse))
  var lastResponse: Future[FinagleResponse] = null

  def server: PeregrineServer

  def get(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map()) {
    executeRequest(HttpMethod.GET,path,params,headers)
  }

  def post(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map(), body:AnyRef=null) {
    executeRequest(HttpMethod.POST,path,params,headers,body)
  }

  def put(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map(), body:AnyRef=null) {
    executeRequest(HttpMethod.PUT,path,params,headers,body)
  }

  def delete(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map()) {
    executeRequest(HttpMethod.DELETE,path,params,headers)
  }

  def head(path:String,params:Map[String,String]=Map(), headers:Map[String,String]=Map()) {
    executeRequest(HttpMethod.HEAD,path,params,headers)
  }

  def patch(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map()) {
    executeRequest(HttpMethod.PATCH,path,params,headers)
  }

  def options(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map(), body:AnyRef=null) {
    executeRequest(HttpMethod.OPTIONS,path,params,headers,body)
  }

  def send(request: FinagleRequest) {
    executeRequest(request)
  }

  private def executeRequest(
                              method: HttpMethod,
                              path: String,
                              params: Map[String, String] = Map(),
                              headers: Map[String,String] = Map(),
                              body: AnyRef = null
                              ) {
    val app = MockApp(server)
    val result: MockResult = app.execute(method = method, path = path, params = params, headers = headers, body = body)
    lastResponse = result.response
  }

  private def executeRequest(request: FinagleRequest) {
    val app = MockApp(server)
    val result: MockResult = app.execute(request)
    lastResponse = result.response
  }

} 
Example 32
package org.sparksamples

import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object GenerateDataFeaturesFile{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    val sc = Util.sc
    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()

    val records = rawData.map(line => line.split(","))
    val first = records.first()

    println(numData.toInt)
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    print("Mapping of second categorical feature column: " +  get_mapping(records, 3))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => Util.extractLabel(r) + "," +  Util.extractSumFeature(r, catLen, mappings))
    }

    val data_collection = data.collect()
    val d_iterator = data_collection.iterator
    while(d_iterator.hasNext) {
      val x = d_iterator.next
      println(x)
    }
    val first_point = data.first()
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    data.saveAsTextFile("./output/x_features" + date + ".csv")

    sc.stop()
  }

} 
Example 33
Source File: GroupedCountEvaluator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
} 
Example 34
Source File: TopElementsAggregator.scala    From salt-core   with Apache License 2.0 5 votes vote down vote up
package software.uncharted.salt.core.analytic.collection

import software.uncharted.salt.core.analytic.Aggregator

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.{Map => MutableMap}
import scala.collection.mutable.PriorityQueue
import scala.reflect.ClassTag


class TopElementsAggregator[ET: ClassTag](elementLimit: Int)
extends Aggregator[Seq[ET], Map[ET, Int], List[(ET, Int)]] {

  def default(): Map[ET, Int] = {
    Map[ET, Int]()
  }

  override def add(current: Map[ET, Int], next: Option[Seq[ET]]): Map[ET, Int] = {
    if (next.isDefined) {
      // If our current map is mutable, add new data in directly.
      // If not, convert to a mutable map, and then add data in
      val sum = current match {
        case hm: MutableMap[ET, Int] => hm
        case _ => {
          // The current value isn't itself a mutable hashmap yet; convert to one.
          val hm = new HashMap[ET, Int]()
          hm ++= current
          hm
        }
      }
      next.get.foreach(t => sum.put(t, sum.getOrElse(t, 0) + 1))
      sum
    } else {
      current
    }
  }

  override def merge(left: Map[ET, Int], right: Map[ET, Int]): Map[ET, Int] = {
    // If either input map is mutable, merge the other into it.
    // If neither is, convert one to mutable, and add the other into it.
    val (to, from) = left match {
      case hm: MutableMap[ET, Int] => (hm, right)
      case _ =>
        right match {
          case hm: MutableMap[ET, Int] => (hm, left)
          case _ =>
            val hm = new HashMap[ET, Int]()
            hm ++= left
            (hm, right)
        }
    }
    from.foreach(t => {
      to.put(t._1, to.getOrElse(t._1, 0) + t._2)
    })
    to
  }

  override def finish(intermediate: Map[ET, Int]): List[(ET, Int)] = {
    val x = new PriorityQueue[(ET, Int)]()(Ordering.by(
      a => a._2
    ))
    intermediate.foreach(t => {
      x.enqueue(t)
    })
    var result = new ListBuffer[(ET, Int)]
    for (i <- 0 until Math.min(elementLimit, x.size)) {
      result.append(x.dequeue)
    }
    result.toList
  }
} 
Example 35
Source File: TypeRewriter.scala    From Converter   with GNU General Public License v3.0 5 votes vote down vote up
package org.scalablytyped.converter.internal
package ts
package transforms

import scala.collection.Map

class TypeRewriter(base: TsTree) extends TreeTransformation[Map[TsType, TsType]] {
  override def leaveTsType(replacements: Map[TsType, TsType])(x: TsType): TsType =
    replacements.getOrElse(x, x)

  override def withTree(t: Map[TsType, TsType], tree: TsTree): Map[TsType, TsType] =
    if (tree === base) t
    else
      tree match {
        
        case HasTParams(tparams) =>
          t.filterKeys {
            case TsTypeRef(_, TsQIdent(IArray.exactlyOne(one: TsIdentSimple)), _) if tparams.exists(_.name === one) =>
              false
            case _ => true
          }
        case _ => t
      }
} 
Example 36
Source File: PairSyntax.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei.syntax

import com.danielwestheide.kontextfrei.DCollectionOps
import org.apache.spark.Partitioner

import scala.collection.Map
import scala.reflect.ClassTag

class PairSyntax[DCollection[_], A: ClassTag, B: ClassTag](
    val self: DCollectionOps[DCollection],
    val coll: DCollection[(A, B)]) {

  final def keys: DCollection[A] = self.keys(coll)

  final def values: DCollection[B] = self.values(coll)

  final def cogroup[C: ClassTag](other: DCollection[(A, C)])
    : DCollection[(A, (Iterable[B], Iterable[C]))] =
    self.cogroup(coll)(other)

  final def leftOuterJoin[C: ClassTag](
      other: DCollection[(A, C)]): DCollection[(A, (B, Option[C]))] =
    self.leftOuterJoin(coll)(other)

  final def rightOuterJoin[C: ClassTag](
      other: DCollection[(A, C)]): DCollection[(A, (Option[B], C))] =
    self.rightOuterJoin(coll)(other)

  final def fullOuterJoin[C: ClassTag](
      other: DCollection[(A, C)]): DCollection[(A, (Option[B], Option[C]))] =
    self.fullOuterJoin(coll)(other)

  final def mapValues[C: ClassTag](f: B => C): DCollection[(A, C)] =
    self.mapValues(coll)(f)

  final def flatMapValues[C: ClassTag](
      f: B => TraversableOnce[C]): DCollection[(A, C)] =
    self.flatMapValues(coll)(f)

  final def reduceByKey(f: (B, B) => B): DCollection[(A, B)] =
    self.reduceByKey(coll)(f)

  final def foldByKey(zeroValue: B)(f: (B, B) => B): DCollection[(A, B)] =
    self.foldByKey(coll)(zeroValue, f)

  final def aggregateByKey[C: ClassTag](zeroValue: C)(
      seqOp: (C, B) => C,
      combOp: (C, C) => C): DCollection[(A, C)] =
    self.aggregateByKey(coll)(zeroValue)(seqOp, combOp)

  final def combineByKey[C: ClassTag](
      createCombiner: B => C,
      mergeValue: (C, B) => C,
      mergeCombiners: (C, C) => C): DCollection[(A, C)] =
    self.combineByKey(coll)(createCombiner)(mergeValue, mergeCombiners)

  final def countByKey(): Map[A, Long] = self.countByKey(coll)

  final def collectAsMap(): Map[A, B] = self.collectAsMap(coll)

  final def partitionBy(partitioner: Partitioner): DCollection[(A, B)] =
    self.partitionBy(coll)(partitioner)
} 
Example 37
Source File: RDDPairFunctions.scala    From kontextfrei   with Apache License 2.0 5 votes vote down vote up
package com.danielwestheide.kontextfrei.rdd

import com.danielwestheide.kontextfrei.DCollectionPairFunctions
import org.apache.spark.Partitioner
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.reflect.ClassTag

private[kontextfrei] trait RDDPairFunctions
    extends DCollectionPairFunctions[RDD] { this: RDDBase =>

  override final def cogroup[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Iterable[B], Iterable[C]))] = withSite(x) {
    _.cogroup(y)
  }

  override final def values[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[B] = withSite(x) {
    _.values
  }

  override final def keys[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[A] = withSite(x) {
    _.keys
  }

  override final def leftOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (B, Option[C]))] = withSite(x) {
    _.leftOuterJoin(y)
  }

  override final def rightOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], C))] = withSite(x) {
    _.rightOuterJoin(y)
  }

  override final def fullOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], Option[C]))] = withSite(x) {
    _.fullOuterJoin(y)
  }

  override final def mapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => C): RDD[(A, C)] = withSite(x) {
    _.mapValues(f)
  }

  override final def flatMapValues[A: ClassTag, B: ClassTag, C: ClassTag](
      x: RDD[(A, B)])(f: B => TraversableOnce[C]): RDD[(A, C)] = withSite(x) {
    _.flatMapValues(f)
  }

  override final def reduceByKey[A: ClassTag, B: ClassTag](xs: RDD[(A, B)])(
      f: (B, B) => B): RDD[(A, B)] = withSite(xs) {
    _.reduceByKey(f)
  }

  override final def foldByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(zeroValue: B, f: (B, B) => B): RDD[(A, B)] = withSite(xs) {
    _.foldByKey(zeroValue)(f)
  }

  override final def aggregateByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(zeroValue: C)(seqOp: (C, B) => C,
                                     combOp: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.aggregateByKey(zeroValue)(seqOp, combOp)
  }

  override final def combineByKey[A: ClassTag, B: ClassTag, C: ClassTag](
      xs: RDD[(A, B)])(createCombiner: B => C)(
      mergeValue: (C, B) => C,
      mergeCombiners: (C, C) => C): RDD[(A, C)] = withSite(xs) {
    _.combineByKey(createCombiner, mergeValue, mergeCombiners)
  }

  override final def countByKey[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, Long] = withSite(xs) {
    _.countByKey()
  }

  override final def collectAsMap[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)]): Map[A, B] = withSite(xs) {
    _.collectAsMap()
  }

  override final def partitionBy[A: ClassTag, B: ClassTag](
      xs: RDD[(A, B)])(partitioner: Partitioner): RDD[(A, B)] = withSite(xs) {
    _.partitionBy(partitioner)
  }
} 
Example 38
Source File: XmlFile.scala    From spark-xml   with Apache License 2.0 5 votes vote down vote up
package com.databricks.spark.xml.util

import java.io.CharArrayWriter
import java.nio.charset.Charset
import javax.xml.stream.XMLOutputFactory

import scala.collection.Map

import com.databricks.spark.xml.parsers.StaxXmlGenerator
import com.sun.xml.txw2.output.IndentingXMLStreamWriter
import org.apache.hadoop.io.{Text, LongWritable}

import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.sql.DataFrame
import com.databricks.spark.xml.{XmlOptions, XmlInputFormat}

private[xml] object XmlFile {
  val DEFAULT_INDENT = "    "

  def withCharset(
      context: SparkContext,
      location: String,
      charset: String,
      rowTag: String): RDD[String] = {
    // This just checks the charset's validity early, to keep behavior
    Charset.forName(charset)
    context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>")
    context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset)
    context.newAPIHadoopFile(location,
      classOf[XmlInputFormat],
      classOf[LongWritable],
      classOf[Text]).map { case (_, text) => new String(text.getBytes, 0, text.getLength, charset) }
  }

  
  def saveAsXmlFile(
      dataFrame: DataFrame,
      path: String,
      parameters: Map[String, String] = Map()): Unit = {
    val options = XmlOptions(parameters.toMap)
    val codecClass = CompressionCodecs.getCodecClass(options.codec)
    val rowSchema = dataFrame.schema
    val indent = XmlFile.DEFAULT_INDENT

    val xmlRDD = dataFrame.rdd.mapPartitions { iter =>
      val factory = XMLOutputFactory.newInstance()
      val writer = new CharArrayWriter()
      val xmlWriter = factory.createXMLStreamWriter(writer)
      val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter)
      indentingXmlWriter.setIndentStep(indent)

      new Iterator[String] {
        var firstRow: Boolean = true
        var lastRow: Boolean = true

        override def hasNext: Boolean = iter.hasNext || firstRow || lastRow

        override def next: String = {
          if (iter.nonEmpty) {
            if (firstRow) {
              indentingXmlWriter.writeStartElement(options.rootTag)
              firstRow = false
            }
            val xml = {
              StaxXmlGenerator(
                rowSchema,
                indentingXmlWriter,
                options)(iter.next())
              indentingXmlWriter.flush()
              writer.toString
            }
            writer.reset()
            xml
          } else {
            if (!firstRow) {
              lastRow = false
              indentingXmlWriter.writeEndElement()
              indentingXmlWriter.close()
              writer.toString
            } else {
              // This means the iterator was initially empty.
              firstRow = false
              lastRow = false
              ""
            }
          }
        }
      }
    }

    codecClass match {
      case null => xmlRDD.saveAsTextFile(path)
      case codec => xmlRDD.saveAsTextFile(path, codec)
    }
  }
} 
Example 39
Source File: MapSerializerModule.scala    From mango   with Apache License 2.0 5 votes vote down vote up
package com.kakao.shaded.jackson.module.scala.ser

import com.kakao.shaded.jackson.databind.`type`.{TypeFactory, MapType, MapLikeType}
import com.kakao.shaded.jackson.databind.jsontype.TypeSerializer
import com.kakao.shaded.jackson.databind.ser.Serializers
import com.kakao.shaded.jackson.databind.ser.std.StdDelegatingSerializer
import com.kakao.shaded.jackson.databind.util.StdConverter
import com.kakao.shaded.jackson.databind._
import com.kakao.shaded.jackson.module.scala.modifiers.MapTypeModifierModule
import scala.collection.JavaConverters._
import scala.collection.Map

private class MapConverter(inputType: JavaType, config: SerializationConfig)
  extends StdConverter[Map[_,_],java.util.Map[_,_]]
{
  def convert(value: Map[_,_]): java.util.Map[_,_] = {
    val m = if (config.isEnabled(SerializationFeature.WRITE_NULL_MAP_VALUES)) {
      value
    } else {
      value.filter(_._2 != None)
    }
    m.asJava
  }


  override def getInputType(factory: TypeFactory) = inputType

  override def getOutputType(factory: TypeFactory) =
    factory.constructMapType(classOf[java.util.Map[_,_]], inputType.getKeyType, inputType.getContentType)
      .withTypeHandler(inputType.getTypeHandler)
      .withValueHandler(inputType.getValueHandler)
}

private object MapSerializerResolver extends Serializers.Base {

  val BASE = classOf[collection.Map[_,_]]

  override def findMapLikeSerializer(config: SerializationConfig,
                                     mapLikeType : MapLikeType,
                                     beanDesc: BeanDescription,
                                     keySerializer: JsonSerializer[AnyRef],
                                     elementTypeSerializer: TypeSerializer,
                                     elementValueSerializer: JsonSerializer[AnyRef]): JsonSerializer[_] = {


    val rawClass = mapLikeType.getRawClass

    if (!BASE.isAssignableFrom(rawClass)) null
    else new StdDelegatingSerializer(new MapConverter(mapLikeType, config))
  }

}

trait MapSerializerModule extends MapTypeModifierModule {
  this += MapSerializerResolver
} 
Example 40
Source File: TruckAndTrafficJoinBolt.scala    From trucking-iot   with Apache License 2.0 5 votes vote down vote up
package com.orendainx.trucking.storm.bolts

import java.util

import com.orendainx.trucking.commons.models.{EnrichedTruckAndTrafficData, EnrichedTruckData, TrafficData}
import com.typesafe.scalalogging.Logger
import org.apache.storm.task.{OutputCollector, TopologyContext}
import org.apache.storm.topology.OutputFieldsDeclarer
import org.apache.storm.topology.base.BaseWindowedBolt
import org.apache.storm.tuple.{Fields, Values}
import org.apache.storm.windowing.TupleWindow

import scala.collection.JavaConverters._
import scala.collection.mutable.ListBuffer
import scala.collection.{Map, mutable}
import scala.language.implicitConversions


  private def processAndEmitData(truckDataPerRoute: Map[Int, ListBuffer[EnrichedTruckData]],
                                 trafficDataPerRoute: Map[Int, ListBuffer[TrafficData]]) {

    // For each EnrichedTruckData object, find the TrafficData object with the closest timestamp
    truckDataPerRoute.foreach { case (routeId, truckDataList) =>
      trafficDataPerRoute.get(routeId) match {
        case None => // No traffic data for this routeId, so drop/ignore truck data
        case Some(trafficDataList) =>
          truckDataList foreach { truckData =>
            trafficDataList.sortBy(data => math.abs(data.eventTime - truckData.eventTime)).headOption match {
              case None => // Window didn't capture any traffic data for this truck's route
              case Some(trafficData) =>

                val joinedData = EnrichedTruckAndTrafficData(truckData.eventTime, truckData.truckId, truckData.driverId, truckData.driverName,
                  truckData.routeId, truckData.routeName, truckData.latitude, truckData.longitude, truckData.speed,
                  truckData.eventType, truckData.foggy, truckData.rainy, truckData.windy, trafficData.congestionLevel)

                outputCollector.emit(new Values("EnrichedTruckAndTrafficData", joinedData))
            }
          }
      }
    }
  }

  override def declareOutputFields(declarer: OutputFieldsDeclarer): Unit = declarer.declare(new Fields("dataType", "data"))
} 
Example 41
package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object LinearRegression{
  def main(args: Array[String]) {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.025
    val intercept =true

    //LinearRegressionWithSGD.tr
    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features)))
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = true
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD")

  }

} 
Example 42
package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object LinearRegressionWithLog{



  def main(args: Array[String]) {

    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    //val step = 0.2
    val step = 0.025
    val intercept =true

    //LinearRegressionWithSGD.tr
    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = data.map(p => (Math.exp(p.label), Math.exp(linear_model.predict(p.features))))
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = false
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log")

  }

} 
Example 43
package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object DecisionTreeUtil {

  def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val data_dt = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r)))
    }

    val splits = data_dt.randomSplit(Array(0.8, 0.2), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)
    return (training, test)
  }

  def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint],
               categoricalFeaturesInfo: scala.Predef.Map[Int, Int],
                maxDepth :Int, maxBins: Int): Double = {
    val impurity = "variance"
    val decisionTreeModel = DecisionTree.trainRegressor(train, categoricalFeaturesInfo,
      impurity,maxDepth, maxBins )

    val true_vs_predicted = test.map(p => (p.label, decisionTreeModel.predict(p.features)))
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())
    return rmsle
  }

} 
Example 44
package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object DecisionTreeCategoricalFeaturesApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    val save = true
    //val sc = new SparkContext("local[2]", "First Spark App")
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()

    val records = rawData.map(line => line.split(","))
    val first = records.first()

    println(numData.toInt)
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    println("Feature vector length for categorical features:"+ catLen)
    println("Feature vector length for numerical features:" + numLen)
    println("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val data_dt = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r)))
    }
    val first_point = data_dt.first()
    println("Decision Tree feature vector:" + first_point.features.toString)
    println("Decision Tree feature vector length: " + first_point.features.size)


    def getCatFeatures(): scala.Predef.Map[Int, Int] = {

      var d = scala.Predef.Map[Int, Int]()

      for(a <- 2 until 10){
        d += (a-2 -> (get_mapping(records, a).size + 1))
        //d.put(a-2,get_mapping(records, a).size + 1)
      }
      return d

    }
    val cat_features = getCatFeatures()
    //dict([(i - 2, len(get_mapping(records, i)) + 1) for i in range(2,10)])

    //val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32
    val decisionTreeModel= DecisionTree.trainRegressor(data_dt, cat_features,  impurity, maxDepth, maxBins)
    //val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo,
    //  impurity, maxDepth, maxBins )

    val preds = decisionTreeModel.predict(data_dt.map( p=> p.features))
    val actual = data.map( p=> p.label)
    val true_vs_predicted_dt = actual.zip(preds)
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + decisionTreeModel.predict(p.features))

    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_categorical_" + date + ".csv")
    }

    print("Decision Tree depth: " + decisionTreeModel.depth)
    print("Decision Tree number of nodes: " + decisionTreeModel.numNodes)
    Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree Categorical Features")
  }

} 
Example 45
package org.sparksamples.decisiontree

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object DecisionTreeWithLog{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    val save = false
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()

    val records = rawData.map(line => line.split(","))
    val first = records.first()

    println(numData.toInt)
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    println("Feature vector length for categorical features:"+ catLen)
    println("Feature vector length for numerical features:" + numLen)
    println("Total feature vector length: " + totalLen)


    val data_dt = {
      records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extract_features_dt(r)))
    }
    val first_point = data_dt.first()
    println("Decision Tree feature vector:" + first_point.features.toString)
    println("Decision Tree feature vector length: " + first_point.features.size)

    val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]()
    val impurity = "variance"
    val maxDepth = 5
    val maxBins = 32

    val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo,
      impurity, maxDepth, maxBins )

    val preds = decisionTreeModel.predict(data_dt.map( p=> p.features))
    val preds_2 = preds.map(p=> Math.exp(p))
    val actual = data_dt.map( p=> Math.exp(p.label))
    val true_vs_predicted_dt = actual.zip(preds)

    if(save){
      val true_vs_predicted_csv = data_dt.map(p => p.label + " ,"  + decisionTreeModel.predict(p.features))
      val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
      val date = format.format(new java.util.Date())
      true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_" + date + ".csv")
    }

    print("Decision Tree depth: " + decisionTreeModel.depth)
    print("Decision Tree number of nodes: " + decisionTreeModel.numNodes)

    Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree With Log")
    Util.sc.stop()
  }

} 
Example 46
package org.sparksamples

import org.apache.spark.mllib.regression.{LabeledPoint, RidgeRegressionWithSGD}
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object RidgeRegressionApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    //val sc = new SparkContext("local[2]", "First Spark App")
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()
    val records = rawData.map(line => line.split(","))
    records.cache()
    //print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.1
    val intercept =false
    val rr = new RidgeRegressionWithSGD()
    rr.optimizer.setNumIterations(iterations)
    rr.optimizer.setStepSize(0.1)
    val rrModel = rr.run(data)
    val true_vs_predicted = data.map(p => (p.label, rrModel.predict(p.features)))
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }
    val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())

    println("Ridge Regression - Mean Squared Error: "  + mse)
    println("Ridge Regression  - Mean Absolute Error: " + mae)
    println("Ridge Regression  - Root Mean Squared Log Error:" + rmsle)
  }
} 
Example 47
Source File: JsonUtils.scala    From InteractiveGraph-neo4j   with BSD 2-Clause "Simplified" License 5 votes vote down vote up
package org.grapheco.server.util

import com.google.gson._

import scala.collection.Map


object JsonUtils {
  def getPrimitiveValue(value: JsonPrimitive): Any = {
    (value.isBoolean, value.isNumber, value.isString) match {
      case (true, false, false) => value.getAsBoolean;
      case (false, true, false) => Some(value.getAsNumber).map(num =>
        if (num.toString.contains(".")) {
          num.doubleValue()
        }
        else {
          num.intValue()
        }
      ).get;
      case (false, false, true) => value.getAsString;
    }
  }

  val gson = new GsonBuilder()
    .setPrettyPrinting()
    .create();

  def parse(json: String): JsonElement = {
    new JsonParser().parse(json);
  }

  def stringify(e: JsonElement): String = {
    gson.toJson(e);
  }

  def stringify(e: Map[String, _]): String = {
    gson.toJson(asJsonObject(e));
  }

  def asJsonArray(arr: Array[_]) = {
    val ja = new JsonArray();
    arr.foreach(x => ja.add(asJsonElement(x)));
    ja;
  }

  def asJsonElement(v: Any): JsonElement = {
    if (v.isInstanceOf[Map[_, _]]) {
      asJsonObject(v.asInstanceOf[Map[String, _]]);
    }
    else if (v.isInstanceOf[Array[_]]) {
      asJsonArray(v.asInstanceOf[Array[_]]);
    }
    else {
      v match {
        case x: String =>
          new JsonPrimitive(x);
        case x: Number =>
          new JsonPrimitive(x);
        case x: Boolean =>
          new JsonPrimitive(x);
      }
    }
  }

  def asJsonObject(map: Map[String, _]) = {
    val jo = new JsonObject();
    map.foreach(en => {
      jo.add(en._1, asJsonElement(en._2));
    })
    jo;
  }
} 
Example 48
package org.sparksamples.gradientboosted

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object GradientBoostedTreesUtil {

  def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }


    val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)
    return (training, test)
  }

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }



  def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint], iterations:Int, maxDepth:Int,
               maxBins: Int): Double ={

    var boostingStrategy = BoostingStrategy.defaultParams("Regression")
    boostingStrategy.setNumIterations(iterations)
    boostingStrategy.treeStrategy.setMaxDepth(maxDepth)
    boostingStrategy.treeStrategy.setMaxBins(maxBins)

    val model = GradientBoostedTrees.train(train, boostingStrategy)
//
//    @classmethod
//    @since("1.3.0")
//    def trainRegressor(cls, data, categoricalFeaturesInfo,
//                       loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3,
//                       maxBins=32):

    val true_vs_predicted = test.map(p => (p.label, model.predict(p.features)))
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())
    return rmsle
  }

} 
Example 49
package org.sparksamples.gradientboosted

import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.GradientBoostedTrees
import org.apache.spark.mllib.tree.configuration.BoostingStrategy
import org.apache.spark.rdd.RDD
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object GradientBoostedTreesApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    //val conf = new SparkConf().setMaster("local").setAppName("GradientBoostedTreesRegressionApp")
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader.csv")
    val numData = rawData.count()
    val records = rawData.map(line => line.split(","))
    records.cache()
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Gradient Boosted Trees Model feature vector:" + first_point.features.toString)
    println("Gradient Boosted Trees Model feature vector length: " + first_point.features.size)


    var boostingStrategy = BoostingStrategy.defaultParams("Regression")
    boostingStrategy.setNumIterations(3)// Note: Use more iterations in practice.
    boostingStrategy.treeStrategy.setMaxDepth(5)


    val model = GradientBoostedTrees.train(data, boostingStrategy)
    val true_vs_predicted = data.map(p => (p.label, model.predict(p.features)))
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }
    val save = true
    if(save){
      val true_vs_predicted_csv = data.map(p => p.label + " ,"  + model.predict(p.features))
      val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
      val date = format.format(new java.util.Date())
      true_vs_predicted_csv.saveAsTextFile("./output/gradient_boosted_trees_" + date + ".csv")
    }
    val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())

    println("Gradient Boosted Trees - Mean Squared Error: "  + mse)
    println("Gradient Boosted Trees - Mean Absolute Error: " + mae)
    println("Gradient Boosted Trees - Root Mean Squared Log Error:" + rmsle)
  }
} 
Example 50
package org.sparksamples

import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object CalculateStdDeviation{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {

    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => Util.extractFeatures(r, catLen, mappings))
    }
    //data.saveAsTextFile("./output/temp.txt")
    val count_columns = data.first().size

    var a = 0;
    var x = new Array[Double](count_columns)
    // for loop execution with a range
    for( a <- 0 to (count_columns -1) ){
      val stddev = data.map(r => r(a)).stdev()
      //println(a +  ": " +  );
      x.update(a,stddev)

    }
    for( a <- 0 to (count_columns -1) ){
      println(a  + " : " + x(a))
    }

    //val data_1_std_dev = data.map(r => r(1)).stdev()
    //println(data_1_std_dev)


  }

} 
Example 51
package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object LinearRegressionWithIntercept{

  def main(args: Array[String]) {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val data1 = {
      records.map(r => Util.extractFeatures(r, catLen, mappings))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.025
    val intercept =true

    val linReg = new LinearRegressionWithSGD().setIntercept(intercept)
    linReg.optimizer.setNumIterations(iterations).setStepSize(step)
    val linear_model = linReg.run(data)
    print(data.first());
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features)))
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = true
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }
    val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())

    println("Linear Model - Mean Squared Error: "  + mse)
    println("Linear Model - Mean Absolute Error: " + mae)
    println("Linear Model - Root Mean Squared Log Error:" + rmsle)

  }

} 
Example 52
package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object LinearRegression{
  def main(args: Array[String]) {
    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    val step = 0.025
    val intercept =true

    //LinearRegressionWithSGD.tr
    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features)))
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = true
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD")

  }

} 
Example 53
package org.sparksamples.linearregression

import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.sparksamples.Util

import scala.collection.Map
import scala.collection.mutable.ListBuffer


object LinearRegressionWithLog{



  def main(args: Array[String]) {

    val recordsArray = Util.getRecords()
    val records = recordsArray._1
    val first = records.first()
    val numData = recordsArray._2

    println(numData.toString())
    records.cache()
    print("Mapping of first categorical feature column: " +  Util.get_mapping(records, 2))
    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m =  Util.get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen

    print("Feature vector length for categorical features:"+ catLen)
    print("Feature vector length for numerical features:" + numLen)
    print("Total feature vector length: " + totalLen)

    val data = {
      records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings)))
    }
    val first_point = data.first()
    println("Linear Model feature vector:" + first_point.features.toString)
    println("Linear Model feature vector length: " + first_point.features.size)

    val iterations = 10
    //val step = 0.2
    val step = 0.025
    val intercept =true

    //LinearRegressionWithSGD.tr
    val linear_model = LinearRegressionWithSGD.train(data, iterations, step)
    val x = linear_model.predict(data.first().features)
    val true_vs_predicted = data.map(p => (Math.exp(p.label), Math.exp(linear_model.predict(p.features))))
    val true_vs_predicted_csv = data.map(p => p.label + " ,"  + linear_model.predict(p.features))
    val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
    val date = format.format(new java.util.Date())
    val save = false
    if (save){
      true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }

    Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log")

  }

} 
Example 54
package org.sparksamples

import org.apache.spark.mllib.regression.{IsotonicRegression, LabeledPoint}
import org.apache.spark.rdd.RDD

import scala.collection.Map
import scala.collection.mutable.ListBuffer

object IsotonicRegressionApp{

  def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = {
    return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap()
  }

  def main(args: Array[String]) {
    val sc = Util.sc

    // we take the raw data in CSV format and convert it into a set of records
    // of the form (user, product, price)
    val rawData = sc.textFile("../data/hour_noheader_1000.csv")
    val numData = rawData.count()
    val records = rawData.map(line => line.split(","))
    records.cache()

    var list = new ListBuffer[Map[String, Long]]()
    for( i <- 2 to 9){
      val m = get_mapping(records, i)
      list += m
    }
    val mappings = list.toList
    var catLen = 0
    mappings.foreach( m => (catLen +=m.size))

    val numLen = records.first().slice(11, 15).size
    val totalLen = catLen + numLen


    val data = {
      records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings)))
    }
    val parsedData = records.map { r =>
      (Util.extractLabel(r), Util.extractSumFeature(r, catLen, mappings), 1.0)
    }

    val iterations = 10
    val step = 0.1
    val intercept =false

    val x = new IsotonicRegression().setIsotonic(false)
    val model = x.run(parsedData)

    val parsedData1: RDD[Double] = parsedData.map(r => r._2)
    //val model = GradientBoostedTrees.train(data, boostingStrategy)
    val true_vs_predicted = parsedData.map(p => (p._1, model.predict(p._2)))

    val save = true
    if(save){
      val true_vs_predicted_csv = parsedData.map(p => ( p._1+ "," + model.predict(p._2)))
      val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss")
      val date = format.format(new java.util.Date())
      true_vs_predicted_csv.saveAsTextFile("./output/isotonic_regression_" + date + ".csv")
    }
    val true_vs_predicted_take5 = true_vs_predicted.take(5)
    for(i <- 0 until 5) {
      println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i))
    }

    val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean()
    val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean()
    val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean())

    Util.calculatePrintMetrics(true_vs_predicted, "Isotonic Regression")

  }
} 
Example 55
Source File: CommandUtils.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.deploy.worker

import java.io.{File, FileOutputStream, InputStream, IOException}

import scala.collection.JavaConverters._
import scala.collection.Map

import org.apache.spark.SecurityManager
import org.apache.spark.deploy.Command
import org.apache.spark.internal.Logging
import org.apache.spark.launcher.WorkerCommandBuilder
import org.apache.spark.util.Utils


  def redirectStream(in: InputStream, file: File) {
    val out = new FileOutputStream(file, true)
    // TODO: It would be nice to add a shutdown hook here that explains why the output is
    //       terminating. Otherwise if the worker dies the executor logs will silently stop.
    new Thread("redirect output to " + file) {
      override def run() {
        try {
          Utils.copyStream(in, out, true)
        } catch {
          case e: IOException =>
            logInfo("Redirection to " + file + " closed: " + e.getMessage)
        }
      }
    }.start()
  }
} 
Example 56
Source File: GroupedCountEvaluator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
} 
Example 57
Source File: GroupedSumEvaluator.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
} 
Example 58
Source File: GroupedCountEvaluator.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T,Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T,Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T,Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 59
Source File: GroupedMeanEvaluator.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
} 
Example 60
Source File: Cache.scala    From sangria   with Apache License 2.0 5 votes vote down vote up
package sangria.util

import scala.collection.{Map, Set}

trait Cache[Key, Value] {
  def size: Int

  def contains(key: Key): Boolean
  def apply(key: Key): Value
  def get(key: Key): Option[Value]
  def getOrElse(key: Key, default: => Value): Value
  def update(key: Key, value: Value): Unit
  def remove(key: Key): Unit
  def clear(): Unit

  // NOTE: that `getOrElseUpdate` allows a race condition between value retrieval and cache update.
  // It is an explicit decision to avoid any kind of synchronization (it is preferred to recompute value multiple times than to synchronize)
  def getOrElseUpdate(key: Key, fn: => Value): Value
  def find(fn: (Key, Value) => Boolean): Option[(Key, Value)]
  def mapToSet[R](fn: (Key, Value) => R): Set[R]
  def mapValues[R](fn: Value => R): Map[Key, R]
  def keyExists(fn: Key => Boolean): Boolean
  def forEachValue(fn: Value => Unit): Unit
  def removeKeys(fn: Key => Boolean): Unit
}

object Cache {
  def empty[Key, Value]: Cache[Key, Value] = emptyConcurrentHashMap[Key, Value]

  def emptyTrieMap[Key, Value] = new TrieMapCache[Key, Value]
  def emptyConcurrentHashMap[Key, Value] = new ConcurrentHashMapCache[Key, Value]
  
  def apply[Key, Value](elems: (Key, Value)*) = {
    val c = empty[Key, Value]
    elems.foreach {case (key, value) => c(key) = value}
    c
  }
}