scala.collection.Map Scala Examples
The following examples show how to use scala.collection.Map.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CommandUtils.scala From drizzle-spark with Apache License 2.0 | 7 votes |
package org.apache.spark.deploy.worker import java.io.{File, FileOutputStream, InputStream, IOException} import scala.collection.JavaConverters._ import scala.collection.Map import org.apache.spark.SecurityManager import org.apache.spark.deploy.Command import org.apache.spark.internal.Logging import org.apache.spark.launcher.WorkerCommandBuilder import org.apache.spark.util.Utils def redirectStream(in: InputStream, file: File) { val out = new FileOutputStream(file, true) // TODO: It would be nice to add a shutdown hook here that explains why the output is // terminating. Otherwise if the worker dies the executor logs will silently stop. new Thread("redirect output to " + file) { override def run() { try { Utils.copyStream(in, out, true) } catch { case e: IOException => logInfo("Redirection to " + file + " closed: " + e.getMessage) } } }.start() } }
Example 2
Source File: GroupedCountEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result(key) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(key) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 3
Source File: WordpieceTokenized.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.common import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} import scala.collection.Map object WordpieceTokenized extends Annotated[WordpieceTokenizedSentence] { override def annotatorType: String = AnnotatorType.WORDPIECE override def unpack(annotations: Seq[Annotation]): Seq[WordpieceTokenizedSentence] = { val tokens = annotations .filter(_.annotatorType == annotatorType) .toArray SentenceSplit.unpack(annotations).map(sentence => { tokens.filter(token => token.begin >= sentence.start & token.end <= sentence.end ).map(token => TokenPiece(wordpiece = token.result, token = token.metadata("token"), pieceId = token.metadata("pieceId").toInt, isWordStart = token.metadata("isWordStart").toBoolean, begin = token.begin, end = token.end ) ) }).filter(_.nonEmpty).map(tokens => WordpieceTokenizedSentence(tokens)) } override def pack(sentences: Seq[WordpieceTokenizedSentence]): Seq[Annotation] = { var sentenceIndex = 0 sentences.flatMap{sentence => sentenceIndex += 1 sentence.tokens.map{token => Annotation(annotatorType, token.begin, token.end, token.wordpiece, Map("sentence" -> sentenceIndex.toString, "isWordStart" -> token.isWordStart.toString, "pieceId" -> token.pieceId.toString, "token" -> token.token) ) }} } } case class WordpieceTokenizedSentence(tokens: Array[TokenPiece]) case class TokenPiece(wordpiece: String, token: String, pieceId: Int, isWordStart: Boolean, begin: Int, end: Int)
Example 4
Source File: NerConverter.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.ner import com.johnsnowlabs.nlp.AnnotatorType.{CHUNK, DOCUMENT, NAMED_ENTITY, TOKEN} import com.johnsnowlabs.nlp.annotators.common.NerTagged import com.johnsnowlabs.nlp.{Annotation, AnnotatorModel, AnnotatorType, ParamsAndFeaturesReadable} import org.apache.spark.ml.param.{BooleanParam, StringArrayParam} import org.apache.spark.ml.util.Identifiable import scala.collection.Map def setPreservePosition(value: Boolean): this.type = set(preservePosition, value) setDefault( preservePosition -> true ) override def annotate(annotations: Seq[Annotation]): Seq[Annotation] = { val sentences = NerTagged.unpack(annotations) val docs = annotations.filter(a => a.annotatorType == AnnotatorType.DOCUMENT) val entities = sentences.zip(docs.zipWithIndex).flatMap { case (sentence, doc) => NerTagsEncoding.fromIOB(sentence, doc._1, sentenceIndex=doc._2, $(preservePosition)) } entities.filter(entity => get(whiteList).forall(validEntity => validEntity.contains(entity.entity))). zipWithIndex.map{case (entity, idx) => Annotation( outputAnnotatorType, entity.start, entity.end, entity.text, Map("entity" -> entity.entity, "sentence" -> entity.sentenceId, "chunk" -> idx.toString) ) } } } object NerConverter extends ParamsAndFeaturesReadable[NerConverter]
Example 5
Source File: DeltaTableOperations.scala From delta with Apache License 2.0 | 5 votes |
package io.delta.tables.execution import scala.collection.Map import org.apache.spark.sql.delta.{DeltaErrors, DeltaHistoryManager, DeltaLog, PreprocessTableUpdate} import org.apache.spark.sql.delta.commands.{DeleteCommand, DeltaGenerateCommand, VacuumCommand} import org.apache.spark.sql.delta.util.AnalysisHelper import io.delta.tables.DeltaTable import org.apache.spark.sql.{functions, Column, DataFrame, Dataset} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression} import org.apache.spark.sql.catalyst.plans.logical._ trait DeltaTableOperations extends AnalysisHelper { self: DeltaTable => protected def executeDelete(condition: Option[Expression]): Unit = improveUnsupportedOpError { val delete = DeleteFromTable(self.toDF.queryExecution.analyzed, condition) toDataset(sparkSession, delete) } protected def executeHistory(deltaLog: DeltaLog, limit: Option[Int]): DataFrame = { val history = new DeltaHistoryManager(deltaLog) val spark = self.toDF.sparkSession spark.createDataFrame(history.getHistory(limit)) } protected def executeGenerate(tblIdentifier: String, mode: String): Unit = { val tableId: TableIdentifier = sparkSession .sessionState .sqlParser .parseTableIdentifier(tblIdentifier) val generate = DeltaGenerateCommand(mode, tableId) generate.run(sparkSession) } protected def executeUpdate( set: Map[String, Column], condition: Option[Column]): Unit = improveUnsupportedOpError { val assignments = set.map { case (targetColName, column) => Assignment(UnresolvedAttribute.quotedString(targetColName), column.expr) }.toSeq val update = UpdateTable(self.toDF.queryExecution.analyzed, assignments, condition.map(_.expr)) toDataset(sparkSession, update) } protected def executeVacuum( deltaLog: DeltaLog, retentionHours: Option[Double]): DataFrame = { VacuumCommand.gc(sparkSession, deltaLog, false, retentionHours) sparkSession.emptyDataFrame } protected def toStrColumnMap(map: Map[String, String]): Map[String, Column] = { map.toSeq.map { case (k, v) => k -> functions.expr(v) }.toMap } protected def sparkSession = self.toDF.sparkSession }
Example 6
Source File: ElasticsearchConnector.scala From hail with MIT License | 5 votes |
package is.hail.io import org.apache.spark import org.elasticsearch.spark.sql._ import scala.collection.JavaConverters._ import scala.collection.Map object ElasticsearchConnector { def export( df: spark.sql.DataFrame, host: String, port: Int, index: String, indexType: String, blockSize: Int, config: java.util.HashMap[String, String], verbose: Boolean) { export(df, host, port, index, indexType, blockSize, Option(config).map(_.asScala.toMap).getOrElse(Map.empty[String, String]), verbose) } def export(df: spark.sql.DataFrame, host: String = "localhost", port: Int = 9200, index: String, indexType: String, blockSize: Int = 1000, config: Map[String, String], verbose: Boolean = true) { // config docs: https://www.elastic.co/guide/en/elasticsearch/hadoop/master/configuration.html val defaultConfig = Map( "es.nodes" -> host, "es.port" -> port.toString, "es.batch.size.entries" -> blockSize.toString, "es.index.auto.create" -> "true") val mergedConfig = if (config == null) defaultConfig else defaultConfig ++ config if (verbose) println(s"Config ${ mergedConfig }") df.saveToEs(s"${ index }/${ indexType }", mergedConfig) } }
Example 7
Source File: CommandUtils.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import java.io.{File, FileOutputStream, InputStream, IOException} import scala.collection.JavaConverters._ import scala.collection.Map import org.apache.spark.SecurityManager import org.apache.spark.deploy.Command import org.apache.spark.internal.Logging import org.apache.spark.launcher.WorkerCommandBuilder import org.apache.spark.util.Utils def redirectStream(in: InputStream, file: File) { val out = new FileOutputStream(file, true) // TODO: It would be nice to add a shutdown hook here that explains why the output is // terminating. Otherwise if the worker dies the executor logs will silently stop. new Thread("redirect output to " + file) { override def run() { try { Utils.copyStream(in, out, true) } catch { case e: IOException => logInfo("Redirection to " + file + " closed: " + e.getMessage) } } }.start() } }
Example 8
Source File: GroupedCountEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { private var outputsMerged = 0 private val sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap } } }
Example 9
Source File: JacksonGenerator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.json import scala.collection.Map import com.fasterxml.jackson.core._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ private[sql] object JacksonGenerator { def apply(rowSchema: StructType, gen: JsonGenerator)(row: Row): Unit = { def valWriter: (DataType, Any) => Unit = { case (_, null) | (NullType, _) => gen.writeNull() case (StringType, v: String) => gen.writeString(v) case (TimestampType, v: java.sql.Timestamp) => gen.writeString(v.toString) case (IntegerType, v: Int) => gen.writeNumber(v) case (ShortType, v: Short) => gen.writeNumber(v) case (FloatType, v: Float) => gen.writeNumber(v) case (DoubleType, v: Double) => gen.writeNumber(v) case (LongType, v: Long) => gen.writeNumber(v) case (DecimalType(), v: java.math.BigDecimal) => gen.writeNumber(v) case (ByteType, v: Byte) => gen.writeNumber(v.toInt) case (BinaryType, v: Array[Byte]) => gen.writeBinary(v) case (BooleanType, v: Boolean) => gen.writeBoolean(v) case (DateType, v) => gen.writeString(v.toString) case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, udt.serialize(v)) case (ArrayType(ty, _), v: Seq[_]) => gen.writeStartArray() v.foreach(valWriter(ty, _)) gen.writeEndArray() case (MapType(kv, vv, _), v: Map[_, _]) => gen.writeStartObject() v.foreach { p => gen.writeFieldName(p._1.toString) valWriter(vv, p._2) } gen.writeEndObject() case (StructType(ty), v: Row) => gen.writeStartObject() ty.zip(v.toSeq).foreach { case (_, null) => case (field, v) => gen.writeFieldName(field.name) valWriter(field.dataType, v) } gen.writeEndObject() } valWriter(rowSchema, row) } }
Example 10
Source File: CommandUtils.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import java.io.{File, FileOutputStream, InputStream, IOException} import java.lang.System._ import scala.collection.JavaConversions._ import scala.collection.Map import org.apache.spark.Logging import org.apache.spark.deploy.Command import org.apache.spark.launcher.WorkerCommandBuilder import org.apache.spark.util.Utils def redirectStream(in: InputStream, file: File) { val out = new FileOutputStream(file, true) // TODO: It would be nice to add a shutdown hook here that explains why the output is // terminating. Otherwise if the worker dies the executor logs will silently stop. new Thread("redirect output to " + file) { override def run() { try { Utils.copyStream(in, out, true) } catch { case e: IOException => logInfo("Redirection to " + file + " closed: " + e.getMessage) } } }.start() } }
Example 11
Source File: GroupedSumEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val sum = entry.getValue.sum result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = studentTCacher.get(counter.count) val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high) } result } } }
Example 12
Source File: GroupedCountEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result(key) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(key) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 13
Source File: GroupedMeanEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val mean = entry.getValue.mean result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = studentTCacher.get(counter.count) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(entry.getKey) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 14
Source File: JsonUtils.scala From OUTDATED_ledger-wallet-android with MIT License | 5 votes |
package co.ledger.wallet.core.utils import org.json.{JSONArray, JSONObject} import scala.collection.Map trait JsonUtils { implicit def Map2JsonObject[T](map: Map[String, T]): JSONObject = { val json = new JSONObject() map foreach {case (key, value) => value match { case string: String => json.put(key, string) case double: Double => json.put(key, double) case float: Float => json.put(key, float) case boolean: Boolean => json.put(key, boolean) case jsonObject: JSONObject => json.put(key, jsonObject) case jsonArray: JSONArray => json.put(key, jsonArray) case map: Map[_, _] => json.put(key, Map2JsonObject(map.asInstanceOf[Map[String, _]])) case array: Array[_] => json.put(key, Array2JsonArray(array)) case _ => json.put(key, value.toString) } } json } implicit def Array2JsonArray[T](array: Array[T]): JSONArray = { val json = new JSONArray() array foreach { case string: String => json.put(string) case double: Double => json.put(double) case float: Float => json.put(float) case boolean: Boolean => json.put(boolean) case jsonObject: JSONObject => json.put(jsonObject) case jsonArray: JSONArray => json.put(jsonArray) case map: Map[_, _] => json.put(Map2JsonObject(map.asInstanceOf[Map[String, _]])) case array: Array[AnyRef] => json.put(Array2JsonArray(array)) case value => json.put(value.toString) } json } implicit class JsonStringContext(val c: StringContext) { def json(args: Any*): JSONObject = { val strings = c.parts.iterator val arguments = args.iterator val string = new StringBuffer(strings.next()) while (strings.hasNext) { arguments.next() match { case charSequence: CharSequence => string.append("\"" + charSequence.toString + "\"") case arg => string.append(arg.toString) } string.append(strings.next()) } new JSONObject(string.toString) } } } object JsonUtils extends JsonUtils
Example 15
Source File: TaskResult.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.Map import scala.collection.mutable import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils // Task result. Also contains updates to accumulator variables. //任务结果,还包含累加器变量的更新, private[spark] sealed trait TaskResult[T] def value(): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value. //这不应该在持有锁时运行,因为它可能花费数十秒钟值 val resultSer = SparkEnv.get.serializer.newInstance() valueObject = resultSer.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 16
Source File: GroupedSumEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val sum = entry.getValue.sum result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = studentTCacher.get(counter.count) val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high) } result } } }
Example 17
Source File: SentenceWithEmbeddings.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.common import com.johnsnowlabs.nlp.{Annotation, AnnotatorType} import scala.collection.Map case class WordpieceEmbeddingsSentence ( tokens: Array[TokenPieceEmbeddings], sentenceId: Int ) case class TokenPieceEmbeddings(wordpiece: String, token: String, pieceId: Int, isWordStart: Boolean, isOOV: Boolean, embeddings: Array[Float], begin: Int, end: Int) object TokenPieceEmbeddings { def apply(piece: TokenPiece, embeddings: Array[Float]): TokenPieceEmbeddings = { TokenPieceEmbeddings( wordpiece = piece.wordpiece, token = piece.token, pieceId = piece.pieceId, isWordStart = piece.isWordStart, isOOV = false, // FIXME: I think BERT wont have OOV, this "constructor" is called from TensorFlowBert embeddings = embeddings, begin = piece.begin, end = piece.end) } def apply(wordpiece: String, token: String, pieceId: Int, isWordStart: Boolean, embeddings: Option[Array[Float]], zeroArray: Array[Float], begin: Int, end: Int): TokenPieceEmbeddings = { val vector = embeddings.getOrElse(zeroArray) val oov = embeddings match { case Some(_) => false; case default => true; } TokenPieceEmbeddings( wordpiece = wordpiece, token = token, pieceId = pieceId, isWordStart = isWordStart, isOOV = oov, embeddings = vector, begin = begin, end = end) } } object WordpieceEmbeddingsSentence extends Annotated[WordpieceEmbeddingsSentence] { override def annotatorType: String = AnnotatorType.WORD_EMBEDDINGS override def unpack(annotations: Seq[Annotation]): Seq[WordpieceEmbeddingsSentence] = { val tokens = annotations .filter(_.annotatorType == annotatorType) .groupBy(_.metadata("sentence").toInt) tokens.map{case (idx: Int, sentenceTokens: Seq[Annotation]) => val tokensWithSentence = sentenceTokens.map { token => new TokenPieceEmbeddings( wordpiece = token.result, token = token.metadata("token"), pieceId = token.metadata("pieceId").toInt, isWordStart = token.metadata("isWordStart").toBoolean, isOOV = token.metadata.getOrElse("isOOV", "false").toBoolean, embeddings = token.embeddings, begin = token.begin, end = token.end ) }.toArray WordpieceEmbeddingsSentence(tokensWithSentence, idx) }.toSeq.sortBy(_.sentenceId) } override def pack(sentences: Seq[WordpieceEmbeddingsSentence]): Seq[Annotation] = { sentences.flatMap{sentence => var isFirstToken = true sentence.tokens.map{ token => // Store embeddings for token val embeddings = token.embeddings isFirstToken = false Annotation(annotatorType, token.begin, token.end, token.token, Map("sentence" -> sentence.sentenceId.toString, "token" -> token.token, "pieceId" -> token.pieceId.toString, "isWordStart" -> token.isWordStart.toString, "isOOV" -> token.isOOV.toString ), embeddings ) } } } }
Example 18
Source File: GroupedMeanEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val mean = entry.getValue.mean result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = studentTCacher.get(counter.count) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(entry.getKey) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 19
Source File: LanguageDetectorTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import com.google.common.base.Optional import com.optimaize.langdetect.LanguageDetector import com.optimaize.langdetect.i18n.LdLocale import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} import scala.collection.Map def setOutputCol(value: String): this.type = set(outputCol, value) def this() = this(Identifiable.randomUID("languageDetector")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } @transient object languageDetectorWrapped extends Serializable { val languageDetector: LanguageDetector = LanguageDetectorUtils.buildLanguageDetector( LanguageDetectorUtils.readListLangsBuiltIn(), $(minimalConfidence), $(languagePriors).toMap) } }
Example 20
Source File: CommandUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import java.io.{File, FileOutputStream, InputStream, IOException} import scala.collection.JavaConverters._ import scala.collection.Map import org.apache.spark.SecurityManager import org.apache.spark.deploy.Command import org.apache.spark.internal.Logging import org.apache.spark.launcher.WorkerCommandBuilder import org.apache.spark.util.Utils def redirectStream(in: InputStream, file: File) { val out = new FileOutputStream(file, true) // TODO: It would be nice to add a shutdown hook here that explains why the output is // terminating. Otherwise if the worker dies the executor logs will silently stop. new Thread("redirect output to " + file) { override def run() { try { Utils.copyStream(in, out, true) } catch { case e: IOException => logInfo("Redirection to " + file + " closed: " + e.getMessage) } } }.start() } }
Example 21
Source File: GroupedCountEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { private var outputsMerged = 0 private val sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap } } }
Example 22
Source File: JacksonGenerator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.json import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{MapData, ArrayData, DateTimeUtils} import scala.collection.Map import com.fasterxml.jackson.core._ import org.apache.spark.sql.Row import org.apache.spark.sql.types._ private[sql] object JacksonGenerator { def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = { def valWriter: (DataType, Any) => Unit = { case (_, null) | (NullType, _) => gen.writeNull() case (StringType, v) => gen.writeString(v.toString) case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString) case (IntegerType, v: Int) => gen.writeNumber(v) case (ShortType, v: Short) => gen.writeNumber(v) case (FloatType, v: Float) => gen.writeNumber(v) case (DoubleType, v: Double) => gen.writeNumber(v) case (LongType, v: Long) => gen.writeNumber(v) case (DecimalType(), v: Decimal) => gen.writeNumber(v.toJavaBigDecimal) case (ByteType, v: Byte) => gen.writeNumber(v.toInt) case (BinaryType, v: Array[Byte]) => gen.writeBinary(v) case (BooleanType, v: Boolean) => gen.writeBoolean(v) case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString) // For UDT values, they should be in the SQL type's corresponding value type. // We should not see values in the user-defined class at here. // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is // an ArrayData at here, instead of a Vector. case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v) case (ArrayType(ty, _), v: ArrayData) => gen.writeStartArray() v.foreach(ty, (_, value) => valWriter(ty, value)) gen.writeEndArray() case (MapType(kt, vt, _), v: MapData) => gen.writeStartObject() v.foreach(kt, vt, { (k, v) => gen.writeFieldName(k.toString) valWriter(vt, v) }) gen.writeEndObject() case (StructType(ty), v: InternalRow) => gen.writeStartObject() var i = 0 while (i < ty.length) { val field = ty(i) val value = v.get(i, field.dataType) if (value != null) { gen.writeFieldName(field.name) valWriter(field.dataType, value) } i += 1 } gen.writeEndObject() case (dt, v) => sys.error( s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.") } valWriter(rowSchema, row) } }
Example 23
Source File: CommandUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import java.io.{File, FileOutputStream, InputStream, IOException} import scala.collection.JavaConverters._ import scala.collection.Map import org.apache.spark.Logging import org.apache.spark.SecurityManager import org.apache.spark.deploy.Command import org.apache.spark.launcher.WorkerCommandBuilder import org.apache.spark.util.Utils def redirectStream(in: InputStream, file: File) { val out = new FileOutputStream(file, true) // TODO: It would be nice to add a shutdown hook here that explains why the output is // terminating. Otherwise if the worker dies the executor logs will silently stop. new Thread("redirect output to " + file) { override def run() { try { Utils.copyStream(in, out, true) } catch { case e: IOException => logInfo("Redirection to " + file + " closed: " + e.getMessage) } } }.start() } }
Example 24
Source File: TaskResult.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.Map import scala.collection.mutable import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value. val resultSer = SparkEnv.get.serializer.newInstance() valueObject = resultSer.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 25
Source File: GroupedSumEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val sum = entry.getValue.sum result.put(entry.getKey, new BoundedDouble(sum, 1.0, sum, sum)) } result.asScala } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = studentTCacher.get(counter.count) val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev result.put(entry.getKey, new BoundedDouble(sumEstimate, confidence, low, high)) } result.asScala } } }
Example 26
Source File: GroupedCountEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result.put(key, new BoundedDouble(sum, 1.0, sum, sum)) } result.asScala } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result.put(key, new BoundedDouble(mean, confidence, low, high)) } result.asScala } } }
Example 27
Source File: GroupedMeanEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val mean = entry.getValue.mean result.put(entry.getKey, new BoundedDouble(mean, 1.0, mean, mean)) } result.asScala } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = studentTCacher.get(counter.count) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result.put(entry.getKey, new BoundedDouble(mean, confidence, low, high)) } result.asScala } } }
Example 28
Source File: Settings.scala From neuroflow with Apache License 2.0 | 5 votes |
package neuroflow.core import neuroflow.core.Network.LearningRate import scala.collection.{Map, Set} case class Settings[V] (verbose : Boolean = true, learningRate : LearningRate[V] = { case (i, α) => α }: LearningRate[V], updateRule : Update[V] = Vanilla[V](), precision : Double = 1E-3, iterations : Int = Int.MaxValue, prettyPrint : Boolean = false, batchSize : Option[Int] = None, gcThreshold : Option[Long] = None, lossFuncOutput : Option[LossFuncOutput] = None, waypoint : Option[Waypoint[V]] = None, approximation : Option[Approximation[V]] = None, regularization : Option[Regularization] = None, partitions : Option[Set[Int]] = None, specifics : Option[Map[String, V]] = None) extends Serializable
Example 29
Source File: PSVector.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.models import java.util.concurrent.Future import scala.collection.Map import org.apache.spark.SparkException import com.tencent.angel.ml.math2.vector.Vector import com.tencent.angel.ml.math2.utils.RowType import com.tencent.angel.ml.matrix.psf.get.base.{GetFunc, GetResult} import com.tencent.angel.ml.matrix.psf.update.base.{UpdateFunc, VoidResult} import com.tencent.angel.sona.context.PSContext def longKeySparse(dim: Long, maxRange: Long, capacity: Int = 20, rowType: RowType = RowType.T_DOUBLE_SPARSE_LONGKEY, additionalConfiguration: Map[String, String] = Map()): PSVector = { sparse(dim, capacity, maxRange, rowType, additionalConfiguration) } def sparse(dimension: Long, capacity: Int, range: Long, rowType: RowType, additionalConfiguration: Map[String, String]): PSVector = { PSContext.instance().createVector(dimension, rowType, capacity, range, additionalConfiguration) } def sparse(dimension: Long, capacity: Int = 20, rowType: RowType = RowType.T_DOUBLE_SPARSE_LONGKEY, additionalConfiguration: Map[String, String] = Map()): PSVector = { sparse(dimension, capacity, dimension, rowType, additionalConfiguration) } }
Example 30
Source File: ModelWithDescriptor.scala From kafka-with-akka-streams-kafka-streams-tutorial with Apache License 2.0 | 5 votes |
package com.lightbend.scala.modelServer.model import java.io.{DataInputStream, DataOutputStream} import com.lightbend.model.modeldescriptor.ModelDescriptor import scala.collection.Map import com.lightbend.scala.modelServer.model.PMML.PMMLModel import com.lightbend.scala.modelServer.model.tensorflow.TensorFlowModel import scala.util.Try case class ModelWithDescriptor(model: Model, descriptor: ModelToServe){} object ModelWithDescriptor { private val factories = Map( ModelDescriptor.ModelType.PMML.name -> PMMLModel, ModelDescriptor.ModelType.TENSORFLOW.name -> TensorFlowModel ) private val factoriesInt = Map( ModelDescriptor.ModelType.PMML.index -> PMMLModel, ModelDescriptor.ModelType.TENSORFLOW.index -> TensorFlowModel ) def fromModelToServe(descriptor : ModelToServe): Try[ModelWithDescriptor] = Try{ println(s"New model - $descriptor") factories.get(descriptor.modelType.name) match { case Some(factory) => ModelWithDescriptor(factory.create(descriptor),descriptor) case _ => throw new Throwable("Undefined model type") } } def readModel(input : DataInputStream) : Option[Model] = { input.readLong.toInt match{ case length if length > 0 => val `type` = input.readLong.toInt val bytes = new Array[Byte](length) input.read(bytes) factoriesInt.get(`type`) match { case Some(factory) => try { Some(factory.restore(bytes)) } catch { case t: Throwable => System.out.println("Error Deserializing model") t.printStackTrace() None } case _ => None } case _ => None } } def writeModel(output : DataOutputStream, model: Model) : Unit = { if(model == null) output.writeLong(0l) else { try { val bytes = model.toBytes output.writeLong(bytes.length) output.writeLong(model.getType) output.write(bytes) } catch { case t: Throwable => System.out.println("Error Serializing model") t.printStackTrace() } } } }
Example 31
Source File: SpecHelper.scala From peregrine with Apache License 2.0 | 5 votes |
package io.peregrine import com.twitter.finagle.http.{Request => FinagleRequest, Response => FinagleResponse} import com.twitter.util.{Await, Future} import org.jboss.netty.handler.codec.http.HttpMethod import org.jboss.netty.util.CharsetUtil.UTF_8 import scala.collection.Map class MockResponse(val originalResponse: FinagleResponse) { def status = originalResponse.getStatus() def code = originalResponse.getStatus().getCode def body = originalResponse.getContent().toString(UTF_8) def getHeader(name: String) = originalResponse.headers().get(name) def getHeaders = originalResponse.headerMap } trait SpecHelper { def response = new MockResponse(Await.result(lastResponse)) var lastResponse: Future[FinagleResponse] = null def server: PeregrineServer def get(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map()) { executeRequest(HttpMethod.GET,path,params,headers) } def post(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map(), body:AnyRef=null) { executeRequest(HttpMethod.POST,path,params,headers,body) } def put(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map(), body:AnyRef=null) { executeRequest(HttpMethod.PUT,path,params,headers,body) } def delete(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map()) { executeRequest(HttpMethod.DELETE,path,params,headers) } def head(path:String,params:Map[String,String]=Map(), headers:Map[String,String]=Map()) { executeRequest(HttpMethod.HEAD,path,params,headers) } def patch(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map()) { executeRequest(HttpMethod.PATCH,path,params,headers) } def options(path:String, params:Map[String,String]=Map(), headers:Map[String,String]=Map(), body:AnyRef=null) { executeRequest(HttpMethod.OPTIONS,path,params,headers,body) } def send(request: FinagleRequest) { executeRequest(request) } private def executeRequest( method: HttpMethod, path: String, params: Map[String, String] = Map(), headers: Map[String,String] = Map(), body: AnyRef = null ) { val app = MockApp(server) val result: MockResult = app.execute(method = method, path = path, params = params, headers = headers, body = body) lastResponse = result.response } private def executeRequest(request: FinagleRequest) { val app = MockApp(server) val result: MockResult = app.execute(request) lastResponse = result.response } }
Example 32
Source File: GenerateDataFeaturesFile.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.rdd.RDD import scala.collection.Map import scala.collection.mutable.ListBuffer object GenerateDataFeaturesFile{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) val first = records.first() println(numData.toInt) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) print("Mapping of second categorical feature column: " + get_mapping(records, 3)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => Util.extractLabel(r) + "," + Util.extractSumFeature(r, catLen, mappings)) } val data_collection = data.collect() val d_iterator = data_collection.iterator while(d_iterator.hasNext) { val x = d_iterator.next println(x) } val first_point = data.first() val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) data.saveAsTextFile("./output/x_features" + date + ".csv") sc.stop() } }
Example 33
Source File: GroupedCountEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { private var outputsMerged = 0 private val sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap } } }
Example 34
Source File: TopElementsAggregator.scala From salt-core with Apache License 2.0 | 5 votes |
package software.uncharted.salt.core.analytic.collection import software.uncharted.salt.core.analytic.Aggregator import scala.collection.Map import scala.collection.mutable.HashMap import scala.collection.mutable.ListBuffer import scala.collection.mutable.{Map => MutableMap} import scala.collection.mutable.PriorityQueue import scala.reflect.ClassTag class TopElementsAggregator[ET: ClassTag](elementLimit: Int) extends Aggregator[Seq[ET], Map[ET, Int], List[(ET, Int)]] { def default(): Map[ET, Int] = { Map[ET, Int]() } override def add(current: Map[ET, Int], next: Option[Seq[ET]]): Map[ET, Int] = { if (next.isDefined) { // If our current map is mutable, add new data in directly. // If not, convert to a mutable map, and then add data in val sum = current match { case hm: MutableMap[ET, Int] => hm case _ => { // The current value isn't itself a mutable hashmap yet; convert to one. val hm = new HashMap[ET, Int]() hm ++= current hm } } next.get.foreach(t => sum.put(t, sum.getOrElse(t, 0) + 1)) sum } else { current } } override def merge(left: Map[ET, Int], right: Map[ET, Int]): Map[ET, Int] = { // If either input map is mutable, merge the other into it. // If neither is, convert one to mutable, and add the other into it. val (to, from) = left match { case hm: MutableMap[ET, Int] => (hm, right) case _ => right match { case hm: MutableMap[ET, Int] => (hm, left) case _ => val hm = new HashMap[ET, Int]() hm ++= left (hm, right) } } from.foreach(t => { to.put(t._1, to.getOrElse(t._1, 0) + t._2) }) to } override def finish(intermediate: Map[ET, Int]): List[(ET, Int)] = { val x = new PriorityQueue[(ET, Int)]()(Ordering.by( a => a._2 )) intermediate.foreach(t => { x.enqueue(t) }) var result = new ListBuffer[(ET, Int)] for (i <- 0 until Math.min(elementLimit, x.size)) { result.append(x.dequeue) } result.toList } }
Example 35
Source File: TypeRewriter.scala From Converter with GNU General Public License v3.0 | 5 votes |
package org.scalablytyped.converter.internal package ts package transforms import scala.collection.Map class TypeRewriter(base: TsTree) extends TreeTransformation[Map[TsType, TsType]] { override def leaveTsType(replacements: Map[TsType, TsType])(x: TsType): TsType = replacements.getOrElse(x, x) override def withTree(t: Map[TsType, TsType], tree: TsTree): Map[TsType, TsType] = if (tree === base) t else tree match { case HasTParams(tparams) => t.filterKeys { case TsTypeRef(_, TsQIdent(IArray.exactlyOne(one: TsIdentSimple)), _) if tparams.exists(_.name === one) => false case _ => true } case _ => t } }
Example 36
Source File: PairSyntax.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.syntax import com.danielwestheide.kontextfrei.DCollectionOps import org.apache.spark.Partitioner import scala.collection.Map import scala.reflect.ClassTag class PairSyntax[DCollection[_], A: ClassTag, B: ClassTag]( val self: DCollectionOps[DCollection], val coll: DCollection[(A, B)]) { final def keys: DCollection[A] = self.keys(coll) final def values: DCollection[B] = self.values(coll) final def cogroup[C: ClassTag](other: DCollection[(A, C)]) : DCollection[(A, (Iterable[B], Iterable[C]))] = self.cogroup(coll)(other) final def leftOuterJoin[C: ClassTag]( other: DCollection[(A, C)]): DCollection[(A, (B, Option[C]))] = self.leftOuterJoin(coll)(other) final def rightOuterJoin[C: ClassTag]( other: DCollection[(A, C)]): DCollection[(A, (Option[B], C))] = self.rightOuterJoin(coll)(other) final def fullOuterJoin[C: ClassTag]( other: DCollection[(A, C)]): DCollection[(A, (Option[B], Option[C]))] = self.fullOuterJoin(coll)(other) final def mapValues[C: ClassTag](f: B => C): DCollection[(A, C)] = self.mapValues(coll)(f) final def flatMapValues[C: ClassTag]( f: B => TraversableOnce[C]): DCollection[(A, C)] = self.flatMapValues(coll)(f) final def reduceByKey(f: (B, B) => B): DCollection[(A, B)] = self.reduceByKey(coll)(f) final def foldByKey(zeroValue: B)(f: (B, B) => B): DCollection[(A, B)] = self.foldByKey(coll)(zeroValue, f) final def aggregateByKey[C: ClassTag](zeroValue: C)( seqOp: (C, B) => C, combOp: (C, C) => C): DCollection[(A, C)] = self.aggregateByKey(coll)(zeroValue)(seqOp, combOp) final def combineByKey[C: ClassTag]( createCombiner: B => C, mergeValue: (C, B) => C, mergeCombiners: (C, C) => C): DCollection[(A, C)] = self.combineByKey(coll)(createCombiner)(mergeValue, mergeCombiners) final def countByKey(): Map[A, Long] = self.countByKey(coll) final def collectAsMap(): Map[A, B] = self.collectAsMap(coll) final def partitionBy(partitioner: Partitioner): DCollection[(A, B)] = self.partitionBy(coll)(partitioner) }
Example 37
Source File: RDDPairFunctions.scala From kontextfrei with Apache License 2.0 | 5 votes |
package com.danielwestheide.kontextfrei.rdd import com.danielwestheide.kontextfrei.DCollectionPairFunctions import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import scala.collection.Map import scala.reflect.ClassTag private[kontextfrei] trait RDDPairFunctions extends DCollectionPairFunctions[RDD] { this: RDDBase => override final def cogroup[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Iterable[B], Iterable[C]))] = withSite(x) { _.cogroup(y) } override final def values[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[B] = withSite(x) { _.values } override final def keys[A: ClassTag, B: ClassTag](x: RDD[(A, B)]): RDD[A] = withSite(x) { _.keys } override final def leftOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (B, Option[C]))] = withSite(x) { _.leftOuterJoin(y) } override final def rightOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], C))] = withSite(x) { _.rightOuterJoin(y) } override final def fullOuterJoin[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(y: RDD[(A, C)]): RDD[(A, (Option[B], Option[C]))] = withSite(x) { _.fullOuterJoin(y) } override final def mapValues[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(f: B => C): RDD[(A, C)] = withSite(x) { _.mapValues(f) } override final def flatMapValues[A: ClassTag, B: ClassTag, C: ClassTag]( x: RDD[(A, B)])(f: B => TraversableOnce[C]): RDD[(A, C)] = withSite(x) { _.flatMapValues(f) } override final def reduceByKey[A: ClassTag, B: ClassTag](xs: RDD[(A, B)])( f: (B, B) => B): RDD[(A, B)] = withSite(xs) { _.reduceByKey(f) } override final def foldByKey[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)])(zeroValue: B, f: (B, B) => B): RDD[(A, B)] = withSite(xs) { _.foldByKey(zeroValue)(f) } override final def aggregateByKey[A: ClassTag, B: ClassTag, C: ClassTag]( xs: RDD[(A, B)])(zeroValue: C)(seqOp: (C, B) => C, combOp: (C, C) => C): RDD[(A, C)] = withSite(xs) { _.aggregateByKey(zeroValue)(seqOp, combOp) } override final def combineByKey[A: ClassTag, B: ClassTag, C: ClassTag]( xs: RDD[(A, B)])(createCombiner: B => C)( mergeValue: (C, B) => C, mergeCombiners: (C, C) => C): RDD[(A, C)] = withSite(xs) { _.combineByKey(createCombiner, mergeValue, mergeCombiners) } override final def countByKey[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)]): Map[A, Long] = withSite(xs) { _.countByKey() } override final def collectAsMap[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)]): Map[A, B] = withSite(xs) { _.collectAsMap() } override final def partitionBy[A: ClassTag, B: ClassTag]( xs: RDD[(A, B)])(partitioner: Partitioner): RDD[(A, B)] = withSite(xs) { _.partitionBy(partitioner) } }
Example 38
Source File: XmlFile.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml.util import java.io.CharArrayWriter import java.nio.charset.Charset import javax.xml.stream.XMLOutputFactory import scala.collection.Map import com.databricks.spark.xml.parsers.StaxXmlGenerator import com.sun.xml.txw2.output.IndentingXMLStreamWriter import org.apache.hadoop.io.{Text, LongWritable} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.sql.DataFrame import com.databricks.spark.xml.{XmlOptions, XmlInputFormat} private[xml] object XmlFile { val DEFAULT_INDENT = " " def withCharset( context: SparkContext, location: String, charset: String, rowTag: String): RDD[String] = { // This just checks the charset's validity early, to keep behavior Charset.forName(charset) context.hadoopConfiguration.set(XmlInputFormat.START_TAG_KEY, s"<$rowTag>") context.hadoopConfiguration.set(XmlInputFormat.END_TAG_KEY, s"</$rowTag>") context.hadoopConfiguration.set(XmlInputFormat.ENCODING_KEY, charset) context.newAPIHadoopFile(location, classOf[XmlInputFormat], classOf[LongWritable], classOf[Text]).map { case (_, text) => new String(text.getBytes, 0, text.getLength, charset) } } def saveAsXmlFile( dataFrame: DataFrame, path: String, parameters: Map[String, String] = Map()): Unit = { val options = XmlOptions(parameters.toMap) val codecClass = CompressionCodecs.getCodecClass(options.codec) val rowSchema = dataFrame.schema val indent = XmlFile.DEFAULT_INDENT val xmlRDD = dataFrame.rdd.mapPartitions { iter => val factory = XMLOutputFactory.newInstance() val writer = new CharArrayWriter() val xmlWriter = factory.createXMLStreamWriter(writer) val indentingXmlWriter = new IndentingXMLStreamWriter(xmlWriter) indentingXmlWriter.setIndentStep(indent) new Iterator[String] { var firstRow: Boolean = true var lastRow: Boolean = true override def hasNext: Boolean = iter.hasNext || firstRow || lastRow override def next: String = { if (iter.nonEmpty) { if (firstRow) { indentingXmlWriter.writeStartElement(options.rootTag) firstRow = false } val xml = { StaxXmlGenerator( rowSchema, indentingXmlWriter, options)(iter.next()) indentingXmlWriter.flush() writer.toString } writer.reset() xml } else { if (!firstRow) { lastRow = false indentingXmlWriter.writeEndElement() indentingXmlWriter.close() writer.toString } else { // This means the iterator was initially empty. firstRow = false lastRow = false "" } } } } } codecClass match { case null => xmlRDD.saveAsTextFile(path) case codec => xmlRDD.saveAsTextFile(path, codec) } } }
Example 39
Source File: MapSerializerModule.scala From mango with Apache License 2.0 | 5 votes |
package com.kakao.shaded.jackson.module.scala.ser import com.kakao.shaded.jackson.databind.`type`.{TypeFactory, MapType, MapLikeType} import com.kakao.shaded.jackson.databind.jsontype.TypeSerializer import com.kakao.shaded.jackson.databind.ser.Serializers import com.kakao.shaded.jackson.databind.ser.std.StdDelegatingSerializer import com.kakao.shaded.jackson.databind.util.StdConverter import com.kakao.shaded.jackson.databind._ import com.kakao.shaded.jackson.module.scala.modifiers.MapTypeModifierModule import scala.collection.JavaConverters._ import scala.collection.Map private class MapConverter(inputType: JavaType, config: SerializationConfig) extends StdConverter[Map[_,_],java.util.Map[_,_]] { def convert(value: Map[_,_]): java.util.Map[_,_] = { val m = if (config.isEnabled(SerializationFeature.WRITE_NULL_MAP_VALUES)) { value } else { value.filter(_._2 != None) } m.asJava } override def getInputType(factory: TypeFactory) = inputType override def getOutputType(factory: TypeFactory) = factory.constructMapType(classOf[java.util.Map[_,_]], inputType.getKeyType, inputType.getContentType) .withTypeHandler(inputType.getTypeHandler) .withValueHandler(inputType.getValueHandler) } private object MapSerializerResolver extends Serializers.Base { val BASE = classOf[collection.Map[_,_]] override def findMapLikeSerializer(config: SerializationConfig, mapLikeType : MapLikeType, beanDesc: BeanDescription, keySerializer: JsonSerializer[AnyRef], elementTypeSerializer: TypeSerializer, elementValueSerializer: JsonSerializer[AnyRef]): JsonSerializer[_] = { val rawClass = mapLikeType.getRawClass if (!BASE.isAssignableFrom(rawClass)) null else new StdDelegatingSerializer(new MapConverter(mapLikeType, config)) } } trait MapSerializerModule extends MapTypeModifierModule { this += MapSerializerResolver }
Example 40
Source File: TruckAndTrafficJoinBolt.scala From trucking-iot with Apache License 2.0 | 5 votes |
package com.orendainx.trucking.storm.bolts import java.util import com.orendainx.trucking.commons.models.{EnrichedTruckAndTrafficData, EnrichedTruckData, TrafficData} import com.typesafe.scalalogging.Logger import org.apache.storm.task.{OutputCollector, TopologyContext} import org.apache.storm.topology.OutputFieldsDeclarer import org.apache.storm.topology.base.BaseWindowedBolt import org.apache.storm.tuple.{Fields, Values} import org.apache.storm.windowing.TupleWindow import scala.collection.JavaConverters._ import scala.collection.mutable.ListBuffer import scala.collection.{Map, mutable} import scala.language.implicitConversions private def processAndEmitData(truckDataPerRoute: Map[Int, ListBuffer[EnrichedTruckData]], trafficDataPerRoute: Map[Int, ListBuffer[TrafficData]]) { // For each EnrichedTruckData object, find the TrafficData object with the closest timestamp truckDataPerRoute.foreach { case (routeId, truckDataList) => trafficDataPerRoute.get(routeId) match { case None => // No traffic data for this routeId, so drop/ignore truck data case Some(trafficDataList) => truckDataList foreach { truckData => trafficDataList.sortBy(data => math.abs(data.eventTime - truckData.eventTime)).headOption match { case None => // Window didn't capture any traffic data for this truck's route case Some(trafficData) => val joinedData = EnrichedTruckAndTrafficData(truckData.eventTime, truckData.truckId, truckData.driverId, truckData.driverName, truckData.routeId, truckData.routeName, truckData.latitude, truckData.longitude, truckData.speed, truckData.eventType, truckData.foggy, truckData.rainy, truckData.windy, trafficData.congestionLevel) outputCollector.emit(new Values("EnrichedTruckAndTrafficData", joinedData)) } } } } } override def declareOutputFields(declarer: OutputFieldsDeclarer): Unit = declarer.declare(new Fields("dataType", "data")) }
Example 41
Source File: LinearRegression.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegression{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = true if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD") } }
Example 42
Source File: LinearRegressionWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegressionWithLog{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 //val step = 0.2 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (Math.exp(p.label), Math.exp(linear_model.predict(p.features)))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = false if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log") } }
Example 43
Source File: DecisionTreeUtil.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.decisiontree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object DecisionTreeUtil { def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val data_dt = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r))) } val splits = data_dt.randomSplit(Array(0.8, 0.2), seed = 11L) val training = splits(0).cache() val test = splits(1) return (training, test) } def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint], categoricalFeaturesInfo: scala.Predef.Map[Int, Int], maxDepth :Int, maxBins: Int): Double = { val impurity = "variance" val decisionTreeModel = DecisionTree.trainRegressor(train, categoricalFeaturesInfo, impurity,maxDepth, maxBins ) val true_vs_predicted = test.map(p => (p.label, decisionTreeModel.predict(p.features))) val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) return rmsle } }
Example 44
Source File: DecisionTreeCategoricalFeaturesApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.decisiontree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object DecisionTreeCategoricalFeaturesApp{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { val save = true //val sc = new SparkContext("local[2]", "First Spark App") val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) val first = records.first() println(numData.toInt) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen println("Feature vector length for categorical features:"+ catLen) println("Feature vector length for numerical features:" + numLen) println("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val data_dt = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extract_features_dt(r))) } val first_point = data_dt.first() println("Decision Tree feature vector:" + first_point.features.toString) println("Decision Tree feature vector length: " + first_point.features.size) def getCatFeatures(): scala.Predef.Map[Int, Int] = { var d = scala.Predef.Map[Int, Int]() for(a <- 2 until 10){ d += (a-2 -> (get_mapping(records, a).size + 1)) //d.put(a-2,get_mapping(records, a).size + 1) } return d } val cat_features = getCatFeatures() //dict([(i - 2, len(get_mapping(records, i)) + 1) for i in range(2,10)]) //val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]() val impurity = "variance" val maxDepth = 5 val maxBins = 32 val decisionTreeModel= DecisionTree.trainRegressor(data_dt, cat_features, impurity, maxDepth, maxBins) //val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo, // impurity, maxDepth, maxBins ) val preds = decisionTreeModel.predict(data_dt.map( p=> p.features)) val actual = data.map( p=> p.label) val true_vs_predicted_dt = actual.zip(preds) val true_vs_predicted_csv = data.map(p => p.label + " ," + decisionTreeModel.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) if (save){ true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_categorical_" + date + ".csv") } print("Decision Tree depth: " + decisionTreeModel.depth) print("Decision Tree number of nodes: " + decisionTreeModel.numNodes) Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree Categorical Features") } }
Example 45
Source File: DecisionTreeWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.decisiontree import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.DecisionTree import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object DecisionTreeWithLog{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { val save = false val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) val first = records.first() println(numData.toInt) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen println("Feature vector length for categorical features:"+ catLen) println("Feature vector length for numerical features:" + numLen) println("Total feature vector length: " + totalLen) val data_dt = { records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extract_features_dt(r))) } val first_point = data_dt.first() println("Decision Tree feature vector:" + first_point.features.toString) println("Decision Tree feature vector length: " + first_point.features.size) val categoricalFeaturesInfo = scala.Predef.Map[Int, Int]() val impurity = "variance" val maxDepth = 5 val maxBins = 32 val decisionTreeModel = DecisionTree.trainRegressor(data_dt, categoricalFeaturesInfo, impurity, maxDepth, maxBins ) val preds = decisionTreeModel.predict(data_dt.map( p=> p.features)) val preds_2 = preds.map(p=> Math.exp(p)) val actual = data_dt.map( p=> Math.exp(p.label)) val true_vs_predicted_dt = actual.zip(preds) if(save){ val true_vs_predicted_csv = data_dt.map(p => p.label + " ," + decisionTreeModel.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) true_vs_predicted_csv.saveAsTextFile("./output/decision_tree_" + date + ".csv") } print("Decision Tree depth: " + decisionTreeModel.depth) print("Decision Tree number of nodes: " + decisionTreeModel.numNodes) Util.calculatePrintMetrics(true_vs_predicted_dt, "Decision Tree With Log") Util.sc.stop() } }
Example 46
Source File: RidgeRegressionApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.mllib.regression.{LabeledPoint, RidgeRegressionWithSGD} import org.apache.spark.rdd.RDD import scala.collection.Map import scala.collection.mutable.ListBuffer object RidgeRegressionApp{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { //val sc = new SparkContext("local[2]", "First Spark App") val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) records.cache() //print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.1 val intercept =false val rr = new RidgeRegressionWithSGD() rr.optimizer.setNumIterations(iterations) rr.optimizer.setStepSize(0.1) val rrModel = rr.run(data) val true_vs_predicted = data.map(p => (p.label, rrModel.predict(p.features))) val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean() val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean() val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) println("Ridge Regression - Mean Squared Error: " + mse) println("Ridge Regression - Mean Absolute Error: " + mae) println("Ridge Regression - Root Mean Squared Log Error:" + rmsle) } }
Example 47
Source File: JsonUtils.scala From InteractiveGraph-neo4j with BSD 2-Clause "Simplified" License | 5 votes |
package org.grapheco.server.util import com.google.gson._ import scala.collection.Map object JsonUtils { def getPrimitiveValue(value: JsonPrimitive): Any = { (value.isBoolean, value.isNumber, value.isString) match { case (true, false, false) => value.getAsBoolean; case (false, true, false) => Some(value.getAsNumber).map(num => if (num.toString.contains(".")) { num.doubleValue() } else { num.intValue() } ).get; case (false, false, true) => value.getAsString; } } val gson = new GsonBuilder() .setPrettyPrinting() .create(); def parse(json: String): JsonElement = { new JsonParser().parse(json); } def stringify(e: JsonElement): String = { gson.toJson(e); } def stringify(e: Map[String, _]): String = { gson.toJson(asJsonObject(e)); } def asJsonArray(arr: Array[_]) = { val ja = new JsonArray(); arr.foreach(x => ja.add(asJsonElement(x))); ja; } def asJsonElement(v: Any): JsonElement = { if (v.isInstanceOf[Map[_, _]]) { asJsonObject(v.asInstanceOf[Map[String, _]]); } else if (v.isInstanceOf[Array[_]]) { asJsonArray(v.asInstanceOf[Array[_]]); } else { v match { case x: String => new JsonPrimitive(x); case x: Number => new JsonPrimitive(x); case x: Boolean => new JsonPrimitive(x); } } } def asJsonObject(map: Map[String, _]) = { val jo = new JsonObject(); map.foreach(en => { jo.add(en._1, asJsonElement(en._2)); }) jo; } }
Example 48
Source File: GradientBoostedTreesUtil.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.gradientboosted import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object GradientBoostedTreesUtil { def getTrainTestData(): (RDD[LabeledPoint], RDD[LabeledPoint]) = { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val splits = data.randomSplit(Array(0.8, 0.2), seed = 11L) val training = splits(0).cache() val test = splits(1) return (training, test) } def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def evaluate(train: RDD[LabeledPoint],test: RDD[LabeledPoint], iterations:Int, maxDepth:Int, maxBins: Int): Double ={ var boostingStrategy = BoostingStrategy.defaultParams("Regression") boostingStrategy.setNumIterations(iterations) boostingStrategy.treeStrategy.setMaxDepth(maxDepth) boostingStrategy.treeStrategy.setMaxBins(maxBins) val model = GradientBoostedTrees.train(train, boostingStrategy) // // @classmethod // @since("1.3.0") // def trainRegressor(cls, data, categoricalFeaturesInfo, // loss="leastSquaresError", numIterations=100, learningRate=0.1, maxDepth=3, // maxBins=32): val true_vs_predicted = test.map(p => (p.label, model.predict(p.features))) val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) return rmsle } }
Example 49
Source File: GradientBoostedTreesApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.gradientboosted import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.GradientBoostedTrees import org.apache.spark.mllib.tree.configuration.BoostingStrategy import org.apache.spark.rdd.RDD import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object GradientBoostedTreesApp{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { //val conf = new SparkConf().setMaster("local").setAppName("GradientBoostedTreesRegressionApp") val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) records.cache() var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Gradient Boosted Trees Model feature vector:" + first_point.features.toString) println("Gradient Boosted Trees Model feature vector length: " + first_point.features.size) var boostingStrategy = BoostingStrategy.defaultParams("Regression") boostingStrategy.setNumIterations(3)// Note: Use more iterations in practice. boostingStrategy.treeStrategy.setMaxDepth(5) val model = GradientBoostedTrees.train(data, boostingStrategy) val true_vs_predicted = data.map(p => (p.label, model.predict(p.features))) val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } val save = true if(save){ val true_vs_predicted_csv = data.map(p => p.label + " ," + model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) true_vs_predicted_csv.saveAsTextFile("./output/gradient_boosted_trees_" + date + ".csv") } val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean() val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean() val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) println("Gradient Boosted Trees - Mean Squared Error: " + mse) println("Gradient Boosted Trees - Mean Absolute Error: " + mae) println("Gradient Boosted Trees - Root Mean Squared Log Error:" + rmsle) } }
Example 50
Source File: CalculateStdDeviation.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.rdd.RDD import scala.collection.Map import scala.collection.mutable.ListBuffer object CalculateStdDeviation{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => Util.extractFeatures(r, catLen, mappings)) } //data.saveAsTextFile("./output/temp.txt") val count_columns = data.first().size var a = 0; var x = new Array[Double](count_columns) // for loop execution with a range for( a <- 0 to (count_columns -1) ){ val stddev = data.map(r => r(a)).stdev() //println(a + ": " + ); x.update(a,stddev) } for( a <- 0 to (count_columns -1) ){ println(a + " : " + x(a)) } //val data_1_std_dev = data.map(r => r(1)).stdev() //println(data_1_std_dev) } }
Example 51
Source File: LinearRegressionWithIntercept.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegressionWithIntercept{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val data1 = { records.map(r => Util.extractFeatures(r, catLen, mappings)) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.025 val intercept =true val linReg = new LinearRegressionWithSGD().setIntercept(intercept) linReg.optimizer.setNumIterations(iterations).setStepSize(step) val linear_model = linReg.run(data) print(data.first()); val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = true if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean() val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean() val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) println("Linear Model - Mean Squared Error: " + mse) println("Linear Model - Mean Absolute Error: " + mae) println("Linear Model - Root Mean Squared Log Error:" + rmsle) } }
Example 52
Source File: LinearRegression.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegression{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = true if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD") } }
Example 53
Source File: LinearRegressionWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegressionWithLog{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 //val step = 0.2 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (Math.exp(p.label), Math.exp(linear_model.predict(p.features)))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = false if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log") } }
Example 54
Source File: IsotonicRegressionApp.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples import org.apache.spark.mllib.regression.{IsotonicRegression, LabeledPoint} import org.apache.spark.rdd.RDD import scala.collection.Map import scala.collection.mutable.ListBuffer object IsotonicRegressionApp{ def get_mapping(rdd :RDD[Array[String]], idx: Int) : Map[String, Long] = { return rdd.map( fields=> fields(idx)).distinct().zipWithIndex().collectAsMap() } def main(args: Array[String]) { val sc = Util.sc // we take the raw data in CSV format and convert it into a set of records // of the form (user, product, price) val rawData = sc.textFile("../data/hour_noheader_1000.csv") val numData = rawData.count() val records = rawData.map(line => line.split(",")) records.cache() var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val parsedData = records.map { r => (Util.extractLabel(r), Util.extractSumFeature(r, catLen, mappings), 1.0) } val iterations = 10 val step = 0.1 val intercept =false val x = new IsotonicRegression().setIsotonic(false) val model = x.run(parsedData) val parsedData1: RDD[Double] = parsedData.map(r => r._2) //val model = GradientBoostedTrees.train(data, boostingStrategy) val true_vs_predicted = parsedData.map(p => (p._1, model.predict(p._2))) val save = true if(save){ val true_vs_predicted_csv = parsedData.map(p => ( p._1+ "," + model.predict(p._2))) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) true_vs_predicted_csv.saveAsTextFile("./output/isotonic_regression_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean() val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean() val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) Util.calculatePrintMetrics(true_vs_predicted, "Isotonic Regression") } }
Example 55
Source File: CommandUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import java.io.{File, FileOutputStream, InputStream, IOException} import scala.collection.JavaConverters._ import scala.collection.Map import org.apache.spark.SecurityManager import org.apache.spark.deploy.Command import org.apache.spark.internal.Logging import org.apache.spark.launcher.WorkerCommandBuilder import org.apache.spark.util.Utils def redirectStream(in: InputStream, file: File) { val out = new FileOutputStream(file, true) // TODO: It would be nice to add a shutdown hook here that explains why the output is // terminating. Otherwise if the worker dies the executor logs will silently stop. new Thread("redirect output to " + file) { override def run() { try { Utils.copyStream(in, out, true) } catch { case e: IOException => logInfo("Redirection to " + file + " closed: " + e.getMessage) } } }.start() } }
Example 56
Source File: GroupedCountEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { private var outputsMerged = 0 private val sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap } } }
Example 57
Source File: GroupedSumEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val sum = entry.getValue.sum result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = studentTCacher.get(counter.count) val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high) } result } } }
Example 58
Source File: GroupedCountEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T,Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T,Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T,Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result(key) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(key) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 59
Source File: GroupedMeanEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val mean = entry.getValue.mean result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = studentTCacher.get(counter.count) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(entry.getKey) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 60
Source File: Cache.scala From sangria with Apache License 2.0 | 5 votes |
package sangria.util import scala.collection.{Map, Set} trait Cache[Key, Value] { def size: Int def contains(key: Key): Boolean def apply(key: Key): Value def get(key: Key): Option[Value] def getOrElse(key: Key, default: => Value): Value def update(key: Key, value: Value): Unit def remove(key: Key): Unit def clear(): Unit // NOTE: that `getOrElseUpdate` allows a race condition between value retrieval and cache update. // It is an explicit decision to avoid any kind of synchronization (it is preferred to recompute value multiple times than to synchronize) def getOrElseUpdate(key: Key, fn: => Value): Value def find(fn: (Key, Value) => Boolean): Option[(Key, Value)] def mapToSet[R](fn: (Key, Value) => R): Set[R] def mapValues[R](fn: Value => R): Map[Key, R] def keyExists(fn: Key => Boolean): Boolean def forEachValue(fn: Value => Unit): Unit def removeKeys(fn: Key => Boolean): Unit } object Cache { def empty[Key, Value]: Cache[Key, Value] = emptyConcurrentHashMap[Key, Value] def emptyTrieMap[Key, Value] = new TrieMapCache[Key, Value] def emptyConcurrentHashMap[Key, Value] = new ConcurrentHashMapCache[Key, Value] def apply[Key, Value](elems: (Key, Value)*) = { val c = empty[Key, Value] elems.foreach {case (key, value) => c(key) = value} c } }