org.apache.spark.broadcast.Broadcast Scala Examples
The following examples show how to use org.apache.spark.broadcast.Broadcast.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ResultTask.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.nio.ByteBuffer import java.io._ import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD private[spark] class ResultTask[T, U]( stageId: Int, taskBinary: Broadcast[Array[Byte]], partition: Partition, @transient locs: Seq[TaskLocation], val outputId: Int) extends Task[U](stageId, partition.index) with Serializable { @transient private[this] val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } override def runTask(context: TaskContext): U = { // Deserialize the RDD and the func using the broadcast variables. val deserializeStartTime = System.currentTimeMillis() val ser = SparkEnv.get.closureSerializer.newInstance() val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime metrics = Some(context.taskMetrics) func(context, rdd.iterator(partition, context)) } // This is only callable on the driver side. override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")" }
Example 2
Source File: HingeAggregator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim.aggregator import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg._ def add(instance: Instance): this.type = { instance match { case Instance(label, weight, features) => require(numFeatures == features.size, s"Dimensions mismatch when adding new instance." + s" Expecting $numFeatures but got ${features.size}.") require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0") if (weight == 0.0) return this val localFeaturesStd = bcFeaturesStd.value val localCoefficients = coefficientsArray val localGradientSumArray = gradientSumArray val dotProduct = { var sum = 0.0 features.foreachActive { (index, value) => if (localFeaturesStd(index) != 0.0 && value != 0.0) { sum += localCoefficients(index) * value / localFeaturesStd(index) } } if (fitIntercept) sum += localCoefficients(numFeaturesPlusIntercept - 1) sum } // Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x))) // Therefore the gradient is -(2y - 1)*x val labelScaled = 2 * label - 1.0 val loss = if (1.0 > labelScaled * dotProduct) { (1.0 - labelScaled * dotProduct) * weight } else { 0.0 } if (1.0 > labelScaled * dotProduct) { val gradientScale = -labelScaled * weight features.foreachActive { (index, value) => if (localFeaturesStd(index) != 0.0 && value != 0.0) { localGradientSumArray(index) += value * gradientScale / localFeaturesStd(index) } } if (fitIntercept) { localGradientSumArray(localGradientSumArray.length - 1) += gradientScale } } lossSum += loss weightSum += weight this } } }
Example 3
Source File: RDDLossFunction.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim.loss import scala.reflect.ClassTag import breeze.linalg.{DenseVector => BDV} import breeze.optimize.DiffFunction import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregator import org.apache.spark.rdd.RDD private[ml] class RDDLossFunction[ T: ClassTag, Agg <: DifferentiableLossAggregator[T, Agg]: ClassTag]( instances: RDD[T], getAggregator: (Broadcast[Vector] => Agg), regularization: Option[DifferentiableRegularization[Vector]], aggregationDepth: Int = 2) extends DiffFunction[BDV[Double]] { override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = { val bcCoefficients = instances.context.broadcast(Vectors.fromBreeze(coefficients)) val thisAgg = getAggregator(bcCoefficients) val seqOp = (agg: Agg, x: T) => agg.add(x) val combOp = (agg1: Agg, agg2: Agg) => agg1.merge(agg2) val newAgg = instances.treeAggregate(thisAgg)(seqOp, combOp, aggregationDepth) val gradient = newAgg.gradient val regLoss = regularization.map { regFun => val (regLoss, regGradient) = regFun.calculate(Vectors.fromBreeze(coefficients)) BLAS.axpy(1.0, regGradient, gradient) regLoss }.getOrElse(0.0) bcCoefficients.destroy(blocking = false) (newAgg.loss + regLoss, gradient.asBreeze.toDenseVector) } }
Example 4
Source File: CommunityBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.operators.partitioning import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID} import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator import org.apache.log4j.Logger import org.apache.spark.{Partitioner, SparkContext} import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId} import scala.reflect.ClassTag object CommunityBasedPartitioning { @transient val logger=Logger.getLogger(CommunityBasedPartitioning.getClass()) def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={ val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph) val numberOfCommunities=communities.vertices.values.countApproxDistinct() val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices) val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator) logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries and ${coarsedNumberOfPartitions} partitions") val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache() out.edges.foreachPartition((_)=>{}) out.vertices.foreachPartition((_)=>{}) out } def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={ partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts) } }
Example 5
Source File: ShortestPathLengthsFromCSV.scala From sparkling-graph with BSD 2-Clause "Simplified" License | 5 votes |
package ml.sparkling.graph.examples import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes._ import ml.sparkling.graph.operators.algorithms.shortestpaths.ShortestPathsAlgorithm import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils.FastUtilWithDistance.DataMap import ml.sparkling.graph.operators.predicates.AllPathPredicate import org.apache.spark.broadcast.Broadcast import org.apache.spark.graphx.{Graph, VertexId} import scala.collection.JavaConversions._ object ShortestPathLengthsFromCSV extends ExampleApp { def body()={ val shortestPaths =if(bucketSize == -1l) ShortestPathsAlgorithm.computeShortestPathsLengths(partitionedGraph,AllPathPredicate,treatAsUndirected) else ShortestPathsAlgorithm.computeShortestPathsLengthsIterative(partitionedGraph,(g:Graph[_,_])=>bucketSize,treatAsUndirected) val size: Broadcast[VertexId] =ctx.broadcast(partitionedGraph.numVertices) partitionedGraph.outerJoinVertices(shortestPaths.vertices)(Util.dataTransformFunction(size) _).vertices.values.saveAsTextFile(out) ctx.stop() } } private object Util{ def dataTransformFunction(size: Broadcast[VertexId])(vId: VertexId,oldValue: String,pathsOption: Option[_ >: DataMap <: JMap[JLong, JDouble]])={ pathsOption.flatMap((paths)=>{ var entries=paths.entrySet().toList.sortBy(_.getKey) val out=new StringBuilder() out++=s"${oldValue}," var a = 0l while (a < size.value) { if (entries.size > 0 && a == entries.head.getKey) { out ++= s"${entries.head.getValue}," entries = entries.drop(1) } else { out ++= "0," } a += 1l } out.setLength(out.length - 1) Option(out.toString()) }).getOrElse(oldValue) } }
Example 6
Source File: FeatureExtraction.scala From meetup-stream with Apache License 2.0 | 5 votes |
package transformations import scala.io.Source import core._ import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast object FeatureExtraction { val localDictionary=Source .fromURL(getClass.getResource("/wordsEn.txt")) .getLines .zipWithIndex .toMap def breakToWords(description: String)={ val wordSelector="""[^\<\>\/]\b([a-zA-Z\d]{4,})\b""".r (wordSelector findAllIn description).map{_.trim.toLowerCase()} } def eventToVector(dictionary: Map[String, Int], description: String): Option[Vector]={ def popularWords(words: Iterator[String])={ val initialWordCounts=collection.mutable.Map[String, Int]() val wordCounts=words. foldLeft(initialWordCounts){ case(wordCounts, word)=> wordCounts+Tuple2(word,wordCounts.getOrElse(word,0)+1) } val wordsIndexes=wordCounts .flatMap{ case(word, count)=>dictionary.get(word).map{index=>(index,count.toDouble)} } val topWords=wordsIndexes.toSeq.sortBy(-1*_._2).take(10) topWords } val wordsIterator = breakToWords(description) val topWords=popularWords(wordsIterator) if (topWords.size==10) Some(Vectors.sparse(dictionary.size,topWords)) else None } }
Example 7
Source File: ResultTask.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.nio.ByteBuffer import java.io._ import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD private[spark] class ResultTask[T, U]( stageId: Int, stageAttemptId: Int, taskBinary: Broadcast[Array[Byte]], partition: Partition, @transient locs: Seq[TaskLocation], val outputId: Int,// internalAccumulators: Seq[Accumulator[Long]]) extends Task[U](stageId, stageAttemptId, partition.index, internalAccumulators) with Serializable { @transient private[this] val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } override def runTask(context: TaskContext): U = { // Deserialize the RDD and the func using the broadcast variables. //获取反序列化的起始时间 val deserializeStartTime = System.currentTimeMillis() //获取反序列化器 val ser = SparkEnv.get.closureSerializer.newInstance() //调用反序列化器ser的deserialize()方法,得到RDD和FUNC,数据来自taskBinary val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)]( //Thread.currentThread().getContextClassLoader,可以获取当前线程的引用,getContextClassLoader用来获取线程的上下文类加载器 ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) //计算反序列化时间_executorDeserializeTime _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime //Task的taskMetrics信息 metrics = Some(context.taskMetrics) // 调针对RDD中的每个分区,迭代执行func方法,执行Task func(context, rdd.iterator(partition, context)) } // This is only callable on the driver side. //这只能在driver使用 override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")" }
Example 8
Source File: NearestNeighbors.scala From SparkSMOTE with MIT License | 5 votes |
package utils import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import com.github.fommil.netlib.BLAS import scala.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.collection.mutable.ArrayBuffer object NearestNeighbors { def runNearestNeighbors(data: RDD[Array[(LabeledPoint,Int,Int)]], kNN: Int, sampleData: Array[(LabeledPoint,Int,Int)]): Array[(String,Array[((Int,Int),Double)])] = { val globalNearestNeighborsByIndex = data.mapPartitionsWithIndex(localNearestNeighbors(_,_,kNN,sampleData)).groupByKey().map(x => (x._1,x._2.toArray.sortBy(r => r._2).take(kNN))).collect() globalNearestNeighborsByIndex } private def localNearestNeighbors(partitionIndex: Long, iter: Iterator[Array[(LabeledPoint,Int,Int)]], kNN: Int, sampleData: Array[(LabeledPoint,Int,Int)]): Iterator[(String,((Int,Int),Double))] = { var result = List[(String,((Int,Int),Double))]() val dataArr = iter.next val nLocal = dataArr.size - 1 val sampleDataSize = sampleData.size - 1 val kLocalNeighbors = Array.fill[distanceIndex](sampleDataSize+1)(null) for { i1 <- 0 to sampleDataSize } kLocalNeighbors(i1) = distanceIndex(sampleData(i1)._3.toInt, sampleData(i1)._2.toInt, DenseVector.zeros[Double](kNN) + Int.MaxValue.toDouble, DenseVector.zeros[Int](kNN)) for (i <- 0 to nLocal) { val currentPoint = dataArr(i) val features = currentPoint._1.features val rowId = currentPoint._3.toInt for (j <- 0 to sampleDataSize) { val samplePartitionId = sampleData(j)._2 val sampleRowId = sampleData(j)._3 val sampleFeatures = sampleData(j)._1.features if (!((rowId == sampleRowId) & (samplePartitionId == partitionIndex))) { val distance = Math.sqrt(sum((sampleFeatures - features) :* (sampleFeatures - features))) if (distance < max(kLocalNeighbors(j).distanceVector)) { val indexToReplace = argmax(kLocalNeighbors(j).distanceVector) kLocalNeighbors(j).distanceVector(indexToReplace) = distance kLocalNeighbors(j).neighborRowId(indexToReplace) = rowId } } } } for (m <- 0 to sampleDataSize){ for (l <-0 to kNN-1) { val key = kLocalNeighbors(m).partitionId.toString+","+kLocalNeighbors(m).sampleRowId.toString val tup = (partitionIndex.toInt,kLocalNeighbors(m).neighborRowId(l)) result.::=(key,(tup,kLocalNeighbors(m).distanceVector(l))) } } result.iterator } }
Example 9
Source File: loadData.scala From SparkSMOTE with MIT License | 5 votes |
package utils import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast object loadData { def readDelimitedData(sc: SparkContext, path: String, numFeatures: Int, delimiter: String, numPartitions: Int): RDD[(LabeledPoint,Int,Int)] = { val data = sc.textFile(path).filter{x => x.split(delimiter)(0).toDouble == 1.0}.repartition(numPartitions).mapPartitions{x => Iterator(x.toArray)} val formatData = data.mapPartitionsWithIndex{(partitionId,iter) => var result = List[(LabeledPoint,Int,Int)]() val dataArray = iter.next val dataArraySize = dataArray.size - 1 var rowCount = dataArraySize for (i <- 0 to dataArraySize) { val parts = dataArray(i).split(delimiter) result.::=((LabeledPoint(parts(0).toDouble,DenseVector(parts.slice(1,numFeatures+1)).map(_.toDouble)),partitionId.toInt,rowCount)) rowCount = rowCount - 1 } result.iterator } formatData } }
Example 10
Source File: SMOTE.scala From SparkSMOTE with MIT License | 5 votes |
package SMOTE import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import com.github.fommil.netlib.BLAS import scala.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.collection.mutable.ArrayBuffer import utils._ object SMOTE { def runSMOTE(sc: SparkContext, inPath: String, outPath: String, numFeatures: Int, oversamplingPctg: Double, kNN: Int, delimiter: String, numPartitions: Int): Unit = { val rand = new Random() val data = loadData.readDelimitedData(sc, inPath, numFeatures, delimiter, numPartitions) val dataArray = data.mapPartitions(x => Iterator(x.toArray)).cache() val numObs = dataArray.map(x => x.size).reduce(_+_) println("Number of Filtered Observations "+numObs.toString) val roundPctg = oversamplingPctg val sampleData = dataArray.flatMap(x => x).sample(withReplacement = false, fraction = roundPctg, seed = 1L).collect().sortBy(r => (r._2,r._3)) //without Replacement println("Sample Data Count "+sampleData.size.toString) val globalNearestNeighbors = NearestNeighbors.runNearestNeighbors(dataArray, kNN, sampleData) var randomNearestNeighbor = globalNearestNeighbors.map(x => (x._1.split(",")(0).toInt,x._1.split(",")(1).toInt,x._2(rand.nextInt(kNN)))).sortBy(r => (r._1,r._2)) var sampleDataNearestNeighbors = randomNearestNeighbor.zip(sampleData).map(x => (x._1._3._1._1, x._1._2, x._1._3._1._2, x._2._1)) val syntheticData = dataArray.mapPartitionsWithIndex(createSyntheticData(_,_,sampleDataNearestNeighbors,delimiter)).persist() println("Synthetic Data Count "+syntheticData.count.toString) val newData = syntheticData.union(sc.textFile(inPath)) println("New Line Count "+newData.count.toString) newData.saveAsTextFile(outPath) } private def createSyntheticData(partitionIndex: Long, iter: Iterator[Array[(LabeledPoint,Int,Int)]], sampleDataNN: Array[(Int,Int,Int,LabeledPoint)], delimiter: String): Iterator[String] = { var result = List[String]() val dataArr = iter.next val nLocal = dataArr.size - 1 val sampleDataNNSize = sampleDataNN.size - 1 val rand = new Random() for (j <- 0 to sampleDataNNSize){ val partitionId = sampleDataNN(j)._1 val neighborId = sampleDataNN(j)._3 val sampleFeatures = sampleDataNN(j)._4.features if (partitionId == partitionIndex.toInt){ val currentPoint = dataArr(neighborId) val features = currentPoint._1.features sampleFeatures += (sampleFeatures - features) * rand.nextDouble result.::=("1.0"+delimiter+sampleFeatures.toArray.mkString(delimiter)) } } result.iterator } }
Example 11
Source File: RDDLossFunctionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim.loss import org.apache.spark.SparkFunSuite import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregatorSuite.TestAggregator import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD class RDDLossFunctionSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var instances: RDD[Instance] = _ override def beforeAll(): Unit = { super.beforeAll() instances = sc.parallelize(Seq( Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)), Instance(2.0, 0.3, Vectors.dense(4.0, 0.5)) )) } test("regularization") { val coefficients = Vectors.dense(0.5, -0.1) val regLossFun = new L2Regularization(0.1, (_: Int) => true, None) val getAgg = (bvec: Broadcast[Vector]) => new TestAggregator(2)(bvec.value) val lossNoReg = new RDDLossFunction(instances, getAgg, None) val lossWithReg = new RDDLossFunction(instances, getAgg, Some(regLossFun)) val (loss1, grad1) = lossNoReg.calculate(coefficients.asBreeze.toDenseVector) val (regLoss, regGrad) = regLossFun.calculate(coefficients) val (loss2, grad2) = lossWithReg.calculate(coefficients.asBreeze.toDenseVector) BLAS.axpy(1.0, Vectors.fromBreeze(grad1), regGrad) assert(regGrad ~== Vectors.fromBreeze(grad2) relTol 1e-5) assert(loss1 + regLoss === loss2) } test("empty RDD") { val rdd = sc.parallelize(Seq.empty[Instance]) val coefficients = Vectors.dense(0.5, -0.1) val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value) val lossFun = new RDDLossFunction(rdd, getAgg, None) withClue("cannot calculate cost for empty dataset") { intercept[IllegalArgumentException]{ lossFun.calculate(coefficients.asBreeze.toDenseVector) } } } test("versus aggregating on an iterable") { val coefficients = Vectors.dense(0.5, -0.1) val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value) val lossFun = new RDDLossFunction(instances, getAgg, None) val (loss, grad) = lossFun.calculate(coefficients.asBreeze.toDenseVector) // just map the aggregator over the instances array val agg = new TestAggregator(2)(coefficients) instances.collect().foreach(agg.add) assert(loss === agg.loss) assert(Vectors.fromBreeze(grad) === agg.gradient) } }
Example 12
Source File: ShuffleMapTask.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.nio.ByteBuffer import scala.language.existentials import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.shuffle.ShuffleWriter def this(partitionId: Int) { this(0, null, new Partition { override def index: Int = 0 }, null) } @transient private val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } override def runTask(context: TaskContext): MapStatus = { // Deserialize the RDD using the broadcast variable. val deserializeStartTime = System.currentTimeMillis() val ser = SparkEnv.get.closureSerializer.newInstance() val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime metrics = Some(context.taskMetrics) var writer: ShuffleWriter[Any, Any] = null try { val manager = SparkEnv.get.shuffleManager writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context) writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]]) return writer.stop(success = true).get } catch { case e: Exception => try { if (writer != null) { writer.stop(success = false) } } catch { case e: Exception => log.debug("Could not stop writer", e) } throw e } } override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId) }
Example 13
Source File: OTBroadcastHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent._ import scala.concurrent.duration._ case class OTBroadcastHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } @transient private lazy val broadcastFuture = future { prevBatch match { case None => // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute().map(_.copy()).collect() val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[HashedRelation]] } }(BroadcastHashJoin.broadcastHashJoinExecutionContext) override def doExecute() = { val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamedIter => hashJoin(streamedIter, broadcastRelation.value) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = OTBroadcastHashJoin(leftKeys, rightKeys, buildSide, left, right)( controller, newTrace, opId) join.broadcastFuture join } }
Example 14
Source File: MTBLeftSemiHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import java.util.{HashSet => JHashSet} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ import scala.concurrent.duration._ case class MTBLeftSemiHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override val buildSide = BuildRight override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil override def output = left.output @transient private[this] lazy val keyGenerator: () => MutableProjection = newMutableProjection(buildKeys, buildPlan.output) val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } val watcher = controller.getWatcher @transient private lazy val broadcastFuture = future { // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute() .mapPartitions(HashedSet(_, keyGenerator())).collect() prevBatch match { case None => val hashed = HashedSet(input.iterator) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => // TODO: fix this integrity error by supporting join whose both branches may grow val hashed = HashedSet(input.iterator) val previous = controller.broadcasts((opId, bId)).value.asInstanceOf[JHashSet[Row]] if (!previous.containsAll(hashed)) { watcher += -1 logError(s"Integrity Error in MTBLeftSemiHashJoin(Op $opId, Batch $currentBatch)") } controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]] } } override def doExecute() = { val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamIter => val hashSet = broadcastRelation.value val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = MTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId) join.broadcastFuture join } }
Example 15
Source File: OTBLeftSemiHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import java.util.{HashSet => JHashSet} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId} import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent._ import scala.concurrent.duration._ case class OTBLeftSemiHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override val buildSide = BuildRight override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil override def output = left.output @transient private[this] lazy val keyGenerator: () => MutableProjection = newMutableProjection(buildKeys, buildPlan.output) val timeout = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } @transient private lazy val broadcastFuture = future { prevBatch match { case None => // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute() .mapPartitions(HashedSet(_, keyGenerator())).collect() val hashed = HashedSet(input.iterator) val broadcast = sparkContext.broadcast(hashed) controller.broadcasts((opId, currentBatch)) = broadcast broadcast case Some(bId) => controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]] } } override def doExecute() = { val broadcastRelation: Broadcast[JHashSet[Row]] = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamIter => val hashSet = broadcastRelation.value val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = { val join = OTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId) join.broadcastFuture join } }
Example 16
Source File: ResultTask.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.lang.management.ManagementFactory import java.nio.ByteBuffer import java.util.Properties import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.executor.TaskMetrics import org.apache.spark.rdd.RDD private[spark] class ResultTask[T, U]( stageId: Int, stageAttemptId: Int, taskBinary: Broadcast[Array[Byte]], partition: Partition, locs: Seq[TaskLocation], val outputId: Int, localProperties: Properties, metrics: TaskMetrics, jobId: Option[Int] = None, appId: Option[String] = None, appAttemptId: Option[String] = None) extends Task[U](stageId, stageAttemptId, partition.index, metrics, localProperties, jobId, appId, appAttemptId) with Serializable { @transient private[this] val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } override def runTask(context: TaskContext, user: String): U = { // Deserialize the RDD and the func using the broadcast variables. val threadMXBean = ManagementFactory.getThreadMXBean val deserializeStartTime = System.currentTimeMillis() val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime } else 0L val ser = SparkEnv.get(user).closureSerializer.newInstance() val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime } else 0L func(context, rdd.iterator(partition, context)) } // This is only callable on the driver side. override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")" }
Example 17
Source File: RRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 18
Source File: MapPartitionsRWrapper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.r import org.apache.spark.api.r._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.api.r.SQLUtils._ import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType case class MapPartitionsRWrapper( func: Array[Byte], packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]], inputSchema: StructType, outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) { def apply(iter: Iterator[Any]): Iterator[Any] = { // If the content of current DataFrame is serialized R data? val isSerializedRData = if (inputSchema == SERIALIZED_R_DATA_SCHEMA) true else false val (newIter, deserializer, colNames) = if (!isSerializedRData) { // Serialize each row into a byte array that can be deserialized in the R worker (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)}, SerializationFormats.ROW, inputSchema.fieldNames) } else { (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null) } val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) { SerializationFormats.ROW } else { SerializationFormats.BYTE } val runner = new RRunner[Array[Byte]]( func, deserializer, serializer, packageNames, broadcastVars, isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY) // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex. val outputIter = runner.compute(newIter, -1) if (serializer == SerializationFormats.ROW) { outputIter.map { bytes => bytesToRow(bytes, outputSchema) } } else { outputIter.map { bytes => Row.fromSeq(Seq(bytes)) } } } }
Example 19
Source File: RepartitionedOrderedRDD2.scala From hail with MIT License | 5 votes |
package is.hail.sparkextras import is.hail.annotations._ import is.hail.rvd.{PartitionBoundOrdering, RVD, RVDContext, RVDPartitioner, RVDType} import is.hail.utils._ import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD class OrderedDependency[T]( oldPartitionerBc: Broadcast[RVDPartitioner], newIntervalListBc: Broadcast[IndexedSeq[Interval]], rdd: RDD[T] ) extends NarrowDependency[T](rdd) { override def getParents(partitionId: Int): Seq[Int] = oldPartitionerBc.value.queryInterval(newIntervalListBc.value(partitionId)) } object RepartitionedOrderedRDD2 { def apply(prev: RVD, newRangeBounds: IndexedSeq[Interval]): ContextRDD[Long] = ContextRDD(new RepartitionedOrderedRDD2(prev, newRangeBounds)) } class RepartitionedOrderedRDD2 private (prev: RVD, newRangeBounds: IndexedSeq[Interval]) extends RDD[ContextRDD.ElementType[Long]](prev.crdd.sparkContext, Nil) { // Nil since we implement getDependencies val prevCRDD: ContextRDD[Long] = prev.boundary.crdd val typ: RVDType = prev.typ val kOrd: ExtendedOrdering = PartitionBoundOrdering(typ.kType.virtualType) val oldPartitionerBc: Broadcast[RVDPartitioner] = prev.partitioner.broadcast(prevCRDD.sparkContext) val newRangeBoundsBc: Broadcast[IndexedSeq[Interval]] = prevCRDD.sparkContext.broadcast(newRangeBounds) require(newRangeBounds.forall{i => typ.kType.virtualType.relaxedTypeCheck(i.start) && typ.kType.virtualType.relaxedTypeCheck(i.end)}) def getPartitions: Array[Partition] = { Array.tabulate[Partition](newRangeBoundsBc.value.length) { i => RepartitionedOrderedRDD2Partition( i, dependency.getParents(i).toArray.map(prevCRDD.partitions), newRangeBoundsBc.value(i)) } } override def compute(partition: Partition, context: TaskContext): Iterator[RVDContext => Iterator[Long]] = { val ordPartition = partition.asInstanceOf[RepartitionedOrderedRDD2Partition] val pord = kOrd.intervalEndpointOrdering val range = ordPartition.range val ur = new UnsafeRow(typ.rowType) val key = new SelectFieldsRow(ur, typ.kFieldIdx) Iterator.single { (ctx: RVDContext) => ordPartition.parents.iterator .flatMap { parentPartition => prevCRDD.iterator(parentPartition, context).flatMap(_(ctx)) }.dropWhile { ptr => ur.set(ctx.r, ptr) pord.lt(key, range.left) }.takeWhile { ptr => ur.set(ctx.r, ptr) pord.lteq(key, range.right) } } } val dependency = new OrderedDependency( oldPartitionerBc, newRangeBoundsBc, prevCRDD.rdd) override def getDependencies: Seq[Dependency[_]] = FastSeq(dependency) } case class RepartitionedOrderedRDD2Partition( index: Int, parents: Array[Partition], range: Interval ) extends Partition
Example 20
Source File: FalseLikes.scala From wordpress-posts-recommender with Apache License 2.0 | 5 votes |
package wordpressworkshop import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import scala.util.Random object FalseLikes { def numLikeUsersDistributionArray(trainPostsRDD: RDD[(BlogPost, Set[Long])]): Array[Int] = { val bins = (0 to 100).toArray.map(_.toDouble) bins.zip(trainPostsRDD.map(_._2.size).histogram(bins)).flatMap { case (bin, count) => Array.fill(count.toInt)(bin.toInt) } } def blogPostsWithNonLikeUsers(trainPostsRDD: RDD[(BlogPost, Set[Long])], numLikeUsersDistributionArrayBV: Broadcast[Array[Int]], userIds: Broadcast[Set[Long]]): RDD[(BlogPost, Set[Long])] = trainPostsRDD.map { case (blogPost, users) => val sum = numLikeUsersDistributionArrayBV.value.groupBy(identity).mapValues(_.length).values.sum val randomNumber: Int = Random.nextInt(sum.toInt) val nUsers = numLikeUsersDistributionArrayBV.value(randomNumber) val nonLikeUsers: Array[Long] = (userIds.value -- users).toArray blogPost -> Array.fill(nUsers)(nonLikeUsers(Random.nextInt(nonLikeUsers.length))).toSet } }
Example 21
Source File: QueryHamming.scala From cosine-lsh-join-spark with MIT License | 5 votes |
package com.soundcloud.lsh import org.apache.spark.broadcast.Broadcast import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRowMatrix, MatrixEntry} import org.apache.spark.rdd.RDD class QueryHamming(minCosineSimilarity: Double, dimensions: Int, resultSize: Int, broadcastCatalog: Boolean = true) extends QueryJoiner with Serializable { override def join(queryMatrix: IndexedRowMatrix, catalogMatrix: IndexedRowMatrix): CoordinateMatrix = { val numFeatures = queryMatrix.numCols().toInt val randomMatrix = localRandomMatrix(dimensions, numFeatures) val querySignatures = matrixToBitSetSparse(queryMatrix, randomMatrix) val catalogSignatures = matrixToBitSetSparse(catalogMatrix, randomMatrix) var rddSignatures: RDD[SparseSignature] = null var broadcastSignatures: Broadcast[Array[SparseSignature]] = null if (broadcastCatalog) { rddSignatures = querySignatures broadcastSignatures = querySignatures.sparkContext.broadcast(catalogSignatures.collect) } else { rddSignatures = catalogSignatures broadcastSignatures = catalogSignatures.sparkContext.broadcast(querySignatures.collect) } val approximated = rddSignatures.mapPartitions { rddSignatureIterator => val signaturesBC = broadcastSignatures.value rddSignatureIterator.flatMap { rddSignature => signaturesBC.map { broadCastSignature => val approximatedCosine = hammingToCosine(hamming(rddSignature.bitSet, broadCastSignature.bitSet), dimensions) if (broadcastCatalog) new MatrixEntry(rddSignature.index, broadCastSignature.index, approximatedCosine) else new MatrixEntry(broadCastSignature.index, rddSignature.index, approximatedCosine) }.filter(_.value >= minCosineSimilarity).sortBy(-_.value).take(resultSize) } } broadcastSignatures.unpersist(true) new CoordinateMatrix(approximated) } }
Example 22
Source File: CompareTest.scala From spark-bam with Apache License 2.0 | 5 votes |
package org.hammerlab.bam.spark.compare import hammerlab.bytes._ import org.apache.spark.broadcast.Broadcast import org.hammerlab.bam.check.{ MaxReadSize, ReadsToCheck } import org.hammerlab.bam.spark.Split import org.hammerlab.bam.test.resources.bam1 import org.hammerlab.bgzf.Pos import org.hammerlab.bgzf.block.BGZFBlocksToCheck import org.hammerlab.hadoop.Configuration import org.hammerlab.hadoop.splits.MaxSplitSize import org.hammerlab.spark.test.suite.SparkSuite import shapeless.LabelledGeneric class CompareTest extends SparkSuite { val lg = LabelledGeneric[Result] def check(actual: Result, expected: Result): Unit = { actual.copy(hadoopBamMS = 0, sparkBamMS = 0) should be( expected ) } implicit lazy val confBroadcast: Broadcast[Configuration] = sc.broadcast(ctx) test("230kb") { implicit val splitSize = MaxSplitSize(230.KB) val actual = Result(bam1) val expected = Result( 3, 3, Vector( Right( Split( Pos(239479, 311), Pos(471040, 65535) ) ), Left( Split( Pos(239479, 312), Pos(484396, 25) ) ) ), 1, 1, 0, // dummy value, timing values not checked 0 // dummy value, timing values not checked ) check(actual, expected) } test("115KB") { implicit val splitSize = MaxSplitSize(115.KB) check( Result(bam1), Result( 5, 5, Vector( Right( Split( Pos(239479, 311), Pos(353280, 65535) ) ), Left( Split( Pos(239479, 312), Pos(361204, 42) ) ) ), 1, 1, 0, // dummy value, timing values not checked 0 // dummy value, timing values not checked ) ) } }
Example 23
Source File: IndexedRecordPositions.scala From spark-bam with Apache License 2.0 | 5 votes |
package org.hammerlab.bam.check.indexed import caseapp.{ ValueDescription, HelpMessage ⇒ M, Name ⇒ O } import hammerlab.path._ import magic_rdds.ordered._ import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.hammerlab.args.ByteRanges import org.hammerlab.bgzf.Pos import org.hammerlab.magic.rdd.ordered.SortedRDD import org.hammerlab.magic.rdd.ordered.SortedRDD.{ Bounds, bounds } import scala.collection.immutable.SortedSet def apply(path: Path)( implicit sc: SparkContext, rangesBroadcast: Broadcast[Option[ByteRanges]] ): IndexedRecordPositions = { val reads = sc .textFile(path.toString) .map( line ⇒ line.split(",") match { case Array(a, b) ⇒ Pos(a.toLong, b.toInt) case _ ⇒ throw new IllegalArgumentException( s"Bad record-pos line: $line" ) } ) .filter { case Pos(blockPos, _) ⇒ rangesBroadcast .value .forall(_.contains(blockPos)) } .cache IndexedRecordPositions( reads, bounds(reads) ) } }
Example 24
Source File: BlocksAndIndexedRecords.scala From spark-bam with Apache License 2.0 | 5 votes |
package org.hammerlab.bam.check.indexed import hammerlab.path._ import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.hammerlab.args.ByteRanges import org.hammerlab.bam.check.Blocks import org.hammerlab.bgzf.Pos import org.hammerlab.bgzf.block.Metadata import org.hammerlab.kryo.Registrar import scala.collection.immutable.SortedSet import scala.reflect.ClassTag case class BlocksAndIndexedRecords(blocks: RDD[Metadata], records: RDD[SortedSet[Pos]]) object BlocksAndIndexedRecords extends Registrar { def apply[U: ClassTag]()( implicit path: Path, sc: SparkContext, rangesBroadcast: Broadcast[Option[ByteRanges]], blockArgs: Blocks.Args, recordArgs: IndexedRecordPositions.Args ): BlocksAndIndexedRecords = { val Blocks(blocks, bounds) = Blocks() val posBounds = bounds .copy( partitions = bounds .partitions .map { _.map { case (start, endOpt) ⇒ ( Pos(start, 0), endOpt.map(Pos(_, 0)) ) } } ) val indexedRecords = IndexedRecordPositions(recordArgs.path) val repartitionedRecords = indexedRecords.toSets(posBounds) BlocksAndIndexedRecords( blocks, repartitionedRecords ) } register( Blocks ) }
Example 25
Source File: PosMetadata.scala From spark-bam with Apache License 2.0 | 5 votes |
package org.hammerlab.bam.check import hammerlab.show._ import htsjdk.samtools.{ BAMRecord, SAMFileHeader, SAMRecord, ValidationStringency } import org.apache.spark.broadcast.Broadcast import org.hammerlab.bam.check.full.error.Flags import org.hammerlab.bam.header.{ ContigLengths, Header } import org.hammerlab.bam.iterator.RecordStream import org.hammerlab.bam.spark.FindRecordStart import org.hammerlab.bgzf.Pos import org.hammerlab.bgzf.block.SeekableUncompressedBytes case class PosMetadata(pos: Pos, recordOpt: Option[NextRecord], flags: Flags) object PosMetadata { implicit def defaultShow(implicit showRecord: Show[SAMRecord]): Show[PosMetadata] = Show { case PosMetadata(pos, recordOpt, flags) ⇒ show"$pos:\t$recordOpt. Failing checks: $flags" } implicit def showNextRecordOpt(implicit showNextRecord: Show[NextRecord]): Show[Option[NextRecord]] = Show { case Some(nextRecord) ⇒ nextRecord.show case None ⇒ "no next record" } def recordPos(record: SAMRecord)(implicit contigLengths: ContigLengths): String = s"${contigLengths(record.getReferenceIndex)._1}:${record.getStart}" implicit def showRecord(implicit contigLengths: ContigLengths): Show[SAMRecord] = Show { record ⇒ record .toString .dropRight(1) + // remove trailing period ( // Append info about mapped/placed location if ( record.getReadUnmappedFlag && record.getStart >= 0 && record.getReferenceIndex >= 0 && record.getReferenceIndex < contigLengths.size ) s" (placed at ${recordPos(record)})" else if (!record.getReadUnmappedFlag) s" @ ${recordPos(record)}" else "" ) } def apply(pos: Pos, flags: Flags)( implicit uncompressedBytes: SeekableUncompressedBytes, header: Broadcast[Header], readsToCheck: ReadsToCheck, maxReadSize: MaxReadSize ): PosMetadata = { implicit val contigLengths = header.value.contigLengths PosMetadata( pos, { FindRecordStart .withDelta(pos) .map { case (nextRecordPos, delta) ⇒ uncompressedBytes.seek(nextRecordPos) NextRecord( RecordStream( uncompressedBytes, header.value ) .next() ._2, delta ) } }, flags ) } import org.hammerlab.kryo._ import org.hammerlab.bam.kryo.registerSAMFileHeader implicit val alsoRegister: AlsoRegister[PosMetadata] = AlsoRegister( cls[NextRecord], cls[BAMRecord], cls[ValidationStringency], cls[SAMFileHeader] ) }
Example 26
Source File: Predictor.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.common import com.tencent.angel.mlcore.conf.{MLCoreConf, SharedConf} import com.tencent.angel.ml.math2.utils.{DataBlock, LabeledData} import org.apache.spark.broadcast.Broadcast import com.tencent.angel.sona.ml.common.MathImplicits._ import com.tencent.angel.sona.core.{AngelGraphModel, ExecutorContext} import com.tencent.angel.sona.data.LocalMemoryDataBlock import org.apache.spark.linalg import org.apache.spark.linalg.Vectors import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import org.apache.spark.sql.{Row, SPKSQLUtils} import scala.collection.mutable.ListBuffer class Predictor(bcValue: Broadcast[ExecutorContext], featIdx: Int, predictionCol: String, probabilityCol: String, bcConf: Broadcast[SharedConf]) extends Serializable { @transient private lazy val executorContext: ExecutorContext = { bcValue.value } @transient private lazy implicit val dim: Long = { executorContext.conf.getLong(MLCoreConf.ML_FEATURE_INDEX_RANGE) } @transient private lazy val appendedSchema: StructType = if (probabilityCol.nonEmpty) { new StructType(Array[StructField](StructField(probabilityCol, DoubleType), StructField(predictionCol, DoubleType))) } else { new StructType(Array[StructField](StructField(predictionCol, DoubleType))) } def predictRDD(data: Iterator[Row]): Iterator[Row] = { val localModel = executorContext.borrowModel(bcConf.value) val batchSize = 1024 val storage = new LocalMemoryDataBlock(batchSize, batchSize * 1024 * 1024) var count = 0 val cachedRows: Array[Row] = new Array[Row](batchSize) val result: ListBuffer[Row] = ListBuffer[Row]() data.foreach { case row if count != 0 && count % batchSize == 0 => predictInternal(localModel, storage, cachedRows, result) storage.clean() storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0)) cachedRows(count % batchSize) = row count += 1 case row => storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0)) cachedRows(count % batchSize) = row count += 1 } predictInternal(localModel, storage, cachedRows, result) executorContext.returnModel(localModel) result.toIterator } private def predictInternal(model: AngelGraphModel, storage: DataBlock[LabeledData], cachedRows: Array[Row], result: ListBuffer[Row]): Unit = { val predicted = model.predict(storage) if (appendedSchema.length == 1) { predicted.zipWithIndex.foreach { case (res, idx) => result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.pred)) } } else { predicted.zipWithIndex.foreach { case (res, idx) => result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.proba, res.predLabel)) } } } def predictRaw(features: linalg.Vector): linalg.Vector = { val localModel = executorContext.borrowModel(bcConf.value) val res = localModel.predict(new LabeledData(features, 0.0)) executorContext.returnModel(localModel) Vectors.dense(res.pred, -res.pred) } }
Example 27
Source File: Trainer.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.common import com.tencent.angel.mlcore.conf.{MLCoreConf, SharedConf} import com.tencent.angel.ml.math2.utils.LabeledData import com.tencent.angel.sona.core.ExecutorContext import com.tencent.angel.sona.util.ConfUtils import com.tencent.angel.sona.ml.evaluation.TrainingStat import com.tencent.angel.sona.ml.evaluation.training._ import org.apache.spark.broadcast.Broadcast class Trainer(bcValue: Broadcast[ExecutorContext], epoch: Int, bcConf: Broadcast[SharedConf]) extends Serializable { @transient private lazy val executorContext: ExecutorContext = { bcValue.value } def trainOneBatch(data: Array[LabeledData]): TrainingStat = { val localRunStat: TrainingStat = executorContext.conf.get(ConfUtils.ALGO_TYPE) match { case "class" => // new ClassificationTrainingStat(executorContext.conf.getInt(MLCoreConf.ML_NUM_CLASS)) new ClassificationTrainingStat(bcConf.value.getInt(MLCoreConf.ML_NUM_CLASS)) case "regression" => new RegressionTrainingStat() case "clustering" => new ClusteringTrainingStat() } val localModel = executorContext.borrowModel(bcConf.value) // those code executor on task val graph = localModel.graph graph.feedData(data) localRunStat.setNumSamples(data.length) // note: this step is synchronized val pullStart = System.currentTimeMillis() if (bcConf.value.getBoolean(MLCoreConf.ML_IS_DATA_SPARSE)) { localModel.pullParams(epoch, graph.placeHolder.getIndices) } else { localModel.pullParams(epoch) } val pullFinished = System.currentTimeMillis() localRunStat.setPullTime(pullFinished - pullStart) val forwardStart = System.currentTimeMillis() val avgLoss = graph.calForward() graph.predict().foreach { pres => localRunStat.add(pres) } localRunStat.setAvgLoss(avgLoss) val forwardFinished = System.currentTimeMillis() localRunStat.setForwardTime(forwardFinished - forwardStart) val backwardStart = System.currentTimeMillis() graph.calBackward() val backwardFinished = System.currentTimeMillis() localRunStat.setBackwardTime(backwardFinished - backwardStart) // note: this step is asynchronous val pushStart = System.currentTimeMillis() localModel.pushGradient(0.1) val pushFinished = System.currentTimeMillis() localRunStat.setPushTime(pushFinished - pushStart) executorContext.returnModel(localModel) localRunStat } }
Example 28
Source File: AngelSparkModel.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.common import com.tencent.angel.client.AngelPSClient import com.tencent.angel.mlcore.conf.SharedConf import com.tencent.angel.sona.core.{AngelGraphModel, DriverContext, ExecutorContext, SparkMasterContext} import com.tencent.angel.sona.ml.evaluation.TrainingStat import com.tencent.angel.sona.ml.param.{AngelGraphParams, Params} import org.apache.spark.broadcast.Broadcast trait AngelSparkModel extends Params with AngelGraphParams { val angelModelName: String var numTask: Int = -1 @transient var bcValue: Broadcast[ExecutorContext] = _ @transient var bcConf: Broadcast[SharedConf] = _ @transient implicit val psClient: AngelPSClient = synchronized { DriverContext.get().getAngelClient } @transient lazy val sparkEnvContext: SparkMasterContext = synchronized { DriverContext.get().sparkMasterContext } @transient implicit lazy val dim: Long = getNumFeature @transient lazy val angelModel: AngelGraphModel = { require(numTask == -1 || numTask > 0, "Please set numTask before use angelModel") new AngelGraphModel(sharedConf, numTask) } @transient private var trainingSummary: Option[TrainingStat] = None def setSummary(summary: Option[TrainingStat]): this.type = { this.trainingSummary = summary this } def hasSummary: Boolean = trainingSummary.isDefined def summary: TrainingStat = trainingSummary.getOrElse { throw new Exception("No training summary available for this AngelClassifierModel") } def setNumTask(numTask: Int): this.type = { this.numTask = numTask psClient.setTaskNum(numTask) this } def setBCValue(bcValue: Broadcast[ExecutorContext]): this.type = { this.bcValue = bcValue this } }
Example 29
Source File: Features.scala From wordpress-posts-recommender with Apache License 2.0 | 5 votes |
package wordpressworkshop import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD._ import scalaz.Scalaz._ case class Features(categoriesLikelihood: Double, tagsLikelihood: Double, languageLikelihood: Double, authorLikelihood: Double, titleLengthMeanError: Double, blogLikelihood: Double, averageLikesPerPost: Double) case object Features { def blogIdToPriorBlogLikelihoodBV(statsUserRDD: RDD[StatsUser]): Map[Long, Double] = (statsUserRDD.map { case StatsUser(_, numLikes: Long, likeBlogs: Map[Long, Long]) => (likeBlogs, numLikes) }.reduce(_ |+| _) match { case (likeBlogs, numLikes) => likeBlogs.mapValues(_.toDouble / numLikes).map(identity) }).withDefaultValue(0.0) def meanBlogLikesPerPost(statsBlogRDD: RDD[StatsBlog]): Double = statsBlogRDD.map { case StatsBlog(_, numLikes: Long, numPosts: Long) => (numLikes, numPosts) }.reduce(_ |+| _) match { case (numLikes, numPosts) => numLikes.toDouble / numPosts } def userIdToOtherLikelihoodMaps(trainPostsRDD: RDD[(BlogPost, Set[Long])]): RDD[(Long, (Map[String, Int], Map[String, Int], Map[String, Int], Map[Long, Int], Map[Int, Int]))] = (for { (blogPost, users) <- trainPostsRDD userId <- users } yield userId ->(blogPost.categories.map(_ -> 1).toMap, blogPost.tags.map(_ -> 1).toMap, Map(blogPost.language -> 1), Map(blogPost.authorId -> 1), Map(blogPost.title.map(_.split("[^\\w']+").size).getOrElse(0) -> 1)) ) .reduceByKey(_ |+| _) .mapValues { case (categoriesLikelihoodMap, tagsLikelihoodMap, languageLikelihoodMap, authorLikelihoodMap, titleLengthLikelihoodMap) => (categoriesLikelihoodMap.toList.sortBy(_._2).takeRight(100).toMap, tagsLikelihoodMap.toList.sortBy(_._2).takeRight(100).toMap, languageLikelihoodMap, authorLikelihoodMap.toList.sortBy(_._2).takeRight(100).toMap, titleLengthLikelihoodMap) } def likelihoodSet(map: Map[String, Int], labels: Set[String]): Double = labels.flatMap(map.get).sum.toDouble / map.values.sum def likelihoodInt[K](map: Map[K, Int], label: K): Double = map.getOrElse(label, 0).toDouble / map.values.sum def likelihoodDouble[K](map: Map[K, Double], label: K): Double = map.getOrElse(label, 0.0) / map.values.sum def features(blogPostsAndUsers: RDD[(BlogPost, Set[Long])], userIdToOtherLikelihoodMaps: Broadcast[Map[Long, (Map[String, Int], Map[String, Int], Map[String, Int], Map[Long, Int], Map[Int, Int])]], userIdToBlogLikelihood: Broadcast[Map[Long, Map[Long, Double]]], blogIdToPriorBlogLikelihoodBV: Broadcast[Map[Long, Double]], blogIdToAverageLikesPerPostBV: Broadcast[Map[Long, Double]], meanBlogLikesPerPost: Double) = for { (post, users) <- blogPostsAndUsers blogId = post.blogId postId = post.postId averageLikesPerPost = blogIdToAverageLikesPerPostBV.value.getOrElse(post.blogId, meanBlogLikesPerPost) userId <- users (categoriesLikelihoodMap, tagsLikelihoodMap, languageLikelihoodMap, authorLikelihoodMap, titleLengthLikelihoodMap) = userIdToOtherLikelihoodMaps.value(userId) titleLengthAverage = titleLengthLikelihoodMap.values.sum.toDouble / titleLengthLikelihoodMap.size blogLikelihoodMapOption = userIdToBlogLikelihood.value.get(userId) blogLikelihoodMap = blogLikelihoodMapOption.getOrElse(blogIdToPriorBlogLikelihoodBV.value) } yield (userId, post.postId) -> Features( categoriesLikelihood = likelihoodSet(categoriesLikelihoodMap, post.categories), tagsLikelihood = likelihoodSet(tagsLikelihoodMap, post.tags), languageLikelihood = likelihoodInt(languageLikelihoodMap, post.language), authorLikelihood = likelihoodInt(authorLikelihoodMap, post.authorId), titleLengthMeanError = math.abs(titleLengthAverage - post.title.map(_.split("[^\\w']+").size).getOrElse(0)), blogLikelihood = likelihoodDouble(blogLikelihoodMap, post.blogId), averageLikesPerPost = averageLikesPerPost ) }
Example 30
Source File: BroadcastSimple.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.test import org.apache.spark.broadcast.Broadcast import reforest.rf.RFCategoryInfo import reforest.util.{GCInstrumented, GCInstrumentedEmpty} import reforest.{TypeInfo, TypeInfoDouble, TypeInfoInt} import test.RFResourceFactory import scala.reflect.ClassTag class BroadcastSimple[T: ClassTag](v: T) extends Broadcast[T](0) { override def value: T = v override def getValue(): T = v override def doDestroy(blocking: Boolean) = {} override def doUnpersist(blocking: Boolean) = {} } object BroadcastSimple { val typeInfoInt = new BroadcastSimple[TypeInfoInt](new TypeInfoInt(false, -100)) val typeInfoDouble : Broadcast[TypeInfo[Double]] = new BroadcastSimple[TypeInfo[Double]](new TypeInfoDouble(false, -100)) val gcInstrumentedEmpty : Broadcast[GCInstrumented] = new BroadcastSimple[GCInstrumented](new GCInstrumentedEmpty) val categoryInfoEmpty : Broadcast[RFCategoryInfo] = new BroadcastSimple[RFCategoryInfo](RFResourceFactory.getCategoricalInfo) }
Example 31
Source File: Ledger.scala From deepspark with GNU General Public License v2.0 | 5 votes |
package com.github.nearbydelta.deepspark.word.layer import breeze.linalg.DenseVector import com.esotericsoftware.kryo.Kryo import com.esotericsoftware.kryo.io.{Input, Output} import com.github.nearbydelta.deepspark.data._ import com.github.nearbydelta.deepspark.layer.InputLayer import com.github.nearbydelta.deepspark.word._ import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.reflect.{ClassTag, classTag} trait Ledger[OutInfo] extends InputLayer[Array[Int], OutInfo] { @transient implicit override protected val evidenceI: ClassTag[Array[Int]] = classTag[Array[Int]] @transient var algorithm: LedgerAlgorithm = _ var bcModel: Broadcast[LedgerModel] = _ @transient var builder: LedgerBuilder = _ var dimension: Int = 0 @transient var model: LedgerModel = _ protected var padID = -1 def withModel(model: LedgerModel, builder: LedgerBuilder): this.type = { this.model = model this.builder = builder this.padID = model.padID this.dimension = model.dimension this.algorithm = builder.getUpdater(this.model.vectors) this } protected def pad = if (padID == -1) null else if (bcModel != null) vectorOf(bcModel.value.padID) else vectorOf(padID) protected def updateWord(word: Int, dx: DataVec): Unit = if (word != -1 && algorithm != null) { val vec = algorithm.delta.getOrElseUpdate(word, DenseVector.zeros[Double](dimension)) vec += dx } protected def vectorOf(str: Int) = if (bcModel != null) bcModel.value.vectorAt(str) else model.vectorAt(str) override def broadcast(sc: SparkContext): Unit = { bcModel = sc.broadcast(model) } override def loss: Double = algorithm.loss override def read(kryo: Kryo, input: Input): Unit = { builder = kryo.readClassAndObject(input).asInstanceOf[LedgerBuilder] val model = new LedgerModel model.read(kryo, input) require(model.size > 0, "Model is empty!") withModel(model, builder) super.read(kryo, input) } override def unbroadcast(): Unit = { bcModel.unpersist(blocking = false) } @deprecated override def withInput(in: Int): this.type = this @deprecated override def withOutput(out: Int): this.type = this override def write(kryo: Kryo, output: Output): Unit = { kryo.writeClassAndObject(output, builder) model.write(kryo, output) super.write(kryo, output) } }
Example 32
Source File: FixedLedger.scala From deepspark with GNU General Public License v2.0 | 5 votes |
package com.github.nearbydelta.deepspark.word.layer import com.esotericsoftware.kryo.Kryo import com.esotericsoftware.kryo.io.{Input, Output} import com.github.nearbydelta.deepspark.data._ import com.github.nearbydelta.deepspark.layer.InputLayer import com.github.nearbydelta.deepspark.word._ import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.collection.parallel.ParSeq import scala.reflect.{ClassTag, classTag} trait FixedLedger[OutInfo] extends InputLayer[Array[Int], OutInfo] { @transient implicit override protected val evidenceI: ClassTag[Array[Int]] = classTag[Array[Int]] var bcModel: Broadcast[LedgerModel] = _ @transient var model: LedgerModel = _ protected var padID = -1 def withModel(model: LedgerModel): this.type = { this.model = model this.padID = model.padID this } protected def pad = if (padID == -1) null else if (bcModel != null) vectorOf(bcModel.value.padID) else vectorOf(padID) protected def vectorOf(str: Int) = if (bcModel != null) bcModel.value.vectorAt(str) else model.vectorAt(str) override def backprop(seq: ParSeq[((Array[Int], OutInfo), DataVec)]): (ParSeq[DataVec], ParSeq[() ⇒ Unit]) = (null, ParSeq()) override def broadcast(sc: SparkContext): Unit = { bcModel = sc.broadcast(model) } override def loss: Double = 0.0 override def read(kryo: Kryo, input: Input): Unit = { val model = new LedgerModel model.read(kryo, input) withModel(model) super.read(kryo, input) } override def unbroadcast(): Unit = { bcModel.unpersist(blocking = false) } @deprecated override def withInput(in: Int): this.type = this @deprecated override def withOutput(out: Int): this.type = this override def write(kryo: Kryo, output: Output): Unit = { model.write(kryo, output) super.write(kryo, output) } }
Example 33
Source File: 7_RecoverableNetworkWordCount.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.spark_streaming import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} import org.apache.spark.util.LongAccumulator import org.apache.spark.{SparkConf, SparkContext} object RecoverableNetworkWordCount { def main(args: Array[String]): Unit = { StreamingLogger.setLoggerLevel() val conf = new SparkConf().setMaster("local").setAppName(RecoverableNetworkWordCount.getClass.getName) val context = new StreamingContext(conf, Seconds(1)) val linesDS = context.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_2) val wordsCounts = linesDS.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _) wordsCounts.foreachRDD((rdd: RDD[(String, Int)], time: Time) => { val blackList = WordBlackList.getInstance(context.sparkContext) val accumulator = DropWordCounter.getInstance(context.sparkContext) val str = rdd.filter { case (word, count) => if (blackList.value.contains(word)) { accumulator.add(count) false } else { true } }.collect().mkString("[", ", ", "]") println(s"str = $str") }) } } object WordBlackList { @volatile private var instance: Broadcast[Seq[String]] = _ def getInstance(context: SparkContext): Broadcast[Seq[String]] = { if (instance == null) { synchronized { if (instance == null) { val blackList = Seq("a", "b", "c") instance = context.broadcast(blackList) } } } instance } } object DropWordCounter { @volatile private var instance: LongAccumulator = _ def getInstance(context: SparkContext): LongAccumulator = { if (instance == null) { synchronized { if (instance == null) { instance = context.longAccumulator("WordCount") } } } instance } }
Example 34
Source File: ResultTask.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.nio.ByteBuffer import java.io._ import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD private[spark] class ResultTask[T, U]( stageId: Int, stageAttemptId: Int, taskBinary: Broadcast[Array[Byte]], partition: Partition, locs: Seq[TaskLocation], val outputId: Int, internalAccumulators: Seq[Accumulator[Long]]) extends Task[U](stageId, stageAttemptId, partition.index, internalAccumulators) with Serializable { @transient private[this] val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } override def runTask(context: TaskContext): U = { // Deserialize the RDD and the func using the broadcast variables. val deserializeStartTime = System.currentTimeMillis() val ser = SparkEnv.get.closureSerializer.newInstance() val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime metrics = Some(context.taskMetrics) func(context, rdd.iterator(partition, context)) } // This is only callable on the driver side. override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")" }
Example 35
Source File: ShuffleMapTask.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.nio.ByteBuffer import scala.language.existentials import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.shuffle.ShuffleWriter def this(partitionId: Int) { this(0, 0, null, new Partition { override def index: Int = 0 }, null, null) } @transient private val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } override def runTask(context: TaskContext): MapStatus = { // Deserialize the RDD using the broadcast variable. val deserializeStartTime = System.currentTimeMillis() val ser = SparkEnv.get.closureSerializer.newInstance() val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime metrics = Some(context.taskMetrics) var writer: ShuffleWriter[Any, Any] = null try { val manager = SparkEnv.get.shuffleManager writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context) writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]]) writer.stop(success = true).get } catch { case e: Exception => try { if (writer != null) { writer.stop(success = false) } } catch { case e: Exception => log.debug("Could not stop writer", e) } throw e } } override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId) }
Example 36
Source File: BroadcastHashJoinNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashedRelation} case class BroadcastHashJoinNode( conf: SQLConf, streamedKeys: Seq[Expression], streamedNode: LocalNode, buildSide: BuildSide, buildOutput: Seq[Attribute], hashedRelation: Broadcast[HashedRelation]) extends UnaryLocalNode(conf) with HashJoinNode { override val child = streamedNode // Because we do not pass in the buildNode, we take the output of buildNode to // create the inputSet properly. override def inputSet: AttributeSet = AttributeSet(child.output ++ buildOutput) override def output: Seq[Attribute] = buildSide match { case BuildRight => streamedNode.output ++ buildOutput case BuildLeft => buildOutput ++ streamedNode.output } protected override def doOpen(): Unit = { streamedNode.open() // Set the HashedRelation used by the HashJoinNode. withHashedRelation(hashedRelation.value) } override def close(): Unit = { streamedNode.close() } }
Example 37
Source File: ResultTask.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.lang.management.ManagementFactory import java.nio.ByteBuffer import java.util.Properties import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD private[spark] class ResultTask[T, U]( stageId: Int, stageAttemptId: Int, taskBinary: Broadcast[Array[Byte]], partition: Partition, locs: Seq[TaskLocation], val outputId: Int, localProperties: Properties, serializedTaskMetrics: Array[Byte], jobId: Option[Int] = None, appId: Option[String] = None, appAttemptId: Option[String] = None) extends Task[U](stageId, stageAttemptId, partition.index, localProperties, serializedTaskMetrics, jobId, appId, appAttemptId) with Serializable { @transient private[this] val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } override def runTask(context: TaskContext): U = { // Deserialize the RDD and the func using the broadcast variables. val threadMXBean = ManagementFactory.getThreadMXBean val deserializeStartTime = System.currentTimeMillis() val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime } else 0L val ser = SparkEnv.get.closureSerializer.newInstance() val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime } else 0L func(context, rdd.iterator(partition, context)) } // This is only callable on the driver side. override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")" }
Example 38
Source File: MapPartitionsRWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.r import org.apache.spark.api.r._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.Row import org.apache.spark.sql.api.r.SQLUtils._ import org.apache.spark.sql.types.StructType case class MapPartitionsRWrapper( func: Array[Byte], packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]], inputSchema: StructType, outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) { def apply(iter: Iterator[Any]): Iterator[Any] = { // If the content of current DataFrame is serialized R data? val isSerializedRData = inputSchema == SERIALIZED_R_DATA_SCHEMA val (newIter, deserializer, colNames) = if (!isSerializedRData) { // Serialize each row into a byte array that can be deserialized in the R worker (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)}, SerializationFormats.ROW, inputSchema.fieldNames) } else { (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null) } val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) { SerializationFormats.ROW } else { SerializationFormats.BYTE } val runner = new RRunner[Array[Byte]]( func, deserializer, serializer, packageNames, broadcastVars, isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY) // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex. val outputIter = runner.compute(newIter, -1) if (serializer == SerializationFormats.ROW) { outputIter.map { bytes => bytesToRow(bytes, outputSchema) } } else { outputIter.map { bytes => Row.fromSeq(Seq(bytes)) } } } }
Example 39
Source File: TestBroadcastVariables.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples import org.apache.spark.{ SparkContext, SparkConf } import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast import scala.io.Source import scala.util.{ Try, Success, Failure } import scala.collection.mutable.Map def loadCSVFile(filename: String): Option[Map[String, String]] = { val countries = Map[String, String]() Try { val bufferedSource = Source.fromFile(filename) for (line <- bufferedSource.getLines) { val Array(country, capital) = line.split(",").map(_.trim) countries += country -> capital } bufferedSource.close() return Some(countries) }.toOption } }
Example 40
Source File: RRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 41
Source File: MapPartitionsRWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.r import org.apache.spark.api.r._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.api.r.SQLUtils._ import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType case class MapPartitionsRWrapper( func: Array[Byte], packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]], inputSchema: StructType, outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) { def apply(iter: Iterator[Any]): Iterator[Any] = { // If the content of current DataFrame is serialized R data? val isSerializedRData = if (inputSchema == SERIALIZED_R_DATA_SCHEMA) true else false val (newIter, deserializer, colNames) = if (!isSerializedRData) { // Serialize each row into a byte array that can be deserialized in the R worker (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)}, SerializationFormats.ROW, inputSchema.fieldNames) } else { (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null) } val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) { SerializationFormats.ROW } else { SerializationFormats.BYTE } val runner = new RRunner[Array[Byte]]( func, deserializer, serializer, packageNames, broadcastVars, isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY) // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex. val outputIter = runner.compute(newIter, -1) if (serializer == SerializationFormats.ROW) { outputIter.map { bytes => bytesToRow(bytes, outputSchema) } } else { outputIter.map { bytes => Row.fromSeq(Seq(bytes)) } } } }
Example 42
Source File: DomainProcessor.scala From oni-ml with Apache License 2.0 | 5 votes |
package org.opennetworkinsight.utilities import org.apache.spark.broadcast.Broadcast import scala.io.Source object DomainProcessor extends Serializable { val COUNTRY_CODES = Set("ac", "ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq", "ar", "as", "at", "au", "aw", "ax", "az", "ba", "bb", "bd", "be", "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "bq", "br", "bs", "bt", "bv", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "cr", "cu", "cv", "cw", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec", "ee", "eg", "eh", "er", "es", "et", "eu", "fi", "fj", "fk", "fm", "fo", "fr", "ga", "gb", "gd", "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr", "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "io", "iq", "ir", "is", "it", "je", "jm", "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "krd", "kw", "ky", "kz", "la", "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mg", "mh", "mk", "ml", "mm", "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my", "mz", "na", "nc", "ne", "nf", "ng", "ni", "nl", "no", "np", "nr", "nu", "nz", "om", "pa", "pe", "pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "ps", "pt", "pw", "py", "qa", "re", "ro", "rs", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sj", "", "sk", "sl", "sm", "sn", "so", "sr", "ss", "st", "su", "sv", "sx", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj", "tk", "tl", "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va", "vc", "ve", "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "za", "zm", "zw") val TOP_LEVEL_DOMAIN_NAMES = Set("com", "org", "net", "int", "edu", "gov", "mil") val NO_DOMAIN = "None" def extractDomain(url: String): String = { val spliturl = url.split('.') val numParts = spliturl.length // First check if query is an IP address e.g.: 123.103.104.10.in-addr.arpa or a name. // Such URLs receive a domain of NO_DOMAIN if (numParts > 2 && spliturl(numParts - 1) == "arpa" && spliturl(numParts - 2) == "in-addr") { NO_DOMAIN // it's an address } else if (!COUNTRY_CODES.contains(spliturl.last) && !TOP_LEVEL_DOMAIN_NAMES.contains(spliturl.last)) { NO_DOMAIN // it does not have a valid top-level domain name } else { val strippedSplitURL = removeTopLevelDomainName(removeCountryCode(spliturl)) if (strippedSplitURL.length > 0) { strippedSplitURL.last } else { // invalid URL... nothing that is not TLD.countrycode NO_DOMAIN } } } def removeCountryCode(urlComponents: Array[String]): Array[String] = { if (COUNTRY_CODES.contains(urlComponents.last)) { urlComponents.dropRight(1) } else { urlComponents } } def removeTopLevelDomainName(urlComponents: Array[String]): Array[String] = { if (TOP_LEVEL_DOMAIN_NAMES.contains(urlComponents.last)) { urlComponents.dropRight(1) } else { urlComponents } } }
Example 43
Source File: ProxyWordCreation.scala From oni-ml with Apache License 2.0 | 5 votes |
package org.opennetworkinsight.proxy import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.functions._ import org.opennetworkinsight.utilities.{Entropy, Quantiles, DomainProcessor, TimeUtilities} object ProxyWordCreation { def udfWordCreation(topDomains : Broadcast[Set[String]], agentCounts : Broadcast[Map[String, Long]], timeCuts: Array[Double], entropyCuts: Array[Double], agentCuts: Array[Double]) = udf((host: String, time: String, reqMethod: String, uri: String, contentType: String, userAgent: String, responseCode: String) => ProxyWordCreation.proxyWord(host, time, reqMethod, uri, contentType, userAgent, responseCode, topDomains, agentCounts, timeCuts, entropyCuts, agentCuts)) def proxyWord(proxyHost: String, time: String, reqMethod: String, uri: String, contentType: String, userAgent: String, responseCode: String, topDomains: Broadcast[Set[String]], agentCounts: Broadcast[Map[String, Long]], timeCuts: Array[Double], entropyCuts: Array[Double], agentCuts: Array[Double]): String = { List(topDomain(proxyHost, topDomains.value).toString, Quantiles.bin(TimeUtilities.getTimeAsDouble(time), timeCuts).toString, reqMethod, Quantiles.bin(Entropy.stringEntropy(uri), entropyCuts), contentType.split('/')(0), // just the top level content type for now Quantiles.bin(agentCounts.value(userAgent), agentCuts), responseCode(0)).mkString("_") } def topDomain(proxyHost: String, topDomains: Set[String]): Int = { val domain = DomainProcessor.extractDomain(proxyHost) if (domainBelongsToSafeList(domain)) { 2 } else if (topDomains.contains(domain)) { 1 } else { 0 } } def domainBelongsToSafeList(domain: String) = domain == "intel" // TBD parameterize this! }
Example 44
Source File: AggregatedICPClassifier.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.liblinear import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD import se.uu.farmbio.cp.ICPClassifierModel import org.apache.commons.lang.NotImplementedException import org.apache.spark.broadcast.Broadcast import org.apache.spark.SparkContext object AggregatedICPClassifier { def load(path: String, sc: SparkContext) = { val icps = sc.textFile(path) .map(ICPClassifierModel.deserialize(_, LibLinAlgDeserializer)) new AggregatedICPClassifier(icps) } } class AggregatedICPClassifier( private val icps: RDD[ICPClassifierModel[LibLinAlg]]) extends ICPClassifierModel[LibLinAlg] { val cachedICPs = icps.cache override def mondrianPv(features: Vector) = { cachedICPs .flatMap { icp => icp.mondrianPv(features) .zipWithIndex } .collect //we expect to aggregate up to 100 ICPs .groupBy(_._2) .toArray .sortBy(_._1) .map { case (index, seq) => val sortedSeq = seq.map(_._1).toArray.sorted val n = sortedSeq.length val median = if (n % 2 == 0) { (sortedSeq(n / 2 - 1) + sortedSeq(n / 2)) / 2 } else { sortedSeq(n / 2) } median } } def save(path: String, coalesce: Int = 0) = { var serialICPs = cachedICPs.map(_.toString) if (coalesce > 0) { serialICPs = serialICPs.coalesce(coalesce) } serialICPs.saveAsTextFile(path) } }
Example 45
Source File: Configuration.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.hadoop import java.io.{ ObjectInputStream, ObjectOutputStream } import org.apache.hadoop.conf import org.apache.hadoop.conf.{ Configuration ⇒ HadoopConfiguration } import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.hammerlab.hadoop.kryo.WritableSerializer import org.hammerlab.kryo._ class Configuration(@transient var value: HadoopConfiguration) extends Serializable { private def writeObject(out: ObjectOutputStream): Unit = { value.write(out) } private def readObject(in: ObjectInputStream): Unit = { value = new HadoopConfiguration(false) value.readFields(in) } } object Configuration extends Registrar { def apply(loadDefaults: Boolean = true): Configuration = new HadoopConfiguration(loadDefaults) def apply(conf: HadoopConfiguration): Configuration = new Configuration(conf) implicit def wrapConfiguration(conf: HadoopConfiguration): Configuration = apply(conf) implicit def unwrapConfiguration(conf: Configuration): HadoopConfiguration = conf.value implicit def unwrapConfigurationBroadcast(confBroadcast: Broadcast[Configuration]): Configuration = confBroadcast.value implicit def sparkContextToHadoopConfiguration(sc: SparkContext): Configuration = sc.hadoopConfiguration implicit class Ops(val conf: HadoopConfiguration) extends AnyVal { def serializable: Configuration = conf } register( cls[conf.Configuration] → new WritableSerializer[conf.Configuration], cls[Configuration] → serializeAs[Configuration, conf.Configuration] ) }
Example 46
Source File: MapPartitionsRWrapper.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.r import org.apache.spark.api.r._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.Row import org.apache.spark.sql.api.r.SQLUtils._ import org.apache.spark.sql.types.StructType case class MapPartitionsRWrapper( func: Array[Byte], packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]], inputSchema: StructType, outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) { def apply(iter: Iterator[Any]): Iterator[Any] = { // If the content of current DataFrame is serialized R data? val isSerializedRData = inputSchema == SERIALIZED_R_DATA_SCHEMA val (newIter, deserializer, colNames) = if (!isSerializedRData) { // Serialize each row into a byte array that can be deserialized in the R worker (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)}, SerializationFormats.ROW, inputSchema.fieldNames) } else { (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null) } val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) { SerializationFormats.ROW } else { SerializationFormats.BYTE } val runner = new RRunner[Array[Byte]]( func, deserializer, serializer, packageNames, broadcastVars, isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY) // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex. val outputIter = runner.compute(newIter, -1) if (serializer == SerializationFormats.ROW) { outputIter.map { bytes => bytesToRow(bytes, outputSchema) } } else { outputIter.map { bytes => Row.fromSeq(Seq(bytes)) } } } }
Example 47
Source File: WordFrequencyEncoder.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.nlp import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import keystoneml.workflow.{Estimator, Transformer} object WordFrequencyEncoder extends Estimator[Seq[String], Seq[Int]] { private[this] def makeUnigrams(data: RDD[Seq[String]]) = NGramsCounts[String]().apply(NGramsFeaturizer[String](1 to 1).apply(data)) // TODO: alternative approach: collectAsMap once, let driver do the work. def fit(data: RDD[Seq[String]]): WordFrequencyTransformer = { val unigramCounts = makeUnigrams(data) val wordIndex = unigramCounts .zipWithIndex() // indexes respect the sorted order .map { case ((unigram, count), index) => // valid if # of word types in training data is less than Int.MaxValue (unigram.words(0), index.asInstanceOf[Int]) }.collectAsMap() val wordIndexBroadcast = unigramCounts.sparkContext.broadcast(wordIndex) val unigrams = unigramCounts.map { case (unigram, count) => (wordIndexBroadcast.value(unigram.words(0)), count) }.collectAsMap() new WordFrequencyTransformer(wordIndexBroadcast, unigrams) } } class WordFrequencyTransformer( wordIndexBroadcast: Broadcast[scala.collection.Map[String, Int]], val unigramCounts: scala.collection.Map[Int, Int]) extends Transformer[Seq[String], Seq[Int]] { final val OOV_INDEX = -1 override def apply(in: RDD[Seq[String]]): RDD[Seq[Int]] = { in.mapPartitions { case part => val index = wordIndexBroadcast.value part.map(ngram => ngram.map(index.getOrElse(_, OOV_INDEX))) } } def apply(words: Seq[String]): Seq[Int] = { val index = wordIndexBroadcast.value words.map(index.getOrElse(_, OOV_INDEX)) } }
Example 48
Source File: KernelBlockLinearMapper.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import scala.reflect.ClassTag import scala.collection.mutable.ListBuffer import breeze.linalg._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import keystoneml.nodes.stats.{StandardScalerModel, StandardScaler} import keystoneml.nodes.util.{VectorSplitter, Identity} import keystoneml.utils.{MatrixUtils, Stats} import keystoneml.workflow.{Transformer, LabelEstimator} class KernelBlockLinearMapper[T: ClassTag]( val model: Seq[DenseMatrix[Double]], blockSize: Int, kernelTransformer: KernelTransformer[T], nTrain: Long, blocksBeforeCheckpoint: Int = 25) extends Transformer[T, DenseVector[Double]] { val numClasses = model(0).cols val numBlocks = model.size override def apply(in: RDD[T]): RDD[DenseVector[Double]] = { val testKernelMat = kernelTransformer(in) // Initially all predictions are 0 var predictions = in.mapPartitions { iter => if (iter.hasNext) { val out = DenseMatrix.zeros[Double](iter.size, numClasses) Iterator.single(out) } else { Iterator.empty } }.cache() val modelBCs = new ListBuffer[Broadcast[DenseMatrix[Double]]] (0 until numBlocks).foreach { block => val blockIdxs = (blockSize * block) until (math.min(nTrain.toInt, (block + 1) * blockSize)) val testKernelBlock = testKernelMat(blockIdxs.toSeq) val modelBlockBC = in.context.broadcast(model(block)) modelBCs += modelBlockBC // Update predictions var predictionsNew = predictions.zip(testKernelBlock).map { case(pred, testKernelBB) => pred :+ (testKernelBB * modelBlockBC.value) } predictionsNew.cache() predictionsNew.count() predictions.unpersist(true) testKernelMat.unpersist(blockIdxs.toSeq) modelBlockBC.unpersist(true) // If we are checkpointing update our cache if (in.context.getCheckpointDir.isDefined && block % blocksBeforeCheckpoint == (blocksBeforeCheckpoint - 1)) { predictionsNew = MatrixUtils.truncateLineage(predictionsNew, false) } predictions = predictionsNew } predictions.flatMap(x => MatrixUtils.matrixToRowArray(x)) } def apply(in: T): DenseVector[Double] = { val testKernelRow = kernelTransformer(in) val predictions = DenseVector.zeros[Double](numClasses) (0 until numBlocks).foreach { block => val blockIdxs = (blockSize * block) until (math.min(nTrain.toInt, (block + 1) * blockSize)) predictions += (testKernelRow(blockIdxs) * model(block)).toDenseVector } predictions } }
Example 49
Source File: ResultTask.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.lang.management.ManagementFactory import java.nio.ByteBuffer import java.util.Properties import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.executor.TaskMetrics import org.apache.spark.rdd.RDD private[spark] class ResultTask[T, U]( stageId: Int, stageAttemptId: Int, taskBinary: Broadcast[Array[Byte]], partition: Partition, locs: Seq[TaskLocation], val outputId: Int, localProperties: Properties, metrics: TaskMetrics, jobId: Option[Int] = None, appId: Option[String] = None, appAttemptId: Option[String] = None) extends Task[U](stageId, stageAttemptId, partition.index, metrics, localProperties, jobId, appId, appAttemptId) with Serializable { @transient private[this] val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } override def runTask(context: TaskContext): U = { // Deserialize the RDD and the func using the broadcast variables. val threadMXBean = ManagementFactory.getThreadMXBean val deserializeStartTime = System.currentTimeMillis() val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime } else 0L val ser = SparkEnv.get.closureSerializer.newInstance() val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime } else 0L func(context, rdd.iterator(partition, context)) } // This is only callable on the driver side. override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")" }
Example 50
Source File: Dictionary.scala From spark-nkp with Apache License 2.0 | 5 votes |
package com.github.uosdmlab.nkp import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql._ import org.apache.spark.sql.types._ import org.bitbucket.eunjeon.seunjeon.{Analyzer => EunjeonAnalyzer} object Dictionary { // Words inside driver. This won't be modified in executor. private[nkp] var words = Seq.empty[String] private[nkp] def syncWords(bcWords: Broadcast[Seq[String]]): Unit = { EunjeonAnalyzer.resetUserDict() EunjeonAnalyzer.setUserDict(bcWords.value.iterator) } def reset(): this.type = chain { words = Seq.empty[String] } private var isDictionaryUsed = false private[nkp] def shouldSync = { isDictionaryUsed } def addWords(word: String, words: String*): this.type = addWords(word +: words) def addWords(words: Traversable[String]): this.type = chain { this.words = this.words ++ words isDictionaryUsed = true } def addWordsFromCSV(path: String, paths: String*): this.type = addWordsFromCSV(path +: paths) def addWordsFromCSV(paths: Traversable[String]): this.type = chain { val spark = SparkSession.builder().getOrCreate() import spark.implicits._ val schema = StructType(Array( StructField("word", StringType, nullable = false), StructField("cost", StringType, nullable = true))) val df = spark.read .option("sep", ",") .option("inferSchema", value = false) .option("header", value = false) .schema(schema) .csv(paths.toSeq: _*) val words = df.map { case Row(word: String, cost: String) => s"$word,$cost" case Row(word: String, null) => word }.collect() addWords(words) } private def chain(fn: => Any): this.type = { fn this } }
Example 51
Source File: FilterTopFeaturesProcess.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.s2jobs.wal.process import org.apache.s2graph.s2jobs.task.TaskConf import org.apache.s2graph.s2jobs.wal.WalLogAgg import org.apache.s2graph.s2jobs.wal.transformer.{DefaultTransformer, Transformer} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import play.api.libs.json.{JsObject, Json} object FilterTopFeaturesProcess { private var validFeatureHashKeys: Set[Long] = null def getValidFeatureHashKeys(validFeatureHashKeysBCast: Broadcast[Array[Long]]): Set[Long] = { if (validFeatureHashKeys == null) { validFeatureHashKeys = validFeatureHashKeysBCast.value.toSet } validFeatureHashKeys } def collectDistinctFeatureHashes(ss: SparkSession, filteredDict: DataFrame): Array[Long] = { import ss.implicits._ val featureHashUDF = udf((dim: String, value: String) => WalLogAgg.toFeatureHash(dim, value)) filteredDict.withColumn("featureHash", featureHashUDF(col("dim"), col("value"))) .select("featureHash") .distinct().as[Long].collect() } def filterTopKsPerDim(dict: DataFrame, maxRankPerDim: Broadcast[Map[String, Int]], defaultMaxRank: Int): DataFrame = { val filterUDF = udf((dim: String, rank: Long) => { rank < maxRankPerDim.value.getOrElse(dim, defaultMaxRank) }) dict.filter(filterUDF(col("dim"), col("rank"))) } def filterWalLogAgg(ss: SparkSession, walLogAgg: Dataset[WalLogAgg], transformers: Seq[Transformer], validFeatureHashKeysBCast: Broadcast[Array[Long]]) = { import ss.implicits._ walLogAgg.mapPartitions { iter => val validFeatureHashKeys = getValidFeatureHashKeys(validFeatureHashKeysBCast) iter.map { walLogAgg => WalLogAgg.filterProps(walLogAgg, transformers, validFeatureHashKeys) } } } } class FilterTopFeaturesProcess(taskConf: TaskConf) extends org.apache.s2graph.s2jobs.task.Process(taskConf) { import FilterTopFeaturesProcess._ override def execute(ss: SparkSession, inputMap: Map[String, DataFrame]): DataFrame = { import ss.implicits._ val maxRankPerDim = taskConf.options.get("maxRankPerDim").map { s => Json.parse(s).as[JsObject].fields.map { case (k, jsValue) => k -> jsValue.as[Int] }.toMap } val maxRankPerDimBCast = ss.sparkContext.broadcast(maxRankPerDim.getOrElse(Map.empty)) val defaultMaxRank = taskConf.options.get("defaultMaxRank").map(_.toInt) val featureDict = inputMap(taskConf.options("featureDict")) val walLogAgg = inputMap(taskConf.options("walLogAgg")).as[WalLogAgg] val transformers = TaskConf.parseTransformers(taskConf) val filteredDict = filterTopKsPerDim(featureDict, maxRankPerDimBCast, defaultMaxRank.getOrElse(Int.MaxValue)) val validFeatureHashKeys = collectDistinctFeatureHashes(ss, filteredDict) val validFeatureHashKeysBCast = ss.sparkContext.broadcast(validFeatureHashKeys) filterWalLogAgg(ss, walLogAgg, transformers, validFeatureHashKeysBCast).toDF() } override def mandatoryOptions: Set[String] = Set("featureDict", "walLogAgg") }
Example 52
Source File: ParameterOperations.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.parameters import com.intel.analytics.bigdl._ import com.intel.analytics.bigdl.dataset.{DistributedDataSet, MiniBatch} import org.apache.spark.rdd.RDD import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.optim.DistriOptimizer.Cache import com.intel.analytics.bigdl.optim.Metrics import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.utils.Table import org.apache.spark.broadcast.Broadcast import scala.collection.mutable private[bigdl] class L2NormClippingProcessor(l2NormThreshold: Double) extends ParameterProcessor { override def collectGlobalData[T](models: RDD[Cache[T]], parameters: AllReduceParameter[T], metrics: Metrics, state: Table)(implicit ev: TensorNumeric[T]) : Unit = { val numFinishedModel = state.get[Int]("numFinishedModel").get val parallelism = state.get[Int]("parallelism").get val isGradientUpdated = state.get[Boolean]("isGradientUpdated").get val sumSquare = models.mapPartitions(modelIter => { if (!isGradientUpdated) { val getG = System.nanoTime() parameters.aggregateGradientPartition(numFinishedModel) metrics.add("aggregrateGradientParition average executor", System.nanoTime() - getG) } val sum = Util.getSumsquareInParallel(parameters.gradientPartition, parallelism) Iterator.single(sum) }).reduce(_ + _) state("isGradientUpdated") = true state("l2Norm") = math.sqrt(sumSquare) } override def processParameters[T](parameters: AllReduceParameter[T], modelCache: Cache[T], state: Table)(implicit ev: TensorNumeric[T]): Unit = { val l2Norm = state.get[Double]("l2Norm").get if (l2Norm > l2NormThreshold) { val scale = ev.fromType[Double](l2Norm / l2NormThreshold) parameters.gradientPartition.div(scale) } } override def processParameters[T](model: Module[T], state: Table)(implicit ev: TensorNumeric[T]): Unit = { val parallelism = state.get[Int]("parallelism").get val gradients = model.getParameters()._2 val l2Norm = math.sqrt(Util.getSumsquareInParallel(gradients, parallelism)) if (l2Norm > l2NormThreshold) { val scale = ev.fromType[Double](l2Norm / l2NormThreshold) gradients.div(scale) } } }
Example 53
Source File: BatchShuffleMapTask.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import java.util.Properties import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.storage.BlockManagerId private[spark] class BatchShuffleMapTask( stageId: Int, stageAttemptId: Int, taskBinaries: Broadcast[Array[Byte]], partitions: Array[Partition], partitionId: Int, @transient private var locs: Seq[TaskLocation], internalAccumulatorsSer: Array[Byte], localProperties: Properties, isFutureTask: Boolean, nextStageLocs: Option[Seq[BlockManagerId]] = None, depShuffleIds: Option[Seq[Seq[Int]]] = None, depShuffleNumMaps: Option[Seq[Int]] = None, jobId: Option[Int] = None, appId: Option[String] = None, appAttemptId: Option[String] = None) extends Task[Array[MapStatus]](stageId, stageAttemptId, partitionId, internalAccumulatorsSer, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps, jobId, appId, appAttemptId) with BatchTask with Logging { @transient private val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } var rdds: Array[RDD[_]] = null var deps: Array[ShuffleDependency[_, _, _]] = null override def prepTask(): Unit = { // Deserialize the RDD using the broadcast variable. val ser = SparkEnv.get.closureSerializer.newInstance() val (rddI, depI) = ser.deserialize[(Array[RDD[_]], Array[ShuffleDependency[_, _, _]])]( ByteBuffer.wrap(taskBinaries.value), Thread.currentThread.getContextClassLoader) rdds = rddI deps = depI } def getTasks(): Seq[Task[Any]] = { if (deps == null || rdds == null) { prepTask() } (0 until partitions.length).map { i => val s = ShuffleMapTask(stageId, stageAttemptId, partitions(i), localProperties, internalAccumulatorsSer, isFutureTask, rdds(i), deps(i), nextStageLocs) s.epoch = epoch s }.map(_.asInstanceOf[Task[Any]]) } override def runTask(context: TaskContext): Array[MapStatus] = { throw new RuntimeException("BatchShuffleMapTasks should not be run!") } override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "BatchShuffleMapTask(%d, %d)".format(stageId, partitionId) }
Example 54
Source File: BatchResultTask.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import java.util.Properties import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD private[spark] class BatchResultTask[T, U: ClassTag]( stageId: Int, stageAttemptId: Int, taskBinaries: Broadcast[Array[Byte]], val partitions: Array[Partition], partitionId: Int, @transient private val locs: Seq[TaskLocation], val outputId: Int, localProperties: Properties, internalAccumulatorsSer: Array[Byte], isFutureTask: Boolean, depShuffleIds: Option[Seq[Seq[Int]]] = None, depShuffleNumMaps: Option[Seq[Int]] = None, jobId: Option[Int] = None, appId: Option[String] = None, appAttemptId: Option[String] = None) extends Task[Array[U]](stageId, stageAttemptId, partitionId, internalAccumulatorsSer, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps, jobId, appId, appAttemptId) with BatchTask with Serializable { @transient private[this] val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } var rdds: Array[RDD[T]] = null var funcs: Array[(TaskContext, Iterator[T]) => U] = null override def prepTask(): Unit = { // Deserialize the RDD and the func using the broadcast variables. val ser = SparkEnv.get.closureSerializer.newInstance() val (rddI, funcI) = ser.deserialize[(Array[RDD[T]], Array[(TaskContext, Iterator[T]) => U])]( ByteBuffer.wrap(taskBinaries.value), Thread.currentThread.getContextClassLoader) rdds = rddI funcs = funcI } // Called on the executor side to get a smaller tasks out def getTasks(): Seq[Task[Any]] = { if (rdds == null) { prepTask() } (0 until partitions.length).map { i => val r = ResultTask(stageId, stageAttemptId, partitions(i), outputId, localProperties, internalAccumulatorsSer, isFutureTask, rdds(i), funcs(i)) r.epoch = epoch r }.map(_.asInstanceOf[Task[Any]]) } override def runTask(context: TaskContext): Array[U] = { throw new RuntimeException("BatchResultTasks should not be run!") } // This is only callable on the driver side. override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "BatchResultTask(" + stageId + ", " + partitionId + ")" }
Example 55
Source File: ResultTask.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.lang.management.ManagementFactory import java.nio.ByteBuffer import java.util.Properties import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.executor.TaskMetrics import org.apache.spark.rdd.RDD private[spark] class ResultTask[T, U]( stageId: Int, stageAttemptId: Int, taskBinary: Broadcast[Array[Byte]], partition: Partition, locs: Seq[TaskLocation], val outputId: Int, localProperties: Properties, serializedTaskMetrics: Array[Byte] = SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array(), isFutureTask: Boolean = false, depShuffleIds: Option[Seq[Seq[Int]]] = None, depShuffleNumMaps: Option[Seq[Int]] = None, jobId: Option[Int] = None, appId: Option[String] = None, appAttemptId: Option[String] = None) extends Task[U](stageId, stageAttemptId, partition.index, serializedTaskMetrics, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps, jobId, appId, appAttemptId) with Serializable { var rdd: RDD[T] = null var func: (TaskContext, Iterator[T]) => U = null @transient private[this] val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } override def prepTask(): Unit = { // Deserialize the RDD and the func using the broadcast variables. val threadMXBean = ManagementFactory.getThreadMXBean val deserializeStartTime = System.currentTimeMillis() val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime } else 0L val ser = SparkEnv.get.closureSerializer.newInstance() val (_rdd, _func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) rdd = _rdd func = _func _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime } else 0L } override def runTask(context: TaskContext): U = { // Deserialize the RDD and the func using the broadcast variables. if (func == null || rdd == null) { prepTask() } func(context, rdd.iterator(partition, context)) } // This is only callable on the driver side. override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")" } object ResultTask { def apply[T, U]( stageId: Int, stageAttemptId: Int, partition: Partition, outputId: Int, localProperties: Properties, internalAccumulatorsSer: Array[Byte], isFutureTask: Boolean, rdd: RDD[T], func: (TaskContext, Iterator[T]) => U): ResultTask[T, U] = { val rt = new ResultTask[T, U](stageId, stageAttemptId, null, partition, Seq.empty, outputId, localProperties, internalAccumulatorsSer, isFutureTask) rt.rdd = rdd rt.func = func rt } }
Example 56
Source File: ShuffleMapTask.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.lang.management.ManagementFactory import java.nio.ByteBuffer import java.util.Properties import scala.language.existentials import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.executor.TaskMetrics import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.shuffle.ShuffleWriter import org.apache.spark.storage.BlockManagerId def this(partitionId: Int) { this(0, 0, null, new Partition { override def index: Int = 0 }, null, new Properties, null) } @transient private val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } var rdd: RDD[_] = null var dep: ShuffleDependency[_, _, _] = null override def prepTask(): Unit = { // Deserialize the RDD using the broadcast variable. val threadMXBean = ManagementFactory.getThreadMXBean val deserializeStartTime = System.currentTimeMillis() val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime } else 0L val ser = SparkEnv.get.closureSerializer.newInstance() val (_rdd, _dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) rdd = _rdd dep = _dep _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime } else 0L } override def runTask(context: TaskContext): MapStatus = { if (dep == null || rdd == null) { prepTask() } var writer: ShuffleWriter[Any, Any] = null try { val manager = SparkEnv.get.shuffleManager writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context) writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]]) val status = writer.stop(success = true).get FutureTaskNotifier.taskCompleted(status, partitionId, dep.shuffleId, dep.partitioner.numPartitions, nextStageLocs, metrics.shuffleWriteMetrics, false) status } catch { case e: Exception => try { if (writer != null) { writer.stop(success = false) } } catch { case e: Exception => log.debug("Could not stop writer", e) } throw e } } override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId) } object ShuffleMapTask { def apply( stageId: Int, stageAttemptId: Int, partition: Partition, properties: Properties, internalAccumulatorsSer: Array[Byte], isFutureTask: Boolean, rdd: RDD[_], dep: ShuffleDependency[_, _, _], nextStageLocs: Option[Seq[BlockManagerId]]): ShuffleMapTask = { val smt = new ShuffleMapTask(stageId, stageAttemptId, null, partition, null, properties, internalAccumulatorsSer, isFutureTask, nextStageLocs) smt.rdd = rdd smt.dep = dep smt } }
Example 57
Source File: RRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 58
Source File: BroadcastSpatialJoin.scala From SpatialSpark with Apache License 2.0 | 5 votes |
package spatialspark.join import com.vividsolutions.jts.geom.Geometry import com.vividsolutions.jts.index.strtree.{ItemBoundable, ItemDistance, STRtree} import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import spatialspark.operator.SpatialOperator import spatialspark.operator.SpatialOperator.SpatialOperator object BroadcastSpatialJoin { def queryRtree(rtree: => Broadcast[STRtree], leftId: Long, geom: Geometry, predicate: SpatialOperator, radius: Double): Array[(Long, Long)] = { val queryEnv = geom.getEnvelopeInternal //queryEnv.expandBy(radius) lazy val candidates = rtree.value.query(queryEnv).toArray //.asInstanceOf[Array[(Long, Geometry)]] if (predicate == SpatialOperator.Within) { candidates.filter { case (id_, geom_) => geom.within(geom_.asInstanceOf[Geometry]) } .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) } } else if (predicate == SpatialOperator.Contains) { candidates.filter { case (id_, geom_) => geom.contains(geom_.asInstanceOf[Geometry]) } .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) } } else if (predicate == SpatialOperator.WithinD) { candidates.filter { case (id_, geom_) => geom.isWithinDistance(geom_.asInstanceOf[Geometry], radius) } .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) } } else if (predicate == SpatialOperator.Intersects) { candidates.filter { case (id_, geom_) => geom.intersects(geom_.asInstanceOf[Geometry]) } .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) } } else if (predicate == SpatialOperator.Overlaps) { candidates.filter { case (id_, geom_) => geom.overlaps(geom_.asInstanceOf[Geometry]) } .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) } } else if (predicate == SpatialOperator.NearestD) { //if (candidates.isEmpty) // return Array.empty[(Long, Long)] //val nearestItem = candidates.map { // case (id_, geom_) => (id_.asInstanceOf[Long], geom_.asInstanceOf[Geometry].distance(geom)) //}.reduce((a, b) => if (a._2 < b._2) a else b) class dist extends ItemDistance { override def distance(itemBoundable: ItemBoundable, itemBoundable1: ItemBoundable): Double = { val geom = itemBoundable.getItem.asInstanceOf[(Long, Geometry)]._2 val geom1 = itemBoundable1.getItem.asInstanceOf[(Long, Geometry)]._2 geom.distance(geom1) } } val nearestItem = rtree.value.nearestNeighbour(queryEnv, (0l, geom), new dist) .asInstanceOf[(Long, Geometry)] Array((leftId, nearestItem._1)) } else { Array.empty[(Long, Long)] } } def apply(sc: SparkContext, leftGeometryWithId: RDD[(Long, Geometry)], rightGeometryWithId: RDD[(Long, Geometry)], joinPredicate: SpatialOperator, radius: Double = 0): RDD[(Long, Long)] = { // create R-tree on right dataset val strtree = new STRtree() val rightGeometryWithIdLocal = rightGeometryWithId.collect() rightGeometryWithIdLocal.foreach(x => { val y = x._2.getEnvelopeInternal y.expandBy(radius) strtree.insert(y, x) }) val rtreeBroadcast = sc.broadcast(strtree) leftGeometryWithId.flatMap(x => queryRtree(rtreeBroadcast, x._1, x._2, joinPredicate, radius)) } }
Example 59
Source File: ReForeStLoader.scala From reforest with Apache License 2.0 | 5 votes |
package reforest import org.apache.commons.math3.distribution.PoissonDistribution import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.data.{RawDataLabeled, RawDataset, StaticData} import reforest.data.tree.ForestManager import reforest.rf.parameter.RFParameter import reforest.rf.split.RFSplitterManager import reforest.rf.{RFCategoryInfo, RFDataPrepare, RFStrategy} import reforest.util.{GCInstrumented, GCInstrumentedEmpty, MemoryUtil} class ReForeStLoader[T, U](@transient private val sc: SparkContext, parameter: Broadcast[RFParameter], strategyBC: Broadcast[RFStrategy[T, U]], val typeInfoBC: Broadcast[TypeInfo[T]], val typeInfoWorkingBC: Broadcast[TypeInfo[U]], val categoricalFeaturesInfoBC: Broadcast[RFCategoryInfo], rawDataset: RawDataset[T, U]) extends Serializable { val instrumented: Broadcast[GCInstrumented] = sc.broadcast(new GCInstrumentedEmpty) val dataPrepare = new RFDataPrepare[T, U](typeInfoBC, instrumented, strategyBC, false, 1) private var memoryUtil : Option[MemoryUtil] = Option.empty private var forestManager : Option[ForestManager[T, U]] = Option.empty private var workingData : Option[RDD[StaticData[U]]] = Option.empty private var previousWorkingData : Option[RDD[StaticData[U]]] = Option.empty private var splitterManager : Option[RFSplitterManager[T,U]] = Option.empty def testdatafreeze(): Unit = { rawDataset.testingData.persist(parameter.value.storageLevel) } def trainingdatafreeze(): Unit = { // rawDataset.trainingData.persist(property.storageLevel) rawDataset.trainingData.count() } def getRawDataset = rawDataset def getTestingData: RDD[RawDataLabeled[T, U]] = rawDataset.testingData def getMemoryUtil = memoryUtil def getForestManager = forestManager def getWorkingData(numTrees: Int = parameter.value.getMaxNumTrees, macroIteration: Int = 0, skipPreparation : Boolean =false) = { val timePreparationSTART = System.currentTimeMillis() if(skipPreparation) { forestManager = Some(new ForestManager[T, U](parameter.value.applyNumTrees(numTrees), splitterManager.get)) previousWorkingData = workingData workingData = Some(dataPrepare.prepareData(rawDataset.trainingData, sc.broadcast(forestManager.get.splitterManager.getSplitter(macroIteration)), parameter.value.numFeatures, memoryUtil.get, numTrees, macroIteration)) // workingData = Some(workingData.get.mapPartitionsWithIndex{case (partitionIndex, elements) => // strategyBC.value.reGenerateBagging(numTrees, partitionIndex, elements)}) val dataSize = workingData.get.persist(parameter.value.storageLevel).count() if(previousWorkingData.isDefined) { previousWorkingData.get.unpersist() } val timePreparationEND = System.currentTimeMillis() println("TIME PREPARATION SKIPPED INIT ("+dataSize+"): " + (timePreparationEND - timePreparationSTART)) workingData.get } else { previousWorkingData = workingData val zzz = strategyBC.value.findSplits(rawDataset.trainingData, typeInfoBC, typeInfoWorkingBC, instrumented, categoricalFeaturesInfoBC) splitterManager = Some(zzz._1) forestManager = Some(new ForestManager[T, U](parameter.value.applyNumTrees(numTrees), zzz._1)) memoryUtil = Some(zzz._2) val splitter = forestManager.get.splitterManager.getSplitter(macroIteration) // TODO the broadcast of the splitter must be unpersisted!!! workingData = Some(dataPrepare.prepareData(rawDataset.trainingData, sc.broadcast(splitter), parameter.value.numFeatures, memoryUtil.get, numTrees, macroIteration)) val dataSize = workingData.get.persist(parameter.value.storageLevel).count() if(previousWorkingData.isDefined) { previousWorkingData.get.unpersist() } val timePreparationEND = System.currentTimeMillis() println("TIME PREPARATION: " + (timePreparationEND - timePreparationSTART)) workingData.get } } }
Example 60
Source File: CCUtil.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.util import org.apache.commons.io.FilenameUtils import org.apache.spark.broadcast.Broadcast import org.apache.spark.{SparkConf, SparkContext} import reforest.TypeInfo import reforest.data.load.{ARFFUtil, DataLoad, LibSVMUtil} import reforest.rf.RFCategoryInfo import reforest.rf.parameter.RFParameter import scala.reflect.ClassTag def getDataLoader[T:ClassTag, U:ClassTag](property : RFParameter, typeInfo: Broadcast[TypeInfo[T]], instrumented: Broadcast[GCInstrumented], categoryInfo: Broadcast[RFCategoryInfo]): DataLoad[T, U] = { val extension = FilenameUtils.getExtension(property.dataset).toUpperCase() property.fileType match { case "LIBSVM" => new LibSVMUtil(typeInfo, instrumented, categoryInfo) case "SVM" => new LibSVMUtil(typeInfo, instrumented, categoryInfo) case "ARFF" => new ARFFUtil(typeInfo, instrumented, categoryInfo) case _ => new LibSVMUtil(typeInfo, instrumented, categoryInfo) } } }
Example 61
Source File: RFRotationMatrix.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.rf.rotation import org.apache.spark.broadcast.Broadcast import reforest.TypeInfo import reforest.data.{RawData, RawDataDense, RawDataLabeled, RotationMatrix} import scala.reflect.ClassTag /** * To rotate the raw data * * @param n the size of the nxn matrix (typically n is the number of features in the dataset) * @param typeInfo the type information for the raw data * @param seed a random generator seed * @tparam T raw data type * @tparam U working data type */ class RFRotationMatrix[T: ClassTag, U: ClassTag](n: Int, typeInfo: TypeInfo[T], seed: Int) extends Serializable { private val matrix = new RotationMatrix(n, seed) /** * It rotates a raw data * * @param element the element to rotate * @return the rotated element */ def rotateRawData(element: RawData[T, U]) = { val dense = element.toDense val densedRotated = matrix.rotate(dense.values, typeInfo) new RawDataDense[T, U](densedRotated, dense.nan) } /** * It rotates a raw data labeled * * @param element the element to rotate * @return the rotated element */ def rotate(element: RawDataLabeled[T, U]) = { new RawDataLabeled[T, U](element.label, rotateRawData(element.features)) } }
Example 62
Source File: RFDataPrepare.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.rf import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.TypeInfo import reforest.data.{RawDataLabeled, StaticData} import reforest.data.tree.ForestManager import reforest.rf.split.{RFSplitter, RFSplitterManager} import reforest.util.{GCInstrumented, MemoryUtil} class RFDataPrepare[T, U](typeInfo: Broadcast[TypeInfo[T]], instrumented: Broadcast[GCInstrumented], strategy: Broadcast[RFStrategy[T, U]], permitSparseWorkingData: Boolean, poissonMean: Double) extends Serializable { def prepareData(dataIndex: RDD[RawDataLabeled[T, U]], splitter : Broadcast[RFSplitter[T, U]], featureNumber: Int, memoryUtil: MemoryUtil, numTrees: Int, macroIteration : Int): RDD[StaticData[U]] = { dataIndex.mapPartitionsWithIndex { (partitionIndex, instances) => strategy.value.prepareData(numTrees, macroIteration, splitter, partitionIndex, instances, instrumented.value, memoryUtil) } } }
Example 63
Source File: SLCTreeGeneration.scala From reforest with Apache License 2.0 | 5 votes |
/* * Licensed to the Apache Software Foundation (ASF) under one or more * contributor license agreements. See the NOTICE file distributed with * this work for additional information regarding copyright ownership. * The ASF licenses this file to You under the Apache License, Version 2.0 * (the "License"); you may not use this file except in compliance with * the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package reforest.rf.slc import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.TypeInfo import reforest.data._ import reforest.data.tree.ForestManager import reforest.rf.feature.RFFeatureManager import reforest.rf.parameter.RFParameter import reforest.rf.{RFSkip, RFStrategy, RFTreeGeneration} import reforest.util._ class SLCTreeGeneration[T, U](@transient private val sc: SparkContext, property: Broadcast[RFParameter], typeInfo: Broadcast[TypeInfo[T]], typeInfoWorking: Broadcast[TypeInfo[U]], sampleSize: Long) extends Serializable { var fcsExecutor : Option[SLCExecutor[T, U]] = Option.empty def findBestCutSLC(dataIndex: RDD[StaticData[U]], forestManager: ForestManager[T, U], featureManager: RFFeatureManager, depthToStop : Int, instrumented: Broadcast[GCInstrumented], skip : RFSkip): ForestManager[T, U] = { if (featureManager.getActiveNodesNum <= 0) { forestManager } else { var toReturn = forestManager val splitterManagerBC = sc.broadcast(forestManager.splitterManager) if(fcsExecutor.isEmpty) { fcsExecutor = Some(SLCExecutor.build(sc, typeInfo, typeInfoWorking, property, splitterManagerBC, sampleSize)) } toReturn = fcsExecutor.get.executeSLC(toReturn, featureManager, dataIndex, depthToStop, skip) splitterManagerBC.unpersist() toReturn } } }
Example 64
Source File: LibSVMUtil.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.data.load import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.TypeInfo import reforest.data.{RawData, RawDataLabeled} import reforest.rf.RFCategoryInfo import reforest.util.GCInstrumented import scala.reflect.ClassTag /** * Forked from Apache Spark MLlib * Load data in LibSVM format * * @param typeInfo the type information of the raw data * @param instrumented the instrumentation of the GC * @param categoryInfo the information for the categorical features * @tparam T raw data type * @tparam U working data type */ class LibSVMUtil[T: ClassTag, U: ClassTag](typeInfo: Broadcast[TypeInfo[T]], instrumented: Broadcast[GCInstrumented], categoryInfo: Broadcast[RFCategoryInfo]) extends DataLoad[T, U] { override def loadFile(sc: SparkContext, path: String, numFeatures: Int, minPartitions: Int): RDD[RawDataLabeled[T, U]] = { val parsed = parseLibSVMFile(sc, path, minPartitions) instrumented.value.gcALL parsed.map { case (label, indices, values) => RawDataLabeled(label, RawData.sparse[T, U](numFeatures, indices, values, typeInfo.value.NaN).compressed) } } private def parseLibSVMFile(sc: SparkContext, path: String, minPartitions: Int): RDD[(Double, Array[Int], Array[T])] = { sc.textFile(path, minPartitions) .map(_.trim) .filter(line => !(line.isEmpty || line.startsWith("#"))) .mapPartitions(it => { val toReturn = it.map(u => parseLibSVMRecord(u)) instrumented.value.gc() toReturn }) } private[load] def parseLibSVMRecord(line: String): (Double, Array[Int], Array[T]) = { val items = line.split(' ') val label = Math.max(items.head.toDouble, 0) val (indices, values) = items.tail.filter(_.nonEmpty).flatMap { item => try { val indexAndValue = item.split(':') val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based val value = typeInfo.value.fromString(indexAndValue(1)) if (categoryInfo.value.isCategorical(index)) { Some((index, typeInfo.value.fromInt(categoryInfo.value.rawRemapping(typeInfo.value.toInt(value))))) } else { Some((index, value)) } } catch { case e : NumberFormatException => { println("Malformed input. Details: \n"+e.getMessage) System.exit(1) None } case e : Exception => { e.printStackTrace() System.exit(1) None } } }.unzip // check if indices are one-based and in ascending order var previous = -1 var i = 0 val indicesLength = indices.length while (i < indicesLength) { val current = indices(i) require(current > previous, s"indices should be one-based and in ascending order;" + " found current=$current, previous=$previous; line=\"$line\"") previous = current i += 1 } (label, indices, values) } }
Example 65
Source File: ARFFUtil.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.data.load import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.TypeInfo import reforest.data.{RawData, RawDataLabeled} import reforest.rf.RFCategoryInfo import reforest.util.GCInstrumented import scala.reflect.ClassTag /** * Load data in ARFF format * * @param typeInfo the type information of the raw data * @param instrumented the instrumentation of the GC * @param categoryInfo the information for the categorical features * @tparam T raw data type * @tparam U working data type */ class ARFFUtil[T: ClassTag, U: ClassTag](typeInfo: Broadcast[TypeInfo[T]], instrumented: Broadcast[GCInstrumented], categoryInfo: Broadcast[RFCategoryInfo]) extends DataLoad[T, U] { override def loadFile(sc: SparkContext, path: String, numFeatures: Int, minPartitions: Int): RDD[RawDataLabeled[T, U]] = { val parsed = parseARFFFile(sc, path, minPartitions) instrumented.value.gcALL parsed.map { case (label, values) => RawDataLabeled(label, RawData.dense[T, U](values, typeInfo.value.NaN)) } } private def parseARFFFile(sc: SparkContext, path: String, minPartitions: Int): RDD[(Double, Array[T])] = { sc.textFile(path, minPartitions) .map(_.trim) .filter(line => !(line.isEmpty || line.startsWith("#") || line.startsWith("%") || line.startsWith("@"))) .mapPartitions(it => { val toReturn = it.map(u => parseARFFRecord(u)) instrumented.value.gc() toReturn }) } private[load] def parseARFFRecord(line: String): (Double, Array[T]) = { val items = line.split(',') val label = Math.max(items.last.toDouble, 0) val values = items.dropRight(1).filter(_.nonEmpty).map({ try { typeInfo.value.fromString } catch { case e : NumberFormatException => { println("Malformed input. Details: \n"+e.getMessage) System.exit(1) null } case e : Exception => { e.printStackTrace() System.exit(1) null } } }) (label, values) } }
Example 66
Source File: ScalingVariable.scala From reforest with Apache License 2.0 | 5 votes |
package reforest.data import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import reforest.TypeInfo import scala.reflect.ClassTag /** * It scales the value of the raw data according to different methodologies * @tparam T raw data type * @tparam U working data type */ trait ScalingVariable[T, U] extends Serializable { /** * It scales the data passed as argument * @param data The value to be scaled * @return The scaled data */ def scale(data: RawDataLabeled[T, U]): RawDataLabeled[T, U] } /** * It scales the values according to the Basic Scaling of Blaser et al. "Random rotation ensembles". * Numeric values are scaled to [0, 1] using the min and max values. * @param sc The Spark Context * @param typeInfo The type information about the raw data * @param featureNumber The number of feature in the dataset * @param input The raw dataset * @tparam T raw data type * @tparam U working data type */ class ScalingBasic[T : ClassTag, U : ClassTag](@transient private val sc: SparkContext, typeInfo: Broadcast[TypeInfo[T]], featureNumber: Int, input: RDD[RawDataLabeled[T, U]]) extends ScalingVariable[T, U] { private val scaling: Broadcast[scala.collection.Map[Int, (T, T)]] = sc.broadcast(init()) private def scaleValue(index: Int, value: T): T = { val (min, max) = scaling.value(index) val doubleValue = typeInfo.value.toDouble(value) typeInfo.value.fromDouble(Math.min(1, Math.max(0, (doubleValue - typeInfo.value.toDouble(min)) / (typeInfo.value.toDouble(max) - typeInfo.value.toDouble(min))))) } override def scale(data: RawDataLabeled[T, U]): RawDataLabeled[T, U] = { val densed = data.features.toDense val values = new Array[T](densed.size) var count = 0 while (count < values.length) { values(count) = scaleValue(count, densed(count)) count += 1 } RawDataLabeled(data.label, new RawDataDense(values, densed.nan)) } private def init(): scala.collection.Map[Int, (T, T)] = { input.mapPartitions(it => { val min = Array.fill(featureNumber)(typeInfo.value.maxValue) val max = Array.fill(featureNumber)(typeInfo.value.minValue) def setMinMax(index: Int, value: T): Unit = { if (typeInfo.value.isMinOrEqual(value, min(index))) { min(index) = value } if (typeInfo.value.isMinOrEqual(max(index), value)) { max(index) = value } } it.foreach(t => { t.features.foreachActive(setMinMax) }) min.zip(max).zipWithIndex.map(_.swap).toIterator }).reduceByKey((a, b) => (typeInfo.value.min(a._1, b._1), typeInfo.value.max(a._2, b._2))).collectAsMap() } }
Example 67
Source File: MapPartitionsRWrapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.r import org.apache.spark.api.r._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.api.r.SQLUtils._ import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType case class MapPartitionsRWrapper( func: Array[Byte], packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]], inputSchema: StructType, outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) { def apply(iter: Iterator[Any]): Iterator[Any] = { // If the content of current DataFrame is serialized R data? val isSerializedRData = if (inputSchema == SERIALIZED_R_DATA_SCHEMA) true else false val (newIter, deserializer, colNames) = if (!isSerializedRData) { // Serialize each row into a byte array that can be deserialized in the R worker (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)}, SerializationFormats.ROW, inputSchema.fieldNames) } else { (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null) } val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) { SerializationFormats.ROW } else { SerializationFormats.BYTE } val runner = new RRunner[Array[Byte]]( func, deserializer, serializer, packageNames, broadcastVars, isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY) // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex. val outputIter = runner.compute(newIter, -1) if (serializer == SerializationFormats.ROW) { outputIter.map { bytes => bytesToRow(bytes, outputSchema) } } else { outputIter.map { bytes => Row.fromSeq(Seq(bytes)) } } } }
Example 68
Source File: LogisticRegression.scala From SparseML with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.sparselr import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap import org.apache.spark.mllib.sparselr.Utils._ import org.apache.spark.SparkEnv import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast object LogisticRegression { def train(input: RDD[(Array[Double], Matrix)], optimizer: Optimizer ): (Array[Int], Array[Double]) = { val hdfsIndex2global = new Int2IntOpenHashMap() var index = 0 input.map { point => point._2 match { case x: CompressedSparseMatrix => println("x.length" + x.mappings.length) case _ => throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.") } }.count val global2hdfsIndex = input.map { point => point._2 match { case x: CompressedSparseMatrix => x.mappings case _ => throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.") } }.collect().flatMap(t => t).distinct global2hdfsIndex.foreach{value => hdfsIndex2global.put(value, index) index += 1 } val bcHdfsIndex2global = input.context.broadcast(hdfsIndex2global) val examples = input.map(global2globalMapping(bcHdfsIndex2global)).cache() val numTraining = examples.count() println(s"Training: $numTraining.") SparkEnv.get.blockManager.removeBroadcast(bcHdfsIndex2global.id, true) val examplesTest = examples.mapPartitions(_.flatMap { case (y, part) => part.asInstanceOf[CompressedSparseMatrix].tupletIterator(y)}) val weights = Vectors.dense(new Array[Double](global2hdfsIndex.size)) val newWeights = optimizer.optimize(examplesTest, weights) ((global2hdfsIndex, newWeights.toArray)) } //globalId to localId for mappings in Matrix def global2globalMapping(bchdfsIndex2global: Broadcast[Int2IntOpenHashMap]) (partition: (Array[Double], Matrix)): (Array[Double], Matrix) = { val hdfsIndex2global = bchdfsIndex2global.value partition._2 match { case x: CompressedSparseMatrix => val local2hdfsIndex = x.mappings for (i <- 0 until local2hdfsIndex.length) { local2hdfsIndex(i) = hdfsIndex2global.get(local2hdfsIndex(i)) } case _ => throw new IllegalArgumentException(s"dot doesn't support ${partition.getClass}.") } partition } }
Example 69
Source File: RegressionMetricsSpark.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.evaluation import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.graphics.charts.Highcharts._ import org.apache.log4j.{Priority, Logger} import org.apache.spark.Accumulator import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import scalax.chart.module.ChartFactories.{XYBarChart, XYLineChart, XYAreaChart} histogram(residuals, numBins = 20) title("Histogram of Regression Residuals") } } object RegressionMetricsSpark { def computeKPIs(scoresAndLabels: RDD[(Double, Double)], size: Long) : (Double, Double, Double, Double) = { val mean: Accumulator[Double] = scoresAndLabels.context.accumulator(0.0, "mean") val err:DenseVector[Double] = scoresAndLabels.map((sc) => { val diff = sc._1 - sc._2 mean += sc._2 val difflog = math.pow(math.log(1 + math.abs(sc._1)) - math.log(math.abs(sc._2) + 1), 2) DenseVector(math.abs(diff), math.pow(diff, 2.0), difflog) }).reduce((a,b) => a+b) val SS_res = err(1) val mu: Broadcast[Double] = scoresAndLabels.context.broadcast(mean.value/size.toDouble) val SS_tot = scoresAndLabels.map((sc) => math.pow(sc._2 - mu.value, 2.0)).sum() val rmse = math.sqrt(SS_res/size.toDouble) val mae = err(0)/size.toDouble val rsq = if(1/SS_tot != Double.NaN) 1 - (SS_res/SS_tot) else 0.0 val rmsle = err(2)/size.toDouble (mae, rmse, rsq, rmsle) } }
Example 70
Source File: implicits.scala From ZparkIO with MIT License | 5 votes |
package com.leobenkel.zparkio import com.leobenkel.zparkio.Services.SparkModule import com.leobenkel.zparkio.Services.SparkModule.SparkModule import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import zio.{BootstrapRuntime, ZIO} import scala.reflect.ClassTag import scala.reflect.runtime.universe._ // scalastyle:off object.name object implicits { type ZDS_R[R, A] = ZIO[R with SparkModule, Throwable, Dataset[A]] type ZDS[A] = ZDS_R[Any, A] type ZRDD_R[R, A] = ZIO[R, Throwable, RDD[A]] type ZRDD[A] = ZRDD_R[Any, A] type ZBC_R[R, A] = ZIO[R with SparkModule, Throwable, Broadcast[A]] type ZBC[A] = ZBC_R[Any, A] object ZDS { def map[A](f: SparkSession => Dataset[A]): ZDS[A] = SparkModule().map(spark => f(spark)) def flatMap[A](f: SparkSession => ZDS[A]): ZDS[A] = SparkModule().flatMap(spark => f(spark)) def flatMapR[R, A](f: SparkSession => ZDS_R[R, A]): ZDS_R[R, A] = SparkModule().flatMap(spark => f(spark)) def apply[A](f: SparkSession => Dataset[A]): ZDS[A] = ZDS.map(f) def make[A <: Product: TypeTag: ClassTag, B <: Product: TypeTag: ClassTag]( input: Dataset[A] )( f: Dataset[A] => Encoder[B] => Dataset[B] ): ZDS[B] = { ZDS { spark => f(input)(spark.implicits.newProductEncoder[B]) } } def apply[A <: Product: TypeTag: ClassTag](data: A*): ZDS[A] = { apply { spark => import spark.implicits._ data.toDS() } } def apply[A: Encoder](data: Seq[A]): ZDS[A] = { apply { spark => import spark.implicits._ data.toDS() } } def broadcast[A: ClassTag](f: SparkSession => A): ZBC[A] = { SparkModule().map(spark => spark.sparkContext.broadcast(f(spark))) } } implicit class DatasetZ[R, A](zds: => ZIO[R, Throwable, Dataset[A]]) extends Serializable { def zMap[B <: Product: TypeTag: ClassTag](f: A => ZIO[Any, Throwable, B]): ZDS_R[R, B] = { ZDS.flatMapR[R, B] { spark => import spark.implicits._ zds.map { ds => ds.map { a => val zB = f(a) val runtime = new BootstrapRuntime {} runtime.unsafeRun(zB) } } } } } } // scalastyle:on
Example 71
Source File: ResultTask.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.nio.ByteBuffer import java.io._ import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD private[spark] class ResultTask[T, U]( stageId: Int, taskBinary: Broadcast[Array[Byte]], partition: Partition, @transient locs: Seq[TaskLocation], val outputId: Int) extends Task[U](stageId, partition.index) with Serializable { @transient private[this] val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } //TODO 进行任务逻辑执行 override def runTask(context: TaskContext): U = { // Deserialize the RDD and the func using the broadcast variables. //TODO 拿到序列化器 val ser = SparkEnv.get.closureSerializer.newInstance() //TODO 反序列化Task,这个rdd是第一个RDD,调用作用在RDD上的函数 val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) metrics = Some(context.taskMetrics) //TODO 开始调用作用于RDD的函数,拿到一条数据进行函数调用作用于数据 func(context, rdd.iterator(partition, context)) } // This is only callable on the driver side. override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString = "ResultTask(" + stageId + ", " + partitionId + ")" }
Example 72
Source File: ShuffleMapTask.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.nio.ByteBuffer import scala.language.existentials import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.shuffle.ShuffleWriter def this(partitionId: Int) { this(0, null, new Partition { override def index = 0 }, null) } @transient private val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } override def runTask(context: TaskContext): MapStatus = { // Deserialize the RDD using the broadcast variable. val ser = SparkEnv.get.closureSerializer.newInstance() val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) metrics = Some(context.taskMetrics) var writer: ShuffleWriter[Any, Any] = null try { val manager = SparkEnv.get.shuffleManager writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context) writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]]) return writer.stop(success = true).get } catch { case e: Exception => try { if (writer != null) { writer.stop(success = false) } } catch { case e: Exception => log.debug("Could not stop writer", e) } throw e } } override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString = "ShuffleMapTask(%d, %d)".format(stageId, partitionId) }
Example 73
Source File: BaseTimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.core.data.DataTransformer import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.DefaultParamsWritable import org.apache.spark.sql.Row abstract class BaseTimeSeriesGenerator extends Transformer with HasInputCol with HasOutputCol with HasTimeCol with DefaultParamsWritable with HasLabelCol with HasFeaturesCol { def convertRowToFloat(toBeConverted: Row): Row = { val values = (0 until toBeConverted.length).map { index => val value = toBeConverted.get(index) DataTransformer.toFloat(value) } Row(values) } def convertRowToDouble(toBeConverted: Row): Row = { val values = (0 until toBeConverted.length).map { index => val value = toBeConverted.get(index) DataTransformer.toDouble(value) } Row(values: _*) } def convertColumnToDouble(toBeTransformed: Row, colIndex: Broadcast[Int]): Row = { val (prior, after) = toBeTransformed.toSeq.splitAt(colIndex.value) val converted = DataTransformer.toDouble(toBeTransformed.get(colIndex.value)) val result = (prior :+ converted.toDouble) ++ after.tail Row(result: _*) } }
Example 74
Source File: HoltWintersBestModelEvaluation.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberHoltWintersModel import eleflow.uberdata.enums.SupportedAlgorithm import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.{ParamMap, ParamPair} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.Row import scala.reflect.ClassTag abstract class HoltWintersBestModelEvaluation[L, M <: ForecastBaseModel[M]]( implicit kt: ClassTag[L], ord: Ordering[L] = null ) extends BestModelFinder[L, M] with HoltWintersParams { protected def holtWintersEvaluation( row: Row, model: UberHoltWintersModel, broadcastEvaluator: Broadcast[TimeSeriesEvaluator[L]], id: L ): (UberHoltWintersModel, ModelParamEvaluation[L]) = { val features = row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol)) log.warn( s"Evaluating forecast for id $id, with parameters " + s"alpha ${model.alpha}, beta ${model.beta} and gamma ${model.gamma}" ) val expectedResult = row.getAs[org.apache.spark.ml.linalg.Vector](partialValidationCol) val forecastToBeValidated = Vectors.dense(new Array[Double]($(nFutures))) model.forecast(org.apache.spark.mllib.linalg.Vectors.fromML(features), forecastToBeValidated).toArray val toBeValidated = expectedResult.toArray.zip(forecastToBeValidated.toArray) val metric = broadcastEvaluator.value.evaluate(toBeValidated) val metricName = broadcastEvaluator.value.getMetricName val params = ParamMap().put( ParamPair(gamma, model.gamma), ParamPair(beta, model.beta), ParamPair(alpha, model.alpha) ) (model, new ModelParamEvaluation[L]( id, metric, params, Some(metricName), SupportedAlgorithm.HoltWinters )) } }
Example 75
Source File: XGBoostBaseBestModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.{Booster, DMatrix} import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType} trait BaseXGBoostBestModelFinder[G, M <: org.apache.spark.ml.ForecastBaseModel[M]] extends BestModelFinder[G, M] with HasGroupByCol { protected def buildTrainSchema(sparkContext: SparkContext): Broadcast[StructType] = sparkContext.broadcast { StructType( Seq( StructField($(groupByCol).get, FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, ArrayType(new VectorUDT)))) } protected def xGBoostEvaluation(row: Row, model: Booster, broadcastEvaluator: Broadcast[TimeSeriesEvaluator[G]], id: G, parameters: ParamMap): ModelParamEvaluation[G] = { val featuresArray = row .getAs[Array[org.apache.spark.ml.linalg.Vector]](IUberdataForecastUtil.FEATURES_COL_NAME) .map { vec => val values = vec.toArray.map(DataTransformer.toFloat) LabeledPoint(values.head, null, values.tail) } val features = new DMatrix(featuresArray.toIterator) log.warn(s"Evaluating forecast for id $id, with xgboost") val prediction = model.predict(features).flatten val (forecastToBeValidated, _) = prediction.splitAt(featuresArray.length) val toBeValidated = featuresArray.zip(forecastToBeValidated) val metric = broadcastEvaluator.value.evaluate(toBeValidated.map(f => (f._1.label.toDouble, f._2.toDouble))) val metricName = broadcastEvaluator.value.getMetricName new ModelParamEvaluation[G]( id, metric, parameters, Some(metricName), SupportedAlgorithm.XGBoostAlgorithm) } }