org.apache.spark.broadcast.Broadcast Scala Example

Source File: ResultTask.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.nio.ByteBuffer

import java.io._

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    @transient locs: Seq[TaskLocation],
    val outputId: Int)
  extends Task[U](stageId, partition.index) with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val deserializeStartTime = System.currentTimeMillis()
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime

    metrics = Some(context.taskMetrics)
    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
}

Source File: HingeAggregator.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.optim.aggregator

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg._


  def add(instance: Instance): this.type = {
    instance match { case Instance(label, weight, features) =>
      require(numFeatures == features.size, s"Dimensions mismatch when adding new instance." +
        s" Expecting $numFeatures but got ${features.size}.")
      require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")

      if (weight == 0.0) return this
      val localFeaturesStd = bcFeaturesStd.value
      val localCoefficients = coefficientsArray
      val localGradientSumArray = gradientSumArray

      val dotProduct = {
        var sum = 0.0
        features.foreachActive { (index, value) =>
          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
            sum += localCoefficients(index) * value / localFeaturesStd(index)
          }
        }
        if (fitIntercept) sum += localCoefficients(numFeaturesPlusIntercept - 1)
        sum
      }
      // Our loss function with {0, 1} labels is max(0, 1 - (2y - 1) (f_w(x)))
      // Therefore the gradient is -(2y - 1)*x
      val labelScaled = 2 * label - 1.0
      val loss = if (1.0 > labelScaled * dotProduct) {
        (1.0 - labelScaled * dotProduct) * weight
      } else {
        0.0
      }

      if (1.0 > labelScaled * dotProduct) {
        val gradientScale = -labelScaled * weight
        features.foreachActive { (index, value) =>
          if (localFeaturesStd(index) != 0.0 && value != 0.0) {
            localGradientSumArray(index) += value * gradientScale / localFeaturesStd(index)
          }
        }
        if (fitIntercept) {
          localGradientSumArray(localGradientSumArray.length - 1) += gradientScale
        }
      }

      lossSum += loss
      weightSum += weight
      this
    }
  }
}

Source File: RDDLossFunction.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.optim.loss

import scala.reflect.ClassTag

import breeze.linalg.{DenseVector => BDV}
import breeze.optimize.DiffFunction

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregator
import org.apache.spark.rdd.RDD


private[ml] class RDDLossFunction[
    T: ClassTag,
    Agg <: DifferentiableLossAggregator[T, Agg]: ClassTag](
    instances: RDD[T],
    getAggregator: (Broadcast[Vector] => Agg),
    regularization: Option[DifferentiableRegularization[Vector]],
    aggregationDepth: Int = 2)
  extends DiffFunction[BDV[Double]] {

  override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = {
    val bcCoefficients = instances.context.broadcast(Vectors.fromBreeze(coefficients))
    val thisAgg = getAggregator(bcCoefficients)
    val seqOp = (agg: Agg, x: T) => agg.add(x)
    val combOp = (agg1: Agg, agg2: Agg) => agg1.merge(agg2)
    val newAgg = instances.treeAggregate(thisAgg)(seqOp, combOp, aggregationDepth)
    val gradient = newAgg.gradient
    val regLoss = regularization.map { regFun =>
      val (regLoss, regGradient) = regFun.calculate(Vectors.fromBreeze(coefficients))
      BLAS.axpy(1.0, regGradient, gradient)
      regLoss
    }.getOrElse(0.0)
    bcCoefficients.destroy(blocking = false)
    (newAgg.loss + regLoss, gradient.asBreeze.toDenseVector)
  }
}

Source File: CommunityBasedPartitioning.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.operators.partitioning


import ml.sparkling.graph.api.operators.algorithms.community.CommunityDetection.{CommunityDetectionAlgorithm, CommunityDetectionMethod, ComponentID}
import ml.sparkling.graph.operators.partitioning.PropagationBasedPartitioning.DefaultPartitionOperator
import org.apache.log4j.Logger
import org.apache.spark.{Partitioner, SparkContext}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, PartitionID, PartitionStrategy, VertexId}

import scala.reflect.ClassTag


object CommunityBasedPartitioning {
  @transient
  val logger=Logger.getLogger(CommunityBasedPartitioning.getClass())

  def partitionGraphBy[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionMethod[VD,ED],numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    val numberOfPartitions=if (numParts== -1) sc.defaultParallelism else numParts
    val communities: Graph[ComponentID, ED] = communityDetectionMethod(graph)
    val numberOfCommunities=communities.vertices.values.countApproxDistinct()
    val (coarsedVertexMap,coarsedNumberOfPartitions) = ParallelPartitioningUtils.coarsePartitions(numberOfPartitions,numberOfCommunities,communities.vertices)
    val strategy=ByComponentIdPartitionStrategy(coarsedVertexMap,coarsedNumberOfPartitions, DefaultPartitionOperator)
    logger.info(s"Partitioning graph using coarsed map with ${coarsedVertexMap.size} entries  and ${coarsedNumberOfPartitions} partitions")
    val out=graph.partitionBy(strategy,numberOfCommunities.toInt).cache()
    out.edges.foreachPartition((_)=>{})
    out.vertices.foreachPartition((_)=>{})
    out
  }


  def partitionGraphUsing[VD:ClassTag,ED:ClassTag](graph:Graph[VD,ED],communityDetectionMethod:CommunityDetectionAlgorithm,numParts:Int= -1)(implicit sc:SparkContext): Graph[VD, ED] ={
    partitionGraphBy(graph,communityDetectionMethod.detectCommunities[VD,ED](_),numParts)
  }



}

Source File: ShortestPathLengthsFromCSV.scala From sparkling-graph with BSD 2-Clause "Simplified" License

5 votes

package ml.sparkling.graph.examples

import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes
import ml.sparkling.graph.api.operators.algorithms.shortestpaths.ShortestPathsTypes._
import ml.sparkling.graph.operators.algorithms.shortestpaths.ShortestPathsAlgorithm
import ml.sparkling.graph.operators.algorithms.shortestpaths.pathprocessors.fastutils.FastUtilWithDistance.DataMap
import ml.sparkling.graph.operators.predicates.AllPathPredicate
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.graphx.{Graph, VertexId}

import scala.collection.JavaConversions._

object ShortestPathLengthsFromCSV extends ExampleApp {
def body()={
  val shortestPaths =if(bucketSize == -1l)
    ShortestPathsAlgorithm.computeShortestPathsLengths(partitionedGraph,AllPathPredicate,treatAsUndirected)
  else
    ShortestPathsAlgorithm.computeShortestPathsLengthsIterative(partitionedGraph,(g:Graph[_,_])=>bucketSize,treatAsUndirected)
  val size: Broadcast[VertexId] =ctx.broadcast(partitionedGraph.numVertices)
  partitionedGraph.outerJoinVertices(shortestPaths.vertices)(Util.dataTransformFunction(size) _).vertices.values.saveAsTextFile(out)
  ctx.stop()
}
}


private object Util{
  def dataTransformFunction(size: Broadcast[VertexId])(vId: VertexId,oldValue: String,pathsOption: Option[_ >: DataMap <: JMap[JLong, JDouble]])={
    pathsOption.flatMap((paths)=>{
      var entries=paths.entrySet().toList.sortBy(_.getKey)
      val out=new StringBuilder()
      out++=s"${oldValue},"
      var a = 0l
      while (a < size.value) {
        if (entries.size > 0 && a == entries.head.getKey) {
          out ++= s"${entries.head.getValue},"
          entries = entries.drop(1)
        }
        else {
          out ++= "0,"
        }
        a += 1l
      }
      out.setLength(out.length - 1)
      Option(out.toString())
    }).getOrElse(oldValue)
  }
}

Source File: FeatureExtraction.scala From meetup-stream with Apache License 2.0

5 votes

package transformations

import scala.io.Source
import core._
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast



object FeatureExtraction  {
  
  val localDictionary=Source
    .fromURL(getClass.getResource("/wordsEn.txt"))
    .getLines
    .zipWithIndex
    .toMap  
  
    
  def breakToWords(description: String)={
    val wordSelector="""[^\<\>\/]\b([a-zA-Z\d]{4,})\b""".r
    (wordSelector findAllIn description).map{_.trim.toLowerCase()}
  }
  
  
  def eventToVector(dictionary: Map[String, Int], description: String): Option[Vector]={
    
  def popularWords(words: Iterator[String])={
    val initialWordCounts=collection.mutable.Map[String, Int]()
    val wordCounts=words.
      foldLeft(initialWordCounts){
        case(wordCounts, word)=> wordCounts+Tuple2(word,wordCounts.getOrElse(word,0)+1)
      }
    val wordsIndexes=wordCounts     
     .flatMap{
        case(word, count)=>dictionary.get(word).map{index=>(index,count.toDouble)}
      }
    val topWords=wordsIndexes.toSeq.sortBy(-1*_._2).take(10)
    topWords
  }
    
    
    
   val wordsIterator = breakToWords(description)
   val topWords=popularWords(wordsIterator)   
   if (topWords.size==10) Some(Vectors.sparse(dictionary.size,topWords)) else None
  }
  
}

Source File: ResultTask.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.nio.ByteBuffer

import java.io._

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    @transient locs: Seq[TaskLocation],
    val outputId: Int,//
    internalAccumulators: Seq[Accumulator[Long]])
  extends Task[U](stageId, stageAttemptId, partition.index, internalAccumulators)
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
     //获取反序列化的起始时间  
    val deserializeStartTime = System.currentTimeMillis()
     //获取反序列化器  
    val ser = SparkEnv.get.closureSerializer.newInstance()
    //调用反序列化器ser的deserialize()方法,得到RDD和FUNC,数据来自taskBinary  
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      //Thread.currentThread().getContextClassLoader,可以获取当前线程的引用,getContextClassLoader用来获取线程的上下文类加载器
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
     //计算反序列化时间_executorDeserializeTime  
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    //Task的taskMetrics信息
    metrics = Some(context.taskMetrics)
   // 调针对RDD中的每个分区,迭代执行func方法,执行Task  
    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  //这只能在driver使用
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
}

Source File: NearestNeighbors.scala From SparkSMOTE with MIT License

5 votes

package utils

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import com.github.fommil.netlib.BLAS
import scala.util.Random
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import scala.collection.mutable.ArrayBuffer

object NearestNeighbors {

	def runNearestNeighbors(data: RDD[Array[(LabeledPoint,Int,Int)]], 
		kNN: Int, 
		sampleData: Array[(LabeledPoint,Int,Int)]): Array[(String,Array[((Int,Int),Double)])] = {
		
		val globalNearestNeighborsByIndex = data.mapPartitionsWithIndex(localNearestNeighbors(_,_,kNN,sampleData)).groupByKey().map(x => (x._1,x._2.toArray.sortBy(r => r._2).take(kNN))).collect()		

		globalNearestNeighborsByIndex 
	}


	private def localNearestNeighbors(partitionIndex: Long,
		iter: Iterator[Array[(LabeledPoint,Int,Int)]],
		kNN: Int,
		sampleData: Array[(LabeledPoint,Int,Int)]): Iterator[(String,((Int,Int),Double))] = { 
			
			var result = List[(String,((Int,Int),Double))]()
			val dataArr = iter.next
			val nLocal = dataArr.size - 1			
			val sampleDataSize = sampleData.size - 1


			val kLocalNeighbors = Array.fill[distanceIndex](sampleDataSize+1)(null)
			for {
			    i1 <- 0 to sampleDataSize
			} 
			kLocalNeighbors(i1) = distanceIndex(sampleData(i1)._3.toInt, sampleData(i1)._2.toInt, DenseVector.zeros[Double](kNN) + Int.MaxValue.toDouble, DenseVector.zeros[Int](kNN))

			for (i <- 0 to nLocal) {
				val currentPoint = dataArr(i)
				val features = currentPoint._1.features
				val rowId = currentPoint._3.toInt	
				for (j <- 0 to sampleDataSize) {
					val samplePartitionId = sampleData(j)._2
					val sampleRowId = sampleData(j)._3
					val sampleFeatures = sampleData(j)._1.features
					if (!((rowId == sampleRowId) & (samplePartitionId == partitionIndex))) {
						val distance = Math.sqrt(sum((sampleFeatures - features) :* (sampleFeatures - features)))
						if (distance < max(kLocalNeighbors(j).distanceVector)) {
							val indexToReplace = argmax(kLocalNeighbors(j).distanceVector)
							kLocalNeighbors(j).distanceVector(indexToReplace) = distance
							kLocalNeighbors(j).neighborRowId(indexToReplace) = rowId
						}
					}
				}
			}
			for (m <- 0 to sampleDataSize){
				for (l <-0 to kNN-1) {
					
					val key = kLocalNeighbors(m).partitionId.toString+","+kLocalNeighbors(m).sampleRowId.toString
					val tup = (partitionIndex.toInt,kLocalNeighbors(m).neighborRowId(l))
					result.::=(key,(tup,kLocalNeighbors(m).distanceVector(l)))
				}
			}			
		result.iterator 
	}	
}

Source File: loadData.scala From SparkSMOTE with MIT License

5 votes

package utils

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

object loadData {

 	def readDelimitedData(sc: SparkContext, path: String, numFeatures: Int, delimiter: String, numPartitions: Int): RDD[(LabeledPoint,Int,Int)] = {
		val data = sc.textFile(path).filter{x => x.split(delimiter)(0).toDouble == 1.0}.repartition(numPartitions).mapPartitions{x => Iterator(x.toArray)}
		val formatData = data.mapPartitionsWithIndex{(partitionId,iter) =>
			var result = List[(LabeledPoint,Int,Int)]()
			val dataArray = iter.next
			val dataArraySize = dataArray.size - 1
			var rowCount = dataArraySize
			for (i <- 0 to dataArraySize) {
				val parts = dataArray(i).split(delimiter)
				result.::=((LabeledPoint(parts(0).toDouble,DenseVector(parts.slice(1,numFeatures+1)).map(_.toDouble)),partitionId.toInt,rowCount))
				rowCount = rowCount - 1
			}
			result.iterator
		}

		formatData
	}
	
}

Source File: SMOTE.scala From SparkSMOTE with MIT License

5 votes

package SMOTE

import org.apache.spark.SparkContext
import breeze.linalg._
import breeze.linalg.{DenseVector,Vector,SparseVector}
import com.github.fommil.netlib.BLAS
import scala.util.Random
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import scala.collection.mutable.ArrayBuffer
import utils._

object SMOTE {

	def runSMOTE(sc: SparkContext, 
		inPath: String, 
		outPath: String,
		numFeatures: Int,  
		oversamplingPctg: Double,
        kNN: Int,
		delimiter: String,
        numPartitions: Int): Unit = {

		val rand = new Random()

		val data = loadData.readDelimitedData(sc, inPath, numFeatures, delimiter, numPartitions)
		
		val dataArray = data.mapPartitions(x => Iterator(x.toArray)).cache()

        val numObs = dataArray.map(x => x.size).reduce(_+_)

		println("Number of Filtered Observations "+numObs.toString)		

		val roundPctg = oversamplingPctg
        val sampleData = dataArray.flatMap(x => x).sample(withReplacement = false, fraction = roundPctg, seed = 1L).collect().sortBy(r => (r._2,r._3)) //without Replacement

		println("Sample Data Count "+sampleData.size.toString)

	 	val globalNearestNeighbors = NearestNeighbors.runNearestNeighbors(dataArray, kNN, sampleData)
		
        var randomNearestNeighbor = globalNearestNeighbors.map(x => (x._1.split(",")(0).toInt,x._1.split(",")(1).toInt,x._2(rand.nextInt(kNN)))).sortBy(r => (r._1,r._2))
		
        var sampleDataNearestNeighbors = randomNearestNeighbor.zip(sampleData).map(x => (x._1._3._1._1, x._1._2, x._1._3._1._2, x._2._1))

		val syntheticData = dataArray.mapPartitionsWithIndex(createSyntheticData(_,_,sampleDataNearestNeighbors,delimiter)).persist()
		println("Synthetic Data Count "+syntheticData.count.toString)
		val newData = syntheticData.union(sc.textFile(inPath))
		println("New Line Count "+newData.count.toString)
		newData.saveAsTextFile(outPath)
	
	}

	private def createSyntheticData(partitionIndex: Long,
		iter: Iterator[Array[(LabeledPoint,Int,Int)]],
		sampleDataNN: Array[(Int,Int,Int,LabeledPoint)],
		delimiter: String): Iterator[String]  = {
			
			var result = List[String]()
			val dataArr = iter.next
			val nLocal = dataArr.size - 1			
			val sampleDataNNSize = sampleDataNN.size - 1
			val rand = new Random()			

			for (j <- 0 to sampleDataNNSize){
				val partitionId = sampleDataNN(j)._1
				val neighborId = sampleDataNN(j)._3
				val sampleFeatures = sampleDataNN(j)._4.features
				if (partitionId == partitionIndex.toInt){
					val currentPoint = dataArr(neighborId)	
					val features = currentPoint._1.features	
					sampleFeatures += (sampleFeatures - features) * rand.nextDouble
					result.::=("1.0"+delimiter+sampleFeatures.toArray.mkString(delimiter))	
				}
			}
		result.iterator
	}		
}

Source File: RDDLossFunctionSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.optim.loss

import org.apache.spark.SparkFunSuite
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregatorSuite.TestAggregator
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.rdd.RDD

class RDDLossFunctionSuite extends SparkFunSuite with MLlibTestSparkContext {

  @transient var instances: RDD[Instance] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    instances = sc.parallelize(Seq(
      Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)),
      Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)),
      Instance(2.0, 0.3, Vectors.dense(4.0, 0.5))
    ))
  }

  test("regularization") {
    val coefficients = Vectors.dense(0.5, -0.1)
    val regLossFun = new L2Regularization(0.1, (_: Int) => true, None)
    val getAgg = (bvec: Broadcast[Vector]) => new TestAggregator(2)(bvec.value)
    val lossNoReg = new RDDLossFunction(instances, getAgg, None)
    val lossWithReg = new RDDLossFunction(instances, getAgg, Some(regLossFun))

    val (loss1, grad1) = lossNoReg.calculate(coefficients.asBreeze.toDenseVector)
    val (regLoss, regGrad) = regLossFun.calculate(coefficients)
    val (loss2, grad2) = lossWithReg.calculate(coefficients.asBreeze.toDenseVector)

    BLAS.axpy(1.0, Vectors.fromBreeze(grad1), regGrad)
    assert(regGrad ~== Vectors.fromBreeze(grad2) relTol 1e-5)
    assert(loss1 + regLoss === loss2)
  }

  test("empty RDD") {
    val rdd = sc.parallelize(Seq.empty[Instance])
    val coefficients = Vectors.dense(0.5, -0.1)
    val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value)
    val lossFun = new RDDLossFunction(rdd, getAgg, None)
    withClue("cannot calculate cost for empty dataset") {
      intercept[IllegalArgumentException]{
        lossFun.calculate(coefficients.asBreeze.toDenseVector)
      }
    }
  }

  test("versus aggregating on an iterable") {
    val coefficients = Vectors.dense(0.5, -0.1)
    val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value)
    val lossFun = new RDDLossFunction(instances, getAgg, None)
    val (loss, grad) = lossFun.calculate(coefficients.asBreeze.toDenseVector)

    // just map the aggregator over the instances array
    val agg = new TestAggregator(2)(coefficients)
    instances.collect().foreach(agg.add)

    assert(loss === agg.loss)
    assert(Vectors.fromBreeze(grad) === agg.gradient)
  }

}

Source File: ShuffleMapTask.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.nio.ByteBuffer

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.shuffle.ShuffleWriter


  def this(partitionId: Int) {
    this(0, null, new Partition { override def index: Int = 0 }, null)
  }

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): MapStatus = {
    // Deserialize the RDD using the broadcast variable.
    val deserializeStartTime = System.currentTimeMillis()
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime

    metrics = Some(context.taskMetrics)
    var writer: ShuffleWriter[Any, Any] = null
    try {
      val manager = SparkEnv.get.shuffleManager
      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
      return writer.stop(success = true).get
    } catch {
      case e: Exception =>
        try {
          if (writer != null) {
            writer.stop(success = false)
          }
        } catch {
          case e: Exception =>
            log.debug("Could not stop writer", e)
        }
        throw e
    }
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
}

Source File: OTBroadcastHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.online.joins

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.catalyst.expressions.{Expression, Row}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.joins.{BroadcastHashJoin, BuildSide, HashJoin, HashedRelation}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId}

import scala.concurrent._
import scala.concurrent.duration._


case class OTBroadcastHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    buildSide: BuildSide,
    left: SparkPlan,
    right: SparkPlan)(
    @transient val controller: OnlineDataFrame,
    @transient val trace: List[Int] = -1 :: Nil,
    opId: OpId = OpId.newOpId)
  extends BinaryNode with HashJoin with OTStateful {

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  val timeout = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  @transient
  private lazy val broadcastFuture = future {
    prevBatch match {
      case None =>
        // Note that we use .execute().collect() because we don't want to convert data to Scala types
        val input: Array[Row] = buildPlan.execute().map(_.copy()).collect()
        val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length)
        val broadcast = sparkContext.broadcast(hashed)
        controller.broadcasts((opId, currentBatch)) = broadcast
        broadcast
      case Some(bId) =>
        controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[HashedRelation]]
    }
  }(BroadcastHashJoin.broadcastHashJoinExecutionContext)

  override def doExecute() = {
    val broadcastRelation = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamedIter =>
      hashJoin(streamedIter, broadcastRelation.value)
    }
  }

  override protected final def otherCopyArgs = controller :: trace :: opId :: Nil

  override def simpleString = s"${super.simpleString} $opId"

  override def newBatch(newTrace: List[Int]): SparkPlan = {
    val join = OTBroadcastHashJoin(leftKeys, rightKeys, buildSide, left, right)(
      controller, newTrace, opId)
    join.broadcastFuture
    join
  }
}

Source File: MTBLeftSemiHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.online.joins

import java.util.{HashSet => JHashSet}

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId}

import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent._
import scala.concurrent.duration._


case class MTBLeftSemiHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan)(
    @transient val controller: OnlineDataFrame,
    @transient val trace: List[Int] = -1 :: Nil,
    opId: OpId = OpId.newOpId)
  extends BinaryNode with HashJoin with OTStateful {

  override val buildSide = BuildRight

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  override def output = left.output

  @transient private[this] lazy val keyGenerator: () => MutableProjection =
    newMutableProjection(buildKeys, buildPlan.output)

  val timeout = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  val watcher = controller.getWatcher

  @transient
  private lazy val broadcastFuture = future {
    // Note that we use .execute().collect() because we don't want to convert data to Scala types
    val input: Array[Row] = buildPlan.execute()
      .mapPartitions(HashedSet(_, keyGenerator())).collect()
    prevBatch match {
      case None =>
        val hashed = HashedSet(input.iterator)
        val broadcast = sparkContext.broadcast(hashed)
        controller.broadcasts((opId, currentBatch)) = broadcast
        broadcast
      case Some(bId) =>
        // TODO: fix this integrity error by supporting join whose both branches may grow
        val hashed = HashedSet(input.iterator)
        val previous = controller.broadcasts((opId, bId)).value.asInstanceOf[JHashSet[Row]]
        if (!previous.containsAll(hashed)) {
          watcher += -1
          logError(s"Integrity Error in MTBLeftSemiHashJoin(Op $opId, Batch $currentBatch)")
        }
        controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]]
    }
  }

  override def doExecute() = {
    val broadcastRelation = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamIter =>
      val hashSet = broadcastRelation.value
      val joinKeys = streamSideKeyGenerator()
      streamIter.filter(current => {
        !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue)
      })
    }
  }

  override protected final def otherCopyArgs = controller :: trace :: opId :: Nil

  override def simpleString = s"${super.simpleString} $opId"

  override def newBatch(newTrace: List[Int]): SparkPlan = {
    val join = MTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId)
    join.broadcastFuture
    join
  }
}

Source File: OTBLeftSemiHashJoin.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.online.joins

import java.util.{HashSet => JHashSet}

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.catalyst.expressions.{Expression, MutableProjection, Row}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnspecifiedDistribution}
import org.apache.spark.sql.execution.joins.{BuildRight, HashJoin}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.hive.online.{OTStateful, OnlineDataFrame, OpId}

import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent._
import scala.concurrent.duration._


case class OTBLeftSemiHashJoin(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan)(
    @transient val controller: OnlineDataFrame,
    @transient val trace: List[Int] = -1 :: Nil,
    opId: OpId = OpId.newOpId)
  extends BinaryNode with HashJoin with OTStateful {

  override val buildSide = BuildRight

  override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning

  override def requiredChildDistribution =
    UnspecifiedDistribution :: UnspecifiedDistribution :: Nil

  override def output = left.output

  @transient private[this] lazy val keyGenerator: () => MutableProjection =
    newMutableProjection(buildKeys, buildPlan.output)

  val timeout = {
    val timeoutValue = sqlContext.conf.broadcastTimeout
    if (timeoutValue < 0) {
      Duration.Inf
    } else {
      timeoutValue.seconds
    }
  }

  @transient
  private lazy val broadcastFuture = future {
    prevBatch match {
      case None =>
        // Note that we use .execute().collect() because we don't want to convert data to Scala types
        val input: Array[Row] = buildPlan.execute()
          .mapPartitions(HashedSet(_, keyGenerator())).collect()
        val hashed = HashedSet(input.iterator)
        val broadcast = sparkContext.broadcast(hashed)
        controller.broadcasts((opId, currentBatch)) = broadcast
        broadcast
      case Some(bId) =>
        controller.broadcasts((opId, bId)).asInstanceOf[Broadcast[JHashSet[Row]]]
    }
  }

  override def doExecute() = {
    val broadcastRelation: Broadcast[JHashSet[Row]] = Await.result(broadcastFuture, timeout)

    streamedPlan.execute().mapPartitions { streamIter =>
      val hashSet = broadcastRelation.value
      val joinKeys = streamSideKeyGenerator()
      streamIter.filter(current => {
        !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue)
      })
    }
  }

  override protected final def otherCopyArgs = controller :: trace :: opId :: Nil

  override def simpleString = s"${super.simpleString} $opId"

  override def newBatch(newTrace: List[Int]): SparkPlan = {
    val join = OTBLeftSemiHashJoin(leftKeys, rightKeys, left, right)(controller, newTrace, opId)
    join.broadcastFuture
    join
  }
}

Source File: ResultTask.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    locs: Seq[TaskLocation],
    val outputId: Int,
    localProperties: Properties,
    metrics: TaskMetrics,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[U](stageId, stageAttemptId, partition.index, metrics, localProperties, jobId,
    appId, appAttemptId)
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext, user: String): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get(user).closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L

    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
}

Source File: RRDD.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
}

Source File: MapPartitionsRWrapper.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.r

import org.apache.spark.api.r._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.api.r.SQLUtils._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType


case class MapPartitionsRWrapper(
    func: Array[Byte],
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]],
    inputSchema: StructType,
    outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
  def apply(iter: Iterator[Any]): Iterator[Any] = {
    // If the content of current DataFrame is serialized R data?
    val isSerializedRData =
      if (inputSchema == SERIALIZED_R_DATA_SCHEMA) true else false

    val (newIter, deserializer, colNames) =
      if (!isSerializedRData) {
        // Serialize each row into a byte array that can be deserialized in the R worker
        (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)},
         SerializationFormats.ROW, inputSchema.fieldNames)
      } else {
        (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null)
      }

    val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) {
      SerializationFormats.ROW
    } else {
      SerializationFormats.BYTE
    }

    val runner = new RRunner[Array[Byte]](
      func, deserializer, serializer, packageNames, broadcastVars,
      isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY)
    // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex.
    val outputIter = runner.compute(newIter, -1)

    if (serializer == SerializationFormats.ROW) {
      outputIter.map { bytes => bytesToRow(bytes, outputSchema) }
    } else {
      outputIter.map { bytes => Row.fromSeq(Seq(bytes)) }
    }
  }
}

Source File: RepartitionedOrderedRDD2.scala From hail with MIT License

5 votes

package is.hail.sparkextras

import is.hail.annotations._
import is.hail.rvd.{PartitionBoundOrdering, RVD, RVDContext, RVDPartitioner, RVDType}
import is.hail.utils._
import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

class OrderedDependency[T](
    oldPartitionerBc: Broadcast[RVDPartitioner],
    newIntervalListBc: Broadcast[IndexedSeq[Interval]],
    rdd: RDD[T]
) extends NarrowDependency[T](rdd) {

  override def getParents(partitionId: Int): Seq[Int] =
    oldPartitionerBc.value.queryInterval(newIntervalListBc.value(partitionId))
}

object RepartitionedOrderedRDD2 {
  def apply(prev: RVD, newRangeBounds: IndexedSeq[Interval]): ContextRDD[Long] =
    ContextRDD(new RepartitionedOrderedRDD2(prev, newRangeBounds))
}


class RepartitionedOrderedRDD2 private (prev: RVD, newRangeBounds: IndexedSeq[Interval])
  extends RDD[ContextRDD.ElementType[Long]](prev.crdd.sparkContext, Nil) { // Nil since we implement getDependencies

  val prevCRDD: ContextRDD[Long] = prev.boundary.crdd
  val typ: RVDType = prev.typ
  val kOrd: ExtendedOrdering = PartitionBoundOrdering(typ.kType.virtualType)
  val oldPartitionerBc: Broadcast[RVDPartitioner] = prev.partitioner.broadcast(prevCRDD.sparkContext)
  val newRangeBoundsBc: Broadcast[IndexedSeq[Interval]] = prevCRDD.sparkContext.broadcast(newRangeBounds)

  require(newRangeBounds.forall{i => typ.kType.virtualType.relaxedTypeCheck(i.start) && typ.kType.virtualType.relaxedTypeCheck(i.end)})

  def getPartitions: Array[Partition] = {
    Array.tabulate[Partition](newRangeBoundsBc.value.length) { i =>
      RepartitionedOrderedRDD2Partition(
        i,
        dependency.getParents(i).toArray.map(prevCRDD.partitions),
        newRangeBoundsBc.value(i))
    }
  }

  override def compute(partition: Partition, context: TaskContext): Iterator[RVDContext => Iterator[Long]] = {
    val ordPartition = partition.asInstanceOf[RepartitionedOrderedRDD2Partition]
    val pord = kOrd.intervalEndpointOrdering
    val range = ordPartition.range
    val ur = new UnsafeRow(typ.rowType)
    val key = new SelectFieldsRow(ur, typ.kFieldIdx)

    Iterator.single { (ctx: RVDContext) =>
      ordPartition.parents.iterator
        .flatMap { parentPartition =>
          prevCRDD.iterator(parentPartition, context).flatMap(_(ctx))
        }.dropWhile { ptr =>
          ur.set(ctx.r, ptr)
          pord.lt(key, range.left)
        }.takeWhile { ptr =>
          ur.set(ctx.r, ptr)
          pord.lteq(key, range.right)
        }
    }
  }

  val dependency = new OrderedDependency(
    oldPartitionerBc,
    newRangeBoundsBc,
    prevCRDD.rdd)

  override def getDependencies: Seq[Dependency[_]] = FastSeq(dependency)
}

case class RepartitionedOrderedRDD2Partition(
    index: Int,
    parents: Array[Partition],
    range: Interval
) extends Partition

Source File: FalseLikes.scala From wordpress-posts-recommender with Apache License 2.0

5 votes

package wordpressworkshop

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

import scala.util.Random

object FalseLikes {

  def numLikeUsersDistributionArray(trainPostsRDD: RDD[(BlogPost, Set[Long])]): Array[Int] = {
    val bins = (0 to 100).toArray.map(_.toDouble)
    bins.zip(trainPostsRDD.map(_._2.size).histogram(bins)).flatMap {
      case (bin, count) => Array.fill(count.toInt)(bin.toInt)
    }
  }

  def blogPostsWithNonLikeUsers(trainPostsRDD: RDD[(BlogPost, Set[Long])],
                                numLikeUsersDistributionArrayBV: Broadcast[Array[Int]],
                                userIds: Broadcast[Set[Long]]): RDD[(BlogPost, Set[Long])] =
    trainPostsRDD.map {
      case (blogPost, users) =>
        val sum = numLikeUsersDistributionArrayBV.value.groupBy(identity).mapValues(_.length).values.sum
        val randomNumber: Int = Random.nextInt(sum.toInt)
        val nUsers = numLikeUsersDistributionArrayBV.value(randomNumber)
        val nonLikeUsers: Array[Long] = (userIds.value -- users).toArray
        blogPost -> Array.fill(nUsers)(nonLikeUsers(Random.nextInt(nonLikeUsers.length))).toSet
    }
}

Source File: QueryHamming.scala From cosine-lsh-join-spark with MIT License

5 votes

package com.soundcloud.lsh

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, IndexedRowMatrix, MatrixEntry}
import org.apache.spark.rdd.RDD



class QueryHamming(minCosineSimilarity: Double,
                   dimensions: Int,
                   resultSize: Int,
                   broadcastCatalog: Boolean = true) extends QueryJoiner with Serializable {

  override def join(queryMatrix: IndexedRowMatrix, catalogMatrix: IndexedRowMatrix): CoordinateMatrix = {
    val numFeatures = queryMatrix.numCols().toInt

    val randomMatrix = localRandomMatrix(dimensions, numFeatures)
    val querySignatures = matrixToBitSetSparse(queryMatrix, randomMatrix)
    val catalogSignatures = matrixToBitSetSparse(catalogMatrix, randomMatrix)

    var rddSignatures: RDD[SparseSignature] = null
    var broadcastSignatures: Broadcast[Array[SparseSignature]] = null

    if (broadcastCatalog) {
      rddSignatures = querySignatures
      broadcastSignatures = querySignatures.sparkContext.broadcast(catalogSignatures.collect)
    } else {
      rddSignatures = catalogSignatures
      broadcastSignatures = catalogSignatures.sparkContext.broadcast(querySignatures.collect)
    }

    val approximated = rddSignatures.mapPartitions {
      rddSignatureIterator =>
        val signaturesBC = broadcastSignatures.value
        rddSignatureIterator.flatMap {
          rddSignature =>
            signaturesBC.map {
              broadCastSignature =>
                val approximatedCosine = hammingToCosine(hamming(rddSignature.bitSet, broadCastSignature.bitSet), dimensions)

                if (broadcastCatalog)
                  new MatrixEntry(rddSignature.index, broadCastSignature.index, approximatedCosine)
                else
                  new MatrixEntry(broadCastSignature.index, rddSignature.index, approximatedCosine)
            }.filter(_.value >= minCosineSimilarity).sortBy(-_.value).take(resultSize)
        }
    }
    broadcastSignatures.unpersist(true)

    new CoordinateMatrix(approximated)
  }

}

Source File: CompareTest.scala From spark-bam with Apache License 2.0

5 votes

package org.hammerlab.bam.spark.compare

import hammerlab.bytes._
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.bam.check.{ MaxReadSize, ReadsToCheck }
import org.hammerlab.bam.spark.Split
import org.hammerlab.bam.test.resources.bam1
import org.hammerlab.bgzf.Pos
import org.hammerlab.bgzf.block.BGZFBlocksToCheck
import org.hammerlab.hadoop.Configuration
import org.hammerlab.hadoop.splits.MaxSplitSize
import org.hammerlab.spark.test.suite.SparkSuite
import shapeless.LabelledGeneric

class CompareTest
  extends SparkSuite {

  val lg = LabelledGeneric[Result]

  def check(actual: Result, expected: Result): Unit = {
    actual.copy(hadoopBamMS = 0, sparkBamMS = 0) should be(
      expected
    )
  }

  implicit lazy val confBroadcast: Broadcast[Configuration] = sc.broadcast(ctx)

  test("230kb") {
    implicit val splitSize = MaxSplitSize(230.KB)
    val actual = Result(bam1)

    val expected =
      Result(
        3,
        3,
        Vector(
          Right(
            Split(
              Pos(239479,   311),
              Pos(471040, 65535)
            )
          ),
          Left(
            Split(
              Pos(239479,   312),
              Pos(484396,    25)
            )
          )
        ),
        1,
        1,
        0,  // dummy value, timing values not checked
        0   // dummy value, timing values not checked
      )

    check(actual, expected)
  }

  test("115KB") {
    implicit val splitSize = MaxSplitSize(115.KB)
    check(
      Result(bam1),
      Result(
        5,
        5,
        Vector(
          Right(
            Split(
              Pos(239479,   311),
              Pos(353280, 65535)
            )
          ),
          Left(
            Split(
              Pos(239479,   312),
              Pos(361204,    42)
            )
          )
        ),
        1,
        1,
        0,  // dummy value, timing values not checked
        0   // dummy value, timing values not checked
      )
    )
  }
}

Source File: IndexedRecordPositions.scala From spark-bam with Apache License 2.0

5 votes

package org.hammerlab.bam.check.indexed

import caseapp.{ ValueDescription, HelpMessage ⇒ M, Name ⇒ O }
import hammerlab.path._
import magic_rdds.ordered._
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.hammerlab.args.ByteRanges
import org.hammerlab.bgzf.Pos
import org.hammerlab.magic.rdd.ordered.SortedRDD
import org.hammerlab.magic.rdd.ordered.SortedRDD.{ Bounds, bounds }

import scala.collection.immutable.SortedSet


  def apply(path: Path)(
      implicit
      sc: SparkContext,
      rangesBroadcast: Broadcast[Option[ByteRanges]]
  ): IndexedRecordPositions = {
    val reads =
      sc
        .textFile(path.toString)
        .map(
          line ⇒
            line.split(",") match {
              case Array(a, b) ⇒
                Pos(a.toLong, b.toInt)
              case _ ⇒
                throw new IllegalArgumentException(
                  s"Bad record-pos line: $line"
                )
            }
        )
        .filter {
          case Pos(blockPos, _) ⇒
            rangesBroadcast
            .value
            .forall(_.contains(blockPos))
        }
        .cache

    IndexedRecordPositions(
      reads,
      bounds(reads)
    )
  }
}

Source File: BlocksAndIndexedRecords.scala From spark-bam with Apache License 2.0

5 votes

package org.hammerlab.bam.check.indexed

import hammerlab.path._
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.hammerlab.args.ByteRanges
import org.hammerlab.bam.check.Blocks
import org.hammerlab.bgzf.Pos
import org.hammerlab.bgzf.block.Metadata
import org.hammerlab.kryo.Registrar

import scala.collection.immutable.SortedSet
import scala.reflect.ClassTag

case class BlocksAndIndexedRecords(blocks: RDD[Metadata],
                                   records: RDD[SortedSet[Pos]])

object BlocksAndIndexedRecords
  extends Registrar {

  def apply[U: ClassTag]()(
      implicit
      path: Path,
      sc: SparkContext,
      rangesBroadcast: Broadcast[Option[ByteRanges]],
      blockArgs: Blocks.Args,
      recordArgs: IndexedRecordPositions.Args
  ): BlocksAndIndexedRecords = {

    val Blocks(blocks, bounds) = Blocks()

    val posBounds =
      bounds
        .copy(
          partitions =
            bounds
              .partitions
              .map {
                _.map {
                  case (start, endOpt) ⇒
                    (
                      Pos(start, 0),
                      endOpt.map(Pos(_, 0))
                    )
                }
              }
        )

    val indexedRecords = IndexedRecordPositions(recordArgs.path)

    val repartitionedRecords = indexedRecords.toSets(posBounds)

    BlocksAndIndexedRecords(
      blocks,
      repartitionedRecords
    )
  }

  register(
    Blocks
  )
}

Source File: PosMetadata.scala From spark-bam with Apache License 2.0

5 votes

package org.hammerlab.bam.check

import hammerlab.show._
import htsjdk.samtools.{ BAMRecord, SAMFileHeader, SAMRecord, ValidationStringency }
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.bam.check.full.error.Flags
import org.hammerlab.bam.header.{ ContigLengths, Header }
import org.hammerlab.bam.iterator.RecordStream
import org.hammerlab.bam.spark.FindRecordStart
import org.hammerlab.bgzf.Pos
import org.hammerlab.bgzf.block.SeekableUncompressedBytes

case class PosMetadata(pos: Pos,
                       recordOpt: Option[NextRecord],
                       flags: Flags)

object PosMetadata {

  implicit def defaultShow(implicit showRecord: Show[SAMRecord]): Show[PosMetadata] =
    Show {
      case PosMetadata(pos, recordOpt, flags) ⇒
        show"$pos:\t$recordOpt. Failing checks: $flags"
    }

  implicit def showNextRecordOpt(implicit showNextRecord: Show[NextRecord]): Show[Option[NextRecord]] =
    Show {
      case Some(nextRecord) ⇒ nextRecord.show
      case None ⇒ "no next record"
    }

  def recordPos(record: SAMRecord)(implicit contigLengths: ContigLengths): String =
    s"${contigLengths(record.getReferenceIndex)._1}:${record.getStart}"

  implicit def showRecord(implicit contigLengths: ContigLengths): Show[SAMRecord] =
    Show {
      record ⇒
        record
          .toString
          .dropRight(1) +  // remove trailing period
            (
              // Append info about mapped/placed location
              if (
                record.getReadUnmappedFlag &&
                record.getStart >= 0 &&
                record.getReferenceIndex >= 0 &&
                record.getReferenceIndex < contigLengths.size
              )
                s" (placed at ${recordPos(record)})"
              else if (!record.getReadUnmappedFlag)
                s" @ ${recordPos(record)}"
              else
                ""
            )
    }

  def apply(pos: Pos,
            flags: Flags)(
      implicit
      uncompressedBytes: SeekableUncompressedBytes,
      header: Broadcast[Header],
      readsToCheck: ReadsToCheck,
      maxReadSize: MaxReadSize
  ): PosMetadata = {
    implicit val contigLengths = header.value.contigLengths
    PosMetadata(
      pos,
      {
        FindRecordStart
          .withDelta(pos)
          .map {
            case (nextRecordPos, delta) ⇒

              uncompressedBytes.seek(nextRecordPos)

              NextRecord(
                RecordStream(
                  uncompressedBytes,
                  header.value
                )
                .next()
                ._2,
                delta
              )
          }
      },
      flags
    )
  }

  import org.hammerlab.kryo._
  import org.hammerlab.bam.kryo.registerSAMFileHeader

  implicit val alsoRegister: AlsoRegister[PosMetadata] =
    AlsoRegister(
      cls[NextRecord],
      cls[BAMRecord],
      cls[ValidationStringency],
      cls[SAMFileHeader]
    )
}

Source File: Predictor.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.common
import com.tencent.angel.mlcore.conf.{MLCoreConf, SharedConf}
import com.tencent.angel.ml.math2.utils.{DataBlock, LabeledData}
import org.apache.spark.broadcast.Broadcast
import com.tencent.angel.sona.ml.common.MathImplicits._
import com.tencent.angel.sona.core.{AngelGraphModel, ExecutorContext}
import com.tencent.angel.sona.data.LocalMemoryDataBlock
import org.apache.spark.linalg
import org.apache.spark.linalg.Vectors
import org.apache.spark.sql.types.{DoubleType, StructField, StructType}
import org.apache.spark.sql.{Row, SPKSQLUtils}

import scala.collection.mutable.ListBuffer

class Predictor(bcValue: Broadcast[ExecutorContext],
                featIdx: Int, predictionCol: String, probabilityCol: String,
                bcConf: Broadcast[SharedConf]) extends Serializable {

  @transient private lazy val executorContext: ExecutorContext = {
    bcValue.value
  }

  @transient private lazy implicit val dim: Long = {
    executorContext.conf.getLong(MLCoreConf.ML_FEATURE_INDEX_RANGE)
  }

  @transient private lazy val appendedSchema: StructType = if (probabilityCol.nonEmpty) {
    new StructType(Array[StructField](StructField(probabilityCol, DoubleType),
      StructField(predictionCol, DoubleType)))
  } else {
    new StructType(Array[StructField](StructField(predictionCol, DoubleType)))
  }

  def predictRDD(data: Iterator[Row]): Iterator[Row] = {
    val localModel = executorContext.borrowModel(bcConf.value)
    val batchSize = 1024
    val storage = new LocalMemoryDataBlock(batchSize, batchSize * 1024 * 1024)

    var count = 0
    val cachedRows: Array[Row] = new Array[Row](batchSize)
    val result: ListBuffer[Row] = ListBuffer[Row]()
    data.foreach {
      case row if count != 0 && count % batchSize == 0 =>
        predictInternal(localModel, storage, cachedRows, result)

        storage.clean()
        storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0))
        cachedRows(count % batchSize) = row
        count += 1
      case row =>
        storage.put(new LabeledData(row.get(featIdx).asInstanceOf[linalg.Vector], 0.0))
        cachedRows(count % batchSize) = row
        count += 1
    }

    predictInternal(localModel, storage, cachedRows, result)

    executorContext.returnModel(localModel)

    result.toIterator
  }

  private def predictInternal(model: AngelGraphModel,
                              storage: DataBlock[LabeledData],
                              cachedRows: Array[Row],
                              result: ListBuffer[Row]): Unit = {
    val predicted = model.predict(storage)

    if (appendedSchema.length == 1) {
      predicted.zipWithIndex.foreach {
        case (res, idx) =>
          result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.pred))
      }
    } else {
      predicted.zipWithIndex.foreach {
        case (res, idx) =>
          result.append(SPKSQLUtils.append(cachedRows(idx), appendedSchema, res.proba, res.predLabel))
      }
    }

  }

  def predictRaw(features: linalg.Vector): linalg.Vector = {
    val localModel = executorContext.borrowModel(bcConf.value)

    val res = localModel.predict(new LabeledData(features, 0.0))

    executorContext.returnModel(localModel)
    Vectors.dense(res.pred, -res.pred)
  }
}

Source File: Trainer.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.common
import com.tencent.angel.mlcore.conf.{MLCoreConf, SharedConf}
import com.tencent.angel.ml.math2.utils.LabeledData
import com.tencent.angel.sona.core.ExecutorContext
import com.tencent.angel.sona.util.ConfUtils
import com.tencent.angel.sona.ml.evaluation.TrainingStat
import com.tencent.angel.sona.ml.evaluation.training._
import org.apache.spark.broadcast.Broadcast

class Trainer(bcValue: Broadcast[ExecutorContext], epoch: Int, bcConf: Broadcast[SharedConf]) extends Serializable {
  @transient private lazy val executorContext: ExecutorContext = {
    bcValue.value
  }

  def trainOneBatch(data: Array[LabeledData]): TrainingStat = {
    val localRunStat: TrainingStat = executorContext.conf.get(ConfUtils.ALGO_TYPE) match {
      case "class" =>
//        new ClassificationTrainingStat(executorContext.conf.getInt(MLCoreConf.ML_NUM_CLASS))
        new ClassificationTrainingStat(bcConf.value.getInt(MLCoreConf.ML_NUM_CLASS))
      case "regression" =>
        new RegressionTrainingStat()
      case "clustering" =>
        new ClusteringTrainingStat()
    }

    val localModel = executorContext.borrowModel(bcConf.value) // those code executor on task

    val graph = localModel.graph

    graph.feedData(data)
    localRunStat.setNumSamples(data.length)
    // note: this step is synchronized
    val pullStart = System.currentTimeMillis()
    if (bcConf.value.getBoolean(MLCoreConf.ML_IS_DATA_SPARSE)) {
      localModel.pullParams(epoch, graph.placeHolder.getIndices)
    } else {
      localModel.pullParams(epoch)
    }
    val pullFinished = System.currentTimeMillis()
    localRunStat.setPullTime(pullFinished - pullStart)

    val forwardStart = System.currentTimeMillis()
    val avgLoss = graph.calForward()
    graph.predict().foreach { pres => localRunStat.add(pres) }
    localRunStat.setAvgLoss(avgLoss)
    val forwardFinished = System.currentTimeMillis()
    localRunStat.setForwardTime(forwardFinished - forwardStart)

    val backwardStart = System.currentTimeMillis()
    graph.calBackward()
    val backwardFinished = System.currentTimeMillis()
    localRunStat.setBackwardTime(backwardFinished - backwardStart)

    // note: this step is asynchronous
    val pushStart = System.currentTimeMillis()
    localModel.pushGradient(0.1)
    val pushFinished = System.currentTimeMillis()
    localRunStat.setPushTime(pushFinished - pushStart)

    executorContext.returnModel(localModel)

    localRunStat
  }
}

Source File: AngelSparkModel.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.common
import com.tencent.angel.client.AngelPSClient
import com.tencent.angel.mlcore.conf.SharedConf
import com.tencent.angel.sona.core.{AngelGraphModel, DriverContext, ExecutorContext, SparkMasterContext}
import com.tencent.angel.sona.ml.evaluation.TrainingStat
import com.tencent.angel.sona.ml.param.{AngelGraphParams, Params}
import org.apache.spark.broadcast.Broadcast

trait AngelSparkModel extends Params with AngelGraphParams {
  val angelModelName: String

  var numTask: Int = -1

  @transient var bcValue: Broadcast[ExecutorContext] = _
  @transient var bcConf: Broadcast[SharedConf] = _

  @transient implicit val psClient: AngelPSClient = synchronized {
    DriverContext.get().getAngelClient
  }

  @transient lazy val sparkEnvContext: SparkMasterContext = synchronized {
    DriverContext.get().sparkMasterContext
  }

  @transient implicit lazy val dim: Long = getNumFeature

  @transient lazy val angelModel: AngelGraphModel = {
    require(numTask == -1 || numTask > 0, "Please set numTask before use angelModel")
    new AngelGraphModel(sharedConf, numTask)
  }

  @transient private var trainingSummary: Option[TrainingStat] = None

  def setSummary(summary: Option[TrainingStat]): this.type = {
    this.trainingSummary = summary
    this
  }

  def hasSummary: Boolean = trainingSummary.isDefined

  def summary: TrainingStat = trainingSummary.getOrElse {
    throw new Exception("No training summary available for this AngelClassifierModel")
  }

  def setNumTask(numTask: Int): this.type = {
    this.numTask = numTask
    psClient.setTaskNum(numTask)

    this
  }

  def setBCValue(bcValue: Broadcast[ExecutorContext]): this.type = {
    this.bcValue = bcValue

    this
  }

}

Source File: Features.scala From wordpress-posts-recommender with Apache License 2.0

5 votes

package wordpressworkshop

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD._
import scalaz.Scalaz._

case class Features(categoriesLikelihood: Double, tagsLikelihood: Double, languageLikelihood: Double,
                    authorLikelihood: Double, titleLengthMeanError: Double,
                    blogLikelihood: Double,
                    averageLikesPerPost: Double)

case object Features {
  def blogIdToPriorBlogLikelihoodBV(statsUserRDD: RDD[StatsUser]): Map[Long, Double] =
    (statsUserRDD.map {
      case StatsUser(_, numLikes: Long, likeBlogs: Map[Long, Long]) => (likeBlogs, numLikes)
    }.reduce(_ |+| _) match {
      case (likeBlogs, numLikes) => likeBlogs.mapValues(_.toDouble / numLikes).map(identity)
    }).withDefaultValue(0.0)

  def meanBlogLikesPerPost(statsBlogRDD: RDD[StatsBlog]): Double = statsBlogRDD.map {
    case StatsBlog(_, numLikes: Long, numPosts: Long) => (numLikes, numPosts)
  }.reduce(_ |+| _) match {
    case (numLikes, numPosts) => numLikes.toDouble / numPosts
  }

  def userIdToOtherLikelihoodMaps(trainPostsRDD: RDD[(BlogPost, Set[Long])]): RDD[(Long, (Map[String, Int], Map[String, Int], Map[String, Int], Map[Long, Int], Map[Int, Int]))] =
    (for {
      (blogPost, users) <- trainPostsRDD
      userId <- users
    } yield userId ->(blogPost.categories.map(_ -> 1).toMap,
        blogPost.tags.map(_ -> 1).toMap,
        Map(blogPost.language -> 1),
        Map(blogPost.authorId -> 1),
        Map(blogPost.title.map(_.split("[^\\w']+").size).getOrElse(0) -> 1))
      )
    .reduceByKey(_ |+| _)
    .mapValues {
      case (categoriesLikelihoodMap, tagsLikelihoodMap,
      languageLikelihoodMap, authorLikelihoodMap,
      titleLengthLikelihoodMap) => (categoriesLikelihoodMap.toList.sortBy(_._2).takeRight(100).toMap,
        tagsLikelihoodMap.toList.sortBy(_._2).takeRight(100).toMap,
        languageLikelihoodMap,
        authorLikelihoodMap.toList.sortBy(_._2).takeRight(100).toMap,
        titleLengthLikelihoodMap)
    }

  def likelihoodSet(map: Map[String, Int], labels: Set[String]): Double =
    labels.flatMap(map.get).sum.toDouble / map.values.sum

  def likelihoodInt[K](map: Map[K, Int], label: K): Double =
    map.getOrElse(label, 0).toDouble / map.values.sum

  def likelihoodDouble[K](map: Map[K, Double], label: K): Double =
    map.getOrElse(label, 0.0) / map.values.sum

  def features(blogPostsAndUsers: RDD[(BlogPost, Set[Long])],
               userIdToOtherLikelihoodMaps: Broadcast[Map[Long, (Map[String, Int], Map[String, Int],
                 Map[String, Int], Map[Long, Int], Map[Int, Int])]],
               userIdToBlogLikelihood: Broadcast[Map[Long, Map[Long, Double]]],
               blogIdToPriorBlogLikelihoodBV: Broadcast[Map[Long, Double]],
               blogIdToAverageLikesPerPostBV: Broadcast[Map[Long, Double]],
               meanBlogLikesPerPost: Double) =
    for {
      (post, users) <- blogPostsAndUsers
      blogId = post.blogId
      postId = post.postId
      averageLikesPerPost = blogIdToAverageLikesPerPostBV.value.getOrElse(post.blogId, meanBlogLikesPerPost)
      userId <- users

      (categoriesLikelihoodMap,
      tagsLikelihoodMap,
      languageLikelihoodMap,
      authorLikelihoodMap,
      titleLengthLikelihoodMap) = userIdToOtherLikelihoodMaps.value(userId)

      titleLengthAverage = titleLengthLikelihoodMap.values.sum.toDouble / titleLengthLikelihoodMap.size

      blogLikelihoodMapOption = userIdToBlogLikelihood.value.get(userId)
      blogLikelihoodMap = blogLikelihoodMapOption.getOrElse(blogIdToPriorBlogLikelihoodBV.value)

    } yield (userId, post.postId) -> Features(
      categoriesLikelihood = likelihoodSet(categoriesLikelihoodMap, post.categories),
      tagsLikelihood = likelihoodSet(tagsLikelihoodMap, post.tags),
      languageLikelihood = likelihoodInt(languageLikelihoodMap, post.language),
      authorLikelihood = likelihoodInt(authorLikelihoodMap, post.authorId),
      titleLengthMeanError =
        math.abs(titleLengthAverage - post.title.map(_.split("[^\\w']+").size).getOrElse(0)),
      blogLikelihood = likelihoodDouble(blogLikelihoodMap, post.blogId),
      averageLikesPerPost = averageLikesPerPost
    )
}

Source File: BroadcastSimple.scala From reforest with Apache License 2.0

5 votes

package reforest.test

import org.apache.spark.broadcast.Broadcast
import reforest.rf.RFCategoryInfo
import reforest.util.{GCInstrumented, GCInstrumentedEmpty}
import reforest.{TypeInfo, TypeInfoDouble, TypeInfoInt}
import test.RFResourceFactory

import scala.reflect.ClassTag

class BroadcastSimple[T: ClassTag](v: T) extends Broadcast[T](0) {
  override def value: T = v

  override def getValue(): T = v

  override def doDestroy(blocking: Boolean) = {}

  override def doUnpersist(blocking: Boolean) = {}
}

object BroadcastSimple {
  val typeInfoInt = new BroadcastSimple[TypeInfoInt](new TypeInfoInt(false, -100))
  val typeInfoDouble : Broadcast[TypeInfo[Double]] = new BroadcastSimple[TypeInfo[Double]](new TypeInfoDouble(false, -100))
  val gcInstrumentedEmpty : Broadcast[GCInstrumented] = new BroadcastSimple[GCInstrumented](new GCInstrumentedEmpty)
  val categoryInfoEmpty : Broadcast[RFCategoryInfo] = new BroadcastSimple[RFCategoryInfo](RFResourceFactory.getCategoricalInfo)
}

Source File: Ledger.scala From deepspark with GNU General Public License v2.0

5 votes

package com.github.nearbydelta.deepspark.word.layer

import breeze.linalg.DenseVector
import com.esotericsoftware.kryo.Kryo
import com.esotericsoftware.kryo.io.{Input, Output}
import com.github.nearbydelta.deepspark.data._
import com.github.nearbydelta.deepspark.layer.InputLayer
import com.github.nearbydelta.deepspark.word._
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast

import scala.reflect.{ClassTag, classTag}


trait Ledger[OutInfo] extends InputLayer[Array[Int], OutInfo] {
  @transient implicit override protected val evidenceI: ClassTag[Array[Int]] = classTag[Array[Int]]
  @transient var algorithm: LedgerAlgorithm = _
  var bcModel: Broadcast[LedgerModel] = _
  @transient var builder: LedgerBuilder = _
  var dimension: Int = 0
  @transient var model: LedgerModel = _
  protected var padID = -1

  def withModel(model: LedgerModel, builder: LedgerBuilder): this.type = {
    this.model = model
    this.builder = builder
    this.padID = model.padID
    this.dimension = model.dimension
    this.algorithm = builder.getUpdater(this.model.vectors)
    this
  }

  protected def pad =
    if (padID == -1) null
    else if (bcModel != null) vectorOf(bcModel.value.padID)
    else vectorOf(padID)

  protected def updateWord(word: Int, dx: DataVec): Unit =
    if (word != -1 && algorithm != null) {
      val vec = algorithm.delta.getOrElseUpdate(word, DenseVector.zeros[Double](dimension))
      vec += dx
    }

  protected def vectorOf(str: Int) =
    if (bcModel != null) bcModel.value.vectorAt(str)
    else model.vectorAt(str)

  override def broadcast(sc: SparkContext): Unit = {
    bcModel = sc.broadcast(model)
  }

  override def loss: Double = algorithm.loss

  override def read(kryo: Kryo, input: Input): Unit = {
    builder = kryo.readClassAndObject(input).asInstanceOf[LedgerBuilder]
    val model = new LedgerModel
    model.read(kryo, input)

    require(model.size > 0, "Model is empty!")
    withModel(model, builder)
    super.read(kryo, input)
  }

  override def unbroadcast(): Unit = {
    bcModel.unpersist(blocking = false)
  }

  @deprecated
  override def withInput(in: Int): this.type = this

  @deprecated
  override def withOutput(out: Int): this.type = this

  override def write(kryo: Kryo, output: Output): Unit = {
    kryo.writeClassAndObject(output, builder)
    model.write(kryo, output)
    super.write(kryo, output)
  }
}

Source File: FixedLedger.scala From deepspark with GNU General Public License v2.0

5 votes

package com.github.nearbydelta.deepspark.word.layer

import com.esotericsoftware.kryo.Kryo
import com.esotericsoftware.kryo.io.{Input, Output}
import com.github.nearbydelta.deepspark.data._
import com.github.nearbydelta.deepspark.layer.InputLayer
import com.github.nearbydelta.deepspark.word._
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast

import scala.collection.parallel.ParSeq
import scala.reflect.{ClassTag, classTag}


trait FixedLedger[OutInfo] extends InputLayer[Array[Int], OutInfo] {
  @transient implicit override protected val evidenceI: ClassTag[Array[Int]] = classTag[Array[Int]]
  var bcModel: Broadcast[LedgerModel] = _
  @transient var model: LedgerModel = _
  protected var padID = -1

  def withModel(model: LedgerModel): this.type = {
    this.model = model
    this.padID = model.padID
    this
  }

  protected def pad =
    if (padID == -1) null
    else if (bcModel != null) vectorOf(bcModel.value.padID)
    else vectorOf(padID)

  protected def vectorOf(str: Int) =
    if (bcModel != null) bcModel.value.vectorAt(str)
    else model.vectorAt(str)

  override def backprop(seq: ParSeq[((Array[Int], OutInfo), DataVec)]): (ParSeq[DataVec], ParSeq[() ⇒ Unit]) =
    (null, ParSeq())

  override def broadcast(sc: SparkContext): Unit = {
    bcModel = sc.broadcast(model)
  }

  override def loss: Double = 0.0

  override def read(kryo: Kryo, input: Input): Unit = {
    val model = new LedgerModel
    model.read(kryo, input)
    withModel(model)
    super.read(kryo, input)
  }

  override def unbroadcast(): Unit = {
    bcModel.unpersist(blocking = false)
  }

  @deprecated
  override def withInput(in: Int): this.type = this

  @deprecated
  override def withOutput(out: Int): this.type = this

  override def write(kryo: Kryo, output: Output): Unit = {
    model.write(kryo, output)
    super.write(kryo, output)
  }
}

Source File: 7_RecoverableNetworkWordCount.scala From wow-spark with MIT License

5 votes

package com.sev7e0.wow.spark_streaming

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.{Seconds, StreamingContext, Time}
import org.apache.spark.util.LongAccumulator
import org.apache.spark.{SparkConf, SparkContext}


object RecoverableNetworkWordCount {

  def main(args: Array[String]): Unit = {

    StreamingLogger.setLoggerLevel()

    val conf = new SparkConf().setMaster("local").setAppName(RecoverableNetworkWordCount.getClass.getName)
    val context = new StreamingContext(conf, Seconds(1))

    val linesDS = context.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_2)

    val wordsCounts = linesDS.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _)

    wordsCounts.foreachRDD((rdd: RDD[(String, Int)], time: Time) => {
      val blackList = WordBlackList.getInstance(context.sparkContext)

      val accumulator = DropWordCounter.getInstance(context.sparkContext)

      val str = rdd.filter { case (word, count) =>
        if (blackList.value.contains(word)) {
          accumulator.add(count)
          false
        } else {
          true
        }
      }.collect().mkString("[", ", ", "]")
      println(s"str = $str")
    })
  }


}

object WordBlackList {

  @volatile private var instance: Broadcast[Seq[String]] = _

  def getInstance(context: SparkContext): Broadcast[Seq[String]] = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          val blackList = Seq("a", "b", "c")
          instance = context.broadcast(blackList)
        }
      }
    }
    instance
  }

}

object DropWordCounter {
  @volatile private var instance: LongAccumulator = _

  def getInstance(context: SparkContext): LongAccumulator = {
    if (instance == null) {
      synchronized {
        if (instance == null) {
          instance = context.longAccumulator("WordCount")
        }
      }
    }
    instance
  }
}

Source File: ResultTask.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.nio.ByteBuffer

import java.io._

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    locs: Seq[TaskLocation],
    val outputId: Int,
    internalAccumulators: Seq[Accumulator[Long]])
  extends Task[U](stageId, stageAttemptId, partition.index, internalAccumulators)
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val deserializeStartTime = System.currentTimeMillis()
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime

    metrics = Some(context.taskMetrics)
    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
}

Source File: ShuffleMapTask.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.nio.ByteBuffer

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.shuffle.ShuffleWriter


  def this(partitionId: Int) {
    this(0, 0, null, new Partition { override def index: Int = 0 }, null, null)
  }

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): MapStatus = {
    // Deserialize the RDD using the broadcast variable.
    val deserializeStartTime = System.currentTimeMillis()
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime

    metrics = Some(context.taskMetrics)
    var writer: ShuffleWriter[Any, Any] = null
    try {
      val manager = SparkEnv.get.shuffleManager
      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
      writer.stop(success = true).get
    } catch {
      case e: Exception =>
        try {
          if (writer != null) {
            writer.stop(success = false)
          }
        } catch {
          case e: Exception =>
            log.debug("Could not stop writer", e)
        }
        throw e
    }
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
}

Source File: BroadcastHashJoinNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashedRelation}


case class BroadcastHashJoinNode(
    conf: SQLConf,
    streamedKeys: Seq[Expression],
    streamedNode: LocalNode,
    buildSide: BuildSide,
    buildOutput: Seq[Attribute],
    hashedRelation: Broadcast[HashedRelation])
  extends UnaryLocalNode(conf) with HashJoinNode {

  override val child = streamedNode

  // Because we do not pass in the buildNode, we take the output of buildNode to
  // create the inputSet properly.
  override def inputSet: AttributeSet = AttributeSet(child.output ++ buildOutput)

  override def output: Seq[Attribute] = buildSide match {
    case BuildRight => streamedNode.output ++ buildOutput
    case BuildLeft => buildOutput ++ streamedNode.output
  }

  protected override def doOpen(): Unit = {
    streamedNode.open()
    // Set the HashedRelation used by the HashJoinNode.
    withHashedRelation(hashedRelation.value)
  }

  override def close(): Unit = {
    streamedNode.close()
  }
}

Source File: ResultTask.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    locs: Seq[TaskLocation],
    val outputId: Int,
    localProperties: Properties,
    serializedTaskMetrics: Array[Byte],
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[U](stageId, stageAttemptId, partition.index, localProperties, serializedTaskMetrics,
    jobId, appId, appAttemptId)
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L

    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
}

Source File: MapPartitionsRWrapper.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.r

import org.apache.spark.api.r._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.Row
import org.apache.spark.sql.api.r.SQLUtils._
import org.apache.spark.sql.types.StructType


case class MapPartitionsRWrapper(
    func: Array[Byte],
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]],
    inputSchema: StructType,
    outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
  def apply(iter: Iterator[Any]): Iterator[Any] = {
    // If the content of current DataFrame is serialized R data?
    val isSerializedRData = inputSchema == SERIALIZED_R_DATA_SCHEMA

    val (newIter, deserializer, colNames) =
      if (!isSerializedRData) {
        // Serialize each row into a byte array that can be deserialized in the R worker
        (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)},
         SerializationFormats.ROW, inputSchema.fieldNames)
      } else {
        (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null)
      }

    val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) {
      SerializationFormats.ROW
    } else {
      SerializationFormats.BYTE
    }

    val runner = new RRunner[Array[Byte]](
      func, deserializer, serializer, packageNames, broadcastVars,
      isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY)
    // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex.
    val outputIter = runner.compute(newIter, -1)

    if (serializer == SerializationFormats.ROW) {
      outputIter.map { bytes => bytesToRow(bytes, outputSchema) }
    } else {
      outputIter.map { bytes => Row.fromSeq(Seq(bytes)) }
    }
  }
}

Source File: TestBroadcastVariables.scala From spark-dev with GNU General Public License v3.0

5 votes

package examples

import org.apache.spark.{ SparkContext, SparkConf }
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

import scala.io.Source
import scala.util.{ Try, Success, Failure }
import scala.collection.mutable.Map


	def loadCSVFile(filename: String): Option[Map[String, String]] = {
		val countries = Map[String, String]()

		Try {
			val bufferedSource = Source.fromFile(filename)

			for (line <- bufferedSource.getLines) {
				val Array(country, capital) = line.split(",").map(_.trim)
				countries += country -> capital
			}

			bufferedSource.close()
			return Some(countries)

		}.toOption
	}
}

Source File: RRDD.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
}

Source File: MapPartitionsRWrapper.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.r

import org.apache.spark.api.r._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.api.r.SQLUtils._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType


case class MapPartitionsRWrapper(
    func: Array[Byte],
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]],
    inputSchema: StructType,
    outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
  def apply(iter: Iterator[Any]): Iterator[Any] = {
    // If the content of current DataFrame is serialized R data?
    val isSerializedRData =
      if (inputSchema == SERIALIZED_R_DATA_SCHEMA) true else false

    val (newIter, deserializer, colNames) =
      if (!isSerializedRData) {
        // Serialize each row into a byte array that can be deserialized in the R worker
        (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)},
         SerializationFormats.ROW, inputSchema.fieldNames)
      } else {
        (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null)
      }

    val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) {
      SerializationFormats.ROW
    } else {
      SerializationFormats.BYTE
    }

    val runner = new RRunner[Array[Byte]](
      func, deserializer, serializer, packageNames, broadcastVars,
      isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY)
    // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex.
    val outputIter = runner.compute(newIter, -1)

    if (serializer == SerializationFormats.ROW) {
      outputIter.map { bytes => bytesToRow(bytes, outputSchema) }
    } else {
      outputIter.map { bytes => Row.fromSeq(Seq(bytes)) }
    }
  }
}

Source File: DomainProcessor.scala From oni-ml with Apache License 2.0

5 votes

package org.opennetworkinsight.utilities

import org.apache.spark.broadcast.Broadcast

import scala.io.Source


object DomainProcessor extends Serializable {

  val COUNTRY_CODES = Set("ac", "ad", "ae", "af", "ag", "ai", "al", "am", "an", "ao", "aq", "ar", "as", "at", "au",
    "aw", "ax", "az", "ba", "bb", "bd", "be", "bf", "bg", "bh", "bi", "bj", "bm", "bn", "bo", "bq", "br", "bs", "bt",
    "bv", "bw", "by", "bz", "ca", "cc", "cd", "cf", "cg", "ch", "ci", "ck", "cl", "cm", "cn", "co", "cr", "cu", "cv",
    "cw", "cx", "cy", "cz", "de", "dj", "dk", "dm", "do", "dz", "ec", "ee", "eg", "eh", "er", "es", "et", "eu", "fi",
    "fj", "fk", "fm", "fo", "fr", "ga", "gb", "gd", "ge", "gf", "gg", "gh", "gi", "gl", "gm", "gn", "gp", "gq", "gr",
    "gs", "gt", "gu", "gw", "gy", "hk", "hm", "hn", "hr", "ht", "hu", "id", "ie", "il", "im", "in", "io", "iq", "ir",
    "is", "it", "je", "jm", "jo", "jp", "ke", "kg", "kh", "ki", "km", "kn", "kp", "kr", "krd", "kw", "ky", "kz", "la",
    "lb", "lc", "li", "lk", "lr", "ls", "lt", "lu", "lv", "ly", "ma", "mc", "md", "me", "mg", "mh", "mk", "ml", "mm",
    "mn", "mo", "mp", "mq", "mr", "ms", "mt", "mu", "mv", "mw", "mx", "my", "mz", "na", "nc", "ne", "nf", "ng", "ni",
    "nl", "no", "np", "nr", "nu", "nz", "om", "pa", "pe", "pf", "pg", "ph", "pk", "pl", "pm", "pn", "pr", "ps", "pt",
    "pw", "py", "qa", "re", "ro", "rs", "ru", "rw", "sa", "sb", "sc", "sd", "se", "sg", "sh", "si", "sj", "", "sk",
    "sl", "sm", "sn", "so", "sr", "ss", "st", "su", "sv", "sx", "sy", "sz", "tc", "td", "tf", "tg", "th", "tj", "tk",
    "tl", "tm", "tn", "to", "tp", "tr", "tt", "tv", "tw", "tz", "ua", "ug", "uk", "us", "uy", "uz", "va", "vc", "ve",
    "vg", "vi", "vn", "vu", "wf", "ws", "ye", "yt", "za", "zm", "zw")

  val TOP_LEVEL_DOMAIN_NAMES = Set("com", "org", "net", "int", "edu", "gov", "mil")
  val NO_DOMAIN = "None"

  def extractDomain(url: String): String = {

    val spliturl = url.split('.')
    val numParts = spliturl.length

    // First check if query is an IP address e.g.: 123.103.104.10.in-addr.arpa or a name.
    // Such URLs receive a domain of NO_DOMAIN

    if (numParts > 2 && spliturl(numParts - 1) == "arpa" && spliturl(numParts - 2) == "in-addr") {
      NO_DOMAIN  // it's an address
    } else if (!COUNTRY_CODES.contains(spliturl.last) && !TOP_LEVEL_DOMAIN_NAMES.contains(spliturl.last)) {
      NO_DOMAIN  //  it does not have a valid top-level domain name
    } else {
      val strippedSplitURL = removeTopLevelDomainName(removeCountryCode(spliturl))
      if (strippedSplitURL.length > 0) {
        strippedSplitURL.last
      } else {
        // invalid URL... nothing that is not TLD.countrycode
        NO_DOMAIN
      }
    }
  }

  def removeCountryCode(urlComponents: Array[String]): Array[String] = {
    if (COUNTRY_CODES.contains(urlComponents.last)) {
      urlComponents.dropRight(1)
    } else {
      urlComponents
    }
  }

  def removeTopLevelDomainName(urlComponents: Array[String]): Array[String] = {
    if (TOP_LEVEL_DOMAIN_NAMES.contains(urlComponents.last)) {
      urlComponents.dropRight(1)
    } else {
      urlComponents
    }
  }
}

Source File: ProxyWordCreation.scala From oni-ml with Apache License 2.0

5 votes

package org.opennetworkinsight.proxy

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.functions._
import org.opennetworkinsight.utilities.{Entropy, Quantiles, DomainProcessor, TimeUtilities}


object ProxyWordCreation {

  def udfWordCreation(topDomains : Broadcast[Set[String]],
                      agentCounts : Broadcast[Map[String, Long]],
                      timeCuts: Array[Double],
                      entropyCuts: Array[Double],
                      agentCuts: Array[Double]) =
    udf((host: String, time: String, reqMethod: String, uri: String, contentType: String, userAgent: String, responseCode: String) =>
      ProxyWordCreation.proxyWord(host,
        time,
        reqMethod,
        uri,
        contentType,
        userAgent,
        responseCode,
        topDomains,
        agentCounts,
        timeCuts,
        entropyCuts,
        agentCuts))


  def proxyWord(proxyHost: String,
                time: String,
                reqMethod: String,
                uri: String,
                contentType: String,
                userAgent: String,
                responseCode: String,
                topDomains: Broadcast[Set[String]],
                agentCounts: Broadcast[Map[String, Long]],
                timeCuts: Array[Double],
                entropyCuts: Array[Double],
                agentCuts: Array[Double]): String = {

    List(topDomain(proxyHost, topDomains.value).toString,
      Quantiles.bin(TimeUtilities.getTimeAsDouble(time), timeCuts).toString,
      reqMethod,
      Quantiles.bin(Entropy.stringEntropy(uri), entropyCuts),
      contentType.split('/')(0), // just the top level content type for now
      Quantiles.bin(agentCounts.value(userAgent), agentCuts),
      responseCode(0)).mkString("_")

  }


  def topDomain(proxyHost: String, topDomains: Set[String]): Int = {

    val domain = DomainProcessor.extractDomain(proxyHost)

    if (domainBelongsToSafeList(domain)) {
      2
    } else if (topDomains.contains(domain)) {
      1
    } else {
      0
    }
  }

  def domainBelongsToSafeList(domain: String) = domain == "intel" // TBD parameterize this!

}

Source File: AggregatedICPClassifier.scala From spark-cp with Apache License 2.0

5 votes

package se.uu.farmbio.cp.liblinear

import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import se.uu.farmbio.cp.ICPClassifierModel
import org.apache.commons.lang.NotImplementedException
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.SparkContext

object AggregatedICPClassifier {

  def load(path: String, sc: SparkContext) = {
    val icps = sc.textFile(path)
      .map(ICPClassifierModel.deserialize(_, LibLinAlgDeserializer))
    new AggregatedICPClassifier(icps)
  }

}

class AggregatedICPClassifier(
  private val icps: RDD[ICPClassifierModel[LibLinAlg]])
  extends ICPClassifierModel[LibLinAlg] {

  val cachedICPs = icps.cache

  override def mondrianPv(features: Vector) = {
    cachedICPs
      .flatMap { icp =>
        icp.mondrianPv(features)
          .zipWithIndex
      }
      .collect //we expect to aggregate up to 100 ICPs
      .groupBy(_._2)
      .toArray
      .sortBy(_._1)
      .map {
        case (index, seq) =>
          val sortedSeq = seq.map(_._1).toArray.sorted
          val n = sortedSeq.length
          val median = if (n % 2 == 0) {
            (sortedSeq(n / 2 - 1) + sortedSeq(n / 2)) / 2
          } else {
            sortedSeq(n / 2)
          }
          median
      }
  }

  def save(path: String, coalesce: Int = 0) = {
    var serialICPs = cachedICPs.map(_.toString)
    if (coalesce > 0) {
      serialICPs = serialICPs.coalesce(coalesce)
    }
    serialICPs.saveAsTextFile(path)
  }

}

Source File: Configuration.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.hadoop

import java.io.{ ObjectInputStream, ObjectOutputStream }

import org.apache.hadoop.conf
import org.apache.hadoop.conf.{ Configuration ⇒ HadoopConfiguration }
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.hammerlab.hadoop.kryo.WritableSerializer
import org.hammerlab.kryo._


class Configuration(@transient var value: HadoopConfiguration)
  extends Serializable {
  private def writeObject(out: ObjectOutputStream): Unit = {
    value.write(out)
  }

  private def readObject(in: ObjectInputStream): Unit = {
    value = new HadoopConfiguration(false)
    value.readFields(in)
  }
}

object Configuration
  extends Registrar {

  def apply(loadDefaults: Boolean = true): Configuration =
    new HadoopConfiguration(loadDefaults)

  def apply(conf: HadoopConfiguration): Configuration =
    new Configuration(conf)

  implicit def wrapConfiguration(conf: HadoopConfiguration): Configuration =
    apply(conf)

  implicit def unwrapConfiguration(conf: Configuration): HadoopConfiguration =
    conf.value

  implicit def unwrapConfigurationBroadcast(confBroadcast: Broadcast[Configuration]): Configuration =
    confBroadcast.value

  implicit def sparkContextToHadoopConfiguration(sc: SparkContext): Configuration =
    sc.hadoopConfiguration

  implicit class Ops(val conf: HadoopConfiguration) extends AnyVal {
    def serializable: Configuration = conf
  }

  register(
    cls[conf.Configuration] → new WritableSerializer[conf.Configuration],
    cls[Configuration] → serializeAs[Configuration, conf.Configuration]
  )
}

Source File: MapPartitionsRWrapper.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.r

import org.apache.spark.api.r._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.Row
import org.apache.spark.sql.api.r.SQLUtils._
import org.apache.spark.sql.types.StructType


case class MapPartitionsRWrapper(
    func: Array[Byte],
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]],
    inputSchema: StructType,
    outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
  def apply(iter: Iterator[Any]): Iterator[Any] = {
    // If the content of current DataFrame is serialized R data?
    val isSerializedRData = inputSchema == SERIALIZED_R_DATA_SCHEMA

    val (newIter, deserializer, colNames) =
      if (!isSerializedRData) {
        // Serialize each row into a byte array that can be deserialized in the R worker
        (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)},
         SerializationFormats.ROW, inputSchema.fieldNames)
      } else {
        (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null)
      }

    val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) {
      SerializationFormats.ROW
    } else {
      SerializationFormats.BYTE
    }

    val runner = new RRunner[Array[Byte]](
      func, deserializer, serializer, packageNames, broadcastVars,
      isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY)
    // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex.
    val outputIter = runner.compute(newIter, -1)

    if (serializer == SerializationFormats.ROW) {
      outputIter.map { bytes => bytesToRow(bytes, outputSchema) }
    } else {
      outputIter.map { bytes => Row.fromSeq(Seq(bytes)) }
    }
  }
}

Source File: WordFrequencyEncoder.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.nlp

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import keystoneml.workflow.{Estimator, Transformer}

object WordFrequencyEncoder extends Estimator[Seq[String], Seq[Int]] {
  private[this] def makeUnigrams(data: RDD[Seq[String]]) =
    NGramsCounts[String]().apply(NGramsFeaturizer[String](1 to 1).apply(data))

  // TODO: alternative approach: collectAsMap once, let driver do the work.
  def fit(data: RDD[Seq[String]]): WordFrequencyTransformer = {
    val unigramCounts = makeUnigrams(data)

    val wordIndex = unigramCounts
      .zipWithIndex() // indexes respect the sorted order
      .map { case ((unigram, count), index) =>
        // valid if # of word types in training data is less than Int.MaxValue
        (unigram.words(0), index.asInstanceOf[Int])
      }.collectAsMap()

    val wordIndexBroadcast = unigramCounts.sparkContext.broadcast(wordIndex)

    val unigrams = unigramCounts.map { case (unigram, count) =>
      (wordIndexBroadcast.value(unigram.words(0)), count)
    }.collectAsMap()

    new WordFrequencyTransformer(wordIndexBroadcast, unigrams)
  }

}


class WordFrequencyTransformer(
    wordIndexBroadcast: Broadcast[scala.collection.Map[String, Int]],
    val unigramCounts: scala.collection.Map[Int, Int])
  extends Transformer[Seq[String], Seq[Int]] {

  final val OOV_INDEX = -1

  override def apply(in: RDD[Seq[String]]): RDD[Seq[Int]] = {
    in.mapPartitions { case part =>
      val index = wordIndexBroadcast.value
      part.map(ngram => ngram.map(index.getOrElse(_, OOV_INDEX)))
    }
  }

  def apply(words: Seq[String]): Seq[Int] = {
    val index = wordIndexBroadcast.value
    words.map(index.getOrElse(_, OOV_INDEX))
  }

}

Source File: KernelBlockLinearMapper.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import scala.reflect.ClassTag
import scala.collection.mutable.ListBuffer

import breeze.linalg._

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

import keystoneml.nodes.stats.{StandardScalerModel, StandardScaler}
import keystoneml.nodes.util.{VectorSplitter, Identity}

import keystoneml.utils.{MatrixUtils, Stats}
import keystoneml.workflow.{Transformer, LabelEstimator}


class KernelBlockLinearMapper[T: ClassTag](
    val model: Seq[DenseMatrix[Double]],
    blockSize: Int,
    kernelTransformer: KernelTransformer[T],
    nTrain: Long,
    blocksBeforeCheckpoint: Int = 25)
  extends Transformer[T, DenseVector[Double]] {

  val numClasses = model(0).cols
  val numBlocks = model.size

  override def apply(in: RDD[T]): RDD[DenseVector[Double]] = {
    val testKernelMat = kernelTransformer(in)
    // Initially all predictions are 0
    var predictions = in.mapPartitions { iter =>
      if (iter.hasNext) {
        val out = DenseMatrix.zeros[Double](iter.size, numClasses)
        Iterator.single(out)
      } else {
        Iterator.empty
      }
    }.cache()

    val modelBCs = new ListBuffer[Broadcast[DenseMatrix[Double]]]

    (0 until numBlocks).foreach { block =>
      val blockIdxs = (blockSize * block) until (math.min(nTrain.toInt, (block + 1) * blockSize))
      val testKernelBlock = testKernelMat(blockIdxs.toSeq)
      val modelBlockBC = in.context.broadcast(model(block))
      modelBCs += modelBlockBC

      // Update predictions
      var predictionsNew = predictions.zip(testKernelBlock).map { case(pred, testKernelBB) =>
        pred :+ (testKernelBB * modelBlockBC.value)
      }

      predictionsNew.cache()
      predictionsNew.count()
      predictions.unpersist(true)

      testKernelMat.unpersist(blockIdxs.toSeq)
      modelBlockBC.unpersist(true)

      // If we are checkpointing update our cache
      if (in.context.getCheckpointDir.isDefined &&
          block % blocksBeforeCheckpoint == (blocksBeforeCheckpoint - 1)) {
        predictionsNew = MatrixUtils.truncateLineage(predictionsNew, false)
      }
      predictions = predictionsNew
    }
    predictions.flatMap(x => MatrixUtils.matrixToRowArray(x))
  }

  def apply(in: T): DenseVector[Double]  = {
    val testKernelRow = kernelTransformer(in)
    val predictions = DenseVector.zeros[Double](numClasses)
    (0 until numBlocks).foreach { block =>
      val blockIdxs = (blockSize * block) until (math.min(nTrain.toInt, (block + 1) * blockSize))
      predictions += (testKernelRow(blockIdxs) * model(block)).toDenseVector
    }
    predictions
  }
}

Source File: ResultTask.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    locs: Seq[TaskLocation],
    val outputId: Int,
    localProperties: Properties,
    metrics: TaskMetrics,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[U](stageId, stageAttemptId, partition.index, metrics, localProperties, jobId,
    appId, appAttemptId)
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L

    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
}

Source File: Dictionary.scala From spark-nkp with Apache License 2.0

5 votes

package com.github.uosdmlab.nkp

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql._
import org.apache.spark.sql.types._
import org.bitbucket.eunjeon.seunjeon.{Analyzer => EunjeonAnalyzer}


object Dictionary {

  // Words inside driver. This won't be modified in executor.
  private[nkp] var words = Seq.empty[String]

  
  private[nkp] def syncWords(bcWords: Broadcast[Seq[String]]): Unit = {
    EunjeonAnalyzer.resetUserDict()
    EunjeonAnalyzer.setUserDict(bcWords.value.iterator)
  }

  def reset(): this.type = chain {
    words = Seq.empty[String]
  }

  private var isDictionaryUsed = false

  private[nkp] def shouldSync = {
    isDictionaryUsed
  }

  def addWords(word: String, words: String*): this.type = addWords(word +: words)

  def addWords(words: Traversable[String]): this.type = chain {
    this.words = this.words ++ words
    isDictionaryUsed = true
  }

  def addWordsFromCSV(path: String, paths: String*): this.type = addWordsFromCSV(path +: paths)

  def addWordsFromCSV(paths: Traversable[String]): this.type = chain {
    val spark = SparkSession.builder().getOrCreate()

    import spark.implicits._

    val schema = StructType(Array(
      StructField("word", StringType, nullable = false),
      StructField("cost", StringType, nullable = true)))

    val df = spark.read
      .option("sep", ",")
      .option("inferSchema", value = false)
      .option("header", value = false)
      .schema(schema)
      .csv(paths.toSeq: _*)

    val words = df.map {
      case Row(word: String, cost: String) =>
        s"$word,$cost"
      case Row(word: String, null) =>
        word
    }.collect()

    addWords(words)
  }

  private def chain(fn: => Any): this.type = {
    fn
    this
  }
}

Source File: FilterTopFeaturesProcess.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs.wal.process

import org.apache.s2graph.s2jobs.task.TaskConf
import org.apache.s2graph.s2jobs.wal.WalLogAgg
import org.apache.s2graph.s2jobs.wal.transformer.{DefaultTransformer, Transformer}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import play.api.libs.json.{JsObject, Json}

object FilterTopFeaturesProcess {
  private var validFeatureHashKeys: Set[Long] = null
  def getValidFeatureHashKeys(validFeatureHashKeysBCast: Broadcast[Array[Long]]): Set[Long] = {
    if (validFeatureHashKeys == null) {
      validFeatureHashKeys = validFeatureHashKeysBCast.value.toSet
    }

    validFeatureHashKeys
  }

  def collectDistinctFeatureHashes(ss: SparkSession,
                                   filteredDict: DataFrame): Array[Long] = {
    import ss.implicits._

    val featureHashUDF = udf((dim: String, value: String) => WalLogAgg.toFeatureHash(dim, value))

    filteredDict.withColumn("featureHash", featureHashUDF(col("dim"), col("value")))
      .select("featureHash")
      .distinct().as[Long].collect()
  }

  def filterTopKsPerDim(dict: DataFrame,
                        maxRankPerDim: Broadcast[Map[String, Int]],
                        defaultMaxRank: Int): DataFrame = {
    val filterUDF = udf((dim: String, rank: Long) => {
      rank < maxRankPerDim.value.getOrElse(dim, defaultMaxRank)
    })

    dict.filter(filterUDF(col("dim"), col("rank")))
  }

  def filterWalLogAgg(ss: SparkSession,
                      walLogAgg: Dataset[WalLogAgg],
                      transformers: Seq[Transformer],
                      validFeatureHashKeysBCast: Broadcast[Array[Long]]) = {
    import ss.implicits._
    walLogAgg.mapPartitions { iter =>
      val validFeatureHashKeys = getValidFeatureHashKeys(validFeatureHashKeysBCast)

      iter.map { walLogAgg =>
        WalLogAgg.filterProps(walLogAgg, transformers, validFeatureHashKeys)
      }
    }
  }
}

class FilterTopFeaturesProcess(taskConf: TaskConf) extends org.apache.s2graph.s2jobs.task.Process(taskConf) {

  import FilterTopFeaturesProcess._

  
  override def execute(ss: SparkSession, inputMap: Map[String, DataFrame]): DataFrame = {
    import ss.implicits._

    val maxRankPerDim = taskConf.options.get("maxRankPerDim").map { s =>
      Json.parse(s).as[JsObject].fields.map { case (k, jsValue) =>
        k -> jsValue.as[Int]
      }.toMap
    }
    val maxRankPerDimBCast = ss.sparkContext.broadcast(maxRankPerDim.getOrElse(Map.empty))

    val defaultMaxRank = taskConf.options.get("defaultMaxRank").map(_.toInt)

    val featureDict = inputMap(taskConf.options("featureDict"))
    val walLogAgg = inputMap(taskConf.options("walLogAgg")).as[WalLogAgg]

    val transformers = TaskConf.parseTransformers(taskConf)

    val filteredDict = filterTopKsPerDim(featureDict, maxRankPerDimBCast, defaultMaxRank.getOrElse(Int.MaxValue))
    val validFeatureHashKeys = collectDistinctFeatureHashes(ss, filteredDict)
    val validFeatureHashKeysBCast = ss.sparkContext.broadcast(validFeatureHashKeys)

    filterWalLogAgg(ss, walLogAgg, transformers, validFeatureHashKeysBCast).toDF()
  }

  override def mandatoryOptions: Set[String] = Set("featureDict", "walLogAgg")
}

Source File: ParameterOperations.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.parameters

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.{DistributedDataSet, MiniBatch}
import org.apache.spark.rdd.RDD
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.optim.DistriOptimizer.Cache
import com.intel.analytics.bigdl.optim.Metrics
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.utils.Table
import org.apache.spark.broadcast.Broadcast

import scala.collection.mutable


private[bigdl] class L2NormClippingProcessor(l2NormThreshold: Double)
  extends ParameterProcessor {
  override def collectGlobalData[T](models: RDD[Cache[T]],
    parameters: AllReduceParameter[T],
    metrics: Metrics,
    state: Table)(implicit ev: TensorNumeric[T]) : Unit = {
    val numFinishedModel = state.get[Int]("numFinishedModel").get
    val parallelism = state.get[Int]("parallelism").get
    val isGradientUpdated = state.get[Boolean]("isGradientUpdated").get

    val sumSquare = models.mapPartitions(modelIter => {
      if (!isGradientUpdated) {
        val getG = System.nanoTime()
        parameters.aggregateGradientPartition(numFinishedModel)
        metrics.add("aggregrateGradientParition average executor",
          System.nanoTime() - getG)
      }
      val sum = Util.getSumsquareInParallel(parameters.gradientPartition, parallelism)
      Iterator.single(sum)
    }).reduce(_ + _)

    state("isGradientUpdated") = true
    state("l2Norm") = math.sqrt(sumSquare)
  }

  override def processParameters[T](parameters: AllReduceParameter[T],
    modelCache: Cache[T],
    state: Table)(implicit ev: TensorNumeric[T]): Unit = {
    val l2Norm = state.get[Double]("l2Norm").get
    if (l2Norm > l2NormThreshold) {
      val scale = ev.fromType[Double](l2Norm / l2NormThreshold)
      parameters.gradientPartition.div(scale)
    }
  }

  override def processParameters[T](model: Module[T],
    state: Table)(implicit ev: TensorNumeric[T]): Unit = {
    val parallelism = state.get[Int]("parallelism").get
    val gradients = model.getParameters()._2
    val l2Norm = math.sqrt(Util.getSumsquareInParallel(gradients, parallelism))

    if (l2Norm > l2NormThreshold) {
      val scale = ev.fromType[Double](l2Norm / l2NormThreshold)
      gradients.div(scale)
    }
  }
}

Source File: BatchShuffleMapTask.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer
import java.util.Properties

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.BlockManagerId

private[spark] class BatchShuffleMapTask(
    stageId: Int,
    stageAttemptId: Int,
    taskBinaries: Broadcast[Array[Byte]],
    partitions: Array[Partition],
    partitionId: Int,
    @transient private var locs: Seq[TaskLocation],
    internalAccumulatorsSer: Array[Byte],
    localProperties: Properties,
    isFutureTask: Boolean,
    nextStageLocs: Option[Seq[BlockManagerId]] = None,
    depShuffleIds: Option[Seq[Seq[Int]]] = None,
    depShuffleNumMaps: Option[Seq[Int]] = None,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[Array[MapStatus]](stageId, stageAttemptId, partitionId,
    internalAccumulatorsSer, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps,
    jobId, appId, appAttemptId)
  with BatchTask
  with Logging {

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  var rdds: Array[RDD[_]] = null
  var deps: Array[ShuffleDependency[_, _, _]] = null

  override def prepTask(): Unit = {
    // Deserialize the RDD using the broadcast variable.
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rddI, depI) = ser.deserialize[(Array[RDD[_]], Array[ShuffleDependency[_, _, _]])](
      ByteBuffer.wrap(taskBinaries.value), Thread.currentThread.getContextClassLoader)
    rdds = rddI
    deps = depI
  }

  def getTasks(): Seq[Task[Any]] = {
    if (deps == null || rdds == null) {
      prepTask()
    }

    (0 until partitions.length).map { i =>
      val s = ShuffleMapTask(stageId, stageAttemptId, partitions(i), localProperties,
        internalAccumulatorsSer, isFutureTask, rdds(i), deps(i), nextStageLocs)
      s.epoch = epoch
      s
    }.map(_.asInstanceOf[Task[Any]])
  }

  override def runTask(context: TaskContext): Array[MapStatus] = {
    throw new RuntimeException("BatchShuffleMapTasks should not be run!")
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "BatchShuffleMapTask(%d, %d)".format(stageId, partitionId)
}

Source File: BatchResultTask.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer
import java.util.Properties

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

private[spark] class BatchResultTask[T, U: ClassTag](
    stageId: Int,
    stageAttemptId: Int,
    taskBinaries: Broadcast[Array[Byte]],
    val partitions: Array[Partition],
    partitionId: Int,
    @transient private val locs: Seq[TaskLocation],
    val outputId: Int,
    localProperties: Properties,
    internalAccumulatorsSer: Array[Byte],
    isFutureTask: Boolean,
    depShuffleIds: Option[Seq[Seq[Int]]] = None,
    depShuffleNumMaps: Option[Seq[Int]] = None,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[Array[U]](stageId, stageAttemptId, partitionId,
      internalAccumulatorsSer, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps,
      jobId, appId, appAttemptId)
  with BatchTask
  with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  var rdds: Array[RDD[T]] = null

  var funcs: Array[(TaskContext, Iterator[T]) => U] = null

  override def prepTask(): Unit = {
    // Deserialize the RDD and the func using the broadcast variables.
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rddI, funcI) =
      ser.deserialize[(Array[RDD[T]], Array[(TaskContext, Iterator[T]) => U])](
        ByteBuffer.wrap(taskBinaries.value), Thread.currentThread.getContextClassLoader)
    rdds = rddI
    funcs = funcI
  }

  // Called on the executor side to get a smaller tasks out
  def getTasks(): Seq[Task[Any]] = {
    if (rdds == null) {
      prepTask()
    }

    (0 until partitions.length).map { i =>
      val r = ResultTask(stageId, stageAttemptId, partitions(i), outputId, localProperties,
        internalAccumulatorsSer, isFutureTask, rdds(i), funcs(i))
      r.epoch = epoch
      r
    }.map(_.asInstanceOf[Task[Any]])
  }

  override def runTask(context: TaskContext): Array[U] = {
    throw new RuntimeException("BatchResultTasks should not be run!")
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "BatchResultTask(" + stageId + ", " + partitionId + ")"
}

Source File: ResultTask.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    stageAttemptId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    locs: Seq[TaskLocation],
    val outputId: Int,
    localProperties: Properties,
    serializedTaskMetrics: Array[Byte] =
      SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array(),
    isFutureTask: Boolean = false,
    depShuffleIds: Option[Seq[Seq[Int]]] = None,
    depShuffleNumMaps: Option[Seq[Int]] = None,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[U](stageId, stageAttemptId, partition.index,
    serializedTaskMetrics, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps,
    jobId, appId, appAttemptId)
  with Serializable {

  var rdd: RDD[T] = null
  var func: (TaskContext, Iterator[T]) => U = null

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def prepTask(): Unit = {
    // Deserialize the RDD and the func using the broadcast variables.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (_rdd, _func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
    rdd = _rdd
    func = _func
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L
  }

  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    if (func == null || rdd == null) {
      prepTask()
    }
    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ResultTask(" + stageId + ", " + partitionId + ")"
}

object ResultTask {

  def apply[T, U](
      stageId: Int,
      stageAttemptId: Int,
      partition: Partition,
      outputId: Int,
      localProperties: Properties,
      internalAccumulatorsSer: Array[Byte],
      isFutureTask: Boolean,
      rdd: RDD[T],
      func: (TaskContext, Iterator[T]) => U): ResultTask[T, U] = {
    val rt = new ResultTask[T, U](stageId, stageAttemptId, null, partition, Seq.empty, outputId,
      localProperties, internalAccumulatorsSer, isFutureTask)
    rt.rdd = rdd
    rt.func = func
    rt
  }

}

Source File: ShuffleMapTask.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.shuffle.ShuffleWriter
import org.apache.spark.storage.BlockManagerId


  def this(partitionId: Int) {
    this(0, 0, null, new Partition { override def index: Int = 0 }, null, new Properties, null)
  }

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  var rdd: RDD[_] = null
  var dep: ShuffleDependency[_, _, _] = null

  override def prepTask(): Unit = {
    // Deserialize the RDD using the broadcast variable.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (_rdd, _dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
     rdd = _rdd
     dep = _dep
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L
  }

  override def runTask(context: TaskContext): MapStatus = {
    if (dep == null || rdd == null) {
      prepTask()
    }

    var writer: ShuffleWriter[Any, Any] = null
    try {
      val manager = SparkEnv.get.shuffleManager
      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
      val status = writer.stop(success = true).get
      FutureTaskNotifier.taskCompleted(status, partitionId, dep.shuffleId,
        dep.partitioner.numPartitions, nextStageLocs, metrics.shuffleWriteMetrics, false)
      status
    } catch {
      case e: Exception =>
        try {
          if (writer != null) {
            writer.stop(success = false)
          }
        } catch {
          case e: Exception =>
            log.debug("Could not stop writer", e)
        }
        throw e
    }
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
}

object ShuffleMapTask {

  def apply(
      stageId: Int,
      stageAttemptId: Int,
      partition: Partition,
      properties: Properties,
      internalAccumulatorsSer: Array[Byte],
      isFutureTask: Boolean,
      rdd: RDD[_],
      dep: ShuffleDependency[_, _, _],
      nextStageLocs: Option[Seq[BlockManagerId]]): ShuffleMapTask = {

    val smt = new ShuffleMapTask(stageId, stageAttemptId, null, partition, null,
      properties, internalAccumulatorsSer, isFutureTask, nextStageLocs)

    smt.rdd = rdd
    smt.dep = dep
    smt
  }
}

Source File: RRDD.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.util.{Map => JMap}

import scala.collection.JavaConverters._
import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext}
import org.apache.spark.api.python.PythonRDD
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD

private abstract class BaseRRDD[T: ClassTag, U: ClassTag](
    parent: RDD[T],
    numPartitions: Int,
    func: Array[Byte],
    deserializer: String,
    serializer: String,
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]])
  extends RDD[U](parent) with Logging {
  override def getPartitions: Array[Partition] = parent.partitions

  override def compute(partition: Partition, context: TaskContext): Iterator[U] = {
    val runner = new RRunner[U](
      func, deserializer, serializer, packageNames, broadcastVars, numPartitions)

    // The parent may be also an RRDD, so we should launch it first.
    val parentIterator = firstParent[T].iterator(partition, context)

    runner.compute(parentIterator, partition.index)
  }
}


  def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int):
  JavaRDD[Array[Byte]] = {
    PythonRDD.readRDDFromFile(jsc, fileName, parallelism)
  }
}

Source File: BroadcastSpatialJoin.scala From SpatialSpark with Apache License 2.0

5 votes

package spatialspark.join

import com.vividsolutions.jts.geom.Geometry
import com.vividsolutions.jts.index.strtree.{ItemBoundable, ItemDistance, STRtree}
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import spatialspark.operator.SpatialOperator
import spatialspark.operator.SpatialOperator.SpatialOperator


object BroadcastSpatialJoin {


  def queryRtree(rtree: => Broadcast[STRtree], leftId: Long, geom: Geometry, predicate: SpatialOperator,
                 radius: Double): Array[(Long, Long)] = {
    val queryEnv = geom.getEnvelopeInternal
    //queryEnv.expandBy(radius)
    lazy val candidates = rtree.value.query(queryEnv).toArray //.asInstanceOf[Array[(Long, Geometry)]]
    if (predicate == SpatialOperator.Within) {
      candidates.filter { case (id_, geom_) => geom.within(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.Contains) {
      candidates.filter { case (id_, geom_) => geom.contains(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.WithinD) {
      candidates.filter { case (id_, geom_) => geom.isWithinDistance(geom_.asInstanceOf[Geometry], radius) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.Intersects) {
      candidates.filter { case (id_, geom_) => geom.intersects(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.Overlaps) {
      candidates.filter { case (id_, geom_) => geom.overlaps(geom_.asInstanceOf[Geometry]) }
        .map { case (id_, geom_) => (leftId, id_.asInstanceOf[Long]) }
    } else if (predicate == SpatialOperator.NearestD) {
      //if (candidates.isEmpty)
      //  return Array.empty[(Long, Long)]
      //val nearestItem = candidates.map {
      //  case (id_, geom_) => (id_.asInstanceOf[Long], geom_.asInstanceOf[Geometry].distance(geom))
      //}.reduce((a, b) => if (a._2 < b._2) a else b)
      class dist extends ItemDistance {
        override def distance(itemBoundable: ItemBoundable, itemBoundable1: ItemBoundable): Double = {
          val geom = itemBoundable.getItem.asInstanceOf[(Long, Geometry)]._2
          val geom1 = itemBoundable1.getItem.asInstanceOf[(Long, Geometry)]._2
          geom.distance(geom1)
        }
      }
      val nearestItem = rtree.value.nearestNeighbour(queryEnv, (0l, geom), new dist)
                             .asInstanceOf[(Long, Geometry)]
      Array((leftId, nearestItem._1))
    } else {
      Array.empty[(Long, Long)]
    }
  }

  def apply(sc: SparkContext,
            leftGeometryWithId: RDD[(Long, Geometry)],
            rightGeometryWithId: RDD[(Long, Geometry)],
            joinPredicate: SpatialOperator,
            radius: Double = 0): RDD[(Long, Long)] = {
    // create R-tree on right dataset
    val strtree = new STRtree()
    val rightGeometryWithIdLocal = rightGeometryWithId.collect()
    rightGeometryWithIdLocal.foreach(x => {
      val y = x._2.getEnvelopeInternal
      y.expandBy(radius)
      strtree.insert(y, x)
    })
    val rtreeBroadcast = sc.broadcast(strtree)
    leftGeometryWithId.flatMap(x => queryRtree(rtreeBroadcast, x._1, x._2, joinPredicate, radius))
  }
}

Source File: ReForeStLoader.scala From reforest with Apache License 2.0

5 votes

package reforest

import org.apache.commons.math3.distribution.PoissonDistribution
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.data.{RawDataLabeled, RawDataset, StaticData}
import reforest.data.tree.ForestManager
import reforest.rf.parameter.RFParameter
import reforest.rf.split.RFSplitterManager
import reforest.rf.{RFCategoryInfo, RFDataPrepare, RFStrategy}
import reforest.util.{GCInstrumented, GCInstrumentedEmpty, MemoryUtil}

class ReForeStLoader[T, U](@transient private val sc: SparkContext,
                           parameter: Broadcast[RFParameter],
                           strategyBC: Broadcast[RFStrategy[T, U]],
                           val typeInfoBC: Broadcast[TypeInfo[T]],
                           val typeInfoWorkingBC: Broadcast[TypeInfo[U]],
                           val categoricalFeaturesInfoBC: Broadcast[RFCategoryInfo],
                           rawDataset: RawDataset[T, U]) extends Serializable {

  val instrumented: Broadcast[GCInstrumented] = sc.broadcast(new GCInstrumentedEmpty)
  val dataPrepare = new RFDataPrepare[T, U](typeInfoBC, instrumented, strategyBC, false, 1)

  private var memoryUtil : Option[MemoryUtil] = Option.empty
  private var forestManager : Option[ForestManager[T, U]] = Option.empty
  private var workingData : Option[RDD[StaticData[U]]] = Option.empty
  private var previousWorkingData : Option[RDD[StaticData[U]]] = Option.empty
  private var splitterManager : Option[RFSplitterManager[T,U]] = Option.empty

  def testdatafreeze(): Unit = {
    rawDataset.testingData.persist(parameter.value.storageLevel)
  }

  def trainingdatafreeze(): Unit = {
    //    rawDataset.trainingData.persist(property.storageLevel)
    rawDataset.trainingData.count()
  }

  def getRawDataset = rawDataset

  def getTestingData: RDD[RawDataLabeled[T, U]] = rawDataset.testingData

  def getMemoryUtil = memoryUtil
  def getForestManager = forestManager

  
  def getWorkingData(numTrees: Int = parameter.value.getMaxNumTrees, macroIteration: Int = 0, skipPreparation : Boolean =false) = {
    val timePreparationSTART = System.currentTimeMillis()
    if(skipPreparation) {
      forestManager = Some(new ForestManager[T, U](parameter.value.applyNumTrees(numTrees), splitterManager.get))
      previousWorkingData = workingData

      workingData = Some(dataPrepare.prepareData(rawDataset.trainingData,
        sc.broadcast(forestManager.get.splitterManager.getSplitter(macroIteration)),
        parameter.value.numFeatures,
        memoryUtil.get,
        numTrees,
        macroIteration))

//      workingData = Some(workingData.get.mapPartitionsWithIndex{case (partitionIndex, elements) =>
//        strategyBC.value.reGenerateBagging(numTrees, partitionIndex, elements)})
      val dataSize = workingData.get.persist(parameter.value.storageLevel).count()

      if(previousWorkingData.isDefined) {
        previousWorkingData.get.unpersist()
      }

      val timePreparationEND = System.currentTimeMillis()
      println("TIME PREPARATION SKIPPED INIT ("+dataSize+"): " + (timePreparationEND - timePreparationSTART))
      workingData.get
    } else {

      previousWorkingData = workingData

      val zzz = strategyBC.value.findSplits(rawDataset.trainingData, typeInfoBC, typeInfoWorkingBC, instrumented, categoricalFeaturesInfoBC)
      splitterManager = Some(zzz._1)
      forestManager = Some(new ForestManager[T, U](parameter.value.applyNumTrees(numTrees), zzz._1))
      memoryUtil = Some(zzz._2)

      val splitter = forestManager.get.splitterManager.getSplitter(macroIteration)

      // TODO the broadcast of the splitter must be unpersisted!!!
      workingData = Some(dataPrepare.prepareData(rawDataset.trainingData,
        sc.broadcast(splitter),
        parameter.value.numFeatures,
        memoryUtil.get,
        numTrees,
        macroIteration))

      val dataSize = workingData.get.persist(parameter.value.storageLevel).count()
      if(previousWorkingData.isDefined) {
        previousWorkingData.get.unpersist()
      }
      val timePreparationEND = System.currentTimeMillis()
      println("TIME PREPARATION: " + (timePreparationEND - timePreparationSTART))
      workingData.get
    }
  }

}

Source File: CCUtil.scala From reforest with Apache License 2.0

5 votes

package reforest.util

import org.apache.commons.io.FilenameUtils
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.{SparkConf, SparkContext}
import reforest.TypeInfo
import reforest.data.load.{ARFFUtil, DataLoad, LibSVMUtil}
import reforest.rf.RFCategoryInfo
import reforest.rf.parameter.RFParameter

import scala.reflect.ClassTag


  def getDataLoader[T:ClassTag, U:ClassTag](property : RFParameter,
                                             typeInfo: Broadcast[TypeInfo[T]],
                                   instrumented: Broadcast[GCInstrumented],
                                   categoryInfo: Broadcast[RFCategoryInfo]): DataLoad[T, U] = {
    val extension = FilenameUtils.getExtension(property.dataset).toUpperCase()

    property.fileType match {
      case "LIBSVM" => new LibSVMUtil(typeInfo, instrumented, categoryInfo)
      case "SVM" => new LibSVMUtil(typeInfo, instrumented, categoryInfo)
      case "ARFF" => new ARFFUtil(typeInfo, instrumented, categoryInfo)
      case _ => new LibSVMUtil(typeInfo, instrumented, categoryInfo)
    }
  }
}

Source File: RFRotationMatrix.scala From reforest with Apache License 2.0

5 votes

package reforest.rf.rotation

import org.apache.spark.broadcast.Broadcast
import reforest.TypeInfo
import reforest.data.{RawData, RawDataDense, RawDataLabeled, RotationMatrix}

import scala.reflect.ClassTag

/**
  * To rotate the raw data
  *
  * @param n        the size of the nxn matrix (typically n is the number of features in the dataset)
  * @param typeInfo the type information for the raw data
  * @param seed     a random generator seed
  * @tparam T raw data type
  * @tparam U working data type
  */
class RFRotationMatrix[T: ClassTag, U: ClassTag](n: Int, typeInfo: TypeInfo[T], seed: Int) extends Serializable {

  private val matrix = new RotationMatrix(n, seed)

  /**
    * It rotates a raw data
    *
    * @param element the element to rotate
    * @return the rotated element
    */
  def rotateRawData(element: RawData[T, U]) = {
    val dense = element.toDense
    val densedRotated = matrix.rotate(dense.values, typeInfo)

    new RawDataDense[T, U](densedRotated, dense.nan)
  }

  /**
    * It rotates a raw data labeled
    *
    * @param element the element to rotate
    * @return the rotated element
    */
  def rotate(element: RawDataLabeled[T, U]) = {
    new RawDataLabeled[T, U](element.label, rotateRawData(element.features))
  }
}

Source File: RFDataPrepare.scala From reforest with Apache License 2.0

5 votes

package reforest.rf

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data.{RawDataLabeled, StaticData}
import reforest.data.tree.ForestManager
import reforest.rf.split.{RFSplitter, RFSplitterManager}
import reforest.util.{GCInstrumented, MemoryUtil}

class RFDataPrepare[T, U](typeInfo: Broadcast[TypeInfo[T]],
                          instrumented: Broadcast[GCInstrumented],
                          strategy: Broadcast[RFStrategy[T, U]],
                          permitSparseWorkingData: Boolean,
                          poissonMean: Double) extends Serializable {

  def prepareData(dataIndex: RDD[RawDataLabeled[T, U]],
                  splitter : Broadcast[RFSplitter[T, U]],
                  featureNumber: Int,
                  memoryUtil: MemoryUtil,
                  numTrees: Int,
                  macroIteration : Int):
  RDD[StaticData[U]] = {

    dataIndex.mapPartitionsWithIndex { (partitionIndex, instances) =>

      strategy.value.prepareData(numTrees, macroIteration, splitter, partitionIndex, instances, instrumented.value, memoryUtil)
    }
  }
}

Source File: SLCTreeGeneration.scala From reforest with Apache License 2.0

5 votes

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package reforest.rf.slc

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data._
import reforest.data.tree.ForestManager
import reforest.rf.feature.RFFeatureManager
import reforest.rf.parameter.RFParameter
import reforest.rf.{RFSkip, RFStrategy, RFTreeGeneration}
import reforest.util._

class SLCTreeGeneration[T, U](@transient private val sc: SparkContext,
                              property: Broadcast[RFParameter],
                              typeInfo: Broadcast[TypeInfo[T]],
                              typeInfoWorking: Broadcast[TypeInfo[U]],
                              sampleSize: Long) extends Serializable {

  var fcsExecutor : Option[SLCExecutor[T, U]] = Option.empty

  def findBestCutSLC(dataIndex: RDD[StaticData[U]],
                     forestManager: ForestManager[T, U],
                     featureManager: RFFeatureManager,
                     depthToStop : Int,
                     instrumented: Broadcast[GCInstrumented],
                    skip : RFSkip): ForestManager[T, U] = {

    if (featureManager.getActiveNodesNum <= 0) {
      forestManager
    } else {
      var toReturn = forestManager

      val splitterManagerBC = sc.broadcast(forestManager.splitterManager)

      if(fcsExecutor.isEmpty) {
        fcsExecutor = Some(SLCExecutor.build(sc, typeInfo, typeInfoWorking, property,
          splitterManagerBC, sampleSize))
      }

      toReturn = fcsExecutor.get.executeSLC(toReturn, featureManager, dataIndex, depthToStop, skip)

      splitterManagerBC.unpersist()

      toReturn
    }
  }
}

Source File: LibSVMUtil.scala From reforest with Apache License 2.0

5 votes

package reforest.data.load

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data.{RawData, RawDataLabeled}
import reforest.rf.RFCategoryInfo
import reforest.util.GCInstrumented

import scala.reflect.ClassTag

/**
  * Forked from Apache Spark MLlib
  * Load data in LibSVM format
  *
  * @param typeInfo     the type information of the raw data
  * @param instrumented the instrumentation of the GC
  * @param categoryInfo the information for the categorical features
  * @tparam T raw data type
  * @tparam U working data type
  */
class LibSVMUtil[T: ClassTag, U: ClassTag](typeInfo: Broadcast[TypeInfo[T]],
                                           instrumented: Broadcast[GCInstrumented],
                                           categoryInfo: Broadcast[RFCategoryInfo]) extends DataLoad[T, U] {

  override def loadFile(sc: SparkContext,
                        path: String,
                        numFeatures: Int,
                        minPartitions: Int): RDD[RawDataLabeled[T, U]] = {
    val parsed = parseLibSVMFile(sc, path, minPartitions)
    instrumented.value.gcALL

    parsed.map {
      case (label, indices, values) =>
        RawDataLabeled(label, RawData.sparse[T, U](numFeatures, indices, values, typeInfo.value.NaN).compressed)
    }
  }

  private def parseLibSVMFile(sc: SparkContext,
                              path: String,
                              minPartitions: Int): RDD[(Double, Array[Int], Array[T])] = {
    sc.textFile(path, minPartitions)
      .map(_.trim)
      .filter(line => !(line.isEmpty || line.startsWith("#")))
      .mapPartitions(it => {
        val toReturn = it.map(u => parseLibSVMRecord(u))
        instrumented.value.gc()
        toReturn
      })
  }

  private[load] def parseLibSVMRecord(line: String): (Double, Array[Int], Array[T]) = {
    val items = line.split(' ')
    val label = Math.max(items.head.toDouble, 0)
    val (indices, values) = items.tail.filter(_.nonEmpty).flatMap {
      item =>
        try {
          val indexAndValue = item.split(':')
          val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based
          val value = typeInfo.value.fromString(indexAndValue(1))

          if (categoryInfo.value.isCategorical(index)) {
            Some((index, typeInfo.value.fromInt(categoryInfo.value.rawRemapping(typeInfo.value.toInt(value)))))
          } else {
            Some((index, value))
          }
        }
        catch {
          case e : NumberFormatException => {
            println("Malformed input. Details: \n"+e.getMessage)
            System.exit(1)
            None
          }
          case e : Exception => {
            e.printStackTrace()
            System.exit(1)
            None
          }
        }
    }.unzip

    // check if indices are one-based and in ascending order
    var previous = -1
    var i = 0
    val indicesLength = indices.length
    while (i < indicesLength) {
      val current = indices(i)
      require(current > previous, s"indices should be one-based and in ascending order;"
        + " found current=$current, previous=$previous; line=\"$line\"")
      previous = current
      i += 1
    }
    (label, indices, values)
  }
}

Source File: ARFFUtil.scala From reforest with Apache License 2.0

5 votes

package reforest.data.load

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo
import reforest.data.{RawData, RawDataLabeled}
import reforest.rf.RFCategoryInfo
import reforest.util.GCInstrumented

import scala.reflect.ClassTag

/**
  * Load data in ARFF format
  *
  * @param typeInfo     the type information of the raw data
  * @param instrumented the instrumentation of the GC
  * @param categoryInfo the information for the categorical features
  * @tparam T raw data type
  * @tparam U working data type
  */
class ARFFUtil[T: ClassTag, U: ClassTag](typeInfo: Broadcast[TypeInfo[T]],
                                         instrumented: Broadcast[GCInstrumented],
                                         categoryInfo: Broadcast[RFCategoryInfo]) extends DataLoad[T, U] {
  override def loadFile(sc: SparkContext,
                        path: String,
                        numFeatures: Int,
                        minPartitions: Int): RDD[RawDataLabeled[T, U]] = {
    val parsed = parseARFFFile(sc, path, minPartitions)
    instrumented.value.gcALL

    parsed.map {
      case (label, values) =>
        RawDataLabeled(label, RawData.dense[T, U](values, typeInfo.value.NaN))
    }
  }

  private def parseARFFFile(sc: SparkContext,
                            path: String,
                            minPartitions: Int): RDD[(Double, Array[T])] = {
    sc.textFile(path, minPartitions)
      .map(_.trim)
      .filter(line => !(line.isEmpty || line.startsWith("#") || line.startsWith("%") || line.startsWith("@")))
      .mapPartitions(it => {
        val toReturn = it.map(u => parseARFFRecord(u))
        instrumented.value.gc()
        toReturn
      })
  }

  private[load] def parseARFFRecord(line: String): (Double, Array[T]) = {
    val items = line.split(',')
    val label = Math.max(items.last.toDouble, 0)
    val values = items.dropRight(1).filter(_.nonEmpty).map({
      try {
        typeInfo.value.fromString
      }
      catch {
        case e : NumberFormatException => {
          println("Malformed input. Details: \n"+e.getMessage)
          System.exit(1)
          null
        }
        case e : Exception => {
          e.printStackTrace()
          System.exit(1)
          null
        }
      }
    })

    (label, values)
  }
}

Source File: ScalingVariable.scala From reforest with Apache License 2.0

5 votes

package reforest.data

import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import reforest.TypeInfo

import scala.reflect.ClassTag

/**
  * It scales the value of the raw data according to different methodologies
  * @tparam T raw data type
  * @tparam U working data type
  */
trait ScalingVariable[T, U] extends Serializable {

  /**
    * It scales the data passed as argument
    * @param data The value to be scaled
    * @return The scaled data
    */
  def scale(data: RawDataLabeled[T, U]): RawDataLabeled[T, U]

}

/**
  * It scales the values according to the Basic Scaling of Blaser et al. "Random rotation ensembles".
  * Numeric values are scaled to [0, 1] using the min and max values.
  * @param sc The Spark Context
  * @param typeInfo The type information about the raw data
  * @param featureNumber The number of feature in the dataset
  * @param input The raw dataset
  * @tparam T raw data type
  * @tparam U working data type
  */
class ScalingBasic[T : ClassTag, U : ClassTag](@transient private val sc: SparkContext,
                         typeInfo: Broadcast[TypeInfo[T]],
                         featureNumber: Int,
                         input: RDD[RawDataLabeled[T, U]]) extends ScalingVariable[T, U] {

  private val scaling: Broadcast[scala.collection.Map[Int, (T, T)]] = sc.broadcast(init())

  private def scaleValue(index: Int, value: T): T = {
    val (min, max) = scaling.value(index)
    val doubleValue = typeInfo.value.toDouble(value)
    typeInfo.value.fromDouble(Math.min(1, Math.max(0, (doubleValue - typeInfo.value.toDouble(min)) / (typeInfo.value.toDouble(max) - typeInfo.value.toDouble(min)))))
  }

  override def scale(data: RawDataLabeled[T, U]): RawDataLabeled[T, U] = {
    val densed = data.features.toDense
    val values = new Array[T](densed.size)
    var count = 0

    while (count < values.length) {
      values(count) = scaleValue(count, densed(count))
      count += 1
    }

    RawDataLabeled(data.label, new RawDataDense(values, densed.nan))
  }

  private def init(): scala.collection.Map[Int, (T, T)] = {

    input.mapPartitions(it => {
      val min = Array.fill(featureNumber)(typeInfo.value.maxValue)
      val max = Array.fill(featureNumber)(typeInfo.value.minValue)

      def setMinMax(index: Int, value: T): Unit = {
        if (typeInfo.value.isMinOrEqual(value, min(index))) {
          min(index) = value
        }
        if (typeInfo.value.isMinOrEqual(max(index), value)) {
          max(index) = value
        }
      }

      it.foreach(t => {
        t.features.foreachActive(setMinMax)
      })

      min.zip(max).zipWithIndex.map(_.swap).toIterator
    }).reduceByKey((a, b) => (typeInfo.value.min(a._1, b._1), typeInfo.value.max(a._2, b._2))).collectAsMap()
  }
}

Source File: MapPartitionsRWrapper.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.r

import org.apache.spark.api.r._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.api.r.SQLUtils._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType


case class MapPartitionsRWrapper(
    func: Array[Byte],
    packageNames: Array[Byte],
    broadcastVars: Array[Broadcast[Object]],
    inputSchema: StructType,
    outputSchema: StructType) extends (Iterator[Any] => Iterator[Any]) {
  def apply(iter: Iterator[Any]): Iterator[Any] = {
    // If the content of current DataFrame is serialized R data?
    val isSerializedRData =
      if (inputSchema == SERIALIZED_R_DATA_SCHEMA) true else false

    val (newIter, deserializer, colNames) =
      if (!isSerializedRData) {
        // Serialize each row into a byte array that can be deserialized in the R worker
        (iter.asInstanceOf[Iterator[Row]].map {row => rowToRBytes(row)},
         SerializationFormats.ROW, inputSchema.fieldNames)
      } else {
        (iter.asInstanceOf[Iterator[Row]].map { row => row(0) }, SerializationFormats.BYTE, null)
      }

    val serializer = if (outputSchema != SERIALIZED_R_DATA_SCHEMA) {
      SerializationFormats.ROW
    } else {
      SerializationFormats.BYTE
    }

    val runner = new RRunner[Array[Byte]](
      func, deserializer, serializer, packageNames, broadcastVars,
      isDataFrame = true, colNames = colNames, mode = RRunnerModes.DATAFRAME_DAPPLY)
    // Partition index is ignored. Dataset has no support for mapPartitionsWithIndex.
    val outputIter = runner.compute(newIter, -1)

    if (serializer == SerializationFormats.ROW) {
      outputIter.map { bytes => bytesToRow(bytes, outputSchema) }
    } else {
      outputIter.map { bytes => Row.fromSeq(Seq(bytes)) }
    }
  }
}

Source File: LogisticRegression.scala From SparseML with Apache License 2.0

5 votes

package org.apache.spark.mllib.sparselr

import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap
import org.apache.spark.mllib.sparselr.Utils._
import org.apache.spark.SparkEnv
import org.apache.spark.rdd.RDD
import org.apache.spark.broadcast.Broadcast

object LogisticRegression {
    def train(input: RDD[(Array[Double], Matrix)],
              optimizer: Optimizer
              ): (Array[Int], Array[Double]) = {

      val hdfsIndex2global = new Int2IntOpenHashMap()
      var index = 0

      input.map { point =>
        point._2 match {
          case x: CompressedSparseMatrix =>
            println("x.length" + x.mappings.length)
          case _ =>
            throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.")
        }
      }.count

      val global2hdfsIndex = input.map { point =>
        point._2 match {
          case x: CompressedSparseMatrix =>
            x.mappings
          case _ =>
            throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.")
        }
      }.collect().flatMap(t => t).distinct

      global2hdfsIndex.foreach{value =>
        hdfsIndex2global.put(value, index)
        index += 1
      }

      val bcHdfsIndex2global = input.context.broadcast(hdfsIndex2global)

      val examples = input.map(global2globalMapping(bcHdfsIndex2global)).cache()

      val numTraining = examples.count()
      println(s"Training: $numTraining.")

      SparkEnv.get.blockManager.removeBroadcast(bcHdfsIndex2global.id, true)

      val examplesTest = examples.mapPartitions(_.flatMap {
        case (y, part) => part.asInstanceOf[CompressedSparseMatrix].tupletIterator(y)})

      val weights = Vectors.dense(new Array[Double](global2hdfsIndex.size))

      val newWeights = optimizer.optimize(examplesTest, weights)

      ((global2hdfsIndex, newWeights.toArray))
    }

  //globalId to localId for mappings in Matrix
    def global2globalMapping(bchdfsIndex2global: Broadcast[Int2IntOpenHashMap])
                     (partition: (Array[Double], Matrix)): (Array[Double], Matrix) = {
      val hdfsIndex2global = bchdfsIndex2global.value

      partition._2 match {
        case x: CompressedSparseMatrix =>
          val local2hdfsIndex = x.mappings
          for (i <- 0 until local2hdfsIndex.length) {
            local2hdfsIndex(i) = hdfsIndex2global.get(local2hdfsIndex(i))
          }
        case _ =>
          throw new IllegalArgumentException(s"dot doesn't support ${partition.getClass}.")
      }
      partition
    }
}

Source File: RegressionMetricsSpark.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.evaluation

import breeze.linalg.DenseVector
import io.github.mandar2812.dynaml.graphics.charts.Highcharts._
import org.apache.log4j.{Priority, Logger}
import org.apache.spark.Accumulator
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD

import scalax.chart.module.ChartFactories.{XYBarChart, XYLineChart, XYAreaChart}


    histogram(residuals, numBins = 20)
    title("Histogram of Regression Residuals")
  }

}

object RegressionMetricsSpark {

  def computeKPIs(scoresAndLabels: RDD[(Double, Double)], size: Long)
  : (Double, Double, Double, Double) = {
    val mean: Accumulator[Double] = scoresAndLabels.context.accumulator(0.0, "mean")

    val err:DenseVector[Double] = scoresAndLabels.map((sc) => {
      val diff = sc._1 - sc._2
      mean += sc._2
      val difflog = math.pow(math.log(1 + math.abs(sc._1)) - math.log(math.abs(sc._2) + 1),
        2)
      DenseVector(math.abs(diff), math.pow(diff, 2.0), difflog)
    }).reduce((a,b) => a+b)

    val SS_res = err(1)

    val mu: Broadcast[Double] = scoresAndLabels.context.broadcast(mean.value/size.toDouble)

    val SS_tot = scoresAndLabels.map((sc) => math.pow(sc._2 - mu.value, 2.0)).sum()

    val rmse = math.sqrt(SS_res/size.toDouble)
    val mae = err(0)/size.toDouble
    val rsq = if(1/SS_tot != Double.NaN) 1 - (SS_res/SS_tot) else 0.0
    val rmsle = err(2)/size.toDouble
    (mae, rmse, rsq, rmsle)
  } 
  
}

Source File: implicits.scala From ZparkIO with MIT License

5 votes

package com.leobenkel.zparkio

import com.leobenkel.zparkio.Services.SparkModule
import com.leobenkel.zparkio.Services.SparkModule.SparkModule
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import zio.{BootstrapRuntime, ZIO}

import scala.reflect.ClassTag
import scala.reflect.runtime.universe._

// scalastyle:off object.name
object implicits {
  type ZDS_R[R, A] = ZIO[R with SparkModule, Throwable, Dataset[A]]
  type ZDS[A] = ZDS_R[Any, A]

  type ZRDD_R[R, A] = ZIO[R, Throwable, RDD[A]]
  type ZRDD[A] = ZRDD_R[Any, A]

  type ZBC_R[R, A] = ZIO[R with SparkModule, Throwable, Broadcast[A]]
  type ZBC[A] = ZBC_R[Any, A]

  object ZDS {
    def map[A](f: SparkSession => Dataset[A]): ZDS[A] = SparkModule().map(spark => f(spark))

    def flatMap[A](f:     SparkSession => ZDS[A]): ZDS[A] = SparkModule().flatMap(spark => f(spark))
    def flatMapR[R, A](f: SparkSession => ZDS_R[R, A]): ZDS_R[R, A] =
      SparkModule().flatMap(spark => f(spark))

    def apply[A](f: SparkSession => Dataset[A]): ZDS[A] = ZDS.map(f)

    def make[A <: Product: TypeTag: ClassTag, B <: Product: TypeTag: ClassTag](
      input: Dataset[A]
    )(
      f: Dataset[A] => Encoder[B] => Dataset[B]
    ): ZDS[B] = {
      ZDS { spark =>
        f(input)(spark.implicits.newProductEncoder[B])
      }
    }

    def apply[A <: Product: TypeTag: ClassTag](data: A*): ZDS[A] = {
      apply { spark =>
        import spark.implicits._
        data.toDS()
      }
    }

    def apply[A: Encoder](data: Seq[A]): ZDS[A] = {
      apply { spark =>
        import spark.implicits._
        data.toDS()
      }
    }

    def broadcast[A: ClassTag](f: SparkSession => A): ZBC[A] = {
      SparkModule().map(spark => spark.sparkContext.broadcast(f(spark)))
    }
  }

  implicit class DatasetZ[R, A](zds: => ZIO[R, Throwable, Dataset[A]]) extends Serializable {
    def zMap[B <: Product: TypeTag: ClassTag](f: A => ZIO[Any, Throwable, B]): ZDS_R[R, B] = {
      ZDS.flatMapR[R, B] { spark =>
        import spark.implicits._
        zds.map { ds =>
          ds.map { a =>
            val zB = f(a)
            val runtime = new BootstrapRuntime {}
            runtime.unsafeRun(zB)
          }
        }
      }
    }
  }
}
// scalastyle:on

Source File: ResultTask.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.nio.ByteBuffer

import java.io._

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD


private[spark] class ResultTask[T, U](
    stageId: Int,
    taskBinary: Broadcast[Array[Byte]],
    partition: Partition,
    @transient locs: Seq[TaskLocation],
    val outputId: Int)
  extends Task[U](stageId, partition.index) with Serializable {

  @transient private[this] val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  //TODO 进行任务逻辑执行
  override def runTask(context: TaskContext): U = {
    // Deserialize the RDD and the func using the broadcast variables.
    //TODO 拿到序列化器
    val ser = SparkEnv.get.closureSerializer.newInstance()
    //TODO 反序列化Task,这个rdd是第一个RDD，调用作用在RDD上的函数
    val (rdd, func) = ser.deserialize[(RDD[T], (TaskContext, Iterator[T]) => U)](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)

    metrics = Some(context.taskMetrics)
    //TODO 开始调用作用于RDD的函数,拿到一条数据进行函数调用作用于数据
    func(context, rdd.iterator(partition, context))
  }

  // This is only callable on the driver side.
  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString = "ResultTask(" + stageId + ", " + partitionId + ")"
}

Source File: ShuffleMapTask.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.nio.ByteBuffer

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.shuffle.ShuffleWriter


  def this(partitionId: Int) {
    this(0, null, new Partition { override def index = 0 }, null)
  }

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  override def runTask(context: TaskContext): MapStatus = {
    // Deserialize the RDD using the broadcast variable.
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rdd, dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)

    metrics = Some(context.taskMetrics)
    var writer: ShuffleWriter[Any, Any] = null
    try {
      val manager = SparkEnv.get.shuffleManager
      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
      return writer.stop(success = true).get
    } catch {
      case e: Exception =>
        try {
          if (writer != null) {
            writer.stop(success = false)
          }
        } catch {
          case e: Exception =>
            log.debug("Could not stop writer", e)
        }
        throw e
    }
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
}

Source File: BaseTimeSeriesGenerator.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import eleflow.uberdata.core.data.DataTransformer
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.DefaultParamsWritable
import org.apache.spark.sql.Row


abstract class BaseTimeSeriesGenerator
    extends Transformer
    with HasInputCol
    with HasOutputCol
    with HasTimeCol
    with DefaultParamsWritable
    with HasLabelCol
    with HasFeaturesCol {

  def convertRowToFloat(toBeConverted: Row): Row = {
    val values = (0 until toBeConverted.length).map { index =>
      val value = toBeConverted.get(index)
      DataTransformer.toFloat(value)
    }
    Row(values)
  }

  def convertRowToDouble(toBeConverted: Row): Row = {
    val values = (0 until toBeConverted.length).map { index =>
      val value = toBeConverted.get(index)
      DataTransformer.toDouble(value)
    }
    Row(values: _*)
  }

  def convertColumnToDouble(toBeTransformed: Row, colIndex: Broadcast[Int]): Row = {
    val (prior, after) = toBeTransformed.toSeq.splitAt(colIndex.value)
    val converted =
      DataTransformer.toDouble(toBeTransformed.get(colIndex.value))
    val result = (prior :+ converted.toDouble) ++ after.tail
    Row(result: _*)
  }
}

Source File: HoltWintersBestModelEvaluation.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import com.cloudera.sparkts.models.UberHoltWintersModel
import eleflow.uberdata.enums.SupportedAlgorithm
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.{ParamMap, ParamPair}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.Row

import scala.reflect.ClassTag


abstract class HoltWintersBestModelEvaluation[L, M <: ForecastBaseModel[M]](
  implicit kt: ClassTag[L],
  ord: Ordering[L] = null
) extends BestModelFinder[L, M]
    with HoltWintersParams {

  protected def holtWintersEvaluation(
    row: Row,
    model: UberHoltWintersModel,
    broadcastEvaluator: Broadcast[TimeSeriesEvaluator[L]],
    id: L
  ): (UberHoltWintersModel, ModelParamEvaluation[L]) = {
    val features =
      row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol))
    log.warn(
      s"Evaluating forecast for id $id, with parameters " +
        s"alpha ${model.alpha}, beta ${model.beta} and gamma ${model.gamma}"
    )
    val expectedResult =
      row.getAs[org.apache.spark.ml.linalg.Vector](partialValidationCol)
    val forecastToBeValidated = Vectors.dense(new Array[Double]($(nFutures)))
    model.forecast(org.apache.spark.mllib.linalg.Vectors.fromML(features), forecastToBeValidated).toArray
    val toBeValidated =
      expectedResult.toArray.zip(forecastToBeValidated.toArray)
    val metric = broadcastEvaluator.value.evaluate(toBeValidated)
    val metricName = broadcastEvaluator.value.getMetricName
    val params = ParamMap().put(
      ParamPair(gamma, model.gamma),
      ParamPair(beta, model.beta),
      ParamPair(alpha, model.alpha)
    )
    (model,
     new ModelParamEvaluation[L](
       id,
       metric,
       params,
       Some(metricName),
       SupportedAlgorithm.HoltWinters
     ))
  }
}

Source File: XGBoostBaseBestModel.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType}


trait BaseXGBoostBestModelFinder[G, M <: org.apache.spark.ml.ForecastBaseModel[M]]
    extends BestModelFinder[G, M]
    with HasGroupByCol {

  protected def buildTrainSchema(sparkContext: SparkContext): Broadcast[StructType] = sparkContext.broadcast {
    StructType(
      Seq(
        StructField($(groupByCol).get, FloatType),
        StructField(IUberdataForecastUtil.FEATURES_COL_NAME, ArrayType(new VectorUDT))))
  }


  protected def xGBoostEvaluation(row: Row,
                                  model: Booster,
                                  broadcastEvaluator: Broadcast[TimeSeriesEvaluator[G]],
                                  id: G,
                                  parameters: ParamMap): ModelParamEvaluation[G] = {
    val featuresArray = row
      .getAs[Array[org.apache.spark.ml.linalg.Vector]](IUberdataForecastUtil.FEATURES_COL_NAME)
      .map { vec =>
        val values = vec.toArray.map(DataTransformer.toFloat)
        LabeledPoint(values.head, null, values.tail)
      }
    val features = new DMatrix(featuresArray.toIterator)
    log.warn(s"Evaluating forecast for id $id, with xgboost")
    val prediction = model.predict(features).flatten
    val (forecastToBeValidated, _) = prediction.splitAt(featuresArray.length)
    val toBeValidated = featuresArray.zip(forecastToBeValidated)
    val metric = broadcastEvaluator.value.evaluate(toBeValidated.map(f =>
      (f._1.label.toDouble, f._2.toDouble)))
    val metricName = broadcastEvaluator.value.getMetricName
    new ModelParamEvaluation[G](
      id,
      metric,
      parameters,
      Some(metricName),
      SupportedAlgorithm.XGBoostAlgorithm)
  }
}

org.apache.spark.broadcast.Broadcast Scala Examples