org.apache.spark.partial.PartialResult Scala Example

Source File: Converters.scala From sparker with GNU General Public License v3.0

5 votes

package SparkER.Utilities

import SparkER.BlockBuildingMethods.TokenBlocking
import org.apache.spark.rdd.RDD
import SparkER.DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorIDs.isEmpty) {
          BlockDirty(blockID, Array(profilesID))
        }
        else {
          BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs))
        }
    }

    blocks.filter(_.getComparisonSize() > 0).map(x => x)

  }
}

Source File: Converters.scala From sparker with GNU General Public License v3.0

5 votes

package Utilities

import BlockBuildingMethods.TokenBlocking
import org.apache.spark.rdd.RDD
import DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorIDs.isEmpty) {
          BlockDirty(blockID, Array(profilesID))
        }
        else {
          BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs))
        }
    }

    blocks.filter(_.getComparisonSize() >= 1).map(x => x)

  }
}

Source File: Converters.scala From sparker with GNU General Public License v3.0

5 votes

package Utilities

import org.apache.spark.rdd.RDD
import DataStructures._
import org.apache.spark.partial.PartialResult


  def profilesBlockToBlocks(profilesBlocks : RDD[ProfileBlocks], separatorID : Long = -1) : RDD[BlockAbstract] = {

    val blockIDProfileID = profilesBlocks flatMap {
      profileWithBlocks =>
        val profileID = profileWithBlocks.profileID
        profileWithBlocks.blocks map {
          BlockWithSize =>
            (BlockWithSize.blockID, profileID)
        }
    }

    val blocks = blockIDProfileID.groupByKey().map {
      block =>
        val blockID = block._1
        val profilesID = block._2.toSet

        if (separatorID < 0){
          BlockDirty(blockID, (profilesID, Set.empty))
        }
        else{
          BlockClean(blockID, (profilesID.partition(_ <= separatorID)))
        }
    }

    blocks.filter(_.getComparisonSize() >=1).map(x => x)

  }
 }

Source File: DatasourceRDD.scala From datasource-receiver with Apache License 2.0

5 votes

package org.apache.spark.streaming.datasource.receiver

import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.streaming.datasource.config.ParametersUtils
import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator}
import org.apache.spark.{Logging, Partition, TaskContext}

private[datasource]
class DatasourceRDD(
                     @transient sqlContext: SQLContext,
                     inputSentences: InputSentences,
                     datasourceParams: Map[String, String]
                   ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils {

  private var totalCalculated: Option[Long] = None

  private val InitTableName = "initTable"
  private val LimitedTableName = "limitedTable"
  private val TempInitQuery = s"select * from $InitTableName"

  val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset =>
    val parsedQuery = parseInitialQuery
    val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery)
    val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty)
    val limitSentence = inputSentences.extractLimitSentence

    sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence)
  }

  private def parseInitialQuery: String = {
    if (inputSentences.query.toUpperCase.contains("WHERE") ||
      inputSentences.query.toUpperCase.contains("ORDER") ||
      inputSentences.query.toUpperCase.contains("LIMIT")
    ) {
      sqlContext.sql(inputSentences.query).registerTempTable(InitTableName)
      TempInitQuery
    } else inputSentences.query
  }

  def progressInputSentences: InputSentences = {
    if (!dataFrame.rdd.isEmpty()) {
      inputSentences.offsetConditions.fold(inputSentences) { case offset =>

        val offsetValue = if (offset.limitRecords.isEmpty)
          dataFrame.rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        else {
          dataFrame.registerTempTable(LimitedTableName)
          val limitedQuery = s"select * from $LimitedTableName order by ${offset.fromOffset.name} " +
            s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1"

          sqlContext.sql(limitedQuery).rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name))
        }

        inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy(
          value = Option(offsetValue),
          operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator)))))
      }
    } else inputSentences
  }

  
  override def isEmpty(): Boolean = {
    totalCalculated.fold {
      withScope {
        partitions.length == 0 || take(1).length == 0
      }
    } { total => total == 0L }
  }

  override def getPartitions: Array[Partition] = dataFrame.rdd.partitions

  override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context)

  override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart)
}

Source File: RDDSamplers.scala From sparkplug with MIT License

5 votes

package springnz.sparkplug.testkit

import java.lang.Math._

import com.typesafe.scalalogging.LazyLogging
import org.apache.spark.partial.{ BoundedDouble, PartialResult }
import org.apache.spark.rdd.RDD
import springnz.sparkplug.util.SerializeUtils

import scala.reflect.ClassTag

object RDDSamplers extends LazyLogging {
  def identitySampler[A: ClassTag](rdd: RDD[A]): RDD[A] = rdd

  def shrinkingSampler[A: ClassTag](sampleParams: RDDShrinkingSamplerParams = sourceRDDParams)(rdd: RDD[A]): RDD[A] =
    shrinkingSample(rdd, sampleParams)

  def takeSampler[A: ClassTag](count: Int, partitions: Int = -1)(rdd: RDD[A]): RDD[A] = {
    val sc = rdd.sparkContext
    val parts = if (partitions > 0) partitions else sc.defaultParallelism
    sc.parallelize(rdd.take(count), parts)
  }

  val sourceRDDParams = RDDShrinkingSamplerParams(
    testerFraction = 0.0001,
    scaleParam = 3000.0,
    scalePower = 0.30102999566398,
    minimum = 1000000.0,
    sequential = false)

  val derivedRDDParams = RDDShrinkingSamplerParams(
    testerFraction = 0.1,
    scaleParam = 1.0,
    scalePower = 1.0,
    minimum = 1000000.0,
    sequential = false)

  private[sparkplug] def shrinkFactor(
    testerFraction: Double,
    scaleParam: Double,
    scalePower: Double,
    minimum: Double,
    testerLength: Double): Double = {

    val fullLength = testerLength / testerFraction
    val calcFrac = Math.pow(fullLength, scalePower) / fullLength * scaleParam
    // don't bother shrinking to less than the minimum, but cap at 1.0
    min(if (minimum > 0) max(calcFrac, minimum / fullLength) else calcFrac, 1.0)
  }

  private[sparkplug] def shrinkingSample[A: ClassTag](rdd: RDD[A], params: RDDShrinkingSamplerParams): RDD[A] = {

    def getSample(params: RDDShrinkingSamplerParams): RDD[A] = {
      val approxSize: PartialResult[BoundedDouble] = rdd.countApprox(60000, 0.95)
      val sampleLength = approxSize.initialValue.mean * params.testerFraction
      if (sampleLength < 50) {
        // take a bigger shrinkingSample
        val updatedTesterFraction = params.testerFraction * 50 / sampleLength
        getSample(params.copy(testerFraction = updatedTesterFraction))
      } else {
        val sample = rdd.take(sampleLength.toInt)
        val tester = SerializeUtils.serialize(sample)
        val sampleFraction = shrinkFactor(params.testerFraction, params.scaleParam, params.scalePower,
          params.minimum, tester.length)
        val fullCount = sample.length / params.testerFraction

        val reSampled = if (params.sequential)
          rdd.sparkContext.parallelize(rdd.take((fullCount * sampleFraction).toInt), 10)
        else
          rdd.sample(withReplacement = true, sampleFraction, 0)
        reSampled
      }
    }

    if (params.scaleParam == 1.0 && params.scalePower == 1.0) {
      logger.info("Not sampling RDD since scaleParam==scalePower==1.0")
      rdd
    } else {
      logger.info(s"Sampling RDD with $params ...")
      getSample(params)
    }
  }

  case class RDDShrinkingSamplerParams(
      testerFraction: Double,
      scaleParam: Double,
      scalePower: Double,
      minimum: Double,
      sequential: Boolean) {

    def withSequential(newSequential: Boolean): Unit = {
      RDDShrinkingSamplerParams(testerFraction, scaleParam, scalePower, minimum, newSequential)
    }
  }

}

Source File: DoubleDCFunctions.scala From spark-flow with Apache License 2.0

5 votes

package com.bloomberg.sparkflow.dc

import org.apache.spark.partial.{BoundedDouble, PartialResult}
import org.apache.spark.util.StatCounter


class DoubleDCFunctions(self: DC[Double]) {

  def sum: DR[Double] = {
    self.mapToResult(_.sum)
  }

  def stats: DR[StatCounter] = {
    self.mapToResult(_.stats)
  }

  def mean: DR[Double] = {
    self.mapToResult(_.mean)
  }

  def variance: DR[Double] = {
    self.mapToResult(_.variance)
  }

  def stdev: DR[Double] = {
    self.mapToResult(_.stdev)
  }

  def sampleStdev: DR[Double] = {
    self.mapToResult(_.sampleStdev)
  }

  def sampleVariance: DR[Double] = {
    self.mapToResult(_.sampleVariance)
  }

  //  Experimental
  def meanApprox(timeout: Long,
                 confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = {
    self.mapToResult(_.meanApprox(timeout, confidence))
  }

  //  Experimental
  def sumApprox(timeout: Long,
                confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = {
    self.mapToResult(_.sumApprox(timeout, confidence))
  }

  def histogram(bucketCount: Int): DR[(Array[Double], Array[Long])] = {
    self.mapToResult(_.histogram(bucketCount))
  }

  def histogram(buckets: Array[Double], evenBuckets: Boolean = false): DR[Array[Long]] = {
    self.mapToResult(_.histogram(buckets, evenBuckets))
  }

}

org.apache.spark.partial.PartialResult Scala Examples