org.apache.spark.partial.PartialResult Scala Examples
The following examples show how to use org.apache.spark.partial.PartialResult.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: Converters.scala From sparker with GNU General Public License v3.0 | 5 votes |
package SparkER.Utilities import SparkER.BlockBuildingMethods.TokenBlocking import org.apache.spark.rdd.RDD import SparkER.DataStructures._ import org.apache.spark.partial.PartialResult def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = { val blockIDProfileID = profilesBlocks flatMap { profileWithBlocks => val profileID = profileWithBlocks.profileID profileWithBlocks.blocks map { BlockWithSize => (BlockWithSize.blockID, profileID) } } val blocks = blockIDProfileID.groupByKey().map { block => val blockID = block._1 val profilesID = block._2.toSet if (separatorIDs.isEmpty) { BlockDirty(blockID, Array(profilesID)) } else { BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs)) } } blocks.filter(_.getComparisonSize() > 0).map(x => x) } }
Example 2
Source File: Converters.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Utilities import BlockBuildingMethods.TokenBlocking import org.apache.spark.rdd.RDD import DataStructures._ import org.apache.spark.partial.PartialResult def profilesBlockToBlocks(profilesBlocks: RDD[ProfileBlocks], separatorIDs: Array[Long] = Array.emptyLongArray): RDD[BlockAbstract] = { val blockIDProfileID = profilesBlocks flatMap { profileWithBlocks => val profileID = profileWithBlocks.profileID profileWithBlocks.blocks map { BlockWithSize => (BlockWithSize.blockID, profileID) } } val blocks = blockIDProfileID.groupByKey().map { block => val blockID = block._1 val profilesID = block._2.toSet if (separatorIDs.isEmpty) { BlockDirty(blockID, Array(profilesID)) } else { BlockClean(blockID, TokenBlocking.separateProfiles(profilesID, separatorIDs)) } } blocks.filter(_.getComparisonSize() >= 1).map(x => x) } }
Example 3
Source File: Converters.scala From sparker with GNU General Public License v3.0 | 5 votes |
package Utilities import org.apache.spark.rdd.RDD import DataStructures._ import org.apache.spark.partial.PartialResult def profilesBlockToBlocks(profilesBlocks : RDD[ProfileBlocks], separatorID : Long = -1) : RDD[BlockAbstract] = { val blockIDProfileID = profilesBlocks flatMap { profileWithBlocks => val profileID = profileWithBlocks.profileID profileWithBlocks.blocks map { BlockWithSize => (BlockWithSize.blockID, profileID) } } val blocks = blockIDProfileID.groupByKey().map { block => val blockID = block._1 val profilesID = block._2.toSet if (separatorID < 0){ BlockDirty(blockID, (profilesID, Set.empty)) } else{ BlockClean(blockID, (profilesID.partition(_ <= separatorID))) } } blocks.filter(_.getComparisonSize() >=1).map(x => x) } }
Example 4
Source File: DatasourceRDD.scala From datasource-receiver with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.datasource.receiver import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.streaming.datasource.config.ParametersUtils import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator} import org.apache.spark.{Logging, Partition, TaskContext} private[datasource] class DatasourceRDD( @transient sqlContext: SQLContext, inputSentences: InputSentences, datasourceParams: Map[String, String] ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils { private var totalCalculated: Option[Long] = None private val InitTableName = "initTable" private val LimitedTableName = "limitedTable" private val TempInitQuery = s"select * from $InitTableName" val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset => val parsedQuery = parseInitialQuery val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery) val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty) val limitSentence = inputSentences.extractLimitSentence sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence) } private def parseInitialQuery: String = { if (inputSentences.query.toUpperCase.contains("WHERE") || inputSentences.query.toUpperCase.contains("ORDER") || inputSentences.query.toUpperCase.contains("LIMIT") ) { sqlContext.sql(inputSentences.query).registerTempTable(InitTableName) TempInitQuery } else inputSentences.query } def progressInputSentences: InputSentences = { if (!dataFrame.rdd.isEmpty()) { inputSentences.offsetConditions.fold(inputSentences) { case offset => val offsetValue = if (offset.limitRecords.isEmpty) dataFrame.rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name)) else { dataFrame.registerTempTable(LimitedTableName) val limitedQuery = s"select * from $LimitedTableName order by ${offset.fromOffset.name} " + s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1" sqlContext.sql(limitedQuery).rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name)) } inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy( value = Option(offsetValue), operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator))))) } } else inputSentences } override def isEmpty(): Boolean = { totalCalculated.fold { withScope { partitions.length == 0 || take(1).length == 0 } } { total => total == 0L } } override def getPartitions: Array[Partition] = dataFrame.rdd.partitions override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context) override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart) }
Example 5
Source File: RDDSamplers.scala From sparkplug with MIT License | 5 votes |
package springnz.sparkplug.testkit import java.lang.Math._ import com.typesafe.scalalogging.LazyLogging import org.apache.spark.partial.{ BoundedDouble, PartialResult } import org.apache.spark.rdd.RDD import springnz.sparkplug.util.SerializeUtils import scala.reflect.ClassTag object RDDSamplers extends LazyLogging { def identitySampler[A: ClassTag](rdd: RDD[A]): RDD[A] = rdd def shrinkingSampler[A: ClassTag](sampleParams: RDDShrinkingSamplerParams = sourceRDDParams)(rdd: RDD[A]): RDD[A] = shrinkingSample(rdd, sampleParams) def takeSampler[A: ClassTag](count: Int, partitions: Int = -1)(rdd: RDD[A]): RDD[A] = { val sc = rdd.sparkContext val parts = if (partitions > 0) partitions else sc.defaultParallelism sc.parallelize(rdd.take(count), parts) } val sourceRDDParams = RDDShrinkingSamplerParams( testerFraction = 0.0001, scaleParam = 3000.0, scalePower = 0.30102999566398, minimum = 1000000.0, sequential = false) val derivedRDDParams = RDDShrinkingSamplerParams( testerFraction = 0.1, scaleParam = 1.0, scalePower = 1.0, minimum = 1000000.0, sequential = false) private[sparkplug] def shrinkFactor( testerFraction: Double, scaleParam: Double, scalePower: Double, minimum: Double, testerLength: Double): Double = { val fullLength = testerLength / testerFraction val calcFrac = Math.pow(fullLength, scalePower) / fullLength * scaleParam // don't bother shrinking to less than the minimum, but cap at 1.0 min(if (minimum > 0) max(calcFrac, minimum / fullLength) else calcFrac, 1.0) } private[sparkplug] def shrinkingSample[A: ClassTag](rdd: RDD[A], params: RDDShrinkingSamplerParams): RDD[A] = { def getSample(params: RDDShrinkingSamplerParams): RDD[A] = { val approxSize: PartialResult[BoundedDouble] = rdd.countApprox(60000, 0.95) val sampleLength = approxSize.initialValue.mean * params.testerFraction if (sampleLength < 50) { // take a bigger shrinkingSample val updatedTesterFraction = params.testerFraction * 50 / sampleLength getSample(params.copy(testerFraction = updatedTesterFraction)) } else { val sample = rdd.take(sampleLength.toInt) val tester = SerializeUtils.serialize(sample) val sampleFraction = shrinkFactor(params.testerFraction, params.scaleParam, params.scalePower, params.minimum, tester.length) val fullCount = sample.length / params.testerFraction val reSampled = if (params.sequential) rdd.sparkContext.parallelize(rdd.take((fullCount * sampleFraction).toInt), 10) else rdd.sample(withReplacement = true, sampleFraction, 0) reSampled } } if (params.scaleParam == 1.0 && params.scalePower == 1.0) { logger.info("Not sampling RDD since scaleParam==scalePower==1.0") rdd } else { logger.info(s"Sampling RDD with $params ...") getSample(params) } } case class RDDShrinkingSamplerParams( testerFraction: Double, scaleParam: Double, scalePower: Double, minimum: Double, sequential: Boolean) { def withSequential(newSequential: Boolean): Unit = { RDDShrinkingSamplerParams(testerFraction, scaleParam, scalePower, minimum, newSequential) } } }
Example 6
Source File: DoubleDCFunctions.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.util.StatCounter class DoubleDCFunctions(self: DC[Double]) { def sum: DR[Double] = { self.mapToResult(_.sum) } def stats: DR[StatCounter] = { self.mapToResult(_.stats) } def mean: DR[Double] = { self.mapToResult(_.mean) } def variance: DR[Double] = { self.mapToResult(_.variance) } def stdev: DR[Double] = { self.mapToResult(_.stdev) } def sampleStdev: DR[Double] = { self.mapToResult(_.sampleStdev) } def sampleVariance: DR[Double] = { self.mapToResult(_.sampleVariance) } // Experimental def meanApprox(timeout: Long, confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = { self.mapToResult(_.meanApprox(timeout, confidence)) } // Experimental def sumApprox(timeout: Long, confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = { self.mapToResult(_.sumApprox(timeout, confidence)) } def histogram(bucketCount: Int): DR[(Array[Double], Array[Long])] = { self.mapToResult(_.histogram(bucketCount)) } def histogram(buckets: Array[Double], evenBuckets: Boolean = false): DR[Array[Long]] = { self.mapToResult(_.histogram(buckets, evenBuckets)) } }