org.apache.spark.SparkEnv Scala Examples
The following examples show how to use org.apache.spark.SparkEnv.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RUtils.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.api.r import java.io.File import java.util.Arrays import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Arrays.asList("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 2
Source File: IndexShuffleBlockResolver.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle import java.io._ import com.google.common.io.ByteStreams import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.network.netty.SparkTransportConf import org.apache.spark.storage._ import org.apache.spark.util.Utils import IndexShuffleBlockResolver.NOOP_REDUCE_ID def writeIndexFile(shuffleId: Int, mapId: Int, lengths: Array[Long]): Unit = { val indexFile = getIndexFile(shuffleId, mapId) val out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(indexFile))) Utils.tryWithSafeFinally { // We take in lengths of each block, need to convert it to offsets. var offset = 0L out.writeLong(offset) for (length <- lengths) { offset += length out.writeLong(offset) } } { out.close() } } override def getBlockData(blockId: ShuffleBlockId): ManagedBuffer = { // The block is actually going to be a range of a single map output file for this map, so // find out the consolidated file, then the offset within that from our index val indexFile = getIndexFile(blockId.shuffleId, blockId.mapId) val in = new DataInputStream(new FileInputStream(indexFile)) try { ByteStreams.skipFully(in, blockId.reduceId * 8) val offset = in.readLong() val nextOffset = in.readLong() new FileSegmentManagedBuffer( transportConf, getDataFile(blockId.shuffleId, blockId.mapId), offset, nextOffset - offset) } finally { in.close() } } override def stop(): Unit = {} } private[spark] object IndexShuffleBlockResolver { // No-op reduce ID used in interactions with disk store and BlockObjectWriter. // The disk store currently expects puts to relate to a (map, reduce) pair, but in the sort // shuffle outputs for several reduces are glommed into a single file. // TODO: Avoid this entirely by having the DiskBlockObjectWriter not require a BlockId. val NOOP_REDUCE_ID = 0 }
Example 3
Source File: CachedKafkaProducer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.{util => ju} import java.util.concurrent.{ConcurrentMap, ExecutionException, TimeUnit} import com.google.common.cache._ import com.google.common.util.concurrent.{ExecutionError, UncheckedExecutionException} import org.apache.kafka.clients.producer.KafkaProducer import scala.collection.JavaConverters._ import scala.util.control.NonFatal import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging private[kafka010] object CachedKafkaProducer extends Logging { private type Producer = KafkaProducer[Array[Byte], Array[Byte]] private lazy val cacheExpireTimeout: Long = SparkEnv.get.conf.getTimeAsMs("spark.kafka.producer.cache.timeout", "10m") private val cacheLoader = new CacheLoader[Seq[(String, Object)], Producer] { override def load(config: Seq[(String, Object)]): Producer = { val configMap = config.map(x => x._1 -> x._2).toMap.asJava createKafkaProducer(configMap) } } private val removalListener = new RemovalListener[Seq[(String, Object)], Producer]() { override def onRemoval( notification: RemovalNotification[Seq[(String, Object)], Producer]): Unit = { val paramsSeq: Seq[(String, Object)] = notification.getKey val producer: Producer = notification.getValue logDebug( s"Evicting kafka producer $producer params: $paramsSeq, due to ${notification.getCause}") close(paramsSeq, producer) } } private lazy val guavaCache: LoadingCache[Seq[(String, Object)], Producer] = CacheBuilder.newBuilder().expireAfterAccess(cacheExpireTimeout, TimeUnit.MILLISECONDS) .removalListener(removalListener) .build[Seq[(String, Object)], Producer](cacheLoader) private def createKafkaProducer(producerConfiguration: ju.Map[String, Object]): Producer = { val kafkaProducer: Producer = new Producer(producerConfiguration) logDebug(s"Created a new instance of KafkaProducer for $producerConfiguration.") kafkaProducer } private def close(paramsSeq: Seq[(String, Object)], producer: Producer): Unit = { try { logInfo(s"Closing the KafkaProducer with params: ${paramsSeq.mkString("\n")}.") producer.close() } catch { case NonFatal(e) => logWarning("Error while closing kafka producer.", e) } } private def clear(): Unit = { logInfo("Cleaning up guava cache.") guavaCache.invalidateAll() } // Intended for testing purpose only. private def getAsMap: ConcurrentMap[Seq[(String, Object)], Producer] = guavaCache.asMap() }
Example 4
Source File: LocalRDDCheckpointData.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Logging, SparkEnv, SparkException, TaskContext} import org.apache.spark.storage.{RDDBlockId, StorageLevel} import org.apache.spark.util.Utils def transformStorageLevel(level: StorageLevel): StorageLevel = { // If this RDD is to be cached off-heap, fail fast since we cannot provide any // correctness guarantees about subsequent computations after the first one //如果这个RDD要被堆栈缓存,那么快速失败,因为我们不能在第一个之后提供关于后续计算的任何正确性保证 if (level.useOffHeap) { throw new SparkException("Local checkpointing is not compatible with off-heap caching.") } StorageLevel(useDisk = true, level.useMemory, level.deserialized, level.replication) } }
Example 5
Source File: BlockManagerSlaveEndpoint.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ ExecutionContext, Future } import org.apache.spark.rpc.{ RpcEnv, RpcCallContext, RpcEndpoint } import org.apache.spark.util.ThreadUtils import org.apache.spark.{ Logging, MapOutputTracker, SparkEnv } import org.apache.spark.storage.BlockManagerMessages._ private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager,//引用BlockManagerMaster与Mast消息通信 mapOutputTracker: MapOutputTracker) extends RpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously //涉及删除块的操作可能很慢,应该是异步完成 override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { //根据BlockId删除该Executor上所有和该Shuffle相关的Block case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } //收到BlockManagerMasterEndpoint发送RemoveRdd信息,根据RddId删除该Excutor上RDD所关联的所有Block case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } //根据shuffleId删除该Executor上所有和该Shuffle相关的Block case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } //根据broadcastId删除该Executor上和该广播变量相关的所有Block case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { //tellMaster 是否将状态汇报到Master blockManager.removeBroadcast(broadcastId, tellMaster = true) } //根据blockId和askSlaves向Master返回该Block的blockStatus case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) //根据blockId和askSlaves向Master返回该Block的blockStatus case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) } //科里化函数,异步调用,方法回调 private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) context.reply(response) logDebug("Sent response: " + response + " to " + context.sender) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 6
Source File: SimrSchedulerBackend.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, //运行driver的主机名或 IP 地址 RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file //创建临时文件以防止执行程序获得空的驱动程序文件的竞争条件 val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 7
Source File: TaskResult.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.Map import scala.collection.mutable import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils // Task result. Also contains updates to accumulator variables. //任务结果,还包含累加器变量的更新, private[spark] sealed trait TaskResult[T] def value(): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value. //这不应该在持有锁时运行,因为它可能花费数十秒钟值 val resultSer = SparkEnv.get.serializer.newInstance() valueObject = resultSer.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 8
Source File: RUtils.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.File import scala.collection.JavaConversions._ import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Seq("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 9
Source File: HashShuffleManagerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import java.io.{File, FileWriter} import scala.language.reflectiveCalls import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, SparkFunSuite} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.serializer.JavaSerializer import org.apache.spark.shuffle.FileShuffleBlockResolver import org.apache.spark.storage.{ShuffleBlockId, FileSegment} class HashShuffleManagerSuite extends SparkFunSuite with LocalSparkContext { private val testConf = new SparkConf(false) private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) { assert(buffer.isInstanceOf[FileSegmentManagedBuffer]) val segment = buffer.asInstanceOf[FileSegmentManagedBuffer] assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath) assert(expected.offset === segment.getOffset) assert(expected.length === segment.getLength) } test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") { val conf = new SparkConf(false) // reset after EACH object write. This is to ensure that there are bytes appended after // an object is written. So if the codepaths assume writeObject is end of data, this should // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc. conf.set("spark.serializer.objectStreamReset", "1") conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer") conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") sc = new SparkContext("local", "test", conf) val shuffleBlockResolver = SparkEnv.get.shuffleManager.shuffleBlockResolver.asInstanceOf[FileShuffleBlockResolver] val shuffle1 = shuffleBlockResolver.forMapTask(1, 1, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle1.writers) { writer.write("test1", "value") writer.write("test2", "value") } for (writer <- shuffle1.writers) { writer.commitAndClose() } val shuffle1Segment = shuffle1.writers(0).fileSegment() shuffle1.releaseWriters(success = true) val shuffle2 = shuffleBlockResolver.forMapTask(1, 2, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle2.writers) { writer.write("test3", "value") writer.write("test4", "vlue") } for (writer <- shuffle2.writers) { writer.commitAndClose() } val shuffle2Segment = shuffle2.writers(0).fileSegment() shuffle2.releaseWriters(success = true) // Now comes the test : // Write to shuffle 3; and close it, but before registering it, check if the file lengths for // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length // of block based on remaining data in file : which could mess things up when there is // concurrent read and writes happening to the same shuffle group. val shuffle3 = shuffleBlockResolver.forMapTask(1, 3, 1, new JavaSerializer(testConf), new ShuffleWriteMetrics) for (writer <- shuffle3.writers) { writer.write("test3", "value") writer.write("test4", "value") } for (writer <- shuffle3.writers) { writer.commitAndClose() } // check before we register. checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0))) shuffle3.releaseWriters(success = true) checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0))) shuffleBlockResolver.removeShuffle(1) } def writeToFile(file: File, numBytes: Int) { val writer = new FileWriter(file, true) for (i <- 0 until numBytes) writer.write(i) writer.close() } }
Example 10
Source File: BlockManagerSlaveEndpoint.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.rpc.{RpcEnv, RpcCallContext, RpcEndpoint} import org.apache.spark.util.ThreadUtils import org.apache.spark.{Logging, MapOutputTracker, SparkEnv} import org.apache.spark.storage.BlockManagerMessages._ private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends RpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) } private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) context.reply(response) logDebug("Sent response: " + response + " to " + context.sender) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 11
Source File: SimrSchedulerBackend.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 12
Source File: TaskResult.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.Map import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value. val resultSer = SparkEnv.get.serializer.newInstance() valueObject = resultSer.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 13
Source File: SortShuffleWriter.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriter, BaseShuffleHandle} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = new ShuffleWriteMetrics() context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics) override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { // The map task failed, so delete our output data. shuffleBlockResolver.removeDataByMap(dep.shuffleId, mapId) return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() context.taskMetrics.shuffleWriteMetrics.foreach( _.incShuffleWriteTime(System.nanoTime - startTime)) sorter = null } } } }
Example 14
Source File: ObjectAggregationMap.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import java.{util => ju} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.internal.config import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, TypedImperativeAggregate} import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter def dumpToExternalSorter( groupingAttributes: Seq[Attribute], aggregateFunctions: Seq[AggregateFunction]): UnsafeKVExternalSorter = { val aggBufferAttributes = aggregateFunctions.flatMap(_.aggBufferAttributes) val sorter = new UnsafeKVExternalSorter( StructType.fromAttributes(groupingAttributes), StructType.fromAttributes(aggBufferAttributes), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, TaskContext.get().taskMemoryManager().pageSizeBytes, SparkEnv.get.conf.get(config.SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD), null ) val mapIterator = iterator val unsafeAggBufferProjection = UnsafeProjection.create(aggBufferAttributes.map(_.dataType).toArray) while (mapIterator.hasNext) { val entry = mapIterator.next() aggregateFunctions.foreach { case agg: TypedImperativeAggregate[_] => agg.serializeAggregateBufferInPlace(entry.aggregationBuffer) case _ => } sorter.insertKV( entry.groupingKey, unsafeAggBufferProjection(entry.aggregationBuffer) ) } hashMap.clear() sorter } def clear(): Unit = { hashMap.clear() } } // Stores the grouping key and aggregation buffer class AggregationBufferEntry(var groupingKey: UnsafeRow, var aggregationBuffer: InternalRow)
Example 15
Source File: OTShuffledHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.online.joins import org.apache.spark.SparkEnv import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning} import org.apache.spark.sql.execution.joins.{BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.hive.online.ComposeRDDFunctions._ import org.apache.spark.sql.hive.online._ import org.apache.spark.storage.{OLABlockId, StorageLevel} case class OTShuffledHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan)( @transient val controller: OnlineDataFrame, @transient val trace: List[Int] = -1 :: Nil, opId: OpId = OpId.newOpId) extends BinaryNode with HashJoin with OTStateful { override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil def retrieveState(): RDD[HashedRelation] = prevBatch match { case Some(bId) => val numParts = controller.olaBlocks(opId, bId) OLABlockRDD.create[HashedRelation](sparkContext, opId.id, Array((numParts, bId)), numParts) case None => sys.error(s"Unexpected prevBatch = $prevBatch") } override def doExecute() = { prevBatch match { case None => val buildRdd = buildPlan.execute() controller.olaBlocks((opId, currentBatch)) = buildRdd.partitions.length buildRdd.zipPartitionsWithIndex(streamedPlan.execute()) { (index, buildIter, streamIter) => val hashed = HashedRelation(buildIter, buildSideKeyGenerator) SparkEnv.get.blockManager.putSingle( OLABlockId(opId.id, currentBatch, index), hashed, StorageLevel.MEMORY_AND_DISK) hashJoin(streamIter, hashed) } case Some(_) => retrieveState().zipPartitionsWithIndex(streamedPlan.execute()) { (index, buildIter, streamIter) => val hashed = buildIter.next() hashJoin(streamIter, hashed) } } } override protected final def otherCopyArgs = controller :: trace :: opId :: Nil override def simpleString = s"${super.simpleString} $opId" override def newBatch(newTrace: List[Int]): SparkPlan = OTShuffledHashJoin(leftKeys, rightKeys, buildSide, left, right)(controller, newTrace, opId) }
Example 16
Source File: MemoryTestingUtils.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import java.util.Properties import org.apache.spark.{SparkEnv, TaskContext, TaskContextImpl} object MemoryTestingUtils { def fakeTaskContext(env: SparkEnv): TaskContext = { val taskMemoryManager = new TaskMemoryManager(env.memoryManager, 0) new TaskContextImpl( stageId = 0, partitionId = 0, taskAttemptId = 0, attemptNumber = 0, taskMemoryManager = taskMemoryManager, localProperties = new Properties, metricsSystem = env.metricsSystem) } }
Example 17
Source File: BlockManagerSlaveEndpoint.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{ThreadUtils, Utils} import org.apache.spark.{MapOutputTracker, SparkEnv} private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends ThreadSafeRpcEndpoint with Logging { private val user = Utils.getCurrentUserName private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get(user).shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) case TriggerThreadDump => context.reply(Utils.getThreadDump()) } private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) context.reply(response) logDebug("Sent response: " + response + " to " + context.senderAddress) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 18
Source File: TaskResult.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get(user).serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 19
Source File: SparkHadoopMapRedUtil.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mapred import java.io.IOException import org.apache.hadoop.mapreduce.{TaskAttemptContext => MapReduceTaskAttemptContext} import org.apache.hadoop.mapreduce.{OutputCommitter => MapReduceOutputCommitter} import org.apache.hadoop.security.UserGroupInformation import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.executor.CommitDeniedException import org.apache.spark.internal.Logging object SparkHadoopMapRedUtil extends Logging { private val user = UserGroupInformation.getCurrentUser.getShortUserName def commitTask( committer: MapReduceOutputCommitter, mrTaskContext: MapReduceTaskAttemptContext, jobId: Int, splitId: Int): Unit = { val mrTaskAttemptID = mrTaskContext.getTaskAttemptID // Called after we have decided to commit def performCommit(): Unit = { try { committer.commitTask(mrTaskContext) logInfo(s"$mrTaskAttemptID: Committed") } catch { case cause: IOException => logError(s"Error committing the output of task: $mrTaskAttemptID", cause) committer.abortTask(mrTaskContext) throw cause } } // First, check whether the task's output has already been committed by some other attempt if (committer.needsTaskCommit(mrTaskContext)) { val shouldCoordinateWithDriver: Boolean = { val sparkConf = SparkEnv.get(user).conf // We only need to coordinate with the driver if there are concurrent task attempts. // Note that this could happen even when speculation is not enabled (e.g. see SPARK-8029). // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs. sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", defaultValue = true) } if (shouldCoordinateWithDriver) { val outputCommitCoordinator = SparkEnv.get(user).outputCommitCoordinator val taskAttemptNumber = TaskContext.get().attemptNumber() val canCommit = outputCommitCoordinator.canCommit(jobId, splitId, taskAttemptNumber) if (canCommit) { performCommit() } else { val message = s"$mrTaskAttemptID: Not committed because the driver did not authorize commit" logInfo(message) // We need to abort the task so that the driver can reschedule new attempts, if necessary committer.abortTask(mrTaskContext) throw new CommitDeniedException(message, jobId, splitId, taskAttemptNumber) } } else { // Speculation is disabled or a user has chosen to manually bypass the commit coordination performCommit() } } else { // Some other attempt committed the output, so we do nothing and signal success logInfo(s"No need to commit output of task because needsTaskCommit=false: $mrTaskAttemptID") } } }
Example 20
Source File: RUtils.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.File import java.util.Arrays import org.apache.hadoop.security.UserGroupInformation import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Arrays.asList("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 21
Source File: GpuDSArrayMult.scala From GPUEnabler with Apache License 2.0 | 5 votes |
package com.ibm.gpuenabler import org.apache.spark.SparkEnv import org.apache.spark.sql.functions.lit import com.ibm.gpuenabler.CUDADSImplicits._ object GpuDSArrayMult { case class jsonData(name : String, factor: Long, arr: Array[Long]) case class inputData(name : String, factor: Long, arr: Array[Long], result: Array[Long]) case class outputData(name: String, result: Array[Long]) def main(args : Array[String]): Unit = { val ss = org.apache.spark.sql.SparkSession.builder.master("local[*]").appName("test").getOrCreate() import ss.implicits._ if(args.length > 0) { println("Setting debug Mode" + args(0)) SparkEnv.get.conf.set("DebugMode", args(0)) } val ptxURL = "/GpuEnablerExamples.ptx" // 1. Sample Map Operation - multiple every element in the array by 2 val mulFunc = DSCUDAFunction("multiplyBy2", Seq("value"), Seq("value"), ptxURL) val N: Long = 100000 val dataPts = ss.range(1, N+1, 1, 10).cache val results = dataPts.mapExtFunc(_ * 2, mulFunc).collect() println("Count is " + results.length) assert(results.length == N) val expResults = (1 to N.toInt).map(_ * 2) assert(results.sameElements(expResults)) // 2. Sample Reduce Operation - Sum of all elements in the array val dimensions = (size: Long, stage: Int) => stage match { case 0 => (64, 256, 1, 1, 1, 1) case 1 => (1, 1, 1, 1, 1, 1) } val gpuParams = gpuParameters(dimensions) val sumFunc = DSCUDAFunction( "suml", Array("value"), Array("value"), ptxURL, Some((size: Long) => 2), Some(gpuParams), outputSize=Some(1)) val results2 = dataPts .mapExtFunc(_ * 2, mulFunc) .reduceExtFunc(_ + _, sumFunc) println("Output is "+ results2) println("Expected is " + (N * (N + 1))) assert(results2 == N * (N + 1)) // 3. Dataset - GPU Map - Dataset Operation. val ds = ss.read.json("src/main/resources/data.json").as[jsonData] val dds = ds.withColumn("result", lit(null: Array[Double] )).as[inputData] val dsFunc = DSCUDAFunction("arrayTest", Seq("factor", "arr"), Seq("result"), ptxURL) val mapDS = dds.mapExtFunc(x => outputData(x.name, x.result), dsFunc, Array((1 to 10).map(_ * 3).toArray, (1 to 35).map(_.toLong).toArray), outputArraySizes = Array(3)) mapDS.select($"name", $"result").show() } }
Example 22
Source File: MQTTStreamSink.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.sql.streaming.mqtt import scala.collection.JavaConverters._ import scala.collection.mutable import org.eclipse.paho.client.mqttv3.MqttException import org.apache.spark.SparkEnv import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.bahir.utils.Logging import org.apache.bahir.utils.Retry class MQTTStreamWriter (schema: StructType, parameters: DataSourceOptions) extends StreamWriter with Logging { override def createWriterFactory(): DataWriterFactory[InternalRow] = { // Skipping client identifier as single batch can be distributed to multiple // Spark worker process. MQTT server does not support two connections // declaring same client ID at given point in time. val params = parameters.asMap().asScala.filterNot( _._1.equalsIgnoreCase("clientId") ) MQTTDataWriterFactory(params) } override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} } case class MQTTDataWriterFactory(config: mutable.Map[String, String]) extends DataWriterFactory[InternalRow] { override def createDataWriter( partitionId: Int, taskId: Long, epochId: Long ): DataWriter[InternalRow] = new MQTTDataWriter(config) } case object MQTTWriterCommitMessage extends WriterCommitMessage class MQTTDataWriter(config: mutable.Map[String, String]) extends DataWriter[InternalRow] { private lazy val publishAttempts: Int = SparkEnv.get.conf.getInt("spark.mqtt.client.publish.attempts", -1) private lazy val publishBackoff: Long = SparkEnv.get.conf.getTimeAsMs("spark.mqtt.client.publish.backoff", "5s") private lazy val (_, _, topic, _, _, qos, _, _, _) = MQTTUtils.parseConfigParams(config.toMap) override def write(record: InternalRow): Unit = { val client = CachedMQTTClient.getOrCreate(config.toMap) val message = record.getBinary(0) Retry(publishAttempts, publishBackoff, classOf[MqttException]) { // In case of errors, retry sending the message. client.publish(topic, message, qos, false) } } override def commit(): WriterCommitMessage = MQTTWriterCommitMessage override def abort(): Unit = {} } case class MQTTRelation(override val sqlContext: SQLContext, data: DataFrame) extends BaseRelation { override def schema: StructType = data.schema } class MQTTStreamSinkProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister with CreatableRelationProvider { override def createStreamWriter(queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { new MQTTStreamWriter(schema, options) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { MQTTRelation(sqlContext, data) } override def shortName(): String = "mqtt" }
Example 23
Source File: LogisticRegression.scala From SparseML with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.sparselr import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap import org.apache.spark.mllib.sparselr.Utils._ import org.apache.spark.SparkEnv import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast object LogisticRegression { def train(input: RDD[(Array[Double], Matrix)], optimizer: Optimizer ): (Array[Int], Array[Double]) = { val hdfsIndex2global = new Int2IntOpenHashMap() var index = 0 input.map { point => point._2 match { case x: CompressedSparseMatrix => println("x.length" + x.mappings.length) case _ => throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.") } }.count val global2hdfsIndex = input.map { point => point._2 match { case x: CompressedSparseMatrix => x.mappings case _ => throw new IllegalArgumentException(s"dot doesn't support ${input.getClass}.") } }.collect().flatMap(t => t).distinct global2hdfsIndex.foreach{value => hdfsIndex2global.put(value, index) index += 1 } val bcHdfsIndex2global = input.context.broadcast(hdfsIndex2global) val examples = input.map(global2globalMapping(bcHdfsIndex2global)).cache() val numTraining = examples.count() println(s"Training: $numTraining.") SparkEnv.get.blockManager.removeBroadcast(bcHdfsIndex2global.id, true) val examplesTest = examples.mapPartitions(_.flatMap { case (y, part) => part.asInstanceOf[CompressedSparseMatrix].tupletIterator(y)}) val weights = Vectors.dense(new Array[Double](global2hdfsIndex.size)) val newWeights = optimizer.optimize(examplesTest, weights) ((global2hdfsIndex, newWeights.toArray)) } //globalId to localId for mappings in Matrix def global2globalMapping(bchdfsIndex2global: Broadcast[Int2IntOpenHashMap]) (partition: (Array[Double], Matrix)): (Array[Double], Matrix) = { val hdfsIndex2global = bchdfsIndex2global.value partition._2 match { case x: CompressedSparseMatrix => val local2hdfsIndex = x.mappings for (i <- 0 until local2hdfsIndex.length) { local2hdfsIndex(i) = hdfsIndex2global.get(local2hdfsIndex(i)) } case _ => throw new IllegalArgumentException(s"dot doesn't support ${partition.getClass}.") } partition } }
Example 24
Source File: PrometheusSink.scala From spark-metrics with Apache License 2.0 | 5 votes |
package org.apache.spark.banzaicloud.metrics.sink import java.net.URL import java.util.Properties import com.banzaicloud.spark.metrics.sink.PrometheusSink.SinkConfig import com.codahale.metrics.MetricRegistry import io.prometheus.client.exporter.PushGateway import org.apache.spark.banzaicloud.metrics.sink.PrometheusSink.SinkConfigProxy import org.apache.spark.internal.config import org.apache.spark.metrics.sink.Sink import org.apache.spark.{SecurityManager, SparkConf, SparkEnv} object PrometheusSink { class SinkConfigProxy extends SinkConfig { // SparkEnv may become available only after metrics sink creation thus retrieving // SparkConf from spark env here and not during the creation/initialisation of PrometheusSink. @transient private lazy val sparkConfig = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf(true)) // Don't use sparkConf.getOption("spark.metrics.namespace") as the underlying string won't be substituted. def metricsNamespace: Option[String] = sparkConfig.get(config.METRICS_NAMESPACE) def sparkAppId: Option[String] = sparkConfig.getOption("spark.app.id") def sparkAppName: Option[String] = sparkConfig.getOption("spark.app.name") def executorId: Option[String] = sparkConfig.getOption("spark.executor.id") } } class PrometheusSink(property: Properties, registry: MetricRegistry, securityMgr: SecurityManager, sinkConfig: SinkConfig, pushGatewayBuilder: URL => PushGateway) extends com.banzaicloud.spark.metrics.sink.PrometheusSink(property, registry, sinkConfig, pushGatewayBuilder) with Sink { // Constructor required by MetricsSystem::registerSinks() def this(property: Properties, registry: MetricRegistry, securityMgr: SecurityManager) = { this( property, registry, securityMgr, new SinkConfigProxy, new PushGateway(_) ) } }
Example 25
Source File: TaskResult.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.Map import scala.collection.mutable import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value. val resultSer = SparkEnv.get.serializer.newInstance() valueObject = resultSer.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 26
Source File: MapDPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object MapDPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply[T](origin: RDD[(Int, (T, InternalRow))], num_partitions: Int): RDD[(Int, (T, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Int, (T, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new MapDPartitioner(num_partitions) new ShuffledRDD[Int, (T, InternalRow), (T, InternalRow)](rdd, part) } } class MapDPartitioner(num_partitions: Int) extends Partitioner { def numPartitions: Int = num_partitions def getPartition(key: Any): Int = { val k = key.asInstanceOf[Int] require(k >= 0 && k < num_partitions) k } }
Example 27
Source File: RangeDPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.util.CollectionsUtils import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair import scala.reflect.ClassTag object RangeDPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply[K: Ordering: ClassTag, T](origin: RDD[(K, (T, InternalRow))], range_bounds: Array[K]): RDD[(K, (T, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[K, (T, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new RangeDPartitioner(range_bounds, ascending = true) new ShuffledRDD[K, (T, InternalRow), (T, InternalRow)](rdd, part) } } class RangeDPartitioner[K: Ordering: ClassTag](range_bounds: Array[K], ascending: Boolean) extends Partitioner { def numPartitions: Int = range_bounds.length + 1 private val binarySearch: ((Array[K], K) => Int) = CollectionsUtils.makeBinarySearch[K] def getPartition(key: Any): Int = { val k = key.asInstanceOf[K] var partition = 0 if (range_bounds.length < 128) { while (partition < range_bounds.length && Ordering[K].gt(k, range_bounds(partition))) partition += 1 } else { partition = binarySearch(range_bounds, k) if (partition < 0) partition = -partition - 1 if (partition > range_bounds.length) partition = range_bounds.length } if (ascending) partition else range_bounds.length - partition } }
Example 28
Source File: VoronoiPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object VoronoiPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply(origin: RDD[(Int, (Point, InternalRow))], pivot_to_group: Array[Int], num_group: Int) : RDD[(Int, (Point, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Int, (Point, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new VoronoiPartitioner(pivot_to_group, num_group) new ShuffledRDD[Int, (Point, InternalRow), (Point, InternalRow)](rdd, part) } } class VoronoiPartitioner(pivot_to_group: Array[Int], num_group: Int) extends Partitioner { override def numPartitions: Int = num_group override def getPartition(key: Any): Int = { val k = key.asInstanceOf[Int] pivot_to_group(k) } }
Example 29
Source File: HashPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object HashPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply(origin: RDD[(Any, InternalRow)], num_partitions: Int): RDD[(Any, InternalRow)] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, row._2.copy()))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Any, InternalRow]() iter.map(row => mutablePair.update(row._1, row._2.copy())) } } val part = new HashPartitioner(num_partitions) new ShuffledRDD[Any, InternalRow, InternalRow](rdd, part) } } class HashPartitioner(num_partitions: Int) extends Partitioner { override def numPartitions: Int = num_partitions override def getPartition(key: Any): Int = { key.hashCode() % num_partitions } }
Example 30
Source File: CrailBroadcast.scala From crail-spark-io with Apache License 2.0 | 5 votes |
package org.apache.spark.broadcast import java.io._ import org.apache.spark.storage._ import org.apache.spark.{SparkEnv, SparkException} import scala.collection.mutable import scala.reflect.ClassTag import scala.util.control.NonFatal private[spark] class CrailBroadcast[T: ClassTag](obj: T, id: Long) extends Broadcast[T](id) with Serializable { @transient private lazy val _value: T = readBroadcastBlock() private val broadcastId = BroadcastBlockId(id) writeBlocks(obj) override protected def getValue() = { _value } override protected def doUnpersist(blocking: Boolean): Unit = { logWarning(" called doUnpersist on broadcastId: " + id + " (NYI)") } override protected def doDestroy(blocking: Boolean): Unit = { val obj = x.asInstanceOf[T] if(CrailBroadcast.useLocalCache) { CrailBroadcast.broadcastCache(id) = Some(x) } else { SparkEnv.get.blockManager.putSingle(broadcastId, obj, StorageLevel.MEMORY_ONLY, tellMaster = false) } obj case None => throw new SparkException(s"Failed to get broadcast " + broadcastId) } } } } } private object CrailBroadcast { //FIXME: (atr) I am not completely sure about if this gives us the best performance. val broadcastCache:mutable.HashMap[Long, Option[Any]] = new mutable.HashMap[Long, Option[Any]] private val useLocalCache = false def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean): Unit = { this.synchronized { if(useLocalCache) { broadcastCache.remove(id) } else { SparkEnv.get.blockManager.master.removeBroadcast(id, removeFromDriver, blocking) SparkEnv.get.blockManager.removeBroadcast(id, false) } } } def cleanCache(): Unit = { this.synchronized { broadcastCache.clear() } } } object Utils { def tryOrIOException[T](block: => T): T = { try { block } catch { case e: IOException => throw e case NonFatal(e) => throw new IOException(e) } } }
Example 31
Source File: HBaseTestSource.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark import org.apache.hadoop.hbase.spark.datasources.HBaseSparkConf import org.apache.spark.SparkEnv import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ class HBaseTestSource extends RelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { DummyScan( parameters("cacheSize").toInt, parameters("batchNum").toInt, parameters("blockCacheingEnable").toBoolean, parameters("rowNum").toInt)(sqlContext) } } case class DummyScan( cacheSize: Int, batchNum: Int, blockCachingEnable: Boolean, rowNum: Int)(@transient val sqlContext: SQLContext) extends BaseRelation with TableScan { private def sparkConf = SparkEnv.get.conf override def schema: StructType = StructType(StructField("i", IntegerType, nullable = false) :: Nil) override def buildScan(): RDD[Row] = sqlContext.sparkContext.parallelize(0 until rowNum) .map(Row(_)) .map{ x => if (sparkConf.getInt(HBaseSparkConf.QUERY_BATCHSIZE, -1) != batchNum || sparkConf.getInt(HBaseSparkConf.QUERY_CACHEDROWS, -1) != cacheSize || sparkConf.getBoolean(HBaseSparkConf.QUERY_CACHEBLOCKS, false) != blockCachingEnable) { throw new Exception("HBase Spark configuration cannot be set properly") } x } }
Example 32
Source File: CachedPulsarClient.scala From pulsar-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.pulsar import java.{util => ju} import java.util.concurrent.{ConcurrentMap, ExecutionException, TimeUnit} import scala.collection.JavaConverters._ import scala.util.control.NonFatal import com.google.common.cache._ import com.google.common.util.concurrent.{ExecutionError, UncheckedExecutionException} import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging private[pulsar] object CachedPulsarClient extends Logging { private type Client = org.apache.pulsar.client.api.PulsarClient private val defaultCacheExpireTimeout = TimeUnit.MINUTES.toMillis(10) private lazy val cacheExpireTimeout: Long = Option(SparkEnv.get) .map(_.conf .getTimeAsMs("spark.pulsar.client.cache.timeout", s"${defaultCacheExpireTimeout}ms")) .getOrElse(defaultCacheExpireTimeout) private val cacheLoader = new CacheLoader[Seq[(String, Object)], Client] { override def load(config: Seq[(String, Object)]): Client = { val configMap = config.map(x => x._1 -> x._2).toMap.asJava createPulsarClient(configMap) } } private val removalListener = new RemovalListener[Seq[(String, Object)], Client]() { override def onRemoval( notification: RemovalNotification[Seq[(String, Object)], Client]): Unit = { val paramsSeq: Seq[(String, Object)] = notification.getKey val client: Client = notification.getValue logDebug( s"Evicting pulsar producer $client params: $paramsSeq, due to ${notification.getCause}") close(paramsSeq, client) } } private lazy val guavaCache: LoadingCache[Seq[(String, Object)], Client] = CacheBuilder .newBuilder() .expireAfterAccess(cacheExpireTimeout, TimeUnit.MILLISECONDS) .removalListener(removalListener) .build[Seq[(String, Object)], Client](cacheLoader) private def createPulsarClient(pulsarConf: ju.Map[String, Object]): Client = { val pulsarServiceUrl = pulsarConf.get(PulsarOptions.SERVICE_URL_OPTION_KEY).asInstanceOf[String] val clientConf = new PulsarConfigUpdater( "pulsarClientCache", pulsarConf.asScala.toMap, PulsarOptions.FILTERED_KEYS ).rebuild() logInfo(s"Client Conf = ${clientConf}") try { val pulsarClient: Client = org.apache.pulsar.client.api.PulsarClient .builder() .serviceUrl(pulsarServiceUrl) .loadConf(clientConf) .build(); logDebug( s"Created a new instance of PulsarClient for serviceUrl = $pulsarServiceUrl," + s" clientConf = $clientConf.") pulsarClient } catch { case e: Throwable => logError( s"Failed to create PulsarClient to serviceUrl ${pulsarServiceUrl}" + s" using client conf ${clientConf}", e) throw e } } private def close(paramsSeq: Seq[(String, Object)], client: Client): Unit = { try { logInfo(s"Closing the Pulsar Client with params: ${paramsSeq.mkString("\n")}.") client.close() } catch { case NonFatal(e) => logWarning("Error while closing pulsar producer.", e) } } private[pulsar] def clear(): Unit = { logInfo("Cleaning up guava cache.") guavaCache.invalidateAll() } // Intended for testing purpose only. private def getAsMap: ConcurrentMap[Seq[(String, Object)], Client] = guavaCache.asMap() }
Example 33
Source File: MemoryTestingUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import org.apache.spark.{SparkEnv, TaskContextImpl, TaskContext} object MemoryTestingUtils { def fakeTaskContext(env: SparkEnv): TaskContext = { val taskMemoryManager = new TaskMemoryManager(env.memoryManager, 0) new TaskContextImpl( stageId = 0, partitionId = 0, taskAttemptId = 0, attemptNumber = 0, taskMemoryManager = taskMemoryManager, metricsSystem = env.metricsSystem, internalAccumulators = Seq.empty) } }
Example 34
Source File: BlockManagerSlaveEndpoint.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.{Logging, MapOutputTracker, SparkEnv} import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{ThreadUtils, Utils} private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends ThreadSafeRpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) case TriggerThreadDump => context.reply(Utils.getThreadDump()) } private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) context.reply(response) logDebug("Sent response: " + response + " to " + context.senderAddress) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 35
Source File: SimrSchedulerBackend.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) if (!fs.delete(new Path(driverFilePath), false)) { logWarning(s"error deleting ${driverFilePath}") } super.stop() } }
Example 36
Source File: DistributedCountRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.indexserver import java.util.concurrent.Executors import scala.collection.JavaConverters._ import scala.concurrent.{Await, ExecutionContext, ExecutionContextExecutor, Future} import scala.concurrent.duration.Duration import org.apache.hadoop.mapred.TaskAttemptID import org.apache.hadoop.mapreduce.{InputSplit, TaskType} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.sql.SparkSession import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.cache.CacheProvider import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.index.{IndexInputFormat, IndexStoreManager} import org.apache.carbondata.core.index.dev.expr.IndexInputSplitWrapper import org.apache.carbondata.core.util.{CarbonProperties, CarbonThreadFactory} import org.apache.carbondata.spark.rdd.CarbonRDD class DistributedCountRDD(@transient ss: SparkSession, indexInputFormat: IndexInputFormat) extends CarbonRDD[(String, String)](ss, Nil) { @transient private val LOGGER = LogServiceFactory.getLogService(classOf[DistributedPruneRDD] .getName) override protected def getPreferredLocations(split: Partition): Seq[String] = { if (split.asInstanceOf[IndexRDDPartition].getLocations != null) { split.asInstanceOf[IndexRDDPartition].getLocations.toSeq } else { Seq() } } override def internalCompute(split: Partition, context: TaskContext): Iterator[(String, String)] = { val attemptId = new TaskAttemptID(DistributedRDDUtils.generateTrackerId, id, TaskType.MAP, split.index, 0) val attemptContext = new TaskAttemptContextImpl(FileFactory.getConfiguration, attemptId) val inputSplits = split.asInstanceOf[IndexRDDPartition].inputSplit val numOfThreads = CarbonProperties.getInstance().getNumOfThreadsForExecutorPruning val service = Executors .newFixedThreadPool(numOfThreads, new CarbonThreadFactory("IndexPruningPool", true)) implicit val ec: ExecutionContextExecutor = ExecutionContext .fromExecutor(service) if (indexInputFormat.ifAsyncCall()) { // to clear cache of invalid segments during pre-priming in index server IndexStoreManager.getInstance().clearInvalidSegments(indexInputFormat.getCarbonTable, indexInputFormat.getInvalidSegments) } val futures = if (inputSplits.length <= numOfThreads) { inputSplits.map { split => generateFuture(Seq(split)) } } else { DistributedRDDUtils.groupSplits(inputSplits, numOfThreads).map { splits => generateFuture(splits) } } // scalastyle:off awaitresult val results = Await.result(Future.sequence(futures), Duration.Inf).flatten // scalastyle:on awaitresult val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${ SparkEnv.get.blockManager.blockManagerId.executorId }" val cacheSize = if (CacheProvider.getInstance().getCarbonCache != null) { CacheProvider.getInstance().getCarbonCache.getCurrentSize } else { 0L } Iterator((executorIP + "_" + cacheSize.toString, results.map(_._2.toLong).sum.toString)) } override protected def internalGetPartitions: Array[Partition] = { new DistributedPruneRDD(ss, indexInputFormat).partitions } private def generateFuture(split: Seq[InputSplit]) (implicit executionContext: ExecutionContext) = { Future { val segments = split.map { inputSplit => val distributable = inputSplit.asInstanceOf[IndexInputSplitWrapper] distributable.getDistributable.getSegment .setReadCommittedScope(indexInputFormat.getReadCommittedScope) distributable.getDistributable.getSegment } val defaultIndex = IndexStoreManager.getInstance .getIndex(indexInputFormat.getCarbonTable, split.head .asInstanceOf[IndexInputSplitWrapper].getDistributable.getIndexSchema) defaultIndex.getBlockRowCount(defaultIndex, segments.toList.asJava, indexInputFormat .getPartitions).asScala } } }
Example 37
Source File: RUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.File import java.util.Arrays import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Arrays.asList("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 38
Source File: SparkSqlSerializer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.nio.ByteBuffer import java.util.{HashMap => JavaHashMap} import scala.reflect.ClassTag import com.esotericsoftware.kryo.io.{Input, Output} import com.esotericsoftware.kryo.{Kryo, Serializer} import com.twitter.chill.ResourcePool import org.apache.spark.serializer.{KryoSerializer, SerializerInstance} import org.apache.spark.sql.types.{Decimal, StructField, StructType} import org.apache.spark.util.MutablePair import org.apache.spark.{SparkConf, SparkEnv} //private[sql] class SparkSqlSerializer(conf: SparkConf) extends KryoSerializer(conf) { override def newKryo(): Kryo = { val kryo = super.newKryo() kryo.setRegistrationRequired(false) kryo.register(classOf[MutablePair[_, _]]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericRow]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericInternalRow]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericMutableRow]) kryo.register(classOf[java.math.BigDecimal], new JavaBigDecimalSerializer) kryo.register(classOf[BigDecimal], new ScalaBigDecimalSerializer) kryo.register(classOf[Decimal]) kryo.register(classOf[JavaHashMap[_, _]]) // APS kryo.register(classOf[StructType]) kryo.register(classOf[StructField]) kryo.setReferences(false) kryo } } private[execution] class KryoResourcePool(size: Int) extends ResourcePool[SerializerInstance](size) { val ser: SparkSqlSerializer = { val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf()) new SparkSqlSerializer(sparkConf) } def newInstance(): SerializerInstance = ser.newInstance() } //private[sql] object SparkSqlSerializer { @transient lazy val resourcePool = new KryoResourcePool(30) private[this] def acquireRelease[O](fn: SerializerInstance => O): O = { val kryo = resourcePool.borrow try { fn(kryo) } finally { resourcePool.release(kryo) } } def serialize[T: ClassTag](o: T): Array[Byte] = acquireRelease { k => k.serialize(o).array() } def deserialize[T: ClassTag](bytes: Array[Byte]): T = acquireRelease { k => k.deserialize[T](ByteBuffer.wrap(bytes)) } } private[sql] class JavaBigDecimalSerializer extends Serializer[java.math.BigDecimal] { def write(kryo: Kryo, output: Output, bd: java.math.BigDecimal) { // TODO: There are probably more efficient representations than strings... output.writeString(bd.toString) } def read(kryo: Kryo, input: Input, tpe: Class[java.math.BigDecimal]): java.math.BigDecimal = { new java.math.BigDecimal(input.readString()) } } private[sql] class ScalaBigDecimalSerializer extends Serializer[BigDecimal] { def write(kryo: Kryo, output: Output, bd: BigDecimal) { // TODO: There are probably more efficient representations than strings... output.writeString(bd.toString) } def read(kryo: Kryo, input: Input, tpe: Class[BigDecimal]): BigDecimal = { new java.math.BigDecimal(input.readString()) } }
Example 39
Source File: Sort.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.{InternalAccumulator, SparkEnv, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{Distribution, OrderedDistribution, UnspecifiedDistribution} import org.apache.spark.sql.execution.metric.SQLMetrics case class Sort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan, testSpillFrequency: Int = 0) extends UnaryNode { override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil override private[sql] lazy val metrics = Map( "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"), "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size")) protected override def doExecute(): RDD[InternalRow] = { val schema = child.schema val childOutput = child.output val dataSize = longMetric("dataSize") val spillSize = longMetric("spillSize") child.execute().mapPartitionsInternal { iter => val ordering = newOrdering(sortOrder, childOutput) // The comparator for comparing prefix val boundSortExpression = BindReferences.bindReference(sortOrder.head, childOutput) val prefixComparator = SortPrefixUtils.getPrefixComparator(boundSortExpression) // The generator for prefix val prefixProjection = UnsafeProjection.create(Seq(SortPrefix(boundSortExpression))) val prefixComputer = new UnsafeExternalRowSorter.PrefixComputer { override def computePrefix(row: InternalRow): Long = { prefixProjection.apply(row).getLong(0) } } val pageSize = SparkEnv.get.memoryManager.pageSizeBytes val sorter = new UnsafeExternalRowSorter( schema, ordering, prefixComparator, prefixComputer, pageSize) if (testSpillFrequency > 0) { sorter.setTestSpillFrequency(testSpillFrequency) } // Remember spill data size of this task before execute this operator so that we can // figure out how many bytes we spilled for this operator. val spillSizeBefore = TaskContext.get().taskMetrics().memoryBytesSpilled val sortedIterator = sorter.sort(iter.asInstanceOf[Iterator[UnsafeRow]]) dataSize += sorter.getPeakMemoryUsage spillSize += TaskContext.get().taskMetrics().memoryBytesSpilled - spillSizeBefore TaskContext.get().internalMetricsToAccumulators( InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.getPeakMemoryUsage) sortedIterator } } }
Example 40
Source File: StarryClosureCleaner.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.util import org.apache.spark.internal.Logging import org.apache.spark.{SparkEnv, SparkException} import scala.collection.mutable object StarryClosureCleaner extends Logging { val serializableMap: LRUCache[String, Boolean] = new LRUCache[String, Boolean](100000) // Check whether a class represents a Scala closure private def isClosure(cls: Class[_]): Boolean = { cls.getName.contains("$anonfun$") } def clean( closure: AnyRef, checkSerializable: Boolean = true, cleanTransitively: Boolean = true): Unit = { clean(closure, checkSerializable, cleanTransitively, mutable.Map.empty) } private def clean( func: AnyRef, checkSerializable: Boolean, cleanTransitively: Boolean, accessedFields: mutable.Map[Class[_], mutable.Set[String]]): Unit = { if (!isClosure(func.getClass)) { logWarning("Expected a closure; got " + func.getClass.getName) return } if (func == null) { return } if (checkSerializable) { ensureSerializable(func) } } private def ensureSerializable(func: AnyRef) { if (!serializableMap.containsKey(func.getClass.getCanonicalName)) { try { if (SparkEnv.get != null) { SparkEnv.get.closureSerializer.newInstance().serialize(func) serializableMap.put(func.getClass.getCanonicalName, true) } } catch { case ex: Exception => throw new SparkException("Task not serializable", ex) } } } case class LRUCache[K, V](cacheSize: Int) extends util.LinkedHashMap[K, V] { override def removeEldestEntry(eldest: util.Map.Entry[K, V]): Boolean = size > cacheSize } }
Example 41
Source File: FakeTask.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.util.Properties import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.executor.TaskMetrics class FakeTask( stageId: Int, partitionId: Int, prefLocs: Seq[TaskLocation] = Nil, serializedTaskMetrics: Array[Byte] = SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array()) extends Task[Int](stageId, 0, partitionId, new Properties, serializedTaskMetrics) { override def runTask(context: TaskContext): Int = 0 override def preferredLocations: Seq[TaskLocation] = prefLocs } object FakeTask { def createTaskSet(numTasks: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { createTaskSet(numTasks, stageAttemptId = 0, prefLocs: _*) } def createTaskSet(numTasks: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { createTaskSet(numTasks, stageId = 0, stageAttemptId, prefLocs: _*) } def createTaskSet(numTasks: Int, stageId: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { if (prefLocs.size != 0 && prefLocs.size != numTasks) { throw new IllegalArgumentException("Wrong number of task locations") } val tasks = Array.tabulate[Task[_]](numTasks) { i => new FakeTask(stageId, i, if (prefLocs.size != 0) prefLocs(i) else Nil) } new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null) } def createShuffleMapTaskSet( numTasks: Int, stageId: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { if (prefLocs.size != 0 && prefLocs.size != numTasks) { throw new IllegalArgumentException("Wrong number of task locations") } val tasks = Array.tabulate[Task[_]](numTasks) { i => new ShuffleMapTask(stageId, stageAttemptId, null, new Partition { override def index: Int = i }, prefLocs(i), new Properties, SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array()) } new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null) } }
Example 42
Source File: MemoryTestingUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import java.util.Properties import org.apache.spark.{SparkEnv, TaskContext, TaskContextImpl} object MemoryTestingUtils { def fakeTaskContext(env: SparkEnv): TaskContext = { val taskMemoryManager = new TaskMemoryManager(env.memoryManager, 0) new TaskContextImpl( stageId = 0, stageAttemptNumber = 0, partitionId = 0, taskAttemptId = 0, attemptNumber = 0, taskMemoryManager = taskMemoryManager, localProperties = new Properties, metricsSystem = env.metricsSystem) } }
Example 43
Source File: BlockManagerSlaveEndpoint.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.{MapOutputTracker, SparkEnv} import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{ThreadUtils, Utils} private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends ThreadSafeRpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) case TriggerThreadDump => context.reply(Utils.getThreadDump()) case ReplicateBlock(blockId, replicas, maxReplicas) => context.reply(blockManager.replicateBlock(blockId, replicas.toSet, maxReplicas)) } private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.foreach { response => logDebug(s"Done $actionMessage, response is $response") context.reply(response) logDebug(s"Sent response: $response to ${context.senderAddress}") } future.failed.foreach { t => logError(s"Error in $actionMessage", t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 44
Source File: TaskResult.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 45
Source File: SparkHadoopMapRedUtil.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mapred import java.io.IOException import org.apache.hadoop.mapreduce.{TaskAttemptContext => MapReduceTaskAttemptContext} import org.apache.hadoop.mapreduce.{OutputCommitter => MapReduceOutputCommitter} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.executor.CommitDeniedException import org.apache.spark.internal.Logging object SparkHadoopMapRedUtil extends Logging { def commitTask( committer: MapReduceOutputCommitter, mrTaskContext: MapReduceTaskAttemptContext, jobId: Int, splitId: Int): Unit = { val mrTaskAttemptID = mrTaskContext.getTaskAttemptID // Called after we have decided to commit def performCommit(): Unit = { try { committer.commitTask(mrTaskContext) logInfo(s"$mrTaskAttemptID: Committed") } catch { case cause: IOException => logError(s"Error committing the output of task: $mrTaskAttemptID", cause) committer.abortTask(mrTaskContext) throw cause } } // First, check whether the task's output has already been committed by some other attempt if (committer.needsTaskCommit(mrTaskContext)) { val shouldCoordinateWithDriver: Boolean = { val sparkConf = SparkEnv.get.conf // We only need to coordinate with the driver if there are concurrent task attempts. // Note that this could happen even when speculation is not enabled (e.g. see SPARK-8029). // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs. sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", defaultValue = true) } if (shouldCoordinateWithDriver) { val outputCommitCoordinator = SparkEnv.get.outputCommitCoordinator val taskAttemptNumber = TaskContext.get().attemptNumber() val stageId = TaskContext.get().stageId() val canCommit = outputCommitCoordinator.canCommit(stageId, splitId, taskAttemptNumber) if (canCommit) { performCommit() } else { val message = s"$mrTaskAttemptID: Not committed because the driver did not authorize commit" logInfo(message) // We need to abort the task so that the driver can reschedule new attempts, if necessary committer.abortTask(mrTaskContext) throw new CommitDeniedException(message, stageId, splitId, taskAttemptNumber) } } else { // Speculation is disabled or a user has chosen to manually bypass the commit coordination performCommit() } } else { // Some other attempt committed the output, so we do nothing and signal success logInfo(s"No need to commit output of task because needsTaskCommit=false: $mrTaskAttemptID") } } }
Example 46
Source File: RUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.File import java.util.Arrays import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Arrays.asList("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 47
Source File: HBasePartitioner.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import java.io.{IOException, ObjectInputStream, ObjectOutputStream} import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.serializer.JavaSerializer import org.apache.spark.util.{CollectionsUtils, Utils} import org.apache.spark.{Partitioner, SparkEnv} object HBasePartitioner { implicit object HBaseRawOrdering extends Ordering[HBaseRawType] { def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b) } } class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner { import HBasePartitioner.HBaseRawOrdering type t = HBaseRawType lazy private val len = splitKeys.length // For pre-split table splitKeys(0) = bytes[0], to remove it, // otherwise partition 0 always be empty and // we will miss the last region's date when bulk load lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail def numPartitions = if (len == 0) 1 else len @transient private val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t] def getPartition(key: Any): Int = { val k = key.asInstanceOf[t] var partition = 0 if (len <= 128 && len > 0) { // If we have less than 128 partitions naive search val ordering = implicitly[Ordering[t]] while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) { partition += 1 } } else { // Determine which binary search method to use only once. partition = binarySearch(realSplitKeys, k) // binarySearch either returns the match location or -[insertion point]-1 if (partition < 0) { partition = -partition - 1 } if (partition > realSplitKeys.length) { partition = realSplitKeys.length } } partition } override def equals(other: Any): Boolean = other match { case r: HBasePartitioner => r.splitKeys.sameElements(splitKeys) case _ => false } override def hashCode(): Int = { val prime = 31 var result = 1 var i = 0 while (i < splitKeys.length) { result = prime * result + splitKeys(i).hashCode i += 1 } result = prime * result result } }
Example 48
Source File: OapIndexCommitProtocol.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.OutputCommitter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.oap.OapConf @transient private var committer: OapIndexFileOutputCommitter = _ override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { val algorithmVersion = SparkEnv.get.conf.get(OapConf.OAP_INDEXFILEOUTPUTCOMMITTER_ALGORITHM_VERSION) val tempDirName = s"_temporary_$jobId" committer = new OapIndexFileOutputCommitter( new Path(path), context, tempDirName, algorithmVersion) logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { val filename = getFilename(taskContext, ext) val stagingDir = new Path(Option(committer.getWorkPath).map(_.toString).getOrElse(path)) dir.map { d => new Path(new Path(stagingDir, d), filename).toString }.getOrElse { new Path(stagingDir, filename).toString } } override protected def getFilename(taskContext: TaskAttemptContext, ext: String): String = { // The file name looks like part-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003-c000.parquet // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, // the file name is fine and won't overflow. val split = taskContext.getTaskAttemptID.getTaskID.getId f"part-$split%05d-$jobId$ext" } }
Example 49
Source File: ColumnarSortExec.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.execution import com.intel.sparkColumnarPlugin.expression._ import com.intel.sparkColumnarPlugin.vectorized._ import java.util.concurrent.TimeUnit._ import org.apache.spark.{SparkEnv, TaskContext, SparkContext} import org.apache.spark.executor.TaskMetrics import org.apache.spark.sql.execution._ import org.apache.spark.sql.catalyst.expressions.SortOrder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} class ColumnarSortExec( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan, testSpillFrequency: Int = 0) extends SortExec(sortOrder, global, child, testSpillFrequency) { override def supportsColumnar = true // Disable code generation override def supportCodegen: Boolean = false override lazy val metrics = Map( "totalSortTime" -> SQLMetrics .createTimingMetric(sparkContext, "time in sort + shuffle process"), "sortTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in sort process"), "shuffleTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in shuffle process"), "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "numOutputBatches" -> SQLMetrics.createMetric(sparkContext, "number of output batches")) override def doExecuteColumnar(): RDD[ColumnarBatch] = { val elapse = longMetric("totalSortTime") val sortTime = longMetric("sortTime") val shuffleTime = longMetric("shuffleTime") val numOutputRows = longMetric("numOutputRows") val numOutputBatches = longMetric("numOutputBatches") child.executeColumnar().mapPartitions { iter => val hasInput = iter.hasNext val res = if (!hasInput) { Iterator.empty } else { val sorter = ColumnarSorter.create( sortOrder, true, child.output, sortTime, numOutputBatches, numOutputRows, shuffleTime, elapse) TaskContext .get() .addTaskCompletionListener[Unit](_ => { sorter.close() }) new CloseableColumnBatchIterator(sorter.createColumnarIterator(iter)) } res } } }
Example 50
Source File: SplashShuffleReader.scala From splash with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle import org.apache.spark.internal.Logging import org.apache.spark.storage.BlockId import org.apache.spark.{InterruptibleIterator, MapOutputTracker, SparkEnv, TaskContext} private[spark] class SplashShuffleReader[K, V]( resolver: SplashShuffleBlockResolver, handle: BaseShuffleHandle[K, _, V], startPartition: Int, endPartition: Int, context: TaskContext, mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker) extends ShuffleReader[K, V] with Logging { private val dep = handle.dependency private type Pair = (Any, Any) private type KCPair = (K, V) private type KCIterator = Iterator[KCPair] override def read(): KCIterator = { val shuffleBlocks = mapOutputTracker.getMapSizesByExecutorId( handle.shuffleId, startPartition, endPartition) .flatMap(_._2) readShuffleBlocks(shuffleBlocks) } def readShuffleBlocks(shuffleBlocks: Seq[(BlockId, Long)]): KCIterator = readShuffleBlocks(shuffleBlocks.iterator) def readShuffleBlocks(shuffleBlocks: Iterator[(BlockId, Long)]): KCIterator = { val taskMetrics = context.taskMetrics() val serializer = SplashSerializer(dep) val nonEmptyBlocks = shuffleBlocks.filter(_._2 > 0).map(_._1) val fetcherIterator = SplashShuffleFetcherIterator(resolver, nonEmptyBlocks) def getAggregatedIterator(iterator: Iterator[Pair]): KCIterator = { dep.aggregator match { case Some(agg) => val aggregator = new SplashAggregator(agg) if (dep.mapSideCombine) { // We are reading values that are already combined val combinedKeyValuesIterator = iterator.asInstanceOf[Iterator[(K, V)]] aggregator.combineCombinersByKey(combinedKeyValuesIterator, context) } else { // We don't know the value type, but also don't care -- the dependency *should* // have made sure its compatible w/ this aggregator, which will convert the value // type to the combined type C val keyValuesIterator = iterator.asInstanceOf[Iterator[(K, Nothing)]] aggregator.combineValuesByKey(keyValuesIterator, context) } case None => require(!dep.mapSideCombine, "Map-side combine without Aggregator specified!") iterator.asInstanceOf[KCIterator] } } def getSortedIterator(iterator: KCIterator): KCIterator = { // Sort the output if there is a sort ordering defined. dep.keyOrdering match { case Some(keyOrd: Ordering[K]) => // Create an ExternalSorter to sort the data. val sorter = new SplashSorter[K, V, V]( context, ordering = Some(keyOrd), serializer = serializer) sorter.insertAll(iterator) sorter.updateTaskMetrics() sorter.completionIterator case None => iterator } } val metricIter = fetcherIterator.flatMap( _.asMetricIterator(serializer, taskMetrics)) // An interruptible iterator must be used here in order to support task cancellation getSortedIterator( getAggregatedIterator( new InterruptibleIterator[Pair](context, metricIter))) } }
Example 51
Source File: SparkPlanSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.SparkEnv import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext class SparkPlanSuite extends QueryTest with SharedSQLContext { test("SPARK-21619 execution of a canonicalized plan should fail") { val plan = spark.range(10).queryExecution.executedPlan.canonicalized intercept[IllegalStateException] { plan.execute() } intercept[IllegalStateException] { plan.executeCollect() } intercept[IllegalStateException] { plan.executeCollectPublic() } intercept[IllegalStateException] { plan.executeToIterator() } intercept[IllegalStateException] { plan.executeBroadcast() } intercept[IllegalStateException] { plan.executeTake(1) } } test("SPARK-23731 plans should be canonicalizable after being (de)serialized") { withTempPath { path => spark.range(1).write.parquet(path.getAbsolutePath) val df = spark.read.parquet(path.getAbsolutePath) val fileSourceScanExec = df.queryExecution.sparkPlan.collectFirst { case p: FileSourceScanExec => p }.get val serializer = SparkEnv.get.serializer.newInstance() val readback = serializer.deserialize[FileSourceScanExec](serializer.serialize(fileSourceScanExec)) try { readback.canonicalized } catch { case e: Throwable => fail("FileSourceScanExec was not canonicalizable", e) } } } test("SPARK-25357 SparkPlanInfo of FileScan contains nonEmpty metadata") { withTempPath { path => spark.range(5).write.parquet(path.getAbsolutePath) val f = spark.read.parquet(path.getAbsolutePath) assert(SparkPlanInfo.fromSparkPlan(f.queryExecution.sparkPlan).metadata.nonEmpty) } } }
Example 52
Source File: ContinuousRecordEndpoint.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.SparkEnv import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.v2.reader.streaming.PartitionOffset case class ContinuousRecordPartitionOffset(partitionId: Int, offset: Int) extends PartitionOffset case class GetRecord(offset: ContinuousRecordPartitionOffset) override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case GetRecord(ContinuousRecordPartitionOffset(partitionId, offset)) => lock.synchronized { val bufOffset = offset - startOffsets(partitionId) val buf = buckets(partitionId) val record = if (buf.size <= bufOffset) None else Some(buf(bufOffset)) context.reply(record.map(InternalRow(_))) } } }
Example 53
Source File: ContinuousShuffleReadRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous.shuffle import java.util.UUID import org.apache.spark.{Partition, SparkContext, SparkEnv, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.rpc.RpcAddress import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.NextIterator case class ContinuousShuffleReadPartition( index: Int, endpointName: String, queueSize: Int, numShuffleWriters: Int, epochIntervalMs: Long) extends Partition { // Initialized only on the executor, and only once even as we call compute() multiple times. lazy val (reader: ContinuousShuffleReader, endpoint) = { val env = SparkEnv.get.rpcEnv val receiver = new RPCContinuousShuffleReader( queueSize, numShuffleWriters, epochIntervalMs, env) val endpoint = env.setupEndpoint(endpointName, receiver) TaskContext.get().addTaskCompletionListener[Unit] { ctx => env.stop(endpoint) } (receiver, endpoint) } } class ContinuousShuffleReadRDD( sc: SparkContext, numPartitions: Int, queueSize: Int = 1024, numShuffleWriters: Int = 1, epochIntervalMs: Long = 1000, val endpointNames: Seq[String] = Seq(s"RPCContinuousShuffleReader-${UUID.randomUUID()}")) extends RDD[UnsafeRow](sc, Nil) { override protected def getPartitions: Array[Partition] = { (0 until numPartitions).map { partIndex => ContinuousShuffleReadPartition( partIndex, endpointNames(partIndex), queueSize, numShuffleWriters, epochIntervalMs) }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = { split.asInstanceOf[ContinuousShuffleReadPartition].reader.read() } }
Example 54
Source File: ContinuousCoalesceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.UUID import org.apache.spark.{HashPartitioner, SparkEnv} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.continuous.shuffle.{ContinuousShuffleReadPartition, ContinuousShuffleReadRDD} case class ContinuousCoalesceExec(numPartitions: Int, child: SparkPlan) extends SparkPlan { override def output: Seq[Attribute] = child.output override def children: Seq[SparkPlan] = child :: Nil override def outputPartitioning: Partitioning = SinglePartition override def doExecute(): RDD[InternalRow] = { assert(numPartitions == 1) new ContinuousCoalesceRDD( sparkContext, numPartitions, conf.continuousStreamingExecutorQueueSize, sparkContext.getLocalProperty(ContinuousExecution.EPOCH_INTERVAL_KEY).toLong, child.execute()) } }
Example 55
Source File: ContinuousWriteRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory} import org.apache.spark.util.Utils class ContinuousWriteRDD(var prev: RDD[InternalRow], writeTask: DataWriterFactory[InternalRow]) extends RDD[Unit](prev) { override val partitioner = prev.partitioner override def getPartitions: Array[Partition] = prev.partitions override def compute(split: Partition, context: TaskContext): Iterator[Unit] = { val epochCoordinator = EpochCoordinatorRef.get( context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), SparkEnv.get) EpochTracker.initializeCurrentEpoch( context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong) while (!context.isInterrupted() && !context.isCompleted()) { var dataWriter: DataWriter[InternalRow] = null // write the data and commit this writer. Utils.tryWithSafeFinallyAndFailureCallbacks(block = { try { val dataIterator = prev.compute(split, context) dataWriter = writeTask.createDataWriter( context.partitionId(), context.taskAttemptId(), EpochTracker.getCurrentEpoch.get) while (dataIterator.hasNext) { dataWriter.write(dataIterator.next()) } logInfo(s"Writer for partition ${context.partitionId()} " + s"in epoch ${EpochTracker.getCurrentEpoch.get} is committing.") val msg = dataWriter.commit() epochCoordinator.send( CommitPartitionEpoch( context.partitionId(), EpochTracker.getCurrentEpoch.get, msg) ) logInfo(s"Writer for partition ${context.partitionId()} " + s"in epoch ${EpochTracker.getCurrentEpoch.get} committed.") EpochTracker.incrementCurrentEpoch() } catch { case _: InterruptedException => // Continuous shutdown always involves an interrupt. Just finish the task. } })(catchBlock = { // If there is an error, abort this writer. We enter this callback in the middle of // rethrowing an exception, so compute() will stop executing at this point. logError(s"Writer for partition ${context.partitionId()} is aborting.") if (dataWriter != null) dataWriter.abort() logError(s"Writer for partition ${context.partitionId()} aborted.") }) } Iterator() } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 56
Source File: StateStoreCoordinator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import java.util.UUID import scala.collection.mutable import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.util.RpcUtils private class StateStoreCoordinator(override val rpcEnv: RpcEnv) extends ThreadSafeRpcEndpoint with Logging { private val instances = new mutable.HashMap[StateStoreProviderId, ExecutorCacheTaskLocation] override def receive: PartialFunction[Any, Unit] = { case ReportActiveInstance(id, host, executorId) => logDebug(s"Reported state store $id is active at $executorId") instances.put(id, ExecutorCacheTaskLocation(host, executorId)) } override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case VerifyIfInstanceActive(id, execId) => val response = instances.get(id) match { case Some(location) => location.executorId == execId case None => false } logDebug(s"Verified that state store $id is active: $response") context.reply(response) case GetLocation(id) => val executorId = instances.get(id).map(_.toString) logDebug(s"Got location of the state store $id: $executorId") context.reply(executorId) case DeactivateInstances(runId) => val storeIdsToRemove = instances.keys.filter(_.queryRunId == runId).toSeq instances --= storeIdsToRemove logDebug(s"Deactivating instances related to checkpoint location $runId: " + storeIdsToRemove.mkString(", ")) context.reply(true) case StopCoordinator => stop() // Stop before replying to ensure that endpoint name has been deregistered logInfo("StateStoreCoordinator stopped") context.reply(true) } }
Example 57
Source File: EvalPythonExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import java.io.File import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.ChainedPythonFunctions import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.util.Utils abstract class EvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends SparkPlan { def children: Seq[SparkPlan] = child :: Nil override def producedAttributes: AttributeSet = AttributeSet(output.drop(child.output.length)) private def collectFunctions(udf: PythonUDF): (ChainedPythonFunctions, Seq[Expression]) = { udf.children match { case Seq(u: PythonUDF) => val (chained, children) = collectFunctions(u) (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children) case children => // There should not be any other UDFs, or the children can't be evaluated directly. assert(children.forall(_.find(_.isInstanceOf[PythonUDF]).isEmpty)) (ChainedPythonFunctions(Seq(udf.func)), udf.children) } } protected def evaluate( funcs: Seq[ChainedPythonFunctions], argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] protected override def doExecute(): RDD[InternalRow] = { val inputRDD = child.execute().map(_.copy()) inputRDD.mapPartitions { iter => val context = TaskContext.get() // The queue used to buffer input rows so we can drain it to // combine input with output from Python. val queue = HybridRowQueue(context.taskMemoryManager(), new File(Utils.getLocalDir(SparkEnv.get.conf)), child.output.length) context.addTaskCompletionListener[Unit] { ctx => queue.close() } val (pyFuncs, inputs) = udfs.map(collectFunctions).unzip // flatten all the arguments val allInputs = new ArrayBuffer[Expression] val dataTypes = new ArrayBuffer[DataType] val argOffsets = inputs.map { input => input.map { e => if (allInputs.exists(_.semanticEquals(e))) { allInputs.indexWhere(_.semanticEquals(e)) } else { allInputs += e dataTypes += e.dataType allInputs.length - 1 } }.toArray }.toArray val projection = newMutableProjection(allInputs, child.output) val schema = StructType(dataTypes.zipWithIndex.map { case (dt, i) => StructField(s"_$i", dt) }) // Add rows to queue to join later with the result. val projectedRowIter = iter.map { inputRow => queue.add(inputRow.asInstanceOf[UnsafeRow]) projection(inputRow) } val outputRowIterator = evaluate( pyFuncs, argOffsets, projectedRowIter, schema, context) val joined = new JoinedRow val resultProj = UnsafeProjection.create(output, output) outputRowIterator.map { outputRow => resultProj(joined(queue.remove(), outputRow)) } } } }
Example 58
Source File: ObjectAggregationMap.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import java.{util => ju} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.internal.config import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, TypedImperativeAggregate} import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter def dumpToExternalSorter( groupingAttributes: Seq[Attribute], aggregateFunctions: Seq[AggregateFunction]): UnsafeKVExternalSorter = { val aggBufferAttributes = aggregateFunctions.flatMap(_.aggBufferAttributes) val sorter = new UnsafeKVExternalSorter( StructType.fromAttributes(groupingAttributes), StructType.fromAttributes(aggBufferAttributes), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, TaskContext.get().taskMemoryManager().pageSizeBytes, SparkEnv.get.conf.get(config.SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD), null ) val mapIterator = iterator val unsafeAggBufferProjection = UnsafeProjection.create(aggBufferAttributes.map(_.dataType).toArray) while (mapIterator.hasNext) { val entry = mapIterator.next() aggregateFunctions.foreach { case agg: TypedImperativeAggregate[_] => agg.serializeAggregateBufferInPlace(entry.aggregationBuffer) case _ => } sorter.insertKV( entry.groupingKey, unsafeAggBufferProjection(entry.aggregationBuffer) ) } hashMap.clear() sorter } def clear(): Unit = { hashMap.clear() } } // Stores the grouping key and aggregation buffer class AggregationBufferEntry(var groupingKey: UnsafeRow, var aggregationBuffer: InternalRow)
Example 59
Source File: OapCacheSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.filecache import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.datasources.oap.filecache.FiberType.FiberType import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.sql.test.oap.SharedOapContext class OapCacheSuite extends SharedOapContext with Logging{ override def beforeAll(): Unit = super.beforeAll() override def afterAll(): Unit = super.afterAll() test("not support cache strategy -- throw exception") { val sparkenv = SparkEnv.get sparkenv.conf.set("spark.oap.cache.strategy", "not_support_cache") sparkenv.conf.set("spark.sql.oap.fiberCache.memory.manager", "offheap") sparkenv.conf.set("spark.oap.cache.backend.fallback.enabled", "false") sparkenv.conf.set("spark.oap.test.cache.backend.fallback.res", "false") val cacheMemory: Long = 10000 val cacheGuardianMemory: Long = 20000 val fiberType: FiberType = FiberType.DATA assertThrows[UnsupportedOperationException] { OapCache(sparkenv, cacheMemory, cacheGuardianMemory, fiberType) } } test("guava cache strategy and offheap memory manager -- return guavaCache") { val sparkenv = SparkEnv.get sparkenv.conf.set("spark.oap.cache.strategy", "guava") sparkenv.conf.set("spark.sql.oap.fiberCache.memory.manager", "offheap") sparkenv.conf.set("spark.oap.cache.backend.fallback.enabled", "false") sparkenv.conf.set("spark.oap.test.cache.backend.fallback.res", "false") val cacheMemory: Long = 100000 val cacheGuardianMemory: Long = 20000 val fiberType: FiberType = FiberType.DATA val guavaCache: OapCache = OapCache(sparkenv, cacheMemory, cacheGuardianMemory, fiberType) assert(guavaCache.isInstanceOf[GuavaOapCache]) } test("guava cache strategy and pm memory manager (without required dirs) " + "-- fall back to simpleCache") { val sparkenv = SparkEnv.get val cacheMemory: Long = 10000 val cacheGuardianMemory: Long = 20000 val fiberType: FiberType = FiberType.DATA sparkenv.conf.set("spark.oap.cache.strategy", "guava") sparkenv.conf.set("spark.sql.oap.fiberCache.memory.manager", "pm") sparkenv.conf.set("spark.oap.cache.backend.fallback.enabled", "false") sparkenv.conf.set("spark.oap.test.cache.backend.fallback.res", "false") val simpleOapCache: OapCache = OapCache(sparkenv, cacheMemory, cacheGuardianMemory, fiberType) assert(simpleOapCache.isInstanceOf[SimpleOapCache]) } test("noevict cache strategy and pm memory manager (without required dirs) " + "-- fallback to simpleCache") { val sparkenv = SparkEnv.get sparkenv.conf.set("spark.oap.cache.strategy", "noevict") sparkenv.conf.set("spark.sql.oap.fiberCache.memory.manager", "pm") sparkenv.conf.set("spark.oap.cache.backend.fallback.enabled", "false") sparkenv.conf.set("spark.oap.test.cache.backend.fallback.res", "false") val cacheMemory: Long = 100000 val cacheGuardianMemory: Long = 20000 val fiberType: FiberType = FiberType.DATA val simpleCache: OapCache = OapCache(sparkenv, cacheMemory, cacheGuardianMemory, fiberType) assert(simpleCache.isInstanceOf[SimpleOapCache]) } test("guava cache strategy and pm memory manager (with required dirs) return guavaCache") { val sparkenv = SparkEnv.get val cacheMemory: Long = 10000 val cacheGuardianMemory: Long = 20000 val fiberType: FiberType = FiberType.DATA sparkenv.conf.set("spark.oap.cache.strategy", "guava") sparkenv.conf.set("spark.sql.oap.fiberCache.memory.manager", "pm") sparkenv.conf.set("spark.oap.cache.backend.fallback.enabled", "false") sparkenv.conf.set("spark.oap.test.cache.backend.fallback.res", "true") val guavaCache: OapCache = OapCache(sparkenv, cacheMemory, cacheGuardianMemory, fiberType) assert(guavaCache.isInstanceOf[GuavaOapCache]) } test("noevict cache strategy and pm memory manager (with required dirs) " + "return noevictCache") { val sparkenv = SparkEnv.get val cacheMemory: Long = 10000 val cacheGuardianMemory: Long = 20000 val fiberType: FiberType = FiberType.DATA sparkenv.conf.set("spark.oap.cache.strategy", "noevict") sparkenv.conf.set("spark.sql.oap.fiberCache.memory.manager", "pm") sparkenv.conf.set("spark.oap.cache.backend.fallback.enabled", "false") sparkenv.conf.set("spark.oap.test.cache.backend.fallback.res", "true") val noevictCache: OapCache = OapCache(sparkenv, OapConf.OAP_FIBERCACHE_STRATEGY, cacheMemory, cacheGuardianMemory, fiberType) assert(noevictCache.isInstanceOf[NoEvictPMCache]) } }
Example 60
Source File: MatfastSerializer.scala From MatRel with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.matfast.util import java.math.BigDecimal import java.nio.ByteBuffer import java.util.{HashMap => JavaHashMap} import scala.reflect.ClassTag import com.esotericsoftware.kryo.{Kryo, Serializer} import com.esotericsoftware.kryo.io.{Input, Output} import com.twitter.chill.ResourcePool import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.serializer.{KryoSerializer, SerializerInstance} import org.apache.spark.sql.matfast.matrix._ import org.apache.spark.sql.types.Decimal import org.apache.spark.util.MutablePair private[matfast] class MatfastSerializer(conf: SparkConf) extends KryoSerializer(conf) { override def newKryo(): Kryo = { val kryo = super.newKryo() kryo.setRegistrationRequired(false) kryo.register(classOf[MutablePair[_, _]]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericRow]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericInternalRow]) kryo.register(classOf[java.math.BigDecimal], new JavaBigDecimalSerializer) kryo.register(classOf[BigDecimal], new ScalaBigDecimalSerializer) kryo.register(classOf[Decimal]) kryo.register(classOf[JavaHashMap[_, _]]) kryo.register(classOf[DenseMatrix]) kryo.register(classOf[SparseMatrix]) kryo.setReferences(false) kryo } } private[matfast] class KryoResourcePool(size: Int) extends ResourcePool[SerializerInstance](size) { val ser: MatfastSerializer = { val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf()) new MatfastSerializer(sparkConf) } def newInstance(): SerializerInstance = ser.newInstance() } private[matfast] object MatfastSerializer { @transient lazy val resourcePool = new KryoResourcePool(50) private[this] def acquireRelease[O](fn: SerializerInstance => O): O = { val kryo = resourcePool.borrow() try { fn(kryo) } finally { resourcePool.release(kryo) } } def serialize[T: ClassTag](o: T): Array[Byte] = { acquireRelease { k => k.serialize(o).array() } } def deserialize[T: ClassTag](bytes: Array[Byte]): T = acquireRelease { k => k.deserialize[T](ByteBuffer.wrap(bytes)) } } private[matfast] class JavaBigDecimalSerializer extends Serializer[java.math.BigDecimal] { def write(kryo: Kryo, output: Output, bd: java.math.BigDecimal) { output.writeString(bd.toString) } def read(kryo: Kryo, input: Input, tpe: Class[java.math.BigDecimal]): java.math.BigDecimal = { new java.math.BigDecimal(input.readString()) } } private[matfast] class ScalaBigDecimalSerializer extends Serializer[BigDecimal] { def write(kryo: Kryo, output: Output, bd: BigDecimal): Unit = { output.writeString(bd.toString) } def read(kryo: Kryo, input: Input, tpe: Class[BigDecimal]): BigDecimal = { new java.math.BigDecimal(input.readString()) } }
Example 61
Source File: BlockManagerWrapper.scala From BigDL with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import java.nio.ByteBuffer import org.apache.spark.SparkEnv object BlockManagerWrapper { def putBytes( blockId: BlockId, bytes: ByteBuffer, level: StorageLevel): Unit = { require(bytes != null, "Bytes is null") SparkEnv.get.blockManager.putBytes(blockId, bytes, level) } def putSingle(blockId: BlockId, value: Any, level: StorageLevel, tellMaster: Boolean = true): Unit = { SparkEnv.get.blockManager.putSingle(blockId, value, level, tellMaster) } def removeBlock(blockId: BlockId): Unit = { SparkEnv.get.blockManager.removeBlock(blockId) } def getLocal(blockId: BlockId): Option[BlockResult] = { SparkEnv.get.blockManager.getLocal(blockId) } def getLocalBytes(blockId: BlockId): Option[ByteBuffer] = { SparkEnv.get.blockManager.getLocalBytes(blockId) } def getLocalOrRemoteBytes(blockId: BlockId): Option[ByteBuffer] = { val bm = SparkEnv.get.blockManager val maybeLocalBytes = bm.getLocalBytes(blockId) if (maybeLocalBytes.isDefined) { maybeLocalBytes } else { bm.getRemoteBytes(blockId) } } def unlock(blockId : BlockId): Unit = {} }
Example 62
Source File: FakeTask.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.SparkEnv import org.apache.spark.TaskContext import org.apache.spark.executor.TaskMetrics class FakeTask( stageId: Int, partitionId: Int, prefLocs: Seq[TaskLocation] = Nil, serializedTaskMetrics: Array[Byte] = SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array()) extends Task[Int](stageId, 0, partitionId, serializedTaskMetrics) { override def prepTask(): Unit = {} override def runTask(context: TaskContext): Int = 0 override def preferredLocations: Seq[TaskLocation] = prefLocs } object FakeTask { def createTaskSet(numTasks: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { createTaskSet(numTasks, stageAttemptId = 0, prefLocs: _*) } def createTaskSet(numTasks: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { createTaskSet(numTasks, stageId = 0, stageAttemptId, prefLocs: _*) } def createTaskSet(numTasks: Int, stageId: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { if (prefLocs.size != 0 && prefLocs.size != numTasks) { throw new IllegalArgumentException("Wrong number of task locations") } val tasks = Array.tabulate[Task[_]](numTasks) { i => new FakeTask(stageId, i, if (prefLocs.size != 0) prefLocs(i) else Nil) } new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null) } }
Example 63
Source File: MemoryTestingUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import java.util.Properties import org.apache.spark.{SparkEnv, TaskContext, TaskContextImpl} object MemoryTestingUtils { def fakeTaskContext(env: SparkEnv): TaskContext = { val taskMemoryManager = new TaskMemoryManager(env.memoryManager, 0) new TaskContextImpl( stageId = 0, partitionId = 0, taskAttemptId = 0, attemptNumber = 0, _taskMemoryManager = taskMemoryManager, localProperties = new Properties, metricsSystem = env.metricsSystem) } }
Example 64
Source File: SubtractedRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.Dependency import org.apache.spark.OneToOneDependency import org.apache.spark.Partition import org.apache.spark.Partitioner import org.apache.spark.ShuffleDependency import org.apache.spark.SparkEnv import org.apache.spark.TaskContext private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag]( @transient var rdd1: RDD[_ <: Product2[K, V]], @transient var rdd2: RDD[_ <: Product2[K, W]], part: Partitioner) extends RDD[(K, V)](rdd1.context, Nil) { override def getDependencies: Seq[Dependency[_]] = { def rddDependency[T1: ClassTag, T2: ClassTag](rdd: RDD[_ <: Product2[T1, T2]]) : Dependency[_] = { if (rdd.partitioner == Some(part)) { logDebug("Adding one-to-one dependency with " + rdd) new OneToOneDependency(rdd) } else { logDebug("Adding shuffle dependency with " + rdd) new ShuffleDependency[T1, T2, Any](rdd, part) } } Seq(rddDependency[K, V](rdd1), rddDependency[K, W](rdd2)) } override def getPartitions: Array[Partition] = { val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.length) { // Each CoGroupPartition will depend on rdd1 and rdd2 array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) => dependencies(j) match { case s: ShuffleDependency[_, _, _] => None case _ => Some(new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i))) } }.toArray) } array } override val partitioner = Some(part) override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = { val partition = p.asInstanceOf[CoGroupPartition] val map = new JHashMap[K, ArrayBuffer[V]] def getSeq(k: K): ArrayBuffer[V] = { val seq = map.get(k) if (seq != null) { seq } else { val seq = new ArrayBuffer[V]() map.put(k, seq) seq } } def integrate(depNum: Int, op: Product2[K, V] => Unit): Unit = { dependencies(depNum) match { case oneToOneDependency: OneToOneDependency[_] => val dependencyPartition = partition.narrowDeps(depNum).get.split oneToOneDependency.rdd.iterator(dependencyPartition, context) .asInstanceOf[Iterator[Product2[K, V]]].foreach(op) case shuffleDependency: ShuffleDependency[_, _, _] => val iter = SparkEnv.get.shuffleManager .getReader( shuffleDependency.shuffleHandle, partition.index, partition.index + 1, context) .read() iter.foreach(op) } } // the first dep is rdd1; add all values to the map integrate(0, t => getSeq(t._1) += t._2) // the second dep is rdd2; remove all of its keys integrate(1, t => map.remove(t._1)) map.asScala.iterator.map(t => t._2.iterator.map((t._1, _))).flatten } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null } }
Example 65
Source File: BlockManagerSlaveEndpoint.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.{MapOutputTracker, SparkEnv} import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{ThreadUtils, Utils} private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends ThreadSafeRpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) case TriggerThreadDump => context.reply(Utils.getThreadDump()) } private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) context.reply(response) logDebug("Sent response: " + response + " to " + context.senderAddress) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 66
Source File: TaskResult.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 67
Source File: LocalSchedulerBackend.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.local import java.io.File import java.net.URL import java.nio.ByteBuffer import org.apache.spark.{SparkConf, SparkContext, SparkEnv, TaskState} import org.apache.spark.TaskState.TaskState import org.apache.spark.executor.{Executor, ExecutorBackend} import org.apache.spark.internal.Logging import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle} import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo private case class ReviveOffers() private case class StatusUpdate(taskId: Long, state: TaskState, serializedData: ByteBuffer) private case class KillTask(taskId: Long, interruptThread: Boolean) private case class StopExecutor() def getUserClasspath(conf: SparkConf): Seq[URL] = { val userClassPathStr = conf.getOption("spark.executor.extraClassPath") userClassPathStr.map(_.split(File.pathSeparator)).toSeq.flatten.map(new File(_).toURI.toURL) } launcherBackend.connect() override def start() { val rpcEnv = SparkEnv.get.rpcEnv val executorEndpoint = new LocalEndpoint(rpcEnv, userClassPath, scheduler, this, totalCores) localEndpoint = rpcEnv.setupEndpoint("LocalSchedulerBackendEndpoint", executorEndpoint) listenerBus.post(SparkListenerExecutorAdded( System.currentTimeMillis, executorEndpoint.localExecutorId, new ExecutorInfo(executorEndpoint.localExecutorHostname, totalCores, Map.empty))) launcherBackend.setAppId(appId) launcherBackend.setState(SparkAppHandle.State.RUNNING) } override def stop() { stop(SparkAppHandle.State.FINISHED) } override def reviveOffers() { localEndpoint.send(ReviveOffers) } override def defaultParallelism(): Int = scheduler.conf.getInt("spark.default.parallelism", totalCores) override def killTask(taskId: Long, executorId: String, interruptThread: Boolean) { localEndpoint.send(KillTask(taskId, interruptThread)) } override def statusUpdate(taskId: Long, state: TaskState, serializedData: ByteBuffer) { localEndpoint.send(StatusUpdate(taskId, state, serializedData)) } override def applicationId(): String = appId private def stop(finalState: SparkAppHandle.State): Unit = { localEndpoint.ask(StopExecutor) try { launcherBackend.setState(finalState) } finally { launcherBackend.close() } } }
Example 68
Source File: SparkHadoopMapRedUtil.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mapred import java.io.IOException import org.apache.hadoop.mapreduce.{TaskAttemptContext => MapReduceTaskAttemptContext} import org.apache.hadoop.mapreduce.{OutputCommitter => MapReduceOutputCommitter} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.executor.CommitDeniedException import org.apache.spark.internal.Logging object SparkHadoopMapRedUtil extends Logging { def commitTask( committer: MapReduceOutputCommitter, mrTaskContext: MapReduceTaskAttemptContext, jobId: Int, splitId: Int): Unit = { val mrTaskAttemptID = mrTaskContext.getTaskAttemptID // Called after we have decided to commit def performCommit(): Unit = { try { committer.commitTask(mrTaskContext) logInfo(s"$mrTaskAttemptID: Committed") } catch { case cause: IOException => logError(s"Error committing the output of task: $mrTaskAttemptID", cause) committer.abortTask(mrTaskContext) throw cause } } // First, check whether the task's output has already been committed by some other attempt if (committer.needsTaskCommit(mrTaskContext)) { val shouldCoordinateWithDriver: Boolean = { val sparkConf = SparkEnv.get.conf // We only need to coordinate with the driver if there are concurrent task attempts. // Note that this could happen even when speculation is not enabled (e.g. see SPARK-8029). // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs. sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", defaultValue = true) } if (shouldCoordinateWithDriver) { val outputCommitCoordinator = SparkEnv.get.outputCommitCoordinator val taskAttemptNumber = TaskContext.get().attemptNumber() val canCommit = outputCommitCoordinator.canCommit(jobId, splitId, taskAttemptNumber) if (canCommit) { performCommit() } else { val message = s"$mrTaskAttemptID: Not committed because the driver did not authorize commit" logInfo(message) // We need to abort the task so that the driver can reschedule new attempts, if necessary committer.abortTask(mrTaskContext) throw new CommitDeniedException(message, jobId, splitId, taskAttemptNumber) } } else { // Speculation is disabled or a user has chosen to manually bypass the commit coordination performCommit() } } else { // Some other attempt committed the output, so we do nothing and signal success logInfo(s"No need to commit output of task because needsTaskCommit=false: $mrTaskAttemptID") } } }
Example 69
Source File: StateStoreCoordinator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import scala.collection.mutable import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.util.RpcUtils private class StateStoreCoordinator(override val rpcEnv: RpcEnv) extends ThreadSafeRpcEndpoint with Logging { private val instances = new mutable.HashMap[StateStoreId, ExecutorCacheTaskLocation] override def receive: PartialFunction[Any, Unit] = { case ReportActiveInstance(id, host, executorId) => logDebug(s"Reported state store $id is active at $executorId") instances.put(id, ExecutorCacheTaskLocation(host, executorId)) } override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case VerifyIfInstanceActive(id, execId) => val response = instances.get(id) match { case Some(location) => location.executorId == execId case None => false } logDebug(s"Verified that state store $id is active: $response") context.reply(response) case GetLocation(id) => val executorId = instances.get(id).map(_.toString) logDebug(s"Got location of the state store $id: $executorId") context.reply(executorId) case DeactivateInstances(checkpointLocation) => val storeIdsToRemove = instances.keys.filter(_.checkpointLocation == checkpointLocation).toSeq instances --= storeIdsToRemove logDebug(s"Deactivating instances related to checkpoint location $checkpointLocation: " + storeIdsToRemove.mkString(", ")) context.reply(true) case StopCoordinator => stop() // Stop before replying to ensure that endpoint name has been deregistered logInfo("StateStoreCoordinator stopped") context.reply(true) } }
Example 70
Source File: IndexShuffleBlockManager.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle import java.io._ import java.nio.ByteBuffer import com.google.common.io.ByteStreams import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.network.netty.SparkTransportConf import org.apache.spark.storage._ def writeIndexFile(shuffleId: Int, mapId: Int, lengths: Array[Long]) = { val indexFile = getIndexFile(shuffleId, mapId) val out = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(indexFile))) try { // We take in lengths of each block, need to convert it to offsets. var offset = 0L out.writeLong(offset) for (length <- lengths) { offset += length out.writeLong(offset) } } finally { out.close() } } override def getBytes(blockId: ShuffleBlockId): Option[ByteBuffer] = { Some(getBlockData(blockId).nioByteBuffer()) } override def getBlockData(blockId: ShuffleBlockId): ManagedBuffer = { // The block is actually going to be a range of a single map output file for this map, so // find out the consolidated file, then the offset within that from our index val indexFile = getIndexFile(blockId.shuffleId, blockId.mapId) val in = new DataInputStream(new FileInputStream(indexFile)) try { ByteStreams.skipFully(in, blockId.reduceId * 8) val offset = in.readLong() val nextOffset = in.readLong() new FileSegmentManagedBuffer( transportConf, getDataFile(blockId.shuffleId, blockId.mapId), offset, nextOffset - offset) } finally { in.close() } } override def stop() = {} }
Example 71
Source File: CachedKafkaConsumer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.{util => ju} import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer} import org.apache.kafka.common.TopicPartition import org.apache.spark.{SparkEnv, SparkException, TaskContext} import org.apache.spark.internal.Logging def getOrCreate( topic: String, partition: Int, kafkaParams: ju.Map[String, Object]): CachedKafkaConsumer = synchronized { val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String] val topicPartition = new TopicPartition(topic, partition) val key = CacheKey(groupId, topicPartition) // If this is reattempt at running the task, then invalidate cache and start with // a new consumer if (TaskContext.get != null && TaskContext.get.attemptNumber > 1) { cache.remove(key) new CachedKafkaConsumer(topicPartition, kafkaParams) } else { if (!cache.containsKey(key)) { cache.put(key, new CachedKafkaConsumer(topicPartition, kafkaParams)) } cache.get(key) } } }
Example 72
Source File: SegmentPruneRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.indexserver import scala.collection.JavaConverters._ import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.sql.SparkSession import org.apache.carbondata.core.cache.CacheProvider import org.apache.carbondata.core.index.{IndexInputFormat, IndexStoreManager} import org.apache.carbondata.core.index.dev.expr.IndexInputSplitWrapper import org.apache.carbondata.core.indexstore.SegmentWrapper import org.apache.carbondata.spark.rdd.CarbonRDD class SegmentPruneRDD(@transient private val ss: SparkSession, indexInputFormat: IndexInputFormat) extends CarbonRDD[(String, SegmentWrapper)](ss, Nil) { override protected def getPreferredLocations(split: Partition): Seq[String] = { val locations = split.asInstanceOf[IndexRDDPartition].getLocations if (locations != null) { locations.toSeq } else { Seq() } } override protected def internalGetPartitions: Array[Partition] = { new DistributedPruneRDD(ss, indexInputFormat).partitions } override def internalCompute(split: Partition, context: TaskContext): Iterator[(String, SegmentWrapper)] = { val inputSplits = split.asInstanceOf[IndexRDDPartition].inputSplit val segments = inputSplits.map(_ .asInstanceOf[IndexInputSplitWrapper].getDistributable.getSegment) segments.foreach(_.setReadCommittedScope(indexInputFormat.getReadCommittedScope)) if (indexInputFormat.getInvalidSegments.size > 0) { // clear the segmentMap and from cache in executor when there are invalid segments IndexStoreManager.getInstance().clearInvalidSegments(indexInputFormat.getCarbonTable, indexInputFormat.getInvalidSegments) } val blockletMap = IndexStoreManager.getInstance .getDefaultIndex(indexInputFormat.getCarbonTable) val prunedSegments = blockletMap .pruneSegments(segments.toList.asJava, indexInputFormat.getFilterResolverIntf) val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${ SparkEnv.get.blockManager.blockManagerId.executorId }" val cacheSize = if (CacheProvider.getInstance().getCarbonCache != null) { CacheProvider.getInstance().getCarbonCache.getCurrentSize } else { 0L } val value = (executorIP + "_" + cacheSize.toString, new SegmentWrapper(prunedSegments)) Iterator(value) } }
Example 73
Source File: KryoSerializerDistributedSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.util.Utils import com.esotericsoftware.kryo.Kryo import org.scalatest.FunSuite import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, TestUtils} import org.apache.spark.serializer.KryoDistributedTest._ class KryoSerializerDistributedSuite extends FunSuite { test("kryo objects are serialised consistently in different processes") { val conf = new SparkConf(false) .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.kryo.registrator", classOf[AppJarRegistrator].getName) .set("spark.task.maxFailures", "1") val jar = TestUtils.createJarWithClasses(List(AppJarRegistrator.customClassName)) conf.setJars(List(jar.getPath)) val sc = new SparkContext("local-cluster[2,1,512]", "test", conf) val original = Thread.currentThread.getContextClassLoader val loader = new java.net.URLClassLoader(Array(jar), Utils.getContextOrSparkClassLoader) SparkEnv.get.serializer.setDefaultClassLoader(loader) val cachedRDD = sc.parallelize((0 until 10).map((_, new MyCustomClass)), 3).cache() // Randomly mix the keys so that the join below will require a shuffle with each partition // sending data to multiple other partitions. val shuffledRDD = cachedRDD.map { case (i, o) => (i * i * i - 10 * i * i, o)} // Join the two RDDs, and force evaluation assert(shuffledRDD.join(cachedRDD).collect().size == 1) LocalSparkContext.stop(sc) } } object KryoDistributedTest { class MyCustomClass class AppJarRegistrator extends KryoRegistrator { override def registerClasses(k: Kryo) { val classLoader = Thread.currentThread.getContextClassLoader k.register(Class.forName(AppJarRegistrator.customClassName, true, classLoader)) } } object AppJarRegistrator { val customClassName = "KryoSerializerDistributedSuiteCustomClass" } }
Example 74
Source File: HashShuffleManagerSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import java.io.{File, FileWriter} import scala.language.reflectiveCalls import org.scalatest.FunSuite import org.apache.spark.{SparkEnv, SparkContext, LocalSparkContext, SparkConf} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.serializer.JavaSerializer import org.apache.spark.shuffle.FileShuffleBlockManager import org.apache.spark.storage.{ShuffleBlockId, FileSegment} class HashShuffleManagerSuite extends FunSuite with LocalSparkContext { private val testConf = new SparkConf(false) private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) { assert(buffer.isInstanceOf[FileSegmentManagedBuffer]) val segment = buffer.asInstanceOf[FileSegmentManagedBuffer] assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath) assert(expected.offset === segment.getOffset) assert(expected.length === segment.getLength) } test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") { val conf = new SparkConf(false) // reset after EACH object write. This is to ensure that there are bytes appended after // an object is written. So if the codepaths assume writeObject is end of data, this should // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc. conf.set("spark.serializer.objectStreamReset", "1") conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer") conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") sc = new SparkContext("local", "test", conf) val shuffleBlockManager = SparkEnv.get.shuffleManager.shuffleBlockManager.asInstanceOf[FileShuffleBlockManager] val shuffle1 = shuffleBlockManager.forMapTask(1, 1, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle1.writers) { writer.write("test1") writer.write("test2") } for (writer <- shuffle1.writers) { writer.commitAndClose() } val shuffle1Segment = shuffle1.writers(0).fileSegment() shuffle1.releaseWriters(success = true) val shuffle2 = shuffleBlockManager.forMapTask(1, 2, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle2.writers) { writer.write("test3") writer.write("test4") } for (writer <- shuffle2.writers) { writer.commitAndClose() } val shuffle2Segment = shuffle2.writers(0).fileSegment() shuffle2.releaseWriters(success = true) // Now comes the test : // Write to shuffle 3; and close it, but before registering it, check if the file lengths for // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length // of block based on remaining data in file : which could mess things up when there is concurrent read // and writes happening to the same shuffle group. val shuffle3 = shuffleBlockManager.forMapTask(1, 3, 1, new JavaSerializer(testConf), new ShuffleWriteMetrics) for (writer <- shuffle3.writers) { writer.write("test3") writer.write("test4") } for (writer <- shuffle3.writers) { writer.commitAndClose() } // check before we register. checkSegments(shuffle2Segment, shuffleBlockManager.getBlockData(ShuffleBlockId(1, 2, 0))) shuffle3.releaseWriters(success = true) checkSegments(shuffle2Segment, shuffleBlockManager.getBlockData(ShuffleBlockId(1, 2, 0))) shuffleBlockManager.removeShuffle(1) } def writeToFile(file: File, numBytes: Int) { val writer = new FileWriter(file, true) for (i <- 0 until numBytes) writer.write(i) writer.close() } }
Example 75
Source File: SubtractedRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.Dependency import org.apache.spark.OneToOneDependency import org.apache.spark.Partition import org.apache.spark.Partitioner import org.apache.spark.ShuffleDependency import org.apache.spark.SparkEnv import org.apache.spark.TaskContext import org.apache.spark.serializer.Serializer def setSerializer(serializer: Serializer): SubtractedRDD[K, V, W] = { this.serializer = Option(serializer) this } override def getDependencies: Seq[Dependency[_]] = { Seq(rdd1, rdd2).map { rdd => if (rdd.partitioner == Some(part)) { logDebug("Adding one-to-one dependency with " + rdd) new OneToOneDependency(rdd) } else { logDebug("Adding shuffle dependency with " + rdd) new ShuffleDependency(rdd, part, serializer) } } } override def getPartitions: Array[Partition] = { val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.size) { // Each CoGroupPartition will depend on rdd1 and rdd2 array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) => dependencies(j) match { case s: ShuffleDependency[_, _, _] => new ShuffleCoGroupSplitDep(s.shuffleHandle) case _ => new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)) } }.toArray) } array } override val partitioner = Some(part) override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = { val partition = p.asInstanceOf[CoGroupPartition] val map = new JHashMap[K, ArrayBuffer[V]] def getSeq(k: K): ArrayBuffer[V] = { val seq = map.get(k) if (seq != null) { seq } else { val seq = new ArrayBuffer[V]() map.put(k, seq) seq } } def integrate(dep: CoGroupSplitDep, op: Product2[K, V] => Unit) = dep match { case NarrowCoGroupSplitDep(rdd, _, itsSplit) => rdd.iterator(itsSplit, context).asInstanceOf[Iterator[Product2[K, V]]].foreach(op) case ShuffleCoGroupSplitDep(handle) => val iter = SparkEnv.get.shuffleManager .getReader(handle, partition.index, partition.index + 1, context) .read() iter.foreach(op) } // the first dep is rdd1; add all values to the map integrate(partition.deps(0), t => getSeq(t._1) += t._2) // the second dep is rdd2; remove all of its keys integrate(partition.deps(1), t => map.remove(t._1)) map.iterator.map { t => t._2.iterator.map { (t._1, _) } }.flatten } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null } }
Example 76
Source File: BlockManagerSlaveActor.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.Future import akka.actor.{ActorRef, Actor} import org.apache.spark.{Logging, MapOutputTracker, SparkEnv} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.ActorLogReceive private[storage] class BlockManagerSlaveActor( blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends Actor with ActorLogReceive with Logging { import context.dispatcher // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveWithLogging = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, sender) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, sender) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, sender) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, sender) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => sender ! blockManager.getStatus(blockId) case GetMatchingBlockIds(filter, _) => sender ! blockManager.getMatchingBlockIds(filter) } private def doAsync[T](actionMessage: String, responseActor: ActorRef)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) responseActor ! response logDebug("Sent response: " + response + " to " + responseActor) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) responseActor ! null.asInstanceOf[T] } } }
Example 77
Source File: SimrSchedulerBackend.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.AkkaUtils private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = AkkaUtils.address( AkkaUtils.protocol(actorSystem), SparkEnv.driverActorSystemName, sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port"), CoarseGrainedSchedulerBackend.ACTOR_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 78
Source File: TaskResult.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.Map import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] private[spark] class DirectTaskResult[T](var valueBytes: ByteBuffer, var accumUpdates: Map[Long, Any], var metrics: TaskMetrics) extends TaskResult[T] with Externalizable { def this() = this(null.asInstanceOf[ByteBuffer], null, null) override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { out.writeInt(valueBytes.remaining); Utils.writeByteBuffer(valueBytes, out) out.writeInt(accumUpdates.size) for ((key, value) <- accumUpdates) { out.writeLong(key) out.writeObject(value) } out.writeObject(metrics) } override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { val blen = in.readInt() val byteVal = new Array[Byte](blen) in.readFully(byteVal) valueBytes = ByteBuffer.wrap(byteVal) val numUpdates = in.readInt if (numUpdates == 0) { accumUpdates = null } else { accumUpdates = Map() for (i <- 0 until numUpdates) { accumUpdates(in.readLong()) = in.readObject() } } metrics = in.readObject().asInstanceOf[TaskMetrics] } def value(): T = { val resultSer = SparkEnv.get.serializer.newInstance() resultSer.deserialize(valueBytes) } }
Example 79
Source File: Serializer.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io._ import java.nio.ByteBuffer import scala.reflect.ClassTag import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.{Utils, ByteBufferInputStream, NextIterator} def asIterator: Iterator[Any] = new NextIterator[Any] { override protected def getNext() = { try { readObject[Any]() } catch { case eof: EOFException => finished = true } } override protected def close() { DeserializationStream.this.close() } } }
Example 80
Source File: SortShuffleWriter.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{IndexShuffleBlockManager, ShuffleWriter, BaseShuffleHandle} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockManager: IndexShuffleBlockManager, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = new ShuffleWriteMetrics() context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics) override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { // The map task failed, so delete our output data. shuffleBlockManager.removeDataByMap(dep.shuffleId, mapId) return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { sorter.stop() sorter = null } } } }
Example 81
Source File: DistributedShowCacheRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.indexserver import scala.collection.JavaConverters._ import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.hive.DistributionUtil import org.apache.carbondata.core.index.IndexStoreManager import org.apache.carbondata.core.indexstore.blockletindex.BlockletIndexFactory import org.apache.carbondata.hadoop.CarbonInputSplit import org.apache.carbondata.spark.rdd.CarbonRDD class DistributedShowCacheRDD(@transient private val ss: SparkSession, tableUniqueId: String, executorCache: Boolean) extends CarbonRDD[String](ss, Nil) { val executorsList: Array[String] = DistributionUtil .getExecutors(ss.sparkContext).flatMap { case (host, executors) => executors.map { executor => s"executor_${ host }_$executor" } }.toArray override protected def getPreferredLocations(split: Partition): Seq[String] = { if (split.asInstanceOf[IndexRDDPartition].getLocations != null) { split.asInstanceOf[IndexRDDPartition].getLocations.toSeq } else { Seq() } } override protected def internalGetPartitions: Array[Partition] = { executorsList.zipWithIndex.map { case (executor, idx) => // create a dummy split for each executor to accumulate the cache size. val dummySplit = new CarbonInputSplit() dummySplit.setLocation(Array(executor)) new IndexRDDPartition(id, idx, List(dummySplit), Array(executor)) } } override def internalCompute(split: Partition, context: TaskContext): Iterator[String] = { val indexes = IndexStoreManager.getInstance().getTableIndexForAllTables.asScala val tableList = tableUniqueId.split(",") val iterator = indexes.collect { case (tableId, tableIndexes) if tableUniqueId.isEmpty || tableList.contains(tableId) => val sizeAndIndexLengths = tableIndexes.asScala .map { index => val indexName = if (index.getIndexFactory.isInstanceOf[BlockletIndexFactory]) { index .getIndexFactory .asInstanceOf[BlockletIndexFactory] .getCarbonTable .getTableUniqueName } else { index.getIndexSchema.getRelationIdentifier.getDatabaseName + "_" + index .getIndexSchema.getIndexName } if (executorCache) { val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${ SparkEnv.get.blockManager.blockManagerId.executorId }" s"${ executorIP }:${ index.getIndexFactory.getCacheSize }:${ index.getIndexSchema.getProviderName }" } else { s"${indexName}:${index.getIndexFactory.getCacheSize}:${ index.getIndexSchema.getProviderName }" } } sizeAndIndexLengths }.flatten.toIterator iterator } }
Example 82
Source File: BeforeAndAfterWithContext.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc.netty import eleflow.uberdata.core.IUberdataContext import eleflow.uberdata.core.util.ClusterSettings import org.apache.log4j.{Level, Logger} import org.apache.spark.{SparkConf, SparkEnv} import org.scalatest.{BeforeAndAfterEach, Suite} object TestSparkConf { def conf = { val sconf = new SparkConf() sconf.set("spark.app.name", "teste") sconf } val separator = "," } trait BeforeAndAfterWithContext extends BeforeAndAfterEach { this: Suite => val defaultFilePath = "src/test/resources/" import TestSparkConf._ ClusterSettings.master = Some("local[*]") conf.set("spark.driver.allowMultipleContexts", "true") @transient val context = IUberdataContext.getUC(conf) override def beforeEach() = { setLogLevels(Level.INFO, Seq("spark", "org.eclipse.jetty", "akka")) } def setLogLevels(level: org.apache.log4j.Level, loggers: TraversableOnce[String]) = { loggers.map { loggerName => val logger = Logger.getLogger(loggerName) val prevLevel = logger.getLevel logger.setLevel(level) loggerName -> prevLevel }.toMap } override def afterEach() = { val get = SparkEnv.get val rpcEnv = if (get != null) { Some(get.rpcEnv) } else None context.clearContext() //rpcEnv.foreach( // _.fileServer.asInstanceOf[org.apache.spark.rpc.netty.HttpBasedFileServer].shutdown()) System.clearProperty("spark.master.port") } }
Example 83
Source File: MemoryTestingUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import java.util.Properties import org.apache.spark.{SparkEnv, TaskContext, TaskContextImpl} object MemoryTestingUtils { def fakeTaskContext(env: SparkEnv): TaskContext = { val taskMemoryManager = new TaskMemoryManager(env.memoryManager, 0) new TaskContextImpl( stageId = 0, partitionId = 0, taskAttemptId = 0, attemptNumber = 0, taskMemoryManager = taskMemoryManager, localProperties = new Properties, metricsSystem = env.metricsSystem) } }
Example 84
Source File: BlockManagerSlaveEndpoint.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.{MapOutputTracker, SparkEnv} import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{ThreadUtils, Utils} private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends ThreadSafeRpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) case TriggerThreadDump => context.reply(Utils.getThreadDump()) } private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) context.reply(response) logDebug("Sent response: " + response + " to " + context.senderAddress) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 85
Source File: TaskResult.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 86
Source File: SparkHadoopMapRedUtil.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mapred import java.io.IOException import org.apache.hadoop.mapreduce.{TaskAttemptContext => MapReduceTaskAttemptContext} import org.apache.hadoop.mapreduce.{OutputCommitter => MapReduceOutputCommitter} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.executor.CommitDeniedException import org.apache.spark.internal.Logging object SparkHadoopMapRedUtil extends Logging { def commitTask( committer: MapReduceOutputCommitter, mrTaskContext: MapReduceTaskAttemptContext, jobId: Int, splitId: Int): Unit = { val mrTaskAttemptID = mrTaskContext.getTaskAttemptID // Called after we have decided to commit def performCommit(): Unit = { try { committer.commitTask(mrTaskContext) logInfo(s"$mrTaskAttemptID: Committed") } catch { case cause: IOException => logError(s"Error committing the output of task: $mrTaskAttemptID", cause) committer.abortTask(mrTaskContext) throw cause } } // First, check whether the task's output has already been committed by some other attempt if (committer.needsTaskCommit(mrTaskContext)) { val shouldCoordinateWithDriver: Boolean = { val sparkConf = SparkEnv.get.conf // We only need to coordinate with the driver if there are concurrent task attempts. // Note that this could happen even when speculation is not enabled (e.g. see SPARK-8029). // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs. sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", defaultValue = true) } if (shouldCoordinateWithDriver) { val outputCommitCoordinator = SparkEnv.get.outputCommitCoordinator val taskAttemptNumber = TaskContext.get().attemptNumber() val canCommit = outputCommitCoordinator.canCommit(jobId, splitId, taskAttemptNumber) if (canCommit) { performCommit() } else { val message = s"$mrTaskAttemptID: Not committed because the driver did not authorize commit" logInfo(message) // We need to abort the task so that the driver can reschedule new attempts, if necessary committer.abortTask(mrTaskContext) throw new CommitDeniedException(message, jobId, splitId, taskAttemptNumber) } } else { // Speculation is disabled or a user has chosen to manually bypass the commit coordination performCommit() } } else { // Some other attempt committed the output, so we do nothing and signal success logInfo(s"No need to commit output of task because needsTaskCommit=false: $mrTaskAttemptID") } } }
Example 87
Source File: RUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.File import java.util.Arrays import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Arrays.asList("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 88
Source File: StreamingQueryListenerSampleJob.scala From spark-monitoring with MIT License | 5 votes |
package com.microsoft.pnp.samplejob import com.microsoft.pnp.logging.Log4jConfiguration import com.microsoft.pnp.util.TryWith import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.metrics.UserMetricsSystems import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.window import org.apache.spark.sql.types.{StringType, StructType, TimestampType} object StreamingQueryListenerSampleJob extends Logging { private final val METRICS_NAMESPACE = "streamingquerylistenersamplejob" private final val COUNTER_NAME = "rowcounter" def main(args: Array[String]): Unit = { // Configure our logging TryWith(getClass.getResourceAsStream("/com/microsoft/pnp/samplejob/log4j.properties")) { stream => { Log4jConfiguration.configure(stream) } } logTrace("Trace message from StreamingQueryListenerSampleJob") logDebug("Debug message from StreamingQueryListenerSampleJob") logInfo("Info message from StreamingQueryListenerSampleJob") logWarning("Warning message from StreamingQueryListenerSampleJob") logError("Error message from StreamingQueryListenerSampleJob") val spark = SparkSession .builder .getOrCreate import spark.implicits._ // this path has sample files provided by databricks for trying out purpose val inputPath = "/databricks-datasets/structured-streaming/events/" val jsonSchema = new StructType().add("time", TimestampType).add("action", StringType) val driverMetricsSystem = UserMetricsSystems .getMetricSystem(METRICS_NAMESPACE, builder => { builder.registerCounter(COUNTER_NAME) }) driverMetricsSystem.counter(COUNTER_NAME).inc // Similar to definition of staticInputDF above, just using `readStream` instead of `read` val streamingInputDF = spark .readStream // `readStream` instead of `read` for creating streaming DataFrame .schema(jsonSchema) // Set the schema of the JSON data .option("maxFilesPerTrigger", 1) // Treat a sequence of files as a stream by picking one file at a time .json(inputPath) driverMetricsSystem.counter(COUNTER_NAME).inc(5) val streamingCountsDF = streamingInputDF .groupBy($"action", window($"time", "1 hour")) .count() // Is this DF actually a streaming DF? streamingCountsDF.isStreaming driverMetricsSystem.counter(COUNTER_NAME).inc(10) val query = streamingCountsDF .writeStream .format("memory") // memory = store in-memory table (for testing only in Spark 2.0) .queryName("counts") // counts = name of the in-memory table .outputMode("complete") // complete = all the counts should be in the table .start() } }
Example 89
Source File: RemoteShuffleUtils.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.remote import java.util.UUID import org.apache.hadoop.fs.Path import org.apache.spark.SparkEnv import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.serializer.{SerializerInstance, SerializerManager} import org.apache.spark.shuffle.ShuffleWriteMetricsReporter import org.apache.spark.storage.{BlockId, TempLocalBlockId, TempShuffleBlockId} object RemoteShuffleUtils { val env = SparkEnv.get def getRemoteWriter( blockId: BlockId, file: Path, serializerManager: SerializerManager, serializerInstance: SerializerInstance, bufferSize: Int, writeMetrics: ShuffleWriteMetricsReporter): RemoteBlockObjectWriter = { val syncWrites = false // env.blockManager.conf.getBoolean("spark.shuffle.sync", false) new RemoteBlockObjectWriter(file, serializerManager, serializerInstance, bufferSize, syncWrites, writeMetrics, blockId) } }
Example 90
Source File: MyNettyBlockRpcServer.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.language.existentials import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.client.{RpcResponseCallback, StreamCallbackWithID, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.remote.{HadoopFileSegmentManagedBuffer, MessageForHadoopManagedBuffers, RemoteShuffleManager} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.storage.{BlockId, ShuffleBlockId} class MyNettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocksNum = openBlocks.blockIds.length val isShuffleRequest = (blocksNum > 0) && BlockId.apply(openBlocks.blockIds(0)).isInstanceOf[ShuffleBlockId] && (SparkEnv.get.conf.get("spark.shuffle.manager", classOf[SortShuffleManager].getName) == classOf[RemoteShuffleManager].getName) if (isShuffleRequest) { val blockIdAndManagedBufferPair = openBlocks.blockIds.map(block => (block, blockManager.getHostLocalShuffleData( BlockId.apply(block), Array.empty).asInstanceOf[HadoopFileSegmentManagedBuffer])) responseContext.onSuccess(new MessageForHadoopManagedBuffers( blockIdAndManagedBufferPair).toByteBuffer.nioBuffer()) } else { // This customized Netty RPC server is only served for RemoteShuffle requests, // Other RPC messages or data chunks transferring should go through // NettyBlockTransferService' NettyBlockRpcServer throw new UnsupportedOperationException("MyNettyBlockRpcServer only serves remote" + " shuffle requests for OpenBlocks") } case uploadBlock: UploadBlock => throw new UnsupportedOperationException("MyNettyBlockRpcServer doesn't serve UploadBlock") } } override def receiveStream( client: TransportClient, messageHeader: ByteBuffer, responseContext: RpcResponseCallback): StreamCallbackWithID = { throw new UnsupportedOperationException("MyNettyBlockRpcServer doesn't support receiving" + " stream") } override def getStreamManager(): StreamManager = streamManager }
Example 91
Source File: MemoryManagerSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.filecache import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.sql.test.oap.SharedOapContext class MemoryManagerSuite extends SharedOapContext with Logging{ override def afterAll(): Unit = { // restore oapSparkConf to default oapSparkConf.set("spark.oap.cache.strategy", "guava") oapSparkConf.set("spark.sql.oap.fiberCache.memory.manager", "offheap") } test("guava cache with offheap memory manager") { oapSparkConf.set("spark.oap.cache.strategy", "guava") oapSparkConf.set("spark.sql.oap.fiberCache.memory.manager", "offheap") val sparkEnv = SparkEnv.get val memoryManager = MemoryManager(sparkEnv) assert(memoryManager.isInstanceOf[OffHeapMemoryManager]) } test("vmem with tmp memory manager") { val sparkEnv = SparkEnv.get sparkEnv.conf.set("spark.oap.cache.strategy", "vmem") // sparkEnv.conf.set("spark.sql.oap.fiberCache.memory.manager", "pm") val memoryManager = MemoryManager(sparkEnv) assert(memoryManager.isInstanceOf[TmpDramMemoryManager]) } test("mix cache with offheap as index memory manager") { val sparkEnv = SparkEnv.get sparkEnv.conf.set("spark.oap.cache.strategy", "mix") val indexMemoryManager = MemoryManager(sparkEnv, OapConf.OAP_FIBERCACHE_STRATEGY, FiberType.INDEX) assert(indexMemoryManager.isInstanceOf[OffHeapMemoryManager]) } }