org.apache.spark.Partition Scala Examples
The following examples show how to use org.apache.spark.Partition.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: AnalyticsRDD.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.rdd import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse} import com.couchbase.client.core.service.ServiceType import com.couchbase.client.java.analytics.AnalyticsQuery import com.couchbase.client.java.query.N1qlQuery import com.couchbase.spark.connection._ import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, SparkContext, TaskContext} import rx.lang.scala.JavaConversions.toScalaObservable import scala.concurrent.duration.Duration class AnalyticsRDD(@transient private val sc: SparkContext, query: AnalyticsQuery, bucketName: String = null, timeout: Option[Duration] = None) extends RDD[CouchbaseAnalyticsRow](sc, Nil) { private val cbConfig = CouchbaseConfig(sc.getConf) override def compute(split: Partition, context: TaskContext): Iterator[CouchbaseAnalyticsRow] = new AnalyticsAccessor(cbConfig, Seq(query), bucketName, timeout).compute() override protected def getPartitions: Array[Partition] = { // Try to run the query on a Spark worker co-located on a Couchbase analytics node val addressesWithAnalyticsService = RDDSupport.couchbaseNodesWithService(cbConfig, bucketName, ServiceType.ANALYTICS) // A single query can only run on one node, so return one partition Array(new QueryPartition(0, addressesWithAnalyticsService)) } override protected def getPreferredLocations(split: Partition): Seq[String] = { RDDSupport.getPreferredLocations(split) } }
Example 2
Source File: SplitRDD.scala From spark-bam with Apache License 2.0 | 5 votes |
package org.hammerlab.bam.spark.load import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input import org.apache.spark.{ Partition, SparkContext, TaskContext } import org.apache.spark.rdd.RDD import scala.collection.JavaConverters._ case class FileSplitPartition(index: Int, start: Long, end: Long, locations: Array[String]) extends Partition class SplitRDD private(@transient override val getPartitions: Array[Partition])(implicit sc: SparkContext) extends RDD[(Long, Long)](sc, Nil) { override def compute(split: Partition, context: TaskContext) = Iterator( split.asInstanceOf[FileSplitPartition] ) .map( fs ⇒ fs.start → fs.end ) override protected def getPreferredLocations(split: Partition) = split .asInstanceOf[FileSplitPartition] .locations } object SplitRDD { def apply(splits: java.util.List[InputSplit])(implicit sc: SparkContext): SplitRDD = new SplitRDD( splits .iterator() .asScala .map(_.asInstanceOf[input.FileSplit]) .zipWithIndex .map { case (fs, idx) ⇒ FileSplitPartition( idx, fs.getStart, fs.getStart + fs.getLength, fs.getLocations ) } .toArray ) }
Example 3
Source File: Scanner.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.datastream.reader import scala.language.reflectiveCalls import org.apache.spark.{ Partition, SparkContext, TaskContext } import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import com.actian.spark_vector.datastream.VectorEndpointConf @volatile private var it: RowReader = _ override protected def getPartitions = (0 until readConf.size).map(idx => new Partition { def index = idx }).toArray override protected def getPreferredLocations(split: Partition) = Seq(readConf.vectorEndpoints(split.index).host) override def compute(split: Partition, taskContext: TaskContext): Iterator[InternalRow] = { taskContext.addTaskCompletionListener { _ => closeAll() } taskContext.addTaskFailureListener { (_, e) => closeAll(Option(e)) } logDebug("Computing partition " + split.index) try { it = reader.read(split.index) it } catch { case e: Exception => logDebug("Exception occurred when attempting to read from stream. If termination was abnormal an additional exception will be thrown.", e) Iterator.empty } } def touchDatastreams(parts: List[Int] = List[Int]()) { val untouched = List.range(0, readConf.size).diff(parts) untouched.foreach ( p => try { reader.touch(p) //Need to ensure all the streams have been closed except the one used by this instance logDebug(s"Closed partition $p Vector transfer datastream") } catch { case e: Exception => logDebug("Exception while closing unused Vector transfer datastream " + e.toString()) } ) } def closeAll(failure: Option[Throwable] = None): Unit = { failure.foreach(logError("Failure during task completion, closing RowReader", _)) if (it != null) { close(it, "RowReader") it = null; } } private def close[T <: { def close() }](c: T, resourceName: String): Unit = if (c != null) { try { c.close } catch { case e: Exception => logWarning(s"Exception closing $resourceName", e) } } }
Example 4
Source File: InsertRDD.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.datastream.writer import scala.annotation.tailrec import scala.reflect.ClassTag import org.apache.spark.{ OneToOneDependency, NarrowDependency, Partition, TaskContext } import org.apache.spark.rdd.RDD import com.actian.spark_vector.datastream.{ DataStreamPartition, DataStreamPartitionAssignment, VectorEndpointConf } private val endPointsToParentPartitionsMap = { val affinities = rdd.partitions.map(getPreferredLocationsRec(rdd, _)) val ret = DataStreamPartitionAssignment(affinities, writeConf.vectorEndpoints) logDebug(s"Computed endPointsToParentPartitionsMap and got: ${ (0 until ret.length).map { case idx => val vals = ret(idx) s"Datastream $idx -> RDD partitions ${vals.length}: ${vals.take(partitionsPerDataStreamToPrint).mkString(",")} ${if (vals.length > partitionsPerDataStreamToPrint) "..." else ""}" } }") ret.map(_.map(rdd.partitions(_).index)) } override protected def getPartitions = (0 until writeConf.size).map(idx => DataStreamPartition(idx, rdd, endPointsToParentPartitionsMap(idx))).toArray override protected def getPreferredLocations(split: Partition) = { logDebug(s"getPreferredLocations is called for partition ${split.index} and we are returning ${writeConf.vectorEndpoints(split.index).host}") Seq(writeConf.vectorEndpoints(split.index).host) } override def compute(split: Partition, taskContext: TaskContext): Iterator[R] = split.asInstanceOf[DataStreamPartition].parents.toIterator.flatMap(firstParent[R].iterator(_, taskContext)) override def getDependencies: Seq[NarrowDependency[_]] = Seq(new NarrowDependency(rdd) { def getParents(partitionId: Int) = endPointsToParentPartitionsMap(partitionId) }) }
Example 5
Source File: PartitionwiseWeightedSampledRDD.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.utils import java.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, Partitioner, TaskContext} import scala.reflect.ClassTag import scala.util.{Random => ScalaRandom} class PartitionwiseWeightedSampledRDDPartition(val prev: Partition, val seed: Long, val fraction: Double) extends Partition with Serializable { override val index: Int = prev.index } class PartitionwiseWeightedSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[(T, Float)], sampler: WeightedRandomSampler[T, U], fractions: Map[Int, Double], preservesPartitioning: Boolean, @transient private val seed: Long = ScalaRandom.nextLong) extends RDD[U](prev) { @transient override val partitioner: Option[Partitioner] = { if (preservesPartitioning) prev.partitioner else None } override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[(T, Float)].partitions.map { x => new PartitionwiseWeightedSampledRDDPartition(x, random.nextLong(), fractions.getOrElse(x.index, 0.0)) } } override def getPreferredLocations(split: Partition): Seq[String] = { firstParent[(T, Float)].preferredLocations( split.asInstanceOf[PartitionwiseWeightedSampledRDDPartition].prev ) } override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseWeightedSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.setFraction(split.fraction) thisSampler.sample(firstParent[(T, Float)].iterator(split.prev, context)) } }
Example 6
Source File: PartitionwiseSampledRDD.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.rdd import java.util.Random import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.util.random.RandomSampler import scala.reflect.ClassTag private[sona] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } /** * An RDD sampled from its parent RDD partition-wise. For each partition of the parent RDD, * a user-specified [[org.apache.spark.util.random.RandomSampler]] instance is used to obtain * a random sample of the records in the partition. The random seeds assigned to the samplers * are guaranteed to have different values. * * @param prev RDD to be sampled * @param sampler a random sampler * @param preservesPartitioning whether the sampler preserves the partitioner of the parent RDD * @param seed random seed * @tparam T input RDD item type * @tparam U sampled RDD item type */ private[sona] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = (new Random).nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 7
Source File: SlidingRDD.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.rdd import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD private[angel] class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int) extends Partition with Serializable { override val index: Int = idx } /** * Represents an RDD from grouping items of its parent RDD in fixed size blocks by passing a sliding * window over them. The ordering is first based on the partition index and then the ordering of * items within each partition. This is similar to sliding in Scala collections, except that it * becomes an empty RDD if the window size is greater than the total number of items. It needs to * trigger a Spark job if the parent RDD has more than one partitions. To make this operation * efficient, the number of items per partition should be larger than the window size and the * window size should be small, e.g., 2. * * @param parent the parent RDD * @param windowSize the window size, must be greater than 1 * @param step step size for windows * * @see `org.apache.spark.ml.rdd.RDDFunctions.sliding(Int, Int)*` * @see `scala.collection.IterableLike.sliding(Int, Int)*` */ private[angel] class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int) extends RDD[Array[T]](parent) { require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1), "Window size and step must be greater than 0, " + s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.") override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = { val part = split.asInstanceOf[SlidingRDDPartition[T]] (firstParent[T].iterator(part.prev, context) ++ part.tail) .drop(part.offset) .sliding(windowSize, step) .withPartial(false) .map(_.toArray) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev) override def getPartitions: Array[Partition] = { val parentPartitions = parent.partitions val n = parentPartitions.length if (n == 0) { Array.empty } else if (n == 1) { Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0)) } else { val w1 = windowSize - 1 // Get partition sizes and first w1 elements. val (sizes, heads) = parent.mapPartitions { iter => val w1Array = iter.take(w1).toArray Iterator.single((w1Array.length + iter.length, w1Array)) }.collect().unzip val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]] var i = 0 var cumSize = 0 var partitionIndex = 0 while (i < n) { val mod = cumSize % step val offset = if (mod == 0) 0 else step - mod val size = sizes(i) if (offset < size) { val tail = mutable.ListBuffer.empty[T] // Keep appending to the current tail until it has w1 elements. var j = i + 1 while (j < n && tail.length < w1) { tail ++= heads(j).take(w1 - tail.length) j += 1 } if (sizes(i) + tail.length >= offset + windowSize) { partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset) partitionIndex += 1 } } cumSize += size i += 1 } partitions.toArray } } // TODO: Override methods such as aggregate, which only requires one Spark job. }
Example 8
Source File: KeyValueRDD.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.rdd import java.net.InetAddress import java.util.zip.CRC32 import com.couchbase.client.core.config.CouchbaseBucketConfig import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse} import com.couchbase.client.java.document.Document import com.couchbase.spark.Logging import com.couchbase.spark.connection.{CouchbaseConfig, CouchbaseConnection, KeyValueAccessor} import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.reflect.ClassTag import rx.lang.scala.JavaConversions._ import scala.concurrent.duration.Duration class KeyValuePartition(id: Int, docIds: Seq[String], loc: Option[InetAddress]) extends Partition { override def index: Int = id def ids: Seq[String] = docIds def location: Option[InetAddress] = loc override def toString = s"KeyValuePartition($id, $docIds, $loc)" } class KeyValueRDD[D <: Document[_]] (@transient private val sc: SparkContext, ids: Seq[String], bname: String = null, timeout: Option[Duration] = None) (implicit ct: ClassTag[D]) extends RDD[D](sc, Nil) { private val cbConfig = CouchbaseConfig(sc.getConf) private val bucketName = Option(bname).getOrElse(cbConfig.buckets.head.name) override def compute(split: Partition, context: TaskContext): Iterator[D] = { val p = split.asInstanceOf[KeyValuePartition] new KeyValueAccessor[D](cbConfig, p.ids, bucketName, timeout).compute() } override protected def getPartitions: Array[Partition] = { val core = CouchbaseConnection().bucket(cbConfig, bucketName).core() val req = new GetClusterConfigRequest() val config = toScalaObservable(core.send[GetClusterConfigResponse](req)) .map(c => { logWarning(c.config().bucketConfigs().toString) logWarning(bucketName) c.config().bucketConfig(bucketName) }) .toBlocking .single val parts = config match { case bucketConfig: CouchbaseBucketConfig => val numPartitions = bucketConfig.numberOfPartitions() var partitionIndex = 0 ids.groupBy(id => { val crc32 = new CRC32() crc32.update(id.getBytes("UTF-8")) val rv = (crc32.getValue >> 16) & 0x7fff rv.toInt & numPartitions - 1 }).map(grouped => { val hostname = Some( bucketConfig.nodeAtIndex(bucketConfig.nodeIndexForMaster(grouped._1, false)).hostname() ) val currentIdx = partitionIndex partitionIndex += 1 new KeyValuePartition(currentIdx, grouped._2, Some(InetAddress.getByName(hostname.get))) }).toArray case _ => logWarning("Memcached preferred locations currently not supported.") Array(new KeyValuePartition(0, ids, None)) } parts.asInstanceOf[Array[Partition]] } override protected def getPreferredLocations(split: Partition): Seq[String] = { val p = split.asInstanceOf[KeyValuePartition] if (p.location.isDefined) { Seq(p.location.get.getHostName, p.location.get.getHostAddress) } else { Nil } } }
Example 9
Source File: SpatialViewRDD.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.rdd import com.couchbase.client.java.document.json.JsonObject import com.couchbase.client.java.view.SpatialViewQuery import com.couchbase.spark.connection.{CouchbaseConfig, SpatialViewAccessor} import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD import scala.concurrent.duration.Duration case class CouchbaseSpatialViewRow(id: String, key: Any, value: Any, geometry: JsonObject) class SpatialViewRDD (@transient private val sc: SparkContext, viewQuery: SpatialViewQuery, bucketName: String = null, timeout: Option[Duration] = None) extends RDD[CouchbaseSpatialViewRow](sc, Nil) { private val cbConfig = CouchbaseConfig(sc.getConf) override def compute(split: Partition, context: TaskContext): Iterator[CouchbaseSpatialViewRow] = new SpatialViewAccessor(cbConfig, Seq(viewQuery), bucketName, timeout).compute() override protected def getPartitions: Array[Partition] = Array(new CouchbasePartition(0)) } object SpatialViewRDD { def apply(sc: SparkContext, bucketName: String, viewQuery: SpatialViewQuery, timeout: Option[Duration] = None) = new SpatialViewRDD(sc, viewQuery, bucketName, timeout) }
Example 10
Source File: QueryRDD.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.rdd import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse} import com.couchbase.client.core.service.ServiceType import com.couchbase.client.java.document.json.JsonObject import com.couchbase.client.java.query.N1qlQuery import com.couchbase.spark.connection.{CouchbaseConfig, CouchbaseConnection, QueryAccessor} import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD import rx.lang.scala.JavaConversions.toScalaObservable import scala.concurrent.duration.Duration case class CouchbaseQueryRow(value: JsonObject) class QueryPartition(val index: Int, val hostnames: Seq[String]) extends Partition { override def toString = s"QueryPartition($index, $hostnames)" } class QueryRDD(@transient private val sc: SparkContext, query: N1qlQuery, bucketName: String = null, timeout: Option[Duration] = None) extends RDD[CouchbaseQueryRow](sc, Nil) { private val cbConfig = CouchbaseConfig(sc.getConf) override def compute(split: Partition, context: TaskContext): Iterator[CouchbaseQueryRow] = new QueryAccessor(cbConfig, Seq(query), bucketName, timeout).compute() override protected def getPartitions: Array[Partition] = { // Try to run the query on a Spark worker co-located on a Couchbase query node val addressesWithQueryService = RDDSupport.couchbaseNodesWithService(cbConfig, bucketName, ServiceType.QUERY) // A single query can only run on one node, so return one partition Array(new QueryPartition(0, addressesWithQueryService)) } override protected def getPreferredLocations(split: Partition): Seq[String] = { RDDSupport.getPreferredLocations(split) } } object QueryRDD { def apply(sc: SparkContext, bucketName: String, query: N1qlQuery, timeout: Option[Duration] = None) = new QueryRDD(sc, query, bucketName, timeout) }
Example 11
Source File: RDDSupport.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.rdd import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse} import com.couchbase.client.core.service.ServiceType import com.couchbase.spark.connection.{CouchbaseConfig, CouchbaseConnection} import org.apache.spark.Partition import rx.lang.scala.JavaConversions.toScalaObservable def getPreferredLocations(split: Partition): Seq[String] = { val p = split.asInstanceOf[QueryPartition] // If the user has co-located Spark worker services on Couchbase nodes, this will get the query // to run on a Spark worker running on a relevant Couchbase node, if possible val out = if (p.hostnames.nonEmpty) { p.hostnames } else { Nil } out } }
Example 12
Source File: SubdocLookupRDD.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.rdd import java.net.InetAddress import java.util.zip.CRC32 import com.couchbase.client.core.config.CouchbaseBucketConfig import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse} import com.couchbase.spark.Logging import com.couchbase.spark.connection._ import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD import rx.lang.scala.JavaConversions._ import scala.concurrent.duration.Duration class SubdocLookupPartition(id: Int, specs: Seq[SubdocLookupSpec], loc: Option[InetAddress]) extends Partition { override def index: Int = id def ids: Seq[SubdocLookupSpec] = specs def location: Option[InetAddress] = loc override def toString = s"SubdocLookupPartition($id, $ids, $loc)" } class SubdocLookupRDD(@transient private val sc: SparkContext, specs: Seq[SubdocLookupSpec], bname: String = null, timeout: Option[Duration] = None) extends RDD[SubdocLookupResult](sc, Nil) { private val cbConfig = CouchbaseConfig(sc.getConf) private val bucketName = Option(bname).getOrElse(cbConfig.buckets.head.name) override def compute(split: Partition, context: TaskContext): Iterator[SubdocLookupResult] = { val p = split.asInstanceOf[SubdocLookupPartition] new SubdocLookupAccessor(cbConfig, p.ids, bucketName, timeout).compute() } override protected def getPartitions: Array[Partition] = { val core = CouchbaseConnection().bucket(cbConfig, bucketName).core() val req = new GetClusterConfigRequest() val config = toScalaObservable(core.send[GetClusterConfigResponse](req)) .map(c => { logWarning(c.config().bucketConfigs().toString) logWarning(bucketName) c.config().bucketConfig(bucketName) }) .toBlocking .single val parts = config match { case bucketConfig: CouchbaseBucketConfig => val numPartitions = bucketConfig.numberOfPartitions() var partitionIndex = 0 specs.groupBy(spec => { val crc32 = new CRC32() crc32.update(spec.id.getBytes("UTF-8")) val rv = (crc32.getValue >> 16) & 0x7fff rv.toInt & numPartitions - 1 }).map(grouped => { val hostname = Some( bucketConfig.nodeAtIndex(bucketConfig.nodeIndexForMaster(grouped._1, false)).hostname() ) val currentIdx = partitionIndex partitionIndex += 1 new SubdocLookupPartition(currentIdx, grouped._2, Some(InetAddress.getByName(hostname.get))) }).toArray case _ => logWarning("Memcached preferred locations currently not supported.") Array(new SubdocLookupPartition(0, specs, None)) } parts.asInstanceOf[Array[Partition]] } override protected def getPreferredLocations(split: Partition): Seq[String] = { val p = split.asInstanceOf[SubdocLookupPartition] if (p.location.isDefined) { Seq(p.location.get.getHostName, p.location.get.getHostAddress) } else { Nil } } }
Example 13
Source File: BinaryFileRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{ Configurable, Configuration } import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.spark.input.StreamFileInputFormat import org.apache.spark.{ Partition, SparkContext } private[spark] class BinaryFileRDD[T]( sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = newJobContext(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 14
Source File: SubdocMutateRDD.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.rdd import java.net.InetAddress import java.util.zip.CRC32 import com.couchbase.client.core.config.CouchbaseBucketConfig import com.couchbase.client.core.message.cluster.{GetClusterConfigRequest, GetClusterConfigResponse} import com.couchbase.spark.connection._ import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD import rx.lang.scala.JavaConversions._ import scala.concurrent.duration.Duration class SubdocMutationPartition(id: Int, specs: Seq[SubdocMutationSpec], loc: Option[InetAddress]) extends Partition { override def index: Int = id def ids: Seq[SubdocMutationSpec] = specs def location: Option[InetAddress] = loc override def toString = s"SubdocMutatePartition($id, $ids, $loc)" } class SubdocMutateRDD(@transient private val sc: SparkContext, specs: Seq[SubdocMutationSpec], bname: String = null, timeout: Option[Duration] = None) extends RDD[SubdocMutationResult](sc, Nil) { private val cbConfig = CouchbaseConfig(sc.getConf) private val bucketName = Option(bname).getOrElse(cbConfig.buckets.head.name) override def compute(split: Partition, context: TaskContext): Iterator[SubdocMutationResult] = { val p = split.asInstanceOf[SubdocMutationPartition] new SubdocMutationAccessor(cbConfig, p.ids, bucketName, timeout).compute() } override protected def getPartitions: Array[Partition] = { val core = CouchbaseConnection().bucket(cbConfig, bucketName).core() val req = new GetClusterConfigRequest() val config = toScalaObservable(core.send[GetClusterConfigResponse](req)) .map(c => { logWarning(c.config().bucketConfigs().toString) logWarning(bucketName) c.config().bucketConfig(bucketName) }) .toBlocking .single val parts = config match { case bucketConfig: CouchbaseBucketConfig => val numPartitions = bucketConfig.numberOfPartitions() var partitionIndex = 0 specs.groupBy(spec => { val crc32 = new CRC32() crc32.update(spec.id.getBytes("UTF-8")) val rv = (crc32.getValue >> 16) & 0x7fff rv.toInt & numPartitions - 1 }).map(grouped => { val hostname = Some( bucketConfig.nodeAtIndex(bucketConfig.nodeIndexForMaster(grouped._1, false)).hostname() ) val currentIdx = partitionIndex partitionIndex += 1 new SubdocMutationPartition(currentIdx, grouped._2, Some(InetAddress.getByName(hostname.get))) }).toArray case _ => logWarning("Memcached preferred locations currently not supported.") Array(new SubdocMutationPartition(0, specs, None)) } parts.asInstanceOf[Array[Partition]] } override protected def getPreferredLocations(split: Partition): Seq[String] = { val p = split.asInstanceOf[SubdocMutationPartition] if (p.location.isDefined) { Seq(p.location.get.getHostName, p.location.get.getHostAddress) } else { Nil } } }
Example 15
Source File: NewHBaseRDD.scala From hbase-connectors with Apache License 2.0 | 5 votes |
package org.apache.hadoop.hbase.spark import org.apache.hadoop.conf.Configuration import org.apache.yetus.audience.InterfaceAudience; import org.apache.hadoop.mapreduce.InputFormat import org.apache.spark.rdd.NewHadoopRDD import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} @InterfaceAudience.Public class NewHBaseRDD[K,V](@transient val sc : SparkContext, @transient val inputFormatClass: Class[_ <: InputFormat[K, V]], @transient val keyClass: Class[K], @transient val valueClass: Class[V], @transient private val __conf: Configuration, val hBaseContext: HBaseContext) extends NewHadoopRDD(sc, inputFormatClass, keyClass, valueClass, __conf) { override def compute(theSplit: Partition, context: TaskContext): InterruptibleIterator[(K, V)] = { hBaseContext.applyCreds() super.compute(theSplit, context) } }
Example 16
Source File: GDBRDD.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import org.apache.hadoop.conf.Configuration import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.{Logging, Partition, SparkContext, TaskContext} case class GDBRDD(@transient sc: SparkContext, gdbPath: String, gdbName: String, numPartitions: Int) extends RDD[Row](sc, Nil) with Logging { @DeveloperApi override def compute(partition: Partition, context: TaskContext): Iterator[Row] = { val part = partition.asInstanceOf[GDBPartition] val hadoopConf = if (sc == null) new Configuration() else sc.hadoopConfiguration val index = GDBIndex(gdbPath, part.hexName, hadoopConf) val table = GDBTable(gdbPath, part.hexName, hadoopConf) context.addTaskCompletionListener(context => { table.close() index.close() }) table.rowIterator(index, part.startAtRow, part.numRowsToRead) } override protected def getPartitions: Array[Partition] = { val hadoopConf = if (sc == null) new Configuration() else sc.hadoopConfiguration GDBTable.findTable(gdbPath, gdbName, hadoopConf) match { case Some(catTab) => { val index = GDBIndex(gdbPath, catTab.hexName, hadoopConf) try { val numRows = index.numRows val numRowsPerPartition = (numRows.toDouble / numPartitions).ceil.toInt var startAtRow = 0 (0 until numPartitions).map(i => { val endAtRow = startAtRow + numRowsPerPartition val numRowsToRead = if (endAtRow <= numRows) numRowsPerPartition else numRows - startAtRow val gdbPartition = GDBPartition(i, catTab.hexName, startAtRow, numRowsToRead) startAtRow += numRowsToRead gdbPartition }).toArray } finally { index.close() } } case _ => { log.error(s"Cannot find '$gdbName' in $gdbPath, creating an empty array of Partitions !") Array.empty[Partition] } } } } private[this] case class GDBPartition(m_index: Int, val hexName: String, val startAtRow: Int, val numRowsToRead: Int ) extends Partition { override def index = m_index }
Example 17
Source File: RDDMatcher.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.cep.spark import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, TaskContext} import scala.reflect.ClassTag import dbis.piglet.cep.nfa.NFAController import dbis.piglet.cep.engines._ import dbis.piglet.cep.ops.SelectionStrategy._ import dbis.piglet.cep.ops.OutputStrategy._ import dbis.piglet.backends.{SchemaClass => Event} import dbis.piglet.cep.ops.MatchCollector import dbis.piglet.cep.ops.SelectionStrategy class RDDMatcher[T <: Event: ClassTag](parent: RDD[T], nfa: NFAController[T], sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends RDD[T](parent){ val collector: MatchCollector[T] = new MatchCollector() val engine: CEPEngine[T] = sstr match { case SelectionStrategy.FirstMatch => new FirstMatch(nfa, collector) case SelectionStrategy.AllMatches => new AnyMatch(nfa, collector) case SelectionStrategy.NextMatches => new NextMatch(nfa, collector) case SelectionStrategy.ContiguityMatches => new ContiguityMatch(nfa, collector) case _ => throw new Exception("The Strategy is not supported") } override def compute(split: Partition, context: TaskContext): Iterator[T] = { firstParent[T].iterator(split, context).foreach (event => engine.runEngine(event)) collector.convertEventsToArray().iterator } override protected def getPartitions: Array[Partition] = firstParent[Event].partitions }
Example 18
Source File: Neo4jRDD.scala From neo4j-spark-connector with Apache License 2.0 | 5 votes |
package org.neo4j.spark.rdd import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.{Partition, SparkContext, TaskContext} import org.neo4j.spark.{Executor, Neo4jConfig, Partitions} class Neo4jRDD(@transient sc: SparkContext, val query: String, val parameters: Map[String, Any] = Map.empty, partitions: Partitions = Partitions()) extends RDD[Row](sc, Nil) { val neo4jConfig = Neo4jConfig(sc.getConf) override def compute(partition: Partition, context: TaskContext): Iterator[Row] = { val neo4jPartition: Neo4jPartition = partition.asInstanceOf[Neo4jPartition] Executor.execute(neo4jConfig, query, parameters ++ neo4jPartition.window).sparkRows } override protected def getPartitions: Array[Partition] = { val p = partitions.effective() Range(0, p.partitions.toInt).map(idx => new Neo4jPartition(idx, p.skip(idx), p.limit(idx))).toArray } override def toString(): String = s"Neo4jRDD partitions $partitions $query using $parameters" }
Example 19
Source File: PushDownJdbcRDD.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.jdbc.utilities import java.sql.{Connection, ResultSet} import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.rdd.JdbcRDD import org.apache.spark.sql.Row import com.paypal.gimel.common.utilities.GenericUtils import com.paypal.gimel.logger.Logger class PushDownJdbcRDD(sc: SparkContext, getConnection: () => Connection, sql: String, mapRow: ResultSet => Row = PushDownJdbcRDD.resultSetToRow) extends JdbcRDD[Row](sc, getConnection, sql, 0, 100, 1, mapRow) with Logging { override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = { val logger = Logger(this.getClass.getName) val functionName = s"[QueryHash: ${sql.hashCode}]" logger.info(s"Proceeding to execute push down query $functionName: $sql") val queryResult: String = GenericUtils.time(functionName, Some(logger)) { JDBCConnectionUtility.withResources(getConnection()) { connection => JdbcAuxiliaryUtilities.executeQueryAndReturnResultString( sql, connection ) } } Seq(Row(queryResult)).iterator } } object PushDownJdbcRDD { def resultSetToRow(rs: ResultSet): Row = { Row(rs.getString(0)) } }
Example 20
Source File: FilteredQueryInfinispanRDD.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.rdd import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, TaskContext} import org.infinispan.client.hotrod.Search import org.infinispan.spark._ class FilteredQueryInfinispanRDD[K, V, R](parent: InfinispanRDD[K, V], filter: QueryFilter) extends RDD[(K, R)](parent.sc, Nil) { override def count(): Long = filter match { case f: StringQueryFilter => val cache = getCache(parent.configuration, parent.remoteCacheManager) Search.getQueryFactory(cache).create(f.queryString).getResultSize case f: QueryObjectFilter => f.query.getResultSize } @DeveloperApi override def compute(split: Partition, context: TaskContext): Iterator[(K, R)] = parent.compute( split, context, (a, p) => RemoteCacheManagerBuilder.create(p, a), filter.name, filter.params.toArray ) override protected def getPartitions: Array[Partition] = parent.getPartitions }
Example 21
Source File: Splitter.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.rdd import java.net.SocketAddress import java.util import org.apache.spark.Partition import org.infinispan.client.hotrod.CacheTopologyInfo import org.infinispan.spark.config.ConnectorConfiguration import scala.collection.JavaConverters._ import scala.collection.mutable class PerServerSplitter extends Splitter { override def split(cacheTopology: CacheTopologyInfo, properties: ConnectorConfiguration): Array[Partition] = { val segmentsByServer = cacheTopology.getSegmentsPerServer if (segmentsByServer.isEmpty) throw new IllegalArgumentException("No servers found to partition") if (segmentsByServer.keySet().size == 1 && segmentsByServer.values().asScala.flatten(_.asScala).isEmpty) { Array(new SingleServerPartition(segmentsByServer.keySet.asScala.head, properties)) } else { val segmentsByServerSeq = segmentsByServer.asScala.toStream.sortBy { case (_, v) => v.size } val segments = segmentsByServerSeq.flatMap { case (_, segs) => segs.asScala.toSeq }.distinct val numServers = segmentsByServerSeq.size val numSegments = segments.size val segmentsPerServer = Math.ceil(numSegments.toFloat / numServers.toFloat).toInt val q = mutable.Queue(segments: _*) val segmentsByServerIterator = Iterator.continually(segmentsByServerSeq).flatten val result = new mutable.HashMap[SocketAddress, collection.mutable.Set[Integer]] with mutable.MultiMap[SocketAddress, Integer] while (q.nonEmpty) { val (server, segments) = segmentsByServerIterator.next() val split = List.fill(segmentsPerServer) { q.dequeueFirst(segments.contains) }.flatten if (split.nonEmpty) { split.foreach { result.addBinding(server, _) } } } val pps = properties.getServerPartitions result.toStream.flatMap { case (a, b) => cut(b.toSeq, pps).map((a, _)) }.zipWithIndex.map { case ((server, segs), idx) => new InfinispanPartition(idx, Location(server), toJavaSet(segs), properties) }.toArray } } private def toJavaSet(s: Set[Integer]) = new util.HashSet[Integer](s.asJava) private def cut[A](l: Seq[A], parts: Int) = (0 until parts).map { i => l.drop(i).sliding(1, parts).flatten.toSet }.filter(_.nonEmpty) }
Example 22
Source File: RabbitMQPartition.scala From spark-rabbitmq with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.rabbitmq.distributed import org.apache.spark.Partition import org.apache.spark.streaming.rabbitmq.models.ExchangeAndRouting private[rabbitmq] class RabbitMQPartition( val index: Int, val queue: String, val exchangeAndRouting: ExchangeAndRouting, val connectionParams: Map[String, String], val withFairDispatch: Boolean ) extends Partition { override def toString: String = s"${index.toString},$queue, ${exchangeAndRouting.toString()}," + s" ${connectionParams.mkString(" , ")}, ${withFairDispatch.toString}" def toStringPretty: String = s"queue: $queue, ${exchangeAndRouting.toStringPretty()}, connectionParams: " + s"${connectionParams.mkString(" , ")}, withFairDispatch: $withFairDispatch" }
Example 23
Source File: MongodbRDD.scala From Spark-MongoDB with Apache License 2.0 | 5 votes |
package com.stratio.datasource.mongodb.rdd import com.mongodb.casbah.Imports._ import com.stratio.datasource.mongodb.partitioner.{MongodbPartition, MongodbPartitioner} import com.stratio.datasource.util.Config import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{Partition, TaskContext} import com.stratio.datasource.mongodb.query.{NoFilters, FilterSection} class MongodbRDD( sc: SQLContext, config: Config, partitioner: MongodbPartitioner, requiredColumns: Array[String] = Array(), filters: FilterSection = NoFilters) extends RDD[DBObject](sc.sparkContext, deps = Nil) { override def getPartitions: Array[Partition] = partitioner.computePartitions().asInstanceOf[Array[Partition]] override def getPreferredLocations(split: Partition): Seq[String] = split.asInstanceOf[MongodbPartition].hosts.map(new ServerAddress(_).getHost) override def compute( split: Partition, context: TaskContext): MongodbRDDIterator = new MongodbRDDIterator( context, split.asInstanceOf[MongodbPartition], config, requiredColumns, filters) }
Example 24
Source File: PartitionPruningRDDSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 25
Source File: SnowflakeRDD.scala From spark-snowflake with Apache License 2.0 | 5 votes |
package net.snowflake.spark.snowflake.io import java.io.InputStream import net.snowflake.spark.snowflake.io.SupportedFormat.SupportedFormat import org.apache.spark.{Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD class SnowflakeRDD(sc: SparkContext, fileNames: List[String], format: SupportedFormat, downloadFile: String => InputStream, expectedPartitionCount: Int) extends RDD[String](sc, Nil) { @transient private val MIN_FILES_PER_PARTITION = 2 @transient private val MAX_FILES_PER_PARTITION = 10 override def compute(split: Partition, context: TaskContext): Iterator[String] = { val snowflakePartition = split.asInstanceOf[SnowflakePartition] val stringIterator = new SFRecordReader(format, snowflakePartition.index) stringIterator.setDownloadFunction(downloadFile) snowflakePartition.fileNames.foreach(name => { stringIterator.addFileName(name) }) logger.info( s"""${SnowflakeResultSetRDD.WORKER_LOG_PREFIX}: Start reading | partition ID:${snowflakePartition.index} | totalFileCount=${snowflakePartition.fileNames.size} |""".stripMargin.filter(_ >= ' ')) stringIterator } override protected def getPartitions: Array[Partition] = { var fileCountPerPartition = Math.max( MIN_FILES_PER_PARTITION, (fileNames.length + expectedPartitionCount / 2) / expectedPartitionCount ) fileCountPerPartition = Math.min(MAX_FILES_PER_PARTITION, fileCountPerPartition) val fileCount = fileNames.length val partitionCount = (fileCount + fileCountPerPartition - 1) / fileCountPerPartition logger.info(s"""${SnowflakeResultSetRDD.MASTER_LOG_PREFIX}: Total statistics: | fileCount=$fileCount filePerPartition=$fileCountPerPartition | actualPartitionCount=$partitionCount | expectedPartitionCount=$expectedPartitionCount |""".stripMargin.filter(_ >= ' ')) if (fileNames.nonEmpty) { fileNames .grouped(fileCountPerPartition) .zipWithIndex .map { case (names, index) => SnowflakePartition(names, id, index) } .toArray } else { // If the result set is empty, put one empty partition to the array. Seq[SnowflakePartition]{SnowflakePartition(fileNames, 0, 0)}.toArray } } } private case class SnowflakePartition(fileNames: List[String], rddId: Int, index: Int) extends Partition { override def hashCode(): Int = 31 * (31 + rddId) + index override def equals(other: Any): Boolean = super.equals(other) }
Example 26
Source File: SlidingRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD private[mllib] class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int) extends Partition with Serializable { override val index: Int = idx } private[mllib] class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int) extends RDD[Array[T]](parent) { require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1), "Window size and step must be greater than 0, " + s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.") override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = { val part = split.asInstanceOf[SlidingRDDPartition[T]] (firstParent[T].iterator(part.prev, context) ++ part.tail) .drop(part.offset) .sliding(windowSize, step) .withPartial(false) .map(_.toArray) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev) override def getPartitions: Array[Partition] = { val parentPartitions = parent.partitions val n = parentPartitions.length if (n == 0) { Array.empty } else if (n == 1) { Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0)) } else { val w1 = windowSize - 1 // Get partition sizes and first w1 elements. val (sizes, heads) = parent.mapPartitions { iter => val w1Array = iter.take(w1).toArray Iterator.single((w1Array.length + iter.length, w1Array)) }.collect().unzip val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]] var i = 0 var cumSize = 0 var partitionIndex = 0 while (i < n) { val mod = cumSize % step val offset = if (mod == 0) 0 else step - mod val size = sizes(i) if (offset < size) { val tail = mutable.ListBuffer.empty[T] // Keep appending to the current tail until it has w1 elements. var j = i + 1 while (j < n && tail.length < w1) { tail ++= heads(j).take(w1 - tail.length) j += 1 } if (sizes(i) + tail.length >= offset + windowSize) { partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset) partitionIndex += 1 } } cumSize += size i += 1 } partitions.toArray } } // TODO: Override methods such as aggregate, which only requires one Spark job. }
Example 27
Source File: StateStoreRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration class StateStoreRDD[T: ClassTag, U: ClassTag]( dataRDD: RDD[T], storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U], checkpointLocation: String, operatorId: Long, storeVersion: Long, keySchema: StructType, valueSchema: StructType, sessionState: SessionState, @transient private val storeCoordinator: Option[StateStoreCoordinatorRef]) extends RDD[U](dataRDD) { private val storeConf = new StateStoreConf(sessionState.conf) // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it private val confBroadcast = dataRDD.context.broadcast( new SerializableConfiguration(sessionState.newHadoopConf())) override protected def getPartitions: Array[Partition] = dataRDD.partitions override def getPreferredLocations(partition: Partition): Seq[String] = { val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) storeCoordinator.flatMap(_.getLocation(storeId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) store = StateStore.get( storeId, keySchema, valueSchema, storeVersion, storeConf, confBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 28
Source File: WholeTextFileRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 29
Source File: SubtractedRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.Dependency import org.apache.spark.OneToOneDependency import org.apache.spark.Partition import org.apache.spark.Partitioner import org.apache.spark.ShuffleDependency import org.apache.spark.SparkEnv import org.apache.spark.TaskContext private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag]( @transient var rdd1: RDD[_ <: Product2[K, V]], @transient var rdd2: RDD[_ <: Product2[K, W]], part: Partitioner) extends RDD[(K, V)](rdd1.context, Nil) { override def getDependencies: Seq[Dependency[_]] = { def rddDependency[T1: ClassTag, T2: ClassTag](rdd: RDD[_ <: Product2[T1, T2]]) : Dependency[_] = { if (rdd.partitioner == Some(part)) { logDebug("Adding one-to-one dependency with " + rdd) new OneToOneDependency(rdd) } else { logDebug("Adding shuffle dependency with " + rdd) new ShuffleDependency[T1, T2, Any](rdd, part) } } Seq(rddDependency[K, V](rdd1), rddDependency[K, W](rdd2)) } override def getPartitions: Array[Partition] = { val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.length) { // Each CoGroupPartition will depend on rdd1 and rdd2 array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) => dependencies(j) match { case s: ShuffleDependency[_, _, _] => None case _ => Some(new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i))) } }.toArray) } array } override val partitioner = Some(part) override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = { val partition = p.asInstanceOf[CoGroupPartition] val map = new JHashMap[K, ArrayBuffer[V]] def getSeq(k: K): ArrayBuffer[V] = { val seq = map.get(k) if (seq != null) { seq } else { val seq = new ArrayBuffer[V]() map.put(k, seq) seq } } def integrate(depNum: Int, op: Product2[K, V] => Unit): Unit = { dependencies(depNum) match { case oneToOneDependency: OneToOneDependency[_] => val dependencyPartition = partition.narrowDeps(depNum).get.split oneToOneDependency.rdd.iterator(dependencyPartition, context) .asInstanceOf[Iterator[Product2[K, V]]].foreach(op) case shuffleDependency: ShuffleDependency[_, _, _] => val iter = SparkEnv.get.shuffleManager .getReader( shuffleDependency.shuffleHandle, partition.index, partition.index + 1, context) .read() iter.foreach(op) } } // the first dep is rdd1; add all values to the map integrate(0, t => getSeq(t._1) += t._2) // the second dep is rdd2; remove all of its keys integrate(1, t => map.remove(t._1)) map.asScala.iterator.map(t => t._2.iterator.map((t._1, _))).flatten } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null } }
Example 30
Source File: ZippedWithIndexRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils private[spark] class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long) extends Partition with Serializable { override val index: Int = prev.index } @transient private val startIndices: Array[Long] = { val n = prev.partitions.length if (n == 0) { Array.empty } else if (n == 1) { Array(0L) } else { prev.context.runJob( prev, Utils.getIteratorSize _, 0 until n - 1 // do not need to count the last partition ).scanLeft(0L)(_ + _) } } override def getPartitions: Array[Partition] = { firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index))) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = { val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition] val parentIter = firstParent[T].iterator(split.prev, context) Utils.getIteratorZipWithIndex(parentIter, split.startIndex) } }
Example 31
Source File: UnionRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport} import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 32
Source File: PartitionwiseSampledRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 33
Source File: PartitionerAwareUnionRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.nonEmpty) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map { index => new PartitionerAwareUnionRDDPartition(rdds, index) }.toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 34
Source File: BinaryFileRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.StreamFileInputFormat private[spark] class BinaryFileRDD[T]( sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 35
Source File: PartitionPruningRDDSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 36
Source File: RiakKeysPartitioner.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.rdd.partitioner import com.basho.riak.client.core.util.HostAndPort import com.basho.riak.spark.query.QueryData import com.basho.riak.spark.rdd.{ RiakPartition, ReadConf } import org.apache.spark.Partition import java.math.BigInteger case class RiakKeysPartition[K]( index: Int, endpoints: Iterable[HostAndPort], keys: QueryData[K]) extends RiakPartition object RiakKeysPartitioner { def partitions[K](endpoints: Iterable[HostAndPort], readConf: ReadConf, riakKeys: QueryData[K]): Array[Partition] = { riakKeys.keysOrRange match { case Some(Left(keys)) => Array(new RiakKeysPartition[K](0, endpoints, riakKeys)) case Some(Right(ranges: Seq[(K, Option[K])])) => ranges match { case (from, to) +: Seq() => { val splitRanges = splitRangeIntoSubranges(from, to, readConf.getOrDefaultSplitCount()) partitionPerRange(splitRanges, endpoints, riakKeys.index) } case _ => partitionPerRange(ranges, endpoints, riakKeys.index) } } } def partitionPerRange[K](ranges: Seq[(K, Option[K])], endpoints: Iterable[HostAndPort], index: Option[String]): Array[Partition] = { ranges.zipWithIndex.map { case (range, indx) => new RiakKeysPartition[K](indx, endpoints, new QueryData[K](Some(Right(Seq(range))), index)) }.toArray } // TODO: move to PartitionUtils def calculateRanges[T: Integral](from: T, to: T, splitCount: Int)(implicit num: Integral[T]): Seq[(T, T)] = { import num._ val diff = (to - from) / num.fromInt(splitCount - 1) val partitionsCount = if (diff == 0) 1 else splitCount val start = (0 to (partitionsCount - 1)).map(x => from + num.fromInt(x) * diff) val end = start.tail.map(_ - num.fromInt(1)) :+ to start zip end } def splitRangeIntoSubranges[K](from: K, to: Option[K], splitCount: Int): Seq[(K, Option[K])] = { to match { case Some(rangeEnd: Int) => from match { case rangeStart: Int => { calculateRanges(rangeStart, rangeEnd, splitCount).map(r => (r._1.asInstanceOf[K], Some(r._2.asInstanceOf[K]))) } case _ => throw new IllegalArgumentException } case Some(rangeEnd: Long) => from match { case rangeStart: Long => { calculateRanges(rangeStart, rangeEnd, splitCount).map(r => (r._1.asInstanceOf[K], Some(r._2.asInstanceOf[K]))) } case _ => throw new IllegalArgumentException } case Some(rangeEnd: BigInt) => from match { case rangeStart: BigInt => { calculateRanges(rangeStart, rangeEnd, splitCount).map(r => (r._1.asInstanceOf[K], Some(r._2.asInstanceOf[K]))) } case _ => throw new IllegalArgumentException } case _ => Seq((from, to)) } } }
Example 37
Source File: Partitioner.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark import java.net.InetAddress import com.lucidworks.spark.rdd.SolrRDD import com.lucidworks.spark.util.SolrSupport import org.apache.solr.client.solrj.SolrQuery import org.apache.spark.Partition import scala.collection.mutable.ArrayBuffer // Is there a need to override {@code Partitioner.scala} and define our own partition id's object SolrPartitioner { def getShardPartitions(shards: List[SolrShard], query: SolrQuery) : Array[Partition] = { shards.zipWithIndex.map{ case (shard, i) => // Chose any of the replicas as the active shard to query SelectSolrRDDPartition(i, "*", shard, query, SolrRDD.randomReplica(shard))}.toArray } def getSplitPartitions( shards: List[SolrShard], query: SolrQuery, splitFieldName: String, splitsPerShard: Int): Array[Partition] = { var splitPartitions = ArrayBuffer.empty[SelectSolrRDDPartition] var counter = 0 shards.foreach(shard => { val splits = SolrSupport.getShardSplits(query, shard, splitFieldName, splitsPerShard) splits.foreach(split => { splitPartitions += SelectSolrRDDPartition(counter, "*", shard, split.query, split.replica) counter = counter + 1 }) }) splitPartitions.toArray } // Workaround for SOLR-10490. TODO: Remove once fixed def getExportHandlerPartitions( shards: List[SolrShard], query: SolrQuery): Array[Partition] = { shards.zipWithIndex.map{ case (shard, i) => // Chose any of the replicas as the active shard to query ExportHandlerPartition(i, shard, query, SolrRDD.randomReplica(shard), 0, 0)}.toArray } // Workaround for SOLR-10490. TODO: Remove once fixed def getExportHandlerPartitions( shards: List[SolrShard], query: SolrQuery, splitFieldName: String, splitsPerShard: Int): Array[Partition] = { val splitPartitions = ArrayBuffer.empty[ExportHandlerPartition] var counter = 0 shards.foreach(shard => { // Form a continuous iterator list so that we can pick different replicas for different partitions in round-robin mode val splits = SolrSupport.getExportHandlerSplits(query, shard, splitFieldName, splitsPerShard) splits.foreach(split => { splitPartitions += ExportHandlerPartition(counter, shard, split.query, split.replica, split.numWorkers, split.workerId) counter = counter+1 }) }) splitPartitions.toArray } } case class SolrShard(shardName: String, replicas: List[SolrReplica]) case class SolrReplica( replicaNumber: Int, replicaName: String, replicaUrl: String, replicaHostName: String, locations: Array[InetAddress]) { def getHostAndPort(): String = {replicaHostName.substring(0, replicaHostName.indexOf('_'))} override def toString(): String = { return s"SolrReplica(${replicaNumber}) ${replicaName}: url=${replicaUrl}, hostName=${replicaHostName}, locations="+locations.mkString(",") } }
Example 38
Source File: SolrRDDPartition.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark import org.apache.solr.client.solrj.SolrQuery import org.apache.solr.common.params.SolrParams import org.apache.spark.Partition trait SolrRDDPartition extends Partition { val solrShard: SolrShard val query: SolrQuery var preferredReplica: SolrReplica // Preferred replica to query } case class CloudStreamPartition( index: Int, zkhost:String, collection:String, params: SolrParams) extends Partition case class SelectSolrRDDPartition( index: Int, cursorMark: String, solrShard: SolrShard, query: SolrQuery, var preferredReplica: SolrReplica) extends SolrRDDPartition case class ExportHandlerPartition( index: Int, solrShard: SolrShard, query: SolrQuery, var preferredReplica: SolrReplica, numWorkers: Int, workerId: Int) extends SolrRDDPartition case class SolrLimitPartition( index: Int = 0, zkhost:String, collection:String, maxRows: Int, query: SolrQuery) extends Partition
Example 39
Source File: HBasePartition.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.regionserver.RegionScanner import org.apache.spark.{Logging, Partition} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.hbase.catalyst.expressions.PartialPredicateOperations._ import org.apache.spark.sql.hbase.types.{HBaseBytesType, Range} private[hbase] class HBasePartition( val idx: Int, val mappedIndex: Int, start: Option[HBaseRawType] = None, end: Option[HBaseRawType] = None, val server: Option[String] = None, val filterPredicates: Option[Expression] = None, @transient relation: HBaseRelation = null, @transient val newScanner:RegionScanner = null) extends Range[HBaseRawType](start, true, end, false, HBaseBytesType) with Partition with IndexMappable with Logging { override def index: Int = idx override def hashCode(): Int = idx @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start) @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end) def computePredicate(relation: HBaseRelation): Option[Expression] = { val predicate = if (filterPredicates.isDefined && filterPredicates.get.references.exists(_.exprId == relation.partitionKeys.head.exprId)) { val oriPredicate = filterPredicates.get val predicateReferences = oriPredicate.references.toSeq val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences) val row = new GenericMutableRow(predicateReferences.size) var rowIndex = 0 var i = 0 var range: Range[_] = null while (i < relation.keyColumns.size) { range = relation.generateRange(this, oriPredicate, i) if (range != null) { rowIndex = relation.rowIndex(predicateReferences, i) if (rowIndex >= 0) row.update(rowIndex, range) // if the non-last dimension range is not point, do not proceed to the next dims if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size else i = i + 1 } else i = relation.keyColumns.size } val pr = boundReference.partialReduce(row, predicateReferences) pr match { case (null, e: Expression) => Some(e) case (true, _) => None case (false, _) => Some(Literal(false)) } } else filterPredicates logInfo(predicate.toString) predicate } override def toString = { s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates" } }
Example 40
Source File: PartitionsIterator.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd import grizzled.slf4j.Logger import org.apache.spark.rdd.RDD import org.apache.spark.{ Partition, TaskContext } protected[flint] object PartitionsIterator { val logger = Logger(PartitionsIterator.getClass) def apply[T]( rdd: RDD[T], partitions: Seq[Partition], context: TaskContext, preservesPartitionsOrdering: Boolean = false // FIXME: This is a band-aid which should be fixed. ): PartitionsIterator[T] = new PartitionsIterator(rdd, partitions, context, preservesPartitionsOrdering) } def headPartitionIndex: Int = curPart.index }
Example 41
Source File: ParallelCollectionRDD.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd import org.apache.spark.rdd.RDD import org.apache.spark.{ Partition, SparkContext, TaskContext } import scala.reflect.ClassTag case class ParallelCollectionRDDPartition[T: ClassTag]( override val index: Int, values: Seq[T] ) extends Partition class ParallelCollectionRDD[T: ClassTag]( sc: SparkContext, @transient data: Seq[Seq[T]] ) extends RDD[T](sc, Nil) { override def compute(split: Partition, context: TaskContext): Iterator[T] = split.asInstanceOf[ParallelCollectionRDDPartition[T]].values.iterator override protected def getPartitions: Array[Partition] = data.zipWithIndex.map { case (d, index) => ParallelCollectionRDDPartition(index, d) }.toArray }
Example 42
Source File: RangeMergeJoinSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd import com.twosigma.flint.rdd.function.join.RangeMergeJoin import org.apache.spark.Partition import org.scalatest.FlatSpec class RangeMergeJoinSpec extends FlatSpec { val thisSplits = IndexedSeq( RangeSplit(Split(0), CloseOpen(1, Some(2))), RangeSplit(Split(1), CloseOpen(2, Some(3))), RangeSplit(Split(2), CloseOpen(3, Some(4))), RangeSplit(Split(3), CloseOpen(4, Some(5))), RangeSplit(Split(4), CloseOpen(5, None)) ) val thatSplits = IndexedSeq( RangeSplit(Split(0), CloseOpen(1, Some(3))), RangeSplit(Split(1), CloseOpen(3, Some(7))), RangeSplit(Split(2), CloseOpen(7, None)) ) "The RangeMergeJoin" should "`mergeSplits` with no tolerance correctly" in { val benchmark = List( RangeMergeJoin( CloseOpen(1, Some(2)), List(RangeSplit(Split(0), CloseOpen(1, Some(2)))), List(RangeSplit(Split(0), CloseOpen(1, Some(3)))) ), RangeMergeJoin( CloseOpen(2, Some(3)), List(RangeSplit(Split(1), CloseOpen(2, Some(3)))), List(RangeSplit(Split(0), CloseOpen(1, Some(3)))) ), RangeMergeJoin( CloseOpen(3, Some(4)), List(RangeSplit(Split(2), CloseOpen(3, Some(4)))), List(RangeSplit(Split(1), CloseOpen(3, Some(7)))) ), RangeMergeJoin( CloseOpen(4, Some(5)), List(RangeSplit(Split(3), CloseOpen(4, Some(5)))), List(RangeSplit(Split(1), CloseOpen(3, Some(7)))) ), RangeMergeJoin( CloseOpen(5, Some(7)), List(RangeSplit(Split(4), CloseOpen(5, None))), List(RangeSplit(Split(1), CloseOpen(3, Some(7)))) ), RangeMergeJoin( CloseOpen(7, None), List(RangeSplit(Split(4), CloseOpen(5, None))), List(RangeSplit(Split(2), CloseOpen(7, None))) ) ) assertResult(benchmark) { RangeMergeJoin.mergeSplits(thisSplits, thatSplits) } } it should "`mergeSplits` with some tolerance correctly" in { val benchmark = List( RangeMergeJoin( CloseOpen(1, Some(2)), List(RangeSplit(Split(0), CloseOpen(1, Some(2)))), List(RangeSplit(Split(0), CloseOpen(1, Some(3)))) ), RangeMergeJoin( CloseOpen(2, Some(3)), List(RangeSplit(Split(0), CloseOpen(1, Some(2))), RangeSplit(Split(1), CloseOpen(2, Some(3)))), List(RangeSplit(Split(0), CloseOpen(1, Some(3)))) ), RangeMergeJoin( CloseOpen(3, Some(4)), List(RangeSplit(Split(1), CloseOpen(2, Some(3))), RangeSplit(Split(2), CloseOpen(3, Some(4)))), List(RangeSplit(Split(0), CloseOpen(1, Some(3))), RangeSplit(Split(1), CloseOpen(3, Some(7)))) ), RangeMergeJoin( CloseOpen(4, Some(5)), List(RangeSplit(Split(2), CloseOpen(3, Some(4))), RangeSplit(Split(3), CloseOpen(4, Some(5)))), List(RangeSplit(Split(1), CloseOpen(3, Some(7)))) ), RangeMergeJoin( CloseOpen(5, Some(7)), List(RangeSplit(Split(3), CloseOpen(4, Some(5))), RangeSplit(Split(4), CloseOpen(5, None))), List(RangeSplit(Split(1), CloseOpen(3, Some(7)))) ), RangeMergeJoin( CloseOpen(7, None), List(RangeSplit(Split(4), CloseOpen(5, None))), List(RangeSplit(Split(1), CloseOpen(3, Some(7))), RangeSplit(Split(2), CloseOpen(7, None))) ) ) assertResult(benchmark) { RangeMergeJoin.mergeSplits(thisSplits, thatSplits, { x: Int => x - 1 }) } } }
Example 43
Source File: TiHandleRDD.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.tispark import com.pingcap.tikv.meta.TiDAGRequest import com.pingcap.tikv.util.RangeSplitter import com.pingcap.tikv.{TiConfiguration, TiSession} import com.pingcap.tispark.utils.TiUtil import com.pingcap.tispark.{TiPartition, TiTableReference} import gnu.trove.list.array.TLongArrayList import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.{Partition, TaskContext, TaskKilledException} import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ class TiHandleRDD( override val dagRequest: TiDAGRequest, override val physicalId: Long, val output: Seq[Attribute], override val tiConf: TiConfiguration, override val tableRef: TiTableReference, @transient private val session: TiSession, @transient private val sparkSession: SparkSession) extends TiRDD(dagRequest, physicalId, tiConf, tableRef, session, sparkSession) { private val outputTypes = output.map(_.dataType) private val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = new Iterator[InternalRow] { checkTimezone() private val tiPartition = split.asInstanceOf[TiPartition] private val session = TiSession.getInstance(tiConf) private val snapshot = session.createSnapshot(dagRequest.getStartTs) private[this] val tasks = tiPartition.tasks private val handleIterator = snapshot.indexHandleRead(dagRequest, tasks) private val regionManager = session.getRegionManager private lazy val handleList = { val lst = new TLongArrayList() handleIterator.asScala.foreach { // Kill the task in case it has been marked as killed. This logic is from // InterruptedIterator, but we inline it here instead of wrapping the iterator in order // to avoid performance overhead. if (context.isInterrupted()) { throw new TaskKilledException } lst.add(_) } lst } // Fetch all handles and group by region id private val regionHandleMap = RangeSplitter .newSplitter(regionManager) .groupByAndSortHandlesByRegionId(physicalId, handleList) .map(x => (x._1.first.getId, x._2)) private val iterator = regionHandleMap.iterator override def hasNext: Boolean = { // Kill the task in case it has been marked as killed. if (context.isInterrupted()) { throw new TaskKilledException } iterator.hasNext } override def next(): InternalRow = { val next = iterator.next val regionId = next._1 val handleList = next._2 // Returns RegionId:[handle1, handle2, handle3...] K-V pair val sparkRow = Row.apply(regionId, handleList.toArray()) TiUtil.rowToInternalRow(sparkRow, outputTypes, converters) } } }
Example 44
Source File: TiRowRDD.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.tispark import com.pingcap.tikv._ import com.pingcap.tikv.columnar.TiColumnarBatchHelper import com.pingcap.tikv.meta.TiDAGRequest import com.pingcap.tispark.listener.CacheInvalidateListener import com.pingcap.tispark.{TiPartition, TiTableReference} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.{Partition, TaskContext, TaskKilledException} import org.slf4j.Logger import scala.collection.JavaConversions._ class TiRowRDD( override val dagRequest: TiDAGRequest, override val physicalId: Long, val chunkBatchSize: Int, override val tiConf: TiConfiguration, val output: Seq[Attribute], override val tableRef: TiTableReference, @transient private val session: TiSession, @transient private val sparkSession: SparkSession) extends TiRDD(dagRequest, physicalId, tiConf, tableRef, session, sparkSession) { protected val logger: Logger = log // cache invalidation call back function // used for driver to update PD cache private val callBackFunc = CacheInvalidateListener.getInstance() override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = new Iterator[ColumnarBatch] { checkTimezone() private val tiPartition = split.asInstanceOf[TiPartition] private val session = TiSession.getInstance(tiConf) session.injectCallBackFunc(callBackFunc) private val snapshot = session.createSnapshot(dagRequest.getStartTs) private[this] val tasks = tiPartition.tasks private val iterator = snapshot.tableReadChunk(dagRequest, tasks, chunkBatchSize) override def hasNext: Boolean = { // Kill the task in case it has been marked as killed. This logic is from // Interrupted Iterator, but we inline it here instead of wrapping the iterator in order // to avoid performance overhead. if (context.isInterrupted()) { throw new TaskKilledException } iterator.hasNext } override def next(): ColumnarBatch = { TiColumnarBatchHelper.createColumnarBatch(iterator.next) } }.asInstanceOf[Iterator[InternalRow]] }
Example 45
Source File: TiRDD.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.tispark import com.pingcap.tikv._ import com.pingcap.tikv.exception.TiInternalException import com.pingcap.tikv.meta.TiDAGRequest import com.pingcap.tikv.types.Converter import com.pingcap.tikv.util.RangeSplitter import com.pingcap.tikv.util.RangeSplitter.RegionTask import com.pingcap.tispark.{TiPartition, TiTableReference} import org.apache.spark.Partition import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import scala.collection.JavaConversions._ import scala.collection.mutable import scala.collection.mutable.ListBuffer abstract class TiRDD( val dagRequest: TiDAGRequest, val physicalId: Long, val tiConf: TiConfiguration, val tableRef: TiTableReference, @transient private val session: TiSession, @transient private val sparkSession: SparkSession) extends RDD[InternalRow](sparkSession.sparkContext, Nil) { private lazy val partitionPerSplit = tiConf.getPartitionPerSplit protected def checkTimezone(): Unit = { if (!tiConf.getLocalTimeZone.equals(Converter.getLocalTimezone)) { throw new TiInternalException( "timezone are different! driver: " + tiConf.getLocalTimeZone + " executor:" + Converter.getLocalTimezone + " please set user.timezone in spark.driver.extraJavaOptions and spark.executor.extraJavaOptions") } } override protected def getPartitions: Array[Partition] = { val keyWithRegionTasks = RangeSplitter .newSplitter(session.getRegionManager) .splitRangeByRegion(dagRequest.getRangesByPhysicalId(physicalId), dagRequest.getStoreType) val hostTasksMap = new mutable.HashMap[String, mutable.Set[RegionTask]] with mutable.MultiMap[String, RegionTask] var index = 0 val result = new ListBuffer[TiPartition] for (task <- keyWithRegionTasks) { hostTasksMap.addBinding(task.getHost, task) val tasks = hostTasksMap(task.getHost) if (tasks.size >= partitionPerSplit) { result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId)) index += 1 hostTasksMap.remove(task.getHost) } } // add rest for (tasks <- hostTasksMap.values) { result.append(new TiPartition(index, tasks.toSeq, sparkContext.applicationId)) index += 1 } result.toArray } override protected def getPreferredLocations(split: Partition): Seq[String] = split.asInstanceOf[TiPartition].tasks.head.getHost :: Nil }
Example 46
Source File: RedisSourceRdd.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis.stream import com.redislabs.provider.redis.RedisConfig import com.redislabs.provider.redis.util.ConnectionUtils.withConnection import org.apache.spark.rdd.RDD import org.apache.spark.sql.redis.stream.RedisSourceTypes.StreamEntry import org.apache.spark.{Partition, SparkContext, TaskContext} class RedisSourceRdd(sc: SparkContext, redisConfig: RedisConfig, offsetRanges: Seq[RedisSourceOffsetRange], autoAck: Boolean = true) extends RDD[StreamEntry](sc, Nil) { override def compute(split: Partition, context: TaskContext): Iterator[StreamEntry] = { val partition = split.asInstanceOf[RedisSourceRddPartition] val offsetRange = partition.offsetRange val streamReader = new RedisStreamReader(redisConfig) streamReader.unreadStreamEntries(offsetRange) } override protected def getPartitions: Array[Partition] = { offsetRanges.zipWithIndex.map { case (e, i) => RedisSourceRddPartition(i, e) } .toArray } } case class RedisSourceRddPartition(index: Int, offsetRange: RedisSourceOffsetRange) extends Partition
Example 47
Source File: MultiZippedPartitionRDD.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.reflect.ClassTag private[spark] class MultiZippedPartitionsRDD[A: ClassTag, V: ClassTag]( sc: SparkContext, var f: (List[Iterator[A]]) => Iterator[V], var rddList: List[RDD[A]], preservesPartitioning: Boolean = false) extends ZippedPartitionsBaseRDD[V](sc, rddList, preservesPartitioning) { override def compute(s: Partition, context: TaskContext): Iterator[V] = { val partitions = s.asInstanceOf[ZippedPartitionsPartition].partitions val iterList = rddList.zipWithIndex.map{ case (rdd: RDD[A], index: Int) => rdd.iterator(partitions(index), context) } f(iterList) } override def clearDependencies() { super.clearDependencies() rddList = null f = null } }
Example 48
Source File: DataSourceRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import scala.reflect.ClassTag import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.v2.reader.InputPartition class DataSourceRDDPartition[T : ClassTag](val index: Int, val inputPartition: InputPartition[T]) extends Partition with Serializable class DataSourceRDD[T: ClassTag]( sc: SparkContext, @transient private val inputPartitions: Seq[InputPartition[T]]) extends RDD[T](sc, Nil) { override protected def getPartitions: Array[Partition] = { inputPartitions.zipWithIndex.map { case (inputPartition, index) => new DataSourceRDDPartition(index, inputPartition) }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { val reader = split.asInstanceOf[DataSourceRDDPartition[T]].inputPartition .createPartitionReader() context.addTaskCompletionListener[Unit](_ => reader.close()) val iter = new Iterator[T] { private[this] var valuePrepared = false override def hasNext: Boolean = { if (!valuePrepared) { valuePrepared = reader.next() } valuePrepared } override def next(): T = { if (!hasNext) { throw new java.util.NoSuchElementException("End of stream") } valuePrepared = false reader.get() } } new InterruptibleIterator(context, iter) } override def getPreferredLocations(split: Partition): Seq[String] = { split.asInstanceOf[DataSourceRDDPartition[T]].inputPartition.preferredLocations() } }
Example 49
Source File: StateStoreRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import java.util.UUID import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.EpochTracker import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration override def getPreferredLocations(partition: Partition): Seq[String] = { val stateStoreProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) // If we're in continuous processing mode, we should get the store version for the current // epoch rather than the one at planning time. val isContinuous = Option(ctxt.getLocalProperty(StreamExecution.IS_CONTINUOUS_PROCESSING)) .map(_.toBoolean).getOrElse(false) val currentVersion = if (isContinuous) { val epoch = EpochTracker.getCurrentEpoch assert(epoch.isDefined, "Current epoch must be defined for continuous processing streams.") epoch.get } else { storeVersion } store = StateStore.get( storeProviderId, keySchema, valueSchema, indexOrdinal, currentVersion, storeConf, hadoopConfBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 50
Source File: ContinuousWriteRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory} import org.apache.spark.util.Utils class ContinuousWriteRDD(var prev: RDD[InternalRow], writeTask: DataWriterFactory[InternalRow]) extends RDD[Unit](prev) { override val partitioner = prev.partitioner override def getPartitions: Array[Partition] = prev.partitions override def compute(split: Partition, context: TaskContext): Iterator[Unit] = { val epochCoordinator = EpochCoordinatorRef.get( context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), SparkEnv.get) EpochTracker.initializeCurrentEpoch( context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong) while (!context.isInterrupted() && !context.isCompleted()) { var dataWriter: DataWriter[InternalRow] = null // write the data and commit this writer. Utils.tryWithSafeFinallyAndFailureCallbacks(block = { try { val dataIterator = prev.compute(split, context) dataWriter = writeTask.createDataWriter( context.partitionId(), context.taskAttemptId(), EpochTracker.getCurrentEpoch.get) while (dataIterator.hasNext) { dataWriter.write(dataIterator.next()) } logInfo(s"Writer for partition ${context.partitionId()} " + s"in epoch ${EpochTracker.getCurrentEpoch.get} is committing.") val msg = dataWriter.commit() epochCoordinator.send( CommitPartitionEpoch( context.partitionId(), EpochTracker.getCurrentEpoch.get, msg) ) logInfo(s"Writer for partition ${context.partitionId()} " + s"in epoch ${EpochTracker.getCurrentEpoch.get} committed.") EpochTracker.incrementCurrentEpoch() } catch { case _: InterruptedException => // Continuous shutdown always involves an interrupt. Just finish the task. } })(catchBlock = { // If there is an error, abort this writer. We enter this callback in the middle of // rethrowing an exception, so compute() will stop executing at this point. logError(s"Writer for partition ${context.partitionId()} is aborting.") if (dataWriter != null) dataWriter.abort() logError(s"Writer for partition ${context.partitionId()} aborted.") }) } Iterator() } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 51
Source File: ContinuousShuffleReadRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous.shuffle import java.util.UUID import org.apache.spark.{Partition, SparkContext, SparkEnv, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.rpc.RpcAddress import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.NextIterator case class ContinuousShuffleReadPartition( index: Int, endpointName: String, queueSize: Int, numShuffleWriters: Int, epochIntervalMs: Long) extends Partition { // Initialized only on the executor, and only once even as we call compute() multiple times. lazy val (reader: ContinuousShuffleReader, endpoint) = { val env = SparkEnv.get.rpcEnv val receiver = new RPCContinuousShuffleReader( queueSize, numShuffleWriters, epochIntervalMs, env) val endpoint = env.setupEndpoint(endpointName, receiver) TaskContext.get().addTaskCompletionListener[Unit] { ctx => env.stop(endpoint) } (receiver, endpoint) } } class ContinuousShuffleReadRDD( sc: SparkContext, numPartitions: Int, queueSize: Int = 1024, numShuffleWriters: Int = 1, epochIntervalMs: Long = 1000, val endpointNames: Seq[String] = Seq(s"RPCContinuousShuffleReader-${UUID.randomUUID()}")) extends RDD[UnsafeRow](sc, Nil) { override protected def getPartitions: Array[Partition] = { (0 until numPartitions).map { partIndex => ContinuousShuffleReadPartition( partIndex, endpointNames(partIndex), queueSize, numShuffleWriters, epochIntervalMs) }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = { split.asInstanceOf[ContinuousShuffleReadPartition].reader.read() } }
Example 52
Source File: AsHadoopPartition.scala From spark-util with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.Partition import org.hammerlab.hadoop.splits.FileSplit import org.apache.hadoop.mapreduce.lib.input import org.apache.hadoop.mapred object AsHadoopPartition { def apply(partition: Partition): HadoopPartition = partition.asInstanceOf[HadoopPartition] } object AsNewHadoopPartition { def apply(partition: Partition): NewHadoopPartition = partition.asInstanceOf[NewHadoopPartition] } object GetFileSplit { def apply(partition: Partition): FileSplit = partition match { case p: HadoopPartition ⇒ FileSplit( p .inputSplit .value .asInstanceOf[mapred.FileSplit] ) case p: NewHadoopPartition ⇒ FileSplit( p .serializableHadoopSplit .value .asInstanceOf[input.FileSplit] ) case _ ⇒ throw NonHadoopPartition(partition) } } case class NonHadoopPartition(p: Partition) extends IllegalArgumentException
Example 53
Source File: EventHubsRDD.scala From azure-event-hubs-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.eventhubs.rdd import com.microsoft.azure.eventhubs.EventData import org.apache.spark.eventhubs.EventHubsConf import org.apache.spark.eventhubs.client.CachedEventHubsReceiver import org.apache.spark.eventhubs.utils.SimulatedCachedReceiver import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.{ Partition, SparkContext, TaskContext } private[spark] class EventHubsRDD(sc: SparkContext, val ehConf: EventHubsConf, val offsetRanges: Array[OffsetRange]) extends RDD[EventData](sc, Nil) with Logging with HasOffsetRanges { override def getPartitions: Array[Partition] = offsetRanges .sortBy(_.partitionId) .map( o => new EventHubsRDDPartition( o.partitionId, o.nameAndPartition, o.fromSeqNo, o.untilSeqNo, o.preferredLoc )) override def count: Long = offsetRanges.map(_.count).sum override def isEmpty(): Boolean = count == 0L override def take(num: Int): Array[EventData] = { val nonEmptyPartitions = this.partitions.map(_.asInstanceOf[EventHubsRDDPartition]).filter(_.count > 0) if (num < 1 || nonEmptyPartitions.isEmpty) { return Array() } val parts = nonEmptyPartitions.foldLeft(Map[Int, Int]()) { (result, part) => val remain = num - result.values.sum if (remain > 0) { val taken = Math.min(remain, part.count) result + (part.index -> taken.toInt) } else { result } } context .runJob( this, (tc: TaskContext, it: Iterator[EventData]) => it.take(parts(tc.partitionId)).toArray, parts.keys.toArray ) .flatten } override def getPreferredLocations(split: Partition): Seq[String] = { val part = split.asInstanceOf[EventHubsRDDPartition] part.preferredLoc.map(Seq(_)).getOrElse(Seq.empty) } private def errBeginAfterEnd(part: EventHubsRDDPartition): String = s"The beginning sequence number ${part.fromSeqNo} is larger than the ending " + s"sequence number ${part.untilSeqNo} for EventHubs ${part.name} on partition " + s"${part.partitionId}." override def compute(partition: Partition, context: TaskContext): Iterator[EventData] = { val part = partition.asInstanceOf[EventHubsRDDPartition] assert(part.fromSeqNo <= part.untilSeqNo, errBeginAfterEnd(part)) if (part.fromSeqNo == part.untilSeqNo) { logInfo( s"(TID ${context.taskAttemptId()}) Beginning sequence number ${part.fromSeqNo} is equal to the ending sequence " + s"number ${part.untilSeqNo}. Returning empty partition for EH: ${part.name} " + s"on partition: ${part.partitionId}") Iterator.empty } else { logInfo( s"(TID ${context.taskAttemptId()}) Computing EventHubs ${part.name}, partition ${part.partitionId} " + s"sequence numbers ${part.fromSeqNo} => ${part.untilSeqNo}") val cachedReceiver = if (ehConf.useSimulatedClient) { SimulatedCachedReceiver } else { CachedEventHubsReceiver } cachedReceiver.receive(ehConf, part.nameAndPartition, part.fromSeqNo, (part.untilSeqNo - part.fromSeqNo).toInt) } } }
Example 54
Source File: SequoiadbRDD.scala From spark-sequoiadb with Apache License 2.0 | 5 votes |
package com.sequoiadb.spark.rdd import org.apache.spark.SparkContext import _root_.com.sequoiadb.spark.SequoiadbConfig import com.sequoiadb.spark.partitioner._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.Filter import org.apache.spark.{Partition, TaskContext} import org.bson.BSONObject import org.slf4j.{Logger, LoggerFactory} import scala.collection.mutable.ArrayBuffer //import java.io.FileOutputStream; def apply ( sc: SQLContext, config: SequoiadbConfig, partitioner: Option[SequoiadbPartitioner] = None, requiredColumns: Array[String] = Array(), filters: Array[Filter] = Array(), queryReturnType: Int = SequoiadbConfig.QUERYRETURNBSON, queryLimit: Long = -1) = { new SequoiadbRDD ( sc.sparkContext, config, partitioner, requiredColumns, filters, queryReturnType, queryLimit) } }
Example 55
Source File: SlidingRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD private[mllib] class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int) extends Partition with Serializable { override val index: Int = idx } private[mllib] class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int) extends RDD[Array[T]](parent) { require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1), "Window size and step must be greater than 0, " + s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.") override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = { val part = split.asInstanceOf[SlidingRDDPartition[T]] (firstParent[T].iterator(part.prev, context) ++ part.tail) .drop(part.offset) .sliding(windowSize, step) .withPartial(false) .map(_.toArray) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev) override def getPartitions: Array[Partition] = { val parentPartitions = parent.partitions val n = parentPartitions.length if (n == 0) { Array.empty } else if (n == 1) { Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0)) } else { val w1 = windowSize - 1 // Get partition sizes and first w1 elements. val (sizes, heads) = parent.mapPartitions { iter => val w1Array = iter.take(w1).toArray Iterator.single((w1Array.length + iter.length, w1Array)) }.collect().unzip val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]] var i = 0 var cumSize = 0 var partitionIndex = 0 while (i < n) { val mod = cumSize % step val offset = if (mod == 0) 0 else step - mod val size = sizes(i) if (offset < size) { val tail = mutable.ListBuffer.empty[T] // Keep appending to the current tail until it has w1 elements. var j = i + 1 while (j < n && tail.length < w1) { tail ++= heads(j).take(w1 - tail.length) j += 1 } if (sizes(i) + tail.length >= offset + windowSize) { partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset) partitionIndex += 1 } } cumSize += size i += 1 } partitions.toArray } } // TODO: Override methods such as aggregate, which only requires one Spark job. }
Example 56
Source File: StateStoreRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration class StateStoreRDD[T: ClassTag, U: ClassTag]( dataRDD: RDD[T], storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U], checkpointLocation: String, operatorId: Long, storeVersion: Long, keySchema: StructType, valueSchema: StructType, sessionState: SessionState, @transient private val storeCoordinator: Option[StateStoreCoordinatorRef]) extends RDD[U](dataRDD) { private val storeConf = new StateStoreConf(sessionState.conf) // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it private val confBroadcast = dataRDD.context.broadcast( new SerializableConfiguration(sessionState.newHadoopConf())) override protected def getPartitions: Array[Partition] = dataRDD.partitions override def getPreferredLocations(partition: Partition): Seq[String] = { val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) storeCoordinator.flatMap(_.getLocation(storeId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) store = StateStore.get( storeId, keySchema, valueSchema, storeVersion, storeConf, confBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 57
Source File: WholeTextFileRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 58
Source File: ZippedWithIndexRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils private[spark] class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long) extends Partition with Serializable { override val index: Int = prev.index } @transient private val startIndices: Array[Long] = { val n = prev.partitions.length if (n == 0) { Array.empty } else if (n == 1) { Array(0L) } else { prev.context.runJob( prev, Utils.getIteratorSize _, 0 until n - 1 // do not need to count the last partition ).scanLeft(0L)(_ + _) } } override def getPartitions: Array[Partition] = { firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index))) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = { val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition] val parentIter = firstParent[T].iterator(split.prev, context) Utils.getIteratorZipWithIndex(parentIter, split.startIndex) } }
Example 59
Source File: UnionRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport} import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 60
Source File: PartitionwiseSampledRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 61
Source File: PartitionerAwareUnionRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.nonEmpty) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map { index => new PartitionerAwareUnionRDDPartition(rdds, index) }.toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 62
Source File: BinaryFileRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.StreamFileInputFormat private[spark] class BinaryFileRDD[T]( @transient private val sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(sc, jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 63
Source File: PartitionPruningRDDSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 64
Source File: SampledRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.commons.math3.distribution.PoissonDistribution import org.apache.spark.{Partition, TaskContext} @deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0") private[spark] class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable { override val index: Int = prev.index } @deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0") private[spark] class SampledRDD[T: ClassTag]( prev: RDD[T], withReplacement: Boolean, frac: Double, seed: Int) extends RDD[T](prev) { override def getPartitions: Array[Partition] = { val rg = new Random(seed) firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt)) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = { val split = splitIn.asInstanceOf[SampledRDDPartition] if (withReplacement) { // For large datasets, the expected number of occurrences of each element in a sample with // replacement is Poisson(frac). We use that to get a count for each element. val poisson = new PoissonDistribution(frac) poisson.reseedRandomGenerator(split.seed) firstParent[T].iterator(split.prev, context).flatMap { element => val count = poisson.sample() if (count == 0) { Iterator.empty // Avoid object allocation when we return 0 items, which is quite often } else { Iterator.fill(count)(element) } } } else { // Sampling without replacement val rand = new Random(split.seed) firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac)) } } }
Example 65
Source File: SubtractedRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.Dependency import org.apache.spark.OneToOneDependency import org.apache.spark.Partition import org.apache.spark.Partitioner import org.apache.spark.ShuffleDependency import org.apache.spark.SparkEnv import org.apache.spark.TaskContext import org.apache.spark.serializer.Serializer def setSerializer(serializer: Serializer): SubtractedRDD[K, V, W] = { this.serializer = Option(serializer) this } override def getDependencies: Seq[Dependency[_]] = { Seq(rdd1, rdd2).map { rdd => if (rdd.partitioner == Some(part)) { logDebug("Adding one-to-one dependency with " + rdd) new OneToOneDependency(rdd) } else { logDebug("Adding shuffle dependency with " + rdd) new ShuffleDependency(rdd, part, serializer) } } } override def getPartitions: Array[Partition] = { val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.size) { // Each CoGroupPartition will depend on rdd1 and rdd2 array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) => dependencies(j) match { case s: ShuffleDependency[_, _, _] => new ShuffleCoGroupSplitDep(s.shuffleHandle) case _ => new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)) } }.toArray) } array } override val partitioner = Some(part) override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = { val partition = p.asInstanceOf[CoGroupPartition] val map = new JHashMap[K, ArrayBuffer[V]] def getSeq(k: K): ArrayBuffer[V] = { val seq = map.get(k) if (seq != null) { seq } else { val seq = new ArrayBuffer[V]() map.put(k, seq) seq } } def integrate(dep: CoGroupSplitDep, op: Product2[K, V] => Unit) = dep match { case NarrowCoGroupSplitDep(rdd, _, itsSplit) => rdd.iterator(itsSplit, context).asInstanceOf[Iterator[Product2[K, V]]].foreach(op) case ShuffleCoGroupSplitDep(handle) => val iter = SparkEnv.get.shuffleManager .getReader(handle, partition.index, partition.index + 1, context) .read() iter.foreach(op) } // the first dep is rdd1; add all values to the map integrate(partition.deps(0), t => getSeq(t._1) += t._2) // the second dep is rdd2; remove all of its keys integrate(partition.deps(1), t => map.remove(t._1)) map.iterator.map { t => t._2.iterator.map { (t._1, _) } }.flatten } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null } }
Example 66
Source File: ZippedWithIndexRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils private[spark] class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long) extends Partition with Serializable { override val index: Int = prev.index } @transient private val startIndices: Array[Long] = { val n = prev.partitions.size if (n == 0) { Array[Long]() } else if (n == 1) { Array(0L) } else { prev.context.runJob( prev, Utils.getIteratorSize _, 0 until n - 1, // do not need to count the last partition allowLocal = false ).scanLeft(0L)(_ + _) } } override def getPartitions: Array[Partition] = { firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index))) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = { val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition] firstParent[T].iterator(split.prev, context).zipWithIndex.map { x => (x._1, split.startIndex + x._2) } } }
Example 67
Source File: UnionRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient rdd: RDD[T], val parentRddIndex: Int, @transient parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations() = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies override def getPartitions: Array[Partition] = { val array = new Array[Partition](rdds.map(_.partitions.size).sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.size) pos += rdd.partitions.size } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 68
Source File: PartitionwiseSampledRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], @transient preservesPartitioning: Boolean, @transient seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 69
Source File: PartitionerAwareUnionRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.length > 0) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map(index => { new PartitionerAwareUnionRDDPartition(rdds, index) }).toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => { val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 70
Source File: BinaryFileRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{ Configurable, Configuration } import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.spark.input.StreamFileInputFormat import org.apache.spark.{ Partition, SparkContext } private[spark] class BinaryFileRDD[T]( sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], @transient conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = newJobContext(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 71
Source File: PartitionPruningRDDSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.scalatest.FunSuite import org.apache.spark.{Partition, SharedSparkContext, TaskContext} class PartitionPruningRDDSuite extends FunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index = i def testValue = this.value }
Example 72
Source File: CarbonDropPartitionRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.rdd import java.util import scala.collection.JavaConverters._ import org.apache.spark.{Partition, TaskContext} import org.apache.spark.sql.SparkSession import org.apache.carbondata.core.index.Segment import org.apache.carbondata.core.indexstore.PartitionSpec import org.apache.carbondata.core.metadata.SegmentFileStore case class CarbonDropPartition(rddId: Int, idx: Int, segment: Segment) extends Partition { override val index: Int = idx override def hashCode(): Int = 41 * (41 + rddId) + idx } class CarbonDropPartitionRDD( @transient private val ss: SparkSession, tablePath: String, segments: Seq[Segment], partitions: util.List[PartitionSpec], uniqueId: String) extends CarbonRDD[(String, String)](ss, Nil) { override def internalGetPartitions: Array[Partition] = { segments.zipWithIndex.map {s => CarbonDropPartition(id, s._2, s._1) }.toArray } override def internalCompute( theSplit: Partition, context: TaskContext): Iterator[(String, String)] = { val iter = new Iterator[(String, String)] { val split = theSplit.asInstanceOf[CarbonDropPartition] logInfo("Dropping partition information from : " + split.segment) val toBeDeletedSegments = new util.ArrayList[String]() val toBeUpdateSegments = new util.ArrayList[String]() new SegmentFileStore( tablePath, split.segment.getSegmentFileName).dropPartitions( split.segment, partitions, uniqueId, toBeDeletedSegments, toBeUpdateSegments) var finished = false override def hasNext: Boolean = { !finished } override def next(): (String, String) = { finished = true (toBeUpdateSegments.asScala.mkString(","), toBeDeletedSegments.asScala.mkString(",")) } } iter } }
Example 73
Source File: CarbonDeltaRowScanRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.rdd import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.conf.Configuration import org.apache.spark.Partition import org.apache.spark.sql.SparkSession import org.apache.carbondata.converter.SparkDataTypeConverterImpl import org.apache.carbondata.core.index.IndexFilter import org.apache.carbondata.core.indexstore.PartitionSpec import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier import org.apache.carbondata.core.metadata.schema.table.{CarbonTable, TableInfo} import org.apache.carbondata.core.mutate.CarbonUpdateUtil import org.apache.carbondata.core.statusmanager.SegmentUpdateStatusManager import org.apache.carbondata.core.util.DataTypeConverter import org.apache.carbondata.hadoop.{CarbonMultiBlockSplit, CarbonProjection} import org.apache.carbondata.hadoop.api.CarbonTableInputFormat import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport import org.apache.carbondata.spark.InitInputMetrics class CarbonDeltaRowScanRDD[T: ClassTag]( @transient private val spark: SparkSession, @transient private val serializedTableInfo: Array[Byte], @transient private val tableInfo: TableInfo, @transient override val partitionNames: Seq[PartitionSpec], override val columnProjection: CarbonProjection, var filter: IndexFilter, identifier: AbsoluteTableIdentifier, inputMetricsStats: InitInputMetrics, override val dataTypeConverterClz: Class[_ <: DataTypeConverter] = classOf[SparkDataTypeConverterImpl], override val readSupportClz: Class[_ <: CarbonReadSupport[_]] = SparkReadSupport.readSupportClass, deltaVersionToRead: String) extends CarbonScanRDD[T]( spark, columnProjection, filter, identifier, serializedTableInfo, tableInfo, inputMetricsStats, partitionNames, dataTypeConverterClz, readSupportClz) { override def internalGetPartitions: Array[Partition] = { val table = CarbonTable.buildFromTableInfo(getTableInfo) val updateStatusManager = new SegmentUpdateStatusManager(table, deltaVersionToRead) val parts = super.internalGetPartitions parts.map { p => val partition = p.asInstanceOf[CarbonSparkPartition] val splits = partition.multiBlockSplit.getAllSplits.asScala.filter { s => updateStatusManager.getDetailsForABlock( CarbonUpdateUtil.getSegmentBlockNameKey(s.getSegmentId, s.getBlockPath, table.isHivePartitionTable)) != null }.asJava new CarbonSparkPartition(partition.rddId, partition.index, new CarbonMultiBlockSplit(splits, partition.multiBlockSplit.getLocations)) }.filter(p => p.multiBlockSplit.getAllSplits.size() > 0).zipWithIndex.map{ case (p, index) => new CarbonSparkPartition(p.rddId, index, p.multiBlockSplit) }.asInstanceOf[Array[Partition]] } override def createInputFormat(conf: Configuration): CarbonTableInputFormat[Object] = { val format = super.createInputFormat(conf) conf.set("updateDeltaVersion", deltaVersionToRead) conf.set("readDeltaOnly", "true") format } }
Example 74
Source File: CarbonRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.rdd import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.conf.Configuration import org.apache.spark.{Dependency, OneToOneDependency, Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.util.SparkSQLUtil import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.metadata.schema.table.TableInfo import org.apache.carbondata.core.util._ abstract class CarbonRDDWithTableInfo[T: ClassTag]( @transient private val ss: SparkSession, @transient private var deps: Seq[Dependency[_]], serializedTableInfo: Array[Byte]) extends CarbonRDD[T](ss, deps) { def this(@transient sparkSession: SparkSession, @transient oneParent: RDD[_], serializedTableInfo: Array[Byte]) = { this (sparkSession, List(new OneToOneDependency(oneParent)), serializedTableInfo) } def getTableInfo: TableInfo = TableInfo.deserialize(serializedTableInfo) }
Example 75
Source File: QueryTaskCompletionListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.rdd import scala.collection.JavaConverters._ import org.apache.hadoop.mapreduce.RecordReader import org.apache.spark.{Partition, TaskContext} import org.apache.spark.sql.carbondata.execution.datasources.tasklisteners.CarbonQueryTaskCompletionListener import org.apache.spark.sql.profiler.{Profiler, QueryTaskEnd} import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.memory.UnsafeMemoryManager import org.apache.carbondata.core.stats.{QueryStatistic, QueryStatisticsConstants, QueryStatisticsRecorder} import org.apache.carbondata.core.util.{DataTypeUtil, TaskMetricsMap, ThreadLocalTaskInfo} import org.apache.carbondata.spark.InitInputMetrics class QueryTaskCompletionListener(freeMemory: Boolean, var reader: RecordReader[Void, Object], inputMetricsStats: InitInputMetrics, executionId: String, taskId: Int, queryStartTime: Long, queryStatisticsRecorder: QueryStatisticsRecorder, split: Partition, queryId: String) extends CarbonQueryTaskCompletionListener { override def onTaskCompletion(context: TaskContext): Unit = { if (reader != null) { try { reader.close() } catch { case e: Exception => LogServiceFactory.getLogService(this.getClass.getCanonicalName).error(e) } reader = null } TaskMetricsMap.getInstance().updateReadBytes(Thread.currentThread().getId) inputMetricsStats.updateAndClose() logStatistics(executionId, taskId, queryStartTime, queryStatisticsRecorder, split) if (freeMemory) { UnsafeMemoryManager.INSTANCE .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId) ThreadLocalTaskInfo.clearCarbonTaskInfo() DataTypeUtil.clearFormatter() } } def logStatistics( executionId: String, taskId: Long, queryStartTime: Long, recorder: QueryStatisticsRecorder, split: Partition ): Unit = { if (null != recorder) { val queryStatistic = new QueryStatistic() queryStatistic.addFixedTimeStatistic(QueryStatisticsConstants.EXECUTOR_PART, System.currentTimeMillis - queryStartTime) recorder.recordStatistics(queryStatistic) // print executor query statistics for each task_id val statistics = recorder.statisticsForTask(taskId, queryStartTime) if (statistics != null && executionId != null) { Profiler.invokeIfEnable { val inputSplit = split.asInstanceOf[CarbonSparkPartition].split.value inputSplit.calculateLength() val size = inputSplit.getLength val files = inputSplit.getAllSplits.asScala.map { s => s.getSegmentId + "/" + s.getPath.getName }.toArray[String] Profiler.send( QueryTaskEnd( executionId.toLong, queryId, statistics.getValues, size, files ) ) } } recorder.logStatisticsForTask(statistics) } } }
Example 76
Source File: CarbonMergeBloomIndexFilesRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.index import scala.collection.JavaConverters._ import org.apache.spark.Partition import org.apache.spark.rdd.CarbonMergeFilePartition import org.apache.spark.sql.SparkSession import org.apache.spark.TaskContext import org.apache.carbondata.core.metadata.schema.table.CarbonTable import org.apache.carbondata.core.util.path.CarbonTablePath import org.apache.carbondata.index.bloom.BloomIndexFileStore import org.apache.carbondata.spark.rdd.CarbonRDD class CarbonMergeBloomIndexFilesRDD( @transient private val ss: SparkSession, carbonTable: CarbonTable, segmentIds: Seq[String], bloomIndexNames: Seq[String], bloomIndexColumns: Seq[Seq[String]]) extends CarbonRDD[String](ss, Nil) { override def internalGetPartitions: Array[Partition] = { segmentIds.zipWithIndex.map {s => CarbonMergeFilePartition(id, s._2, s._1) }.toArray } override def internalCompute(theSplit: Partition, context: TaskContext): Iterator[String] = { val tablePath = carbonTable.getTablePath val split = theSplit.asInstanceOf[CarbonMergeFilePartition] logInfo("Merging bloom index files of " + s"segment ${split.segmentId} for ${carbonTable.getTableName}") bloomIndexNames.zipWithIndex.map( dm => { val dmSegmentPath = CarbonTablePath.getIndexesStorePath( tablePath, split.segmentId, dm._1) BloomIndexFileStore.mergeBloomIndexFile(dmSegmentPath, bloomIndexColumns(dm._2).asJava) }) val iter = new Iterator[String] { var havePair = false var finished = false override def hasNext: Boolean = { if (!finished && !havePair) { finished = true havePair = !finished } !finished } override def next(): String = { if (!hasNext) { throw new java.util.NoSuchElementException("End of stream") } havePair = false "" } } iter } }
Example 77
Source File: SegmentPruneRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.indexserver import scala.collection.JavaConverters._ import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.sql.SparkSession import org.apache.carbondata.core.cache.CacheProvider import org.apache.carbondata.core.index.{IndexInputFormat, IndexStoreManager} import org.apache.carbondata.core.index.dev.expr.IndexInputSplitWrapper import org.apache.carbondata.core.indexstore.SegmentWrapper import org.apache.carbondata.spark.rdd.CarbonRDD class SegmentPruneRDD(@transient private val ss: SparkSession, indexInputFormat: IndexInputFormat) extends CarbonRDD[(String, SegmentWrapper)](ss, Nil) { override protected def getPreferredLocations(split: Partition): Seq[String] = { val locations = split.asInstanceOf[IndexRDDPartition].getLocations if (locations != null) { locations.toSeq } else { Seq() } } override protected def internalGetPartitions: Array[Partition] = { new DistributedPruneRDD(ss, indexInputFormat).partitions } override def internalCompute(split: Partition, context: TaskContext): Iterator[(String, SegmentWrapper)] = { val inputSplits = split.asInstanceOf[IndexRDDPartition].inputSplit val segments = inputSplits.map(_ .asInstanceOf[IndexInputSplitWrapper].getDistributable.getSegment) segments.foreach(_.setReadCommittedScope(indexInputFormat.getReadCommittedScope)) if (indexInputFormat.getInvalidSegments.size > 0) { // clear the segmentMap and from cache in executor when there are invalid segments IndexStoreManager.getInstance().clearInvalidSegments(indexInputFormat.getCarbonTable, indexInputFormat.getInvalidSegments) } val blockletMap = IndexStoreManager.getInstance .getDefaultIndex(indexInputFormat.getCarbonTable) val prunedSegments = blockletMap .pruneSegments(segments.toList.asJava, indexInputFormat.getFilterResolverIntf) val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${ SparkEnv.get.blockManager.blockManagerId.executorId }" val cacheSize = if (CacheProvider.getInstance().getCarbonCache != null) { CacheProvider.getInstance().getCarbonCache.getCurrentSize } else { 0L } val value = (executorIP + "_" + cacheSize.toString, new SegmentWrapper(prunedSegments)) Iterator(value) } }
Example 78
Source File: DistributedCountRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.indexserver import java.util.concurrent.Executors import scala.collection.JavaConverters._ import scala.concurrent.{Await, ExecutionContext, ExecutionContextExecutor, Future} import scala.concurrent.duration.Duration import org.apache.hadoop.mapred.TaskAttemptID import org.apache.hadoop.mapreduce.{InputSplit, TaskType} import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.sql.SparkSession import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.cache.CacheProvider import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.index.{IndexInputFormat, IndexStoreManager} import org.apache.carbondata.core.index.dev.expr.IndexInputSplitWrapper import org.apache.carbondata.core.util.{CarbonProperties, CarbonThreadFactory} import org.apache.carbondata.spark.rdd.CarbonRDD class DistributedCountRDD(@transient ss: SparkSession, indexInputFormat: IndexInputFormat) extends CarbonRDD[(String, String)](ss, Nil) { @transient private val LOGGER = LogServiceFactory.getLogService(classOf[DistributedPruneRDD] .getName) override protected def getPreferredLocations(split: Partition): Seq[String] = { if (split.asInstanceOf[IndexRDDPartition].getLocations != null) { split.asInstanceOf[IndexRDDPartition].getLocations.toSeq } else { Seq() } } override def internalCompute(split: Partition, context: TaskContext): Iterator[(String, String)] = { val attemptId = new TaskAttemptID(DistributedRDDUtils.generateTrackerId, id, TaskType.MAP, split.index, 0) val attemptContext = new TaskAttemptContextImpl(FileFactory.getConfiguration, attemptId) val inputSplits = split.asInstanceOf[IndexRDDPartition].inputSplit val numOfThreads = CarbonProperties.getInstance().getNumOfThreadsForExecutorPruning val service = Executors .newFixedThreadPool(numOfThreads, new CarbonThreadFactory("IndexPruningPool", true)) implicit val ec: ExecutionContextExecutor = ExecutionContext .fromExecutor(service) if (indexInputFormat.ifAsyncCall()) { // to clear cache of invalid segments during pre-priming in index server IndexStoreManager.getInstance().clearInvalidSegments(indexInputFormat.getCarbonTable, indexInputFormat.getInvalidSegments) } val futures = if (inputSplits.length <= numOfThreads) { inputSplits.map { split => generateFuture(Seq(split)) } } else { DistributedRDDUtils.groupSplits(inputSplits, numOfThreads).map { splits => generateFuture(splits) } } // scalastyle:off awaitresult val results = Await.result(Future.sequence(futures), Duration.Inf).flatten // scalastyle:on awaitresult val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${ SparkEnv.get.blockManager.blockManagerId.executorId }" val cacheSize = if (CacheProvider.getInstance().getCarbonCache != null) { CacheProvider.getInstance().getCarbonCache.getCurrentSize } else { 0L } Iterator((executorIP + "_" + cacheSize.toString, results.map(_._2.toLong).sum.toString)) } override protected def internalGetPartitions: Array[Partition] = { new DistributedPruneRDD(ss, indexInputFormat).partitions } private def generateFuture(split: Seq[InputSplit]) (implicit executionContext: ExecutionContext) = { Future { val segments = split.map { inputSplit => val distributable = inputSplit.asInstanceOf[IndexInputSplitWrapper] distributable.getDistributable.getSegment .setReadCommittedScope(indexInputFormat.getReadCommittedScope) distributable.getDistributable.getSegment } val defaultIndex = IndexStoreManager.getInstance .getIndex(indexInputFormat.getCarbonTable, split.head .asInstanceOf[IndexInputSplitWrapper].getDistributable.getIndexSchema) defaultIndex.getBlockRowCount(defaultIndex, segments.toList.asJava, indexInputFormat .getPartitions).asScala } } }
Example 79
Source File: DistributedShowCacheRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.indexserver import scala.collection.JavaConverters._ import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.hive.DistributionUtil import org.apache.carbondata.core.index.IndexStoreManager import org.apache.carbondata.core.indexstore.blockletindex.BlockletIndexFactory import org.apache.carbondata.hadoop.CarbonInputSplit import org.apache.carbondata.spark.rdd.CarbonRDD class DistributedShowCacheRDD(@transient private val ss: SparkSession, tableUniqueId: String, executorCache: Boolean) extends CarbonRDD[String](ss, Nil) { val executorsList: Array[String] = DistributionUtil .getExecutors(ss.sparkContext).flatMap { case (host, executors) => executors.map { executor => s"executor_${ host }_$executor" } }.toArray override protected def getPreferredLocations(split: Partition): Seq[String] = { if (split.asInstanceOf[IndexRDDPartition].getLocations != null) { split.asInstanceOf[IndexRDDPartition].getLocations.toSeq } else { Seq() } } override protected def internalGetPartitions: Array[Partition] = { executorsList.zipWithIndex.map { case (executor, idx) => // create a dummy split for each executor to accumulate the cache size. val dummySplit = new CarbonInputSplit() dummySplit.setLocation(Array(executor)) new IndexRDDPartition(id, idx, List(dummySplit), Array(executor)) } } override def internalCompute(split: Partition, context: TaskContext): Iterator[String] = { val indexes = IndexStoreManager.getInstance().getTableIndexForAllTables.asScala val tableList = tableUniqueId.split(",") val iterator = indexes.collect { case (tableId, tableIndexes) if tableUniqueId.isEmpty || tableList.contains(tableId) => val sizeAndIndexLengths = tableIndexes.asScala .map { index => val indexName = if (index.getIndexFactory.isInstanceOf[BlockletIndexFactory]) { index .getIndexFactory .asInstanceOf[BlockletIndexFactory] .getCarbonTable .getTableUniqueName } else { index.getIndexSchema.getRelationIdentifier.getDatabaseName + "_" + index .getIndexSchema.getIndexName } if (executorCache) { val executorIP = s"${ SparkEnv.get.blockManager.blockManagerId.host }_${ SparkEnv.get.blockManager.blockManagerId.executorId }" s"${ executorIP }:${ index.getIndexFactory.getCacheSize }:${ index.getIndexSchema.getProviderName }" } else { s"${indexName}:${index.getIndexFactory.getCacheSize}:${ index.getIndexSchema.getProviderName }" } } sizeAndIndexLengths }.flatten.toIterator iterator } }
Example 80
Source File: InvalidateSegmentCacheRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.indexserver import scala.collection.JavaConverters._ import org.apache.spark.{Partition, TaskContext} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.hive.DistributionUtil import org.apache.carbondata.core.index.IndexStoreManager import org.apache.carbondata.core.metadata.schema.table.CarbonTable import org.apache.carbondata.hadoop.CarbonInputSplit import org.apache.carbondata.spark.rdd.CarbonRDD class InvalidateSegmentCacheRDD(@transient private val ss: SparkSession, carbonTable: CarbonTable, invalidSegmentIds: List[String]) extends CarbonRDD[String](ss, Nil) { val executorsList: Array[String] = DistributionUtil.getExecutors(ss.sparkContext).flatMap { case (host, executors) => executors.map { executor => s"executor_${host}_$executor" } }.toArray override def internalCompute(split: Partition, context: TaskContext): Iterator[String] = { IndexStoreManager.getInstance().clearInvalidSegments(carbonTable, invalidSegmentIds.asJava) Iterator.empty } override protected def getPreferredLocations(split: Partition): Seq[String] = { if (split.asInstanceOf[IndexRDDPartition].getLocations != null) { split.asInstanceOf[IndexRDDPartition].getLocations.toSeq } else { Seq() } } override protected def internalGetPartitions: Array[Partition] = { if (invalidSegmentIds.isEmpty) { Array() } else { executorsList.zipWithIndex.map { case (executor, idx) => // create a dummy split for each executor to accumulate the cache size. val dummySplit = new CarbonInputSplit() dummySplit.setLocation(Array(executor)) new IndexRDDPartition(id, idx, List(dummySplit), Array(executor)) } } } }
Example 81
Source File: MemsqlRDD.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import java.sql.{Connection, PreparedStatement, ResultSet} import com.memsql.spark.SQLGen.VariableList import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils} import org.apache.spark.sql.types._ import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} case class MemsqlRDD(query: String, variables: VariableList, options: MemsqlOptions, schema: StructType, expectedOutput: Seq[Attribute], @transient val sc: SparkContext) extends RDD[Row](sc, Nil) { override protected def getPartitions: Array[Partition] = MemsqlQueryHelpers.GetPartitions(options, query, variables) override def compute(rawPartition: Partition, context: TaskContext): Iterator[Row] = { var closed = false var rs: ResultSet = null var stmt: PreparedStatement = null var conn: Connection = null var partition: MemsqlPartition = rawPartition.asInstanceOf[MemsqlPartition] def tryClose(name: String, what: AutoCloseable): Unit = { try { if (what != null) { what.close() } } catch { case e: Exception => logWarning(s"Exception closing $name", e) } } def close(): Unit = { if (closed) { return } tryClose("resultset", rs) tryClose("statement", stmt) tryClose("connection", conn) closed = true } context.addTaskCompletionListener { context => close() } conn = JdbcUtils.createConnectionFactory(partition.connectionInfo)() stmt = conn.prepareStatement(partition.query) JdbcHelpers.fillStatement(stmt, partition.variables) rs = stmt.executeQuery() var rowsIter = JdbcUtils.resultSetToRows(rs, schema) if (expectedOutput.nonEmpty) { val schemaDatatypes = schema.map(_.dataType) val expectedDatatypes = expectedOutput.map(_.dataType) if (schemaDatatypes != expectedDatatypes) { val columnEncoders = schemaDatatypes.zip(expectedDatatypes).zipWithIndex.map { case ((_: StringType, _: NullType), _) => ((_: Row) => null) case ((_: ShortType, _: BooleanType), i) => ((r: Row) => r.getShort(i) != 0) case ((_: IntegerType, _: BooleanType), i) => ((r: Row) => r.getInt(i) != 0) case ((_: LongType, _: BooleanType), i) => ((r: Row) => r.getLong(i) != 0) case ((l, r), i) => { options.assert(l == r, s"MemsqlRDD: unable to encode ${l} into ${r}") ((r: Row) => r.get(i)) } } rowsIter = rowsIter .map(row => Row.fromSeq(columnEncoders.map(_(row)))) } } CompletionIterator[Row, Iterator[Row]](new InterruptibleIterator[Row](context, rowsIter), close) } }
Example 82
Source File: DatasourceRDD.scala From datasource-receiver with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.datasource.receiver import org.apache.spark.partial.{BoundedDouble, CountEvaluator, PartialResult} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.streaming.datasource.config.ParametersUtils import org.apache.spark.streaming.datasource.models.{InputSentences, OffsetOperator} import org.apache.spark.{Logging, Partition, TaskContext} private[datasource] class DatasourceRDD( @transient sqlContext: SQLContext, inputSentences: InputSentences, datasourceParams: Map[String, String] ) extends RDD[Row](sqlContext.sparkContext, Nil) with Logging with ParametersUtils { private var totalCalculated: Option[Long] = None private val InitTableName = "initTable" private val LimitedTableName = "limitedTable" private val TempInitQuery = s"select * from $InitTableName" val dataFrame = inputSentences.offsetConditions.fold(sqlContext.sql(inputSentences.query)) { case offset => val parsedQuery = parseInitialQuery val conditionsSentence = offset.fromOffset.extractConditionSentence(parsedQuery) val orderSentence = offset.fromOffset.extractOrderSentence(parsedQuery, inverse = offset.limitRecords.isEmpty) val limitSentence = inputSentences.extractLimitSentence sqlContext.sql(parsedQuery + conditionsSentence + orderSentence + limitSentence) } private def parseInitialQuery: String = { if (inputSentences.query.toUpperCase.contains("WHERE") || inputSentences.query.toUpperCase.contains("ORDER") || inputSentences.query.toUpperCase.contains("LIMIT") ) { sqlContext.sql(inputSentences.query).registerTempTable(InitTableName) TempInitQuery } else inputSentences.query } def progressInputSentences: InputSentences = { if (!dataFrame.rdd.isEmpty()) { inputSentences.offsetConditions.fold(inputSentences) { case offset => val offsetValue = if (offset.limitRecords.isEmpty) dataFrame.rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name)) else { dataFrame.registerTempTable(LimitedTableName) val limitedQuery = s"select * from $LimitedTableName order by ${offset.fromOffset.name} " + s"${OffsetOperator.toInverseOrderOperator(offset.fromOffset.operator)} limit 1" sqlContext.sql(limitedQuery).rdd.first().get(dataFrame.schema.fieldIndex(offset.fromOffset.name)) } inputSentences.copy(offsetConditions = Option(offset.copy(fromOffset = offset.fromOffset.copy( value = Option(offsetValue), operator = OffsetOperator.toProgressOperator(offset.fromOffset.operator))))) } } else inputSentences } override def isEmpty(): Boolean = { totalCalculated.fold { withScope { partitions.length == 0 || take(1).length == 0 } } { total => total == 0L } } override def getPartitions: Array[Partition] = dataFrame.rdd.partitions override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = dataFrame.rdd.compute(thePart, context) override def getPreferredLocations(thePart: Partition): Seq[String] = dataFrame.rdd.preferredLocations(thePart) }
Example 83
Source File: NetezzaRDD.scala From spark-netezza with Apache License 2.0 | 5 votes |
package com.ibm.spark.netezza import java.sql.Connection import java.util.Properties import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.sources._ import org.apache.spark.sql.types._ import org.apache.spark.{Partition, SparkContext, TaskContext} override def compute(thePart: Partition, context: TaskContext): Iterator[Row] = new Iterator[Row] { var closed = false var finished = false var gotNext = false var nextValue: Row = null context.addTaskCompletionListener { context => close() } val part = thePart.asInstanceOf[NetezzaPartition] val conn = getConnection() val reader = new NetezzaDataReader(conn, table, columns, filters, part, schema) reader.startExternalTableDataUnload() def getNext(): Row = { if (reader.hasNext) { reader.next() } else { finished = true null.asInstanceOf[Row] } } def close() { if (closed) return try { if (null != reader) { reader.close() } } catch { case e: Exception => logWarning("Exception closing Netezza record reader", e) } try { if (null != conn) { conn.close() } logInfo("closed connection") } catch { case e: Exception => logWarning("Exception closing connection", e) } } override def hasNext: Boolean = { if (!finished) { if (!gotNext) { nextValue = getNext() if (finished) { close() } gotNext = true } } !finished } override def next(): Row = { if (!hasNext) { throw new NoSuchElementException("End of stream") } gotNext = false nextValue } } }
Example 84
Source File: ReorderedPartitionsRDD.scala From hail with MIT License | 5 votes |
package is.hail.sparkextras import is.hail.utils.FastSeq import org.apache.spark.rdd.RDD import org.apache.spark.{Dependency, NarrowDependency, Partition, TaskContext} import scala.reflect.ClassTag case class ReorderedPartitionsRDDPartition(index: Int, oldPartition: Partition) extends Partition class ReorderedPartitionsRDD[T](@transient var prev: RDD[T], @transient val oldIndices: Array[Int])(implicit tct: ClassTag[T]) extends RDD[T](prev.sparkContext, Nil) { override def getPartitions: Array[Partition] = { val parentPartitions = dependencies.head.rdd.asInstanceOf[RDD[T]].partitions Array.tabulate(oldIndices.length) { i => val oldIndex = oldIndices(i) val oldPartition = parentPartitions(oldIndex) ReorderedPartitionsRDDPartition(i, oldPartition) } } override def compute(split: Partition, context: TaskContext): Iterator[T] = { val parent = dependencies.head.rdd.asInstanceOf[RDD[T]] parent.compute(split.asInstanceOf[ReorderedPartitionsRDDPartition].oldPartition, context) } override def getDependencies: Seq[Dependency[_]] = FastSeq(new NarrowDependency[T](prev) { override def getParents(partitionId: Int): Seq[Int] = FastSeq(oldIndices(partitionId)) }) override def clearDependencies() { super.clearDependencies() prev = null } override def getPreferredLocations(partition: Partition): Seq[String] = prev.preferredLocations(partition.asInstanceOf[ReorderedPartitionsRDDPartition].oldPartition) }
Example 85
Source File: MapPartitionsWithValueRDD.scala From hail with MIT License | 5 votes |
package is.hail.sparkextras import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, TaskContext} import scala.annotation.meta.param import scala.reflect.ClassTag case class MapPartitionsWithValueRDDPartition[V]( parentPartition: Partition, value: V) extends Partition { def index: Int = parentPartition.index } class MapPartitionsWithValueRDD[T: ClassTag, U: ClassTag, V]( var prev: RDD[T], @(transient @param) values: Array[V], f: (Int, V, Iterator[T]) => Iterator[U], preservesPartitioning: Boolean) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None override def getPartitions: Array[Partition] = { firstParent[T].partitions.map(p => MapPartitionsWithValueRDDPartition(p, values(p.index))) } override def compute(split: Partition, context: TaskContext): Iterator[U] = { val p = split.asInstanceOf[MapPartitionsWithValueRDDPartition[V]] f(split.index, p.value, firstParent[T].iterator(p.parentPartition, context)) } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 86
Source File: BlockedRDD.scala From hail with MIT License | 5 votes |
package is.hail.sparkextras import is.hail.utils._ import org.apache.spark.rdd.RDD import org.apache.spark.{Dependency, NarrowDependency, Partition, TaskContext} import scala.language.existentials import scala.reflect.ClassTag case class BlockedRDDPartition(@transient rdd: RDD[_], index: Int, first: Int, last: Int) extends Partition { require(first <= last) val parentPartitions: Array[Partition] = range.map(rdd.partitions).toArray def range: Range = first to last } class BlockedRDD[T](@transient var prev: RDD[T], @transient val partFirst: Array[Int], @transient val partLast: Array[Int] )(implicit tct: ClassTag[T]) extends RDD[T](prev.sparkContext, Nil) { assert(partFirst.length == partLast.length) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](partFirst.length)(i => BlockedRDDPartition(prev, i, partFirst(i), partLast(i))) } override def compute(split: Partition, context: TaskContext): Iterator[T] = { val parent = dependencies.head.rdd.asInstanceOf[RDD[T]] split.asInstanceOf[BlockedRDDPartition].parentPartitions.iterator.flatMap(p => parent.iterator(p, context)) } override def getDependencies: Seq[Dependency[_]] = { FastSeq(new NarrowDependency(prev) { def getParents(id: Int): Seq[Int] = partitions(id).asInstanceOf[BlockedRDDPartition].range }) } override def clearDependencies() { super.clearDependencies() prev = null } override def getPreferredLocations(partition: Partition): Seq[String] = { val prevPartitions = prev.partitions val range = partition.asInstanceOf[BlockedRDDPartition].range val locationAvail = range.flatMap(i => prev.preferredLocations(prevPartitions(i))) .groupBy(identity) .mapValues(_.length) if (locationAvail.isEmpty) return FastSeq.empty[String] val m = locationAvail.values.max locationAvail.filter(_._2 == m) .keys .toFastSeq } }
Example 87
Source File: IndexReadRDD.scala From hail with MIT License | 5 votes |
package is.hail.sparkextras import is.hail.backend.spark.SparkBackend import is.hail.utils.Interval import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag case class IndexedFilePartition(index: Int, file: String, bounds: Option[Interval]) extends Partition class IndexReadRDD[T: ClassTag]( @transient val partFiles: Array[String], @transient val intervalBounds: Option[Array[Interval]], f: (IndexedFilePartition, TaskContext) => T ) extends RDD[T](SparkBackend.sparkContext("IndexReadRDD"), Nil) { def getPartitions: Array[Partition] = Array.tabulate(partFiles.length) { i => IndexedFilePartition(i, partFiles(i), intervalBounds.map(_(i))) } override def compute( split: Partition, context: TaskContext ): Iterator[T] = { Iterator.single(f(split.asInstanceOf[IndexedFilePartition], context)) } }
Example 88
Source File: MultiWayZipPartitionsRDD.scala From hail with MIT License | 5 votes |
package is.hail.sparkextras import org.apache.spark.rdd.RDD import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import scala.reflect.ClassTag object MultiWayZipPartitionsRDD { def apply[T: ClassTag , V: ClassTag]( rdds: IndexedSeq[RDD[T]] )(f: (Array[Iterator[T]]) => Iterator[V]): MultiWayZipPartitionsRDD[T, V] = { new MultiWayZipPartitionsRDD(rdds.head.sparkContext, rdds, f) } } private case class MultiWayZipPartition(val index: Int, val partitions: IndexedSeq[Partition]) extends Partition class MultiWayZipPartitionsRDD[T: ClassTag, V: ClassTag]( sc: SparkContext, var rdds: IndexedSeq[RDD[T]], var f: (Array[Iterator[T]]) => Iterator[V] ) extends RDD[V](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.length > 0) private val numParts = rdds(0).partitions.length require(rdds.forall(rdd => rdd.partitions.length == numParts)) override val partitioner = None override def getPartitions: Array[Partition] = { Array.tabulate[Partition](numParts) { i => MultiWayZipPartition(i, rdds.map(rdd => rdd.partitions(i))) } } override def compute(s: Partition, tc: TaskContext) = { val partitions = s.asInstanceOf[MultiWayZipPartition].partitions val arr = Array.tabulate(rdds.length)(i => rdds(i).iterator(partitions(i), tc)) f(arr) } override def clearDependencies() { super.clearDependencies rdds = null f = null } }
Example 89
Source File: OriginUnionRDD.scala From hail with MIT License | 5 votes |
package is.hail.sparkextras import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.rdd.RDD import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag private[hail] class OriginUnionPartition( val index: Int, val originIdx: Int, val originPart: Partition ) extends Partition class OriginUnionRDD[T: ClassTag, S: ClassTag]( sc: SparkContext, var rdds: IndexedSeq[RDD[T]], f: (Int, Int, Iterator[T]) => Iterator[S] ) extends RDD[S](sc, Nil) { override def getPartitions: Array[Partition] = { val arr = new Array[Partition](rdds.map(_.partitions.length).sum) var i = 0 for ((rdd, rddIdx) <- rdds.zipWithIndex; part <- rdd.partitions) { arr(i) = new OriginUnionPartition(i, rddIdx, part) i += 1 } arr } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var i = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, i, rdd.partitions.length) i += rdd.partitions.length } deps } override def compute(s: Partition, tc: TaskContext): Iterator[S] = { val p = s.asInstanceOf[OriginUnionPartition] f(p.originIdx, p.originPart.index, parent[T](p.originIdx).iterator(p.originPart, tc)) } override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 90
Source File: ProtoParquetRDD.scala From sparksql-protobuf with Apache License 2.0 | 5 votes |
package com.github.saurfang.parquet.proto.spark import com.github.saurfang.parquet.proto.ProtoMessageParquetInputFormat import com.google.protobuf.AbstractMessage import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.parquet.proto.ProtoReadSupport import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.{NewHadoopRDD, RDD} import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.reflect.ClassTag class ProtoParquetRDD[T <: AbstractMessage : ClassTag]( sc: SparkContext, input: String, protoClass: Class[T], @transient conf: Configuration ) extends RDD[T](sc, Nil) { def this(sc: SparkContext, input: String, protoClass: Class[T]) = { this(sc, input, protoClass, sc.hadoopConfiguration) } lazy private[this] val rdd = { val jconf = new JobConf(conf) FileInputFormat.setInputPaths(jconf, input) ProtoReadSupport.setProtobufClass(jconf, protoClass.getName) new NewHadoopRDD(sc, classOf[ProtoMessageParquetInputFormat[T]], classOf[Void], protoClass, jconf) } @DeveloperApi override def compute(split: Partition, context: TaskContext): Iterator[T] = rdd.compute(split, context).map(_._2) override protected def getPartitions: Array[Partition] = rdd.getPartitions }
Example 91
Source File: Neo4jRDD.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.io.neo4j.external import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.{Partition, SparkContext, TaskContext} import org.opencypher.okapi.neo4j.io.Neo4jConfig private class Neo4jRDD( sc: SparkContext, val query: String, val neo4jConfig: Neo4jConfig, val parameters: Map[String, Any] = Map.empty, partitions: Partitions = Partitions()) extends RDD[Row](sc, Nil) { override def compute(partition: Partition, context: TaskContext): Iterator[Row] = { val neo4jPartition: Neo4jPartition = partition.asInstanceOf[Neo4jPartition] Executor.execute(neo4jConfig, query, parameters ++ neo4jPartition.window).sparkRows } override protected def getPartitions: Array[Partition] = { val p = partitions.effective() Range(0, p.partitions.toInt).map(idx => new Neo4jPartition(idx, p.skip(idx), p.limit(idx))).toArray } override def toString(): String = s"Neo4jRDD partitions $partitions $query using $parameters" }
Example 92
Source File: SlidingRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD private[mllib] class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int) extends Partition with Serializable { override val index: Int = idx } private[mllib] class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int) extends RDD[Array[T]](parent) { require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1), "Window size and step must be greater than 0, " + s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.") override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = { val part = split.asInstanceOf[SlidingRDDPartition[T]] (firstParent[T].iterator(part.prev, context) ++ part.tail) .drop(part.offset) .sliding(windowSize, step) .withPartial(false) .map(_.toArray) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev) override def getPartitions: Array[Partition] = { val parentPartitions = parent.partitions val n = parentPartitions.length if (n == 0) { Array.empty } else if (n == 1) { Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0)) } else { val w1 = windowSize - 1 // Get partition sizes and first w1 elements. val (sizes, heads) = parent.mapPartitions { iter => val w1Array = iter.take(w1).toArray Iterator.single((w1Array.length + iter.length, w1Array)) }.collect().unzip val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]] var i = 0 var cumSize = 0 var partitionIndex = 0 while (i < n) { val mod = cumSize % step val offset = if (mod == 0) 0 else step - mod val size = sizes(i) if (offset < size) { val tail = mutable.ListBuffer.empty[T] // Keep appending to the current tail until it has w1 elements. var j = i + 1 while (j < n && tail.length < w1) { tail ++= heads(j).take(w1 - tail.length) j += 1 } if (sizes(i) + tail.length >= offset + windowSize) { partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset) partitionIndex += 1 } } cumSize += size i += 1 } partitions.toArray } } // TODO: Override methods such as aggregate, which only requires one Spark job. }
Example 93
Source File: StateStoreRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration class StateStoreRDD[T: ClassTag, U: ClassTag]( dataRDD: RDD[T], storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U], checkpointLocation: String, operatorId: Long, storeVersion: Long, keySchema: StructType, valueSchema: StructType, sessionState: SessionState, @transient private val storeCoordinator: Option[StateStoreCoordinatorRef]) extends RDD[U](dataRDD) { private val storeConf = new StateStoreConf(sessionState.conf) // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it private val confBroadcast = dataRDD.context.broadcast( new SerializableConfiguration(sessionState.newHadoopConf())) override protected def getPartitions: Array[Partition] = dataRDD.partitions override def getPreferredLocations(partition: Partition): Seq[String] = { val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) storeCoordinator.flatMap(_.getLocation(storeId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) store = StateStore.get( storeId, keySchema, valueSchema, storeVersion, storeConf, confBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 94
Source File: WholeTextFileRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 95
Source File: ZippedWithIndexRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils private[spark] class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long) extends Partition with Serializable { override val index: Int = prev.index } @transient private val startIndices: Array[Long] = { val n = prev.partitions.length if (n == 0) { Array.empty } else if (n == 1) { Array(0L) } else { prev.context.runJob( prev, Utils.getIteratorSize _, 0 until n - 1 // do not need to count the last partition ).scanLeft(0L)(_ + _) } } override def getPartitions: Array[Partition] = { firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index))) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = { val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition] val parentIter = firstParent[T].iterator(split.prev, context) Utils.getIteratorZipWithIndex(parentIter, split.startIndex) } }
Example 96
Source File: UnionRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport} import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 97
Source File: PartitionwiseSampledRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 98
Source File: PartitionerAwareUnionRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.nonEmpty) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map { index => new PartitionerAwareUnionRDDPartition(rdds, index) }.toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 99
Source File: BinaryFileRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.StreamFileInputFormat private[spark] class BinaryFileRDD[T]( @transient private val sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(sc, jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 100
Source File: PartitionPruningRDDSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 101
Source File: SlidingRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.{TaskContext, Partition} import org.apache.spark.rdd.RDD private[mllib] class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T]) extends Partition with Serializable { override val index: Int = idx } private[mllib] class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int) extends RDD[Array[T]](parent) { require(windowSize > 1, s"Window size must be greater than 1, but got $windowSize.") override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = { val part = split.asInstanceOf[SlidingRDDPartition[T]] (firstParent[T].iterator(part.prev, context) ++ part.tail) .sliding(windowSize) .withPartial(false) .map(_.toArray) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev) override def getPartitions: Array[Partition] = { val parentPartitions = parent.partitions val n = parentPartitions.size if (n == 0) { Array.empty } else if (n == 1) { Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty)) } else { val n1 = n - 1 val w1 = windowSize - 1 // Get the first w1 items of each partition, starting from the second partition. val nextHeads = parent.context.runJob(parent, (iter: Iterator[T]) => iter.take(w1).toArray, 1 until n, true) val partitions = mutable.ArrayBuffer[SlidingRDDPartition[T]]() var i = 0 var partitionIndex = 0 while (i < n1) { var j = i val tail = mutable.ListBuffer[T]() // Keep appending to the current tail until appended a head of size w1. while (j < n1 && nextHeads(j).size < w1) { tail ++= nextHeads(j) j += 1 } if (j < n1) { tail ++= nextHeads(j) j += 1 } partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail) partitionIndex += 1 // Skip appended heads. i = j } // If the head of last partition has size w1, we also need to add this partition. if (nextHeads.last.size == w1) { partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(n1), Seq.empty) } partitions.toArray } } // TODO: Override methods such as aggregate, which only requires one Spark job. }
Example 102
Source File: KafkaRDDPartition.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import org.apache.spark.Partition private[kafka] class KafkaRDDPartition( val index: Int, val topic: String, val partition: Int, val fromOffset: Long, val untilOffset: Long, val host: String, val port: Int ) extends Partition
Example 103
Source File: SampledRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.commons.math3.distribution.PoissonDistribution import org.apache.spark.{Partition, TaskContext} @deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0") private[spark] class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable { override val index: Int = prev.index } @deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0") private[spark] class SampledRDD[T: ClassTag]( prev: RDD[T], withReplacement: Boolean, frac: Double, seed: Int) extends RDD[T](prev) { override def getPartitions: Array[Partition] = { val rg = new Random(seed) firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt)) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = { val split = splitIn.asInstanceOf[SampledRDDPartition] if (withReplacement) { // For large datasets, the expected number of occurrences of each element in a sample with // replacement is Poisson(frac). We use that to get a count for each element. val poisson = new PoissonDistribution(frac) poisson.reseedRandomGenerator(split.seed) firstParent[T].iterator(split.prev, context).flatMap { element => val count = poisson.sample() if (count == 0) { Iterator.empty // Avoid object allocation when we return 0 items, which is quite often } else { Iterator.fill(count)(element) } } } else { // Sampling without replacement val rand = new Random(split.seed) firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac)) } } }
Example 104
Source File: ZippedWithIndexRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils private[spark] class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long) extends Partition with Serializable { override val index: Int = prev.index } @transient private val startIndices: Array[Long] = { val n = prev.partitions.length if (n == 0) { Array[Long]() } else if (n == 1) { Array(0L) } else { prev.context.runJob( prev, Utils.getIteratorSize _, 0 until n - 1, // do not need to count the last partition allowLocal = false ).scanLeft(0L)(_ + _) } } override def getPartitions: Array[Partition] = { firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index))) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = { val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition] firstParent[T].iterator(split.prev, context).zipWithIndex.map { x => (x._1, split.startIndex + x._2) } } }
Example 105
Source File: UnionRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient rdd: RDD[T], val parentRddIndex: Int, @transient parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies override def getPartitions: Array[Partition] = { val array = new Array[Partition](rdds.map(_.partitions.length).sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 106
Source File: PartitionwiseSampledRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], @transient preservesPartitioning: Boolean, @transient seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 107
Source File: PartitionerAwareUnionRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.length > 0) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map(index => { new PartitionerAwareUnionRDDPartition(rdds, index) }).toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => { val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 108
Source File: BinaryFileRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{ Configurable, Configuration } import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.spark.input.StreamFileInputFormat import org.apache.spark.{ Partition, SparkContext } private[spark] class BinaryFileRDD[T]( sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], @transient conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = newJobContext(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 109
Source File: PartitionPruningRDDSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 110
Source File: HBaseSimpleRDD.scala From spark-hbase-connector with Apache License 2.0 | 5 votes |
package it.nerdammer.spark.hbase import it.nerdammer.spark.hbase.conversion.FieldReader import org.apache.hadoop.hbase.CellUtil import org.apache.hadoop.hbase.client.Result import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.util.Bytes import org.apache.spark.rdd.{NewHadoopRDD, RDD} import org.apache.spark.{Partition, TaskContext} import scala.reflect.ClassTag class HBaseSimpleRDD[R: ClassTag](hadoopHBase: NewHadoopRDD[ImmutableBytesWritable, Result], builder: HBaseReaderBuilder[R], saltingLength: Int = 0) (implicit mapper: FieldReader[R], saltingProvider: SaltingProviderFactory[String]) extends RDD[R](hadoopHBase) { override def getPartitions: Array[Partition] = firstParent[(ImmutableBytesWritable, Result)].partitions override def compute(split: Partition, context: TaskContext) = { // val cleanConversion = sc.clean ---> next version firstParent[(ImmutableBytesWritable, Result)].iterator(split, context) .map(e => conversion(e._1, e._2)) } def conversion(key: ImmutableBytesWritable, row: Result) = { val columnNames = HBaseUtils.chosenColumns(builder.columns, mapper.columns) val columnNamesFC = HBaseUtils.columnsWithFamily(builder.columnFamily, columnNames) val columns = columnNamesFC .map(t => (Bytes.toBytes(t._1), Bytes.toBytes(t._2))) .map(t => if(row.containsColumn(t._1, t._2)) Some(CellUtil.cloneValue(row.getColumnLatestCell(t._1, t._2)).array) else None) .toList mapper.map(Some(key.get.drop(saltingLength)) :: columns) } }
Example 111
Source File: SlidingRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.{TaskContext, Partition} import org.apache.spark.rdd.RDD private[mllib] class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T]) extends Partition with Serializable { override val index: Int = idx } private[mllib] class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int) extends RDD[Array[T]](parent) { require(windowSize > 1, s"Window size must be greater than 1, but got $windowSize.") override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = { val part = split.asInstanceOf[SlidingRDDPartition[T]] (firstParent[T].iterator(part.prev, context) ++ part.tail) .sliding(windowSize) .withPartial(false) .map(_.toArray) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev) override def getPartitions: Array[Partition] = { val parentPartitions = parent.partitions val n = parentPartitions.size if (n == 0) { Array.empty } else if (n == 1) { Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty)) } else { val n1 = n - 1 val w1 = windowSize - 1 // Get the first w1 items of each partition, starting from the second partition. val nextHeads = parent.context.runJob(parent, (iter: Iterator[T]) => iter.take(w1).toArray, 1 until n) val partitions = mutable.ArrayBuffer[SlidingRDDPartition[T]]() var i = 0 var partitionIndex = 0 while (i < n1) { var j = i val tail = mutable.ListBuffer[T]() // Keep appending to the current tail until appended a head of size w1. while (j < n1 && nextHeads(j).size < w1) { tail ++= nextHeads(j) j += 1 } if (j < n1) { tail ++= nextHeads(j) j += 1 } partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail) partitionIndex += 1 // Skip appended heads. i = j } // If the head of last partition has size w1, we also need to add this partition. if (nextHeads.last.size == w1) { partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(n1), Seq.empty) } partitions.toArray } } // TODO: Override methods such as aggregate, which only requires one Spark job. }
Example 112
Source File: JDBCRelation.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.util.Properties import scala.collection.mutable.ArrayBuffer import org.apache.spark.Partition import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode} def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = { if (partitioning == null) return Array[Partition](JDBCPartition(null, 0)) val numPartitions = partitioning.numPartitions val column = partitioning.column if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0)) // Overflow and silliness can happen if you subtract then divide. // Here we get a little roundoff, but that's (hopefully) OK. val stride: Long = (partitioning.upperBound / numPartitions - partitioning.lowerBound / numPartitions) var i: Int = 0 var currentValue: Long = partitioning.lowerBound var ans = new ArrayBuffer[Partition]() while (i < numPartitions) { val lowerBound = if (i != 0) s"$column >= $currentValue" else null currentValue += stride val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null val whereClause = if (upperBound == null) { lowerBound } else if (lowerBound == null) { upperBound } else { s"$lowerBound AND $upperBound" } ans += JDBCPartition(whereClause, i) i = i + 1 } ans.toArray } } private[sql] case class JDBCRelation( url: String, table: String, parts: Array[Partition], properties: Properties = new Properties())(@transient val sqlContext: SQLContext) extends BaseRelation with PrunedFilteredScan with InsertableRelation { override val needConversion: Boolean = false override val schema: StructType = JDBCRDD.resolveTable(url, table, properties) override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { val driver: String = DriverRegistry.getDriverClassName(url) // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row] JDBCRDD.scanTable( sqlContext.sparkContext, schema, driver, url, properties, table, requiredColumns, filters, parts).asInstanceOf[RDD[Row]] } override def insert(data: DataFrame, overwrite: Boolean): Unit = { data.write .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append) .jdbc(url, table, properties) } }
Example 113
Source File: SampledRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.commons.math3.distribution.PoissonDistribution import org.apache.spark.{Partition, TaskContext} @deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0") private[spark] class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable { override val index: Int = prev.index } @deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0") private[spark] class SampledRDD[T: ClassTag]( prev: RDD[T], withReplacement: Boolean, frac: Double, seed: Int) extends RDD[T](prev) { override def getPartitions: Array[Partition] = { val rg = new Random(seed) firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt)) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = { val split = splitIn.asInstanceOf[SampledRDDPartition] if (withReplacement) { // For large datasets, the expected number of occurrences of each element in a sample with // replacement is Poisson(frac). We use that to get a count for each element. //对于大型数据集,替换样本中每个元素的预期出现次数为泊松(压缩),我们使用它来获取每个元素的计数 val poisson = new PoissonDistribution(frac) poisson.reseedRandomGenerator(split.seed) firstParent[T].iterator(split.prev, context).flatMap { element => val count = poisson.sample() if (count == 0) { //当我们返回0个项目时,避免对象分配,这是很经常的 Iterator.empty // Avoid object allocation when we return 0 items, which is quite often } else { Iterator.fill(count)(element) } } } else { // Sampling without replacement val rand = new Random(split.seed) firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac)) } } }
Example 114
Source File: ZippedWithIndexRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils private[spark] class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long) extends Partition with Serializable { override val index: Int = prev.index } @transient private val startIndices: Array[Long] = { val n = prev.partitions.length if (n == 0) { Array[Long]() } else if (n == 1) { Array(0L) } else { prev.context.runJob( prev, Utils.getIteratorSize _, 0 until n - 1 // do not need to count the last partition ).scanLeft(0L)(_ + _) } } override def getPartitions: Array[Partition] = { firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index))) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = { val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition] firstParent[T].iterator(split.prev, context).zipWithIndex.map { x => (x._1, split.startIndex + x._2) } } }
Example 115
Source File: MapPartitionsWithPreparationRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Partition, Partitioner, TaskContext} override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val prepared = if (preparedArguments.isEmpty) { preparePartition() } else { preparedArguments.remove(0) } val parentIterator = firstParent[T].iterator(partition, context) executePartition(context, partition.index, prepared, parentIterator) } }
Example 116
Source File: MapPartitionsRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} private[spark] class MapPartitionsRDD[U: ClassTag, T: ClassTag]( prev: RDD[T], f: (TaskContext, Int, Iterator[T]) => Iterator[U], // (TaskContext, partition index, iterator) preservesPartitioning: Boolean = false) //这里this,就是之前生成的HadoopRDD,MapPartitionsRDD的构造函数,会调用父类的构造函数RDD[U](prev), //这个this(例如也就是hadoopRdd),会被赋值给prev,然后调用RDD.scala extends RDD[U](prev) { override val partitioner = if (preservesPartitioning) firstParent[T].partitioner else None //firstParent用于返回依赖的第一个父RDD, override def getPartitions: Array[Partition] = firstParent[T].partitions //首先调用firstParent找到父RDD override def compute(split: Partition, context: TaskContext): Iterator[U] = f(context, split.index, firstParent[T].iterator(split, context)) }
Example 117
Source File: UnionRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient rdd: RDD[T], val parentRddIndex: Int, @transient parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization //在任务序列化时更新对父拆分的引用 parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies override def getPartitions: Array[Partition] = { val array = new Array[Partition](rdds.map(_.partitions.length).sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 118
Source File: PartitionwiseSampledRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], @transient preservesPartitioning: Boolean, @transient seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 119
Source File: CheckpointRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partition, SparkContext, TaskContext} private[spark] abstract class CheckpointRDD[T: ClassTag](@transient sc: SparkContext) extends RDD[T](sc, Nil) { // CheckpointRDD should not be checkpointed again //CheckpointRDD不应再次检查点 override def doCheckpoint(): Unit = { } override def checkpoint(): Unit = { } //this.type表示当前对象(this)的类型,this指代当前的对象 override def localCheckpoint(): this.type = this // Note: There is a bug in MiMa that complains about `AbstractMethodProblem`s in the // base [[org.apache.spark.rdd.RDD]] class if we do not override the following methods. //注意:如果我们不覆盖以下方法,那么MiMa中有一个Bug在基础[[org.apache.spark.rdd.RDD]]类中引用了`AbstractMethodProblem`s) // scalastyle:off protected override def getPartitions: Array[Partition] = ??? override def compute(p: Partition, tc: TaskContext): Iterator[T] = ??? // scalastyle:on }
Example 120
Source File: BinaryFileRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{ Configurable, Configuration } import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.spark.input.StreamFileInputFormat import org.apache.spark.{ Partition, SparkContext } private[spark] class BinaryFileRDD[T]( sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], @transient conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = newJobContext(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 121
Source File: PartitionPruningRDDSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") {//修剪的分区设置的正确性 val rdd = new RDD[Int](sc, Nil) {//列表结尾为Nil override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") {//修剪分区可以联合 //列表结尾为Nil val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 122
Source File: HBasePartition.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.regionserver.RegionScanner import org.apache.spark.Partition import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions.PartialPredicateOperations._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.hbase.types.HBaseBytesType import org.apache.spark.sql.types.Range import scala.annotation.meta.param private[hbase] class HBasePartition( val idx: Int, val mappedIndex: Int, start: Option[HBaseRawType] = None, end: Option[HBaseRawType] = None, val server: Option[String] = None, val filterPredicates: Option[Expression] = None, @(transient @param) relation: HBaseRelation = null, @(transient @param) val newScanner:RegionScanner = null) extends Range[HBaseRawType](start, true, end, false, HBaseBytesType) with Partition with IndexMappable with Logging { override def index: Int = idx override def hashCode(): Int = idx @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start) @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end) def computePredicate(relation: HBaseRelation): Option[Expression] = { val predicate = if (filterPredicates.isDefined && filterPredicates.get.references.exists(_.exprId == relation.partitionKeys.head.exprId)) { val oriPredicate = filterPredicates.get val predicateReferences = oriPredicate.references.toSeq val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences) val row = new GenericInternalRow(predicateReferences.size) var rowIndex = 0 var i = 0 var range: Range[_] = null while (i < relation.keyColumns.size) { range = relation.generateRange(this, oriPredicate, i) if (range != null) { rowIndex = relation.rowIndex(predicateReferences, i) if (rowIndex >= 0) row.update(rowIndex, range) // if the non-last dimension range is not point, do not proceed to the next dims if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size else i = i + 1 } else i = relation.keyColumns.size } val pr = boundReference.partialReduce(row, predicateReferences) pr match { case (null, e: Expression) => Some(e) case (true, _) => None case (false, _) => Some(Literal(false)) } } else filterPredicates logInfo(predicate.toString) predicate } override def toString = { s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates" } }
Example 123
Source File: HBasePartition.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.hadoop.hbase.regionserver.RegionScanner import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.hbase.catalyst.expressions.PartialPredicateOperations._ import org.apache.spark.sql.hbase.types.{HBaseBytesType, Range} import org.apache.spark.{Logging, Partition} private[hbase] class HBasePartition( val idx: Int, val mappedIndex: Int, start: Option[HBaseRawType] = None, end: Option[HBaseRawType] = None, val server: Option[String] = None, val filterPredicates: Option[Expression] = None, @transient relation: HBaseRelation = null, @transient val newScanner:RegionScanner = null) extends Range[HBaseRawType](start, true, end, false, HBaseBytesType) with Partition with IndexMappable with Logging { override def index: Int = idx override def hashCode(): Int = idx @transient lazy val startNative: Seq[Any] = relation.nativeKeyConvert(start) @transient lazy val endNative: Seq[Any] = relation.nativeKeyConvert(end) def computePredicate(relation: HBaseRelation): Option[Expression] = { val predicate = if (filterPredicates.isDefined && filterPredicates.get.references.exists(_.exprId == relation.partitionKeys.head.exprId)) { val oriPredicate = filterPredicates.get val predicateReferences = oriPredicate.references.toSeq val boundReference = BindReferences.bindReference(oriPredicate, predicateReferences) val row = new GenericMutableRow(predicateReferences.size) var rowIndex = 0 var i = 0 var range: Range[_] = null while (i < relation.keyColumns.size) { range = relation.generateRange(this, oriPredicate, i) if (range != null) { rowIndex = relation.rowIndex(predicateReferences, i) if (rowIndex >= 0) row.update(rowIndex, range) // if the non-last dimension range is not point, do not proceed to the next dims if (i < relation.keyColumns.size - 1 && !range.isPoint) i = relation.keyColumns.size else i = i + 1 } else i = relation.keyColumns.size } val pr = boundReference.partialReduce(row, predicateReferences) pr match { case (null, e: Expression) => Some(e) case (true, _) => None case (false, _) => Some(Literal(false)) } } else filterPredicates logInfo(predicate.toString) predicate } override def toString = { s"HBasePartition: $idx, $mappedIndex, [$start, $end), $filterPredicates" } }
Example 124
Source File: ClassRDDPartitioner.scala From spark-orientdb-connector with Apache License 2.0 | 5 votes |
package com.metreta.spark.orientdb.connector.rdd.partitioner import scala.collection.JavaConversions.iterableAsScalaIterable import scala.collection.mutable.ArrayBuffer import org.apache.spark.Logging import org.apache.spark.Partition import com.metreta.spark.orientdb.connector.api.OrientDBConnector import com.orientechnologies.orient.core.metadata.schema.OClass import com.orientechnologies.orient.core.metadata.schema.OSchema import com.orientechnologies.orient.core.storage.OStorage import com.metreta.spark.orientdb.connector.SystemTables import scala.collection.JavaConversions.iterableAsScalaIterable def getPartitions(): Array[Partition] = { val db = connector.databaseDocumentTx() var partitions = new ArrayBuffer[OrientPartition] val schema: OSchema = connector.getSchema(db) var klass: OClass = schema.getClass(mClass) val storage: OStorage = connector.getStorage(db) klass.getClusterIds.zipWithIndex foreach { case (clusterId, index) => partitions = partitions.+=(OrientPartition( index, null, // <- Host Address ????? PartitionName(klass.getName, storage.getClusterById(clusterId).getName))) } partitions.toArray } }
Example 125
Source File: SlidingRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD private[mllib] class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int) extends Partition with Serializable { override val index: Int = idx } private[mllib] class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int) extends RDD[Array[T]](parent) { require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1), "Window size and step must be greater than 0, " + s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.") override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = { val part = split.asInstanceOf[SlidingRDDPartition[T]] (firstParent[T].iterator(part.prev, context) ++ part.tail) .drop(part.offset) .sliding(windowSize, step) .withPartial(false) .map(_.toArray) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev) override def getPartitions: Array[Partition] = { val parentPartitions = parent.partitions val n = parentPartitions.length if (n == 0) { Array.empty } else if (n == 1) { Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0)) } else { val w1 = windowSize - 1 // Get partition sizes and first w1 elements. val (sizes, heads) = parent.mapPartitions { iter => val w1Array = iter.take(w1).toArray Iterator.single((w1Array.length + iter.length, w1Array)) }.collect().unzip val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]] var i = 0 var cumSize = 0 var partitionIndex = 0 while (i < n) { val mod = cumSize % step val offset = if (mod == 0) 0 else step - mod val size = sizes(i) if (offset < size) { val tail = mutable.ListBuffer.empty[T] // Keep appending to the current tail until it has w1 elements. var j = i + 1 while (j < n && tail.length < w1) { tail ++= heads(j).take(w1 - tail.length) j += 1 } if (sizes(i) + tail.length >= offset + windowSize) { partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset) partitionIndex += 1 } } cumSize += size i += 1 } partitions.toArray } } // TODO: Override methods such as aggregate, which only requires one Spark job. }
Example 126
Source File: DataSourceRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.v2.reader.DataReaderFactory class DataSourceRDDPartition[T : ClassTag](val index: Int, val readerFactory: DataReaderFactory[T]) extends Partition with Serializable class DataSourceRDD[T: ClassTag]( sc: SparkContext, @transient private val readerFactories: java.util.List[DataReaderFactory[T]]) extends RDD[T](sc, Nil) { override protected def getPartitions: Array[Partition] = { readerFactories.asScala.zipWithIndex.map { case (readerFactory, index) => new DataSourceRDDPartition(index, readerFactory) }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { val reader = split.asInstanceOf[DataSourceRDDPartition[T]].readerFactory.createDataReader() context.addTaskCompletionListener(_ => reader.close()) val iter = new Iterator[T] { private[this] var valuePrepared = false override def hasNext: Boolean = { if (!valuePrepared) { valuePrepared = reader.next() } valuePrepared } override def next(): T = { if (!hasNext) { throw new java.util.NoSuchElementException("End of stream") } valuePrepared = false reader.get() } } new InterruptibleIterator(context, iter) } override def getPreferredLocations(split: Partition): Seq[String] = { split.asInstanceOf[DataSourceRDDPartition[T]].readerFactory.preferredLocations() } }
Example 127
Source File: StateStoreRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import java.util.UUID import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration override def getPreferredLocations(partition: Partition): Seq[String] = { val stateStoreProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) store = StateStore.get( storeProviderId, keySchema, valueSchema, indexOrdinal, storeVersion, storeConf, hadoopConfBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 128
Source File: WholeTextFileRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val conf = getConf // setMinPartitions below will call FileInputFormat.listStatus(), which can be quite slow when // traversing a large number of directories and files. Parallelize it. conf.setIfUnset(FileInputFormat.LIST_STATUS_NUM_THREADS, Runtime.getRuntime.availableProcessors().toString) val inputFormat = inputFormatClass.newInstance inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 129
Source File: ZippedWithIndexRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils private[spark] class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long) extends Partition with Serializable { override val index: Int = prev.index } @transient private val startIndices: Array[Long] = { val n = prev.partitions.length if (n == 0) { Array.empty } else if (n == 1) { Array(0L) } else { prev.context.runJob( prev, Utils.getIteratorSize _, 0 until n - 1 // do not need to count the last partition ).scanLeft(0L)(_ + _) } } override def getPartitions: Array[Partition] = { firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index))) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = { val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition] val parentIter = firstParent[T].iterator(split.prev, context) Utils.getIteratorZipWithIndex(parentIter, split.startIndex) } }
Example 130
Source File: UnionRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.ForkJoinTaskSupport import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 131
Source File: PartitionwiseSampledRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils import org.apache.spark.util.random.RandomSampler private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 132
Source File: PartitionerAwareUnionRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.nonEmpty) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map { index => new PartitionerAwareUnionRDDPartition(rdds, index) }.toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 133
Source File: BinaryFileRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.Writable import org.apache.hadoop.mapreduce._ import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.hadoop.mapreduce.task.JobContextImpl import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.StreamFileInputFormat private[spark] class BinaryFileRDD[T]( @transient private val sc: SparkContext, inputFormatClass: Class[_ <: StreamFileInputFormat[T]], keyClass: Class[String], valueClass: Class[T], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val conf = getConf // setMinPartitions below will call FileInputFormat.listStatus(), which can be quite slow when // traversing a large number of directories and files. Parallelize it. conf.setIfUnset(FileInputFormat.LIST_STATUS_NUM_THREADS, Runtime.getRuntime.availableProcessors().toString) val inputFormat = inputFormatClass.newInstance inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = new JobContextImpl(conf, jobId) inputFormat.setMinPartitions(sc, jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 134
Source File: FakeTask.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.util.Properties import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.executor.TaskMetrics class FakeTask( stageId: Int, partitionId: Int, prefLocs: Seq[TaskLocation] = Nil, serializedTaskMetrics: Array[Byte] = SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array()) extends Task[Int](stageId, 0, partitionId, new Properties, serializedTaskMetrics) { override def runTask(context: TaskContext): Int = 0 override def preferredLocations: Seq[TaskLocation] = prefLocs } object FakeTask { def createTaskSet(numTasks: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { createTaskSet(numTasks, stageAttemptId = 0, prefLocs: _*) } def createTaskSet(numTasks: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { createTaskSet(numTasks, stageId = 0, stageAttemptId, prefLocs: _*) } def createTaskSet(numTasks: Int, stageId: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { if (prefLocs.size != 0 && prefLocs.size != numTasks) { throw new IllegalArgumentException("Wrong number of task locations") } val tasks = Array.tabulate[Task[_]](numTasks) { i => new FakeTask(stageId, i, if (prefLocs.size != 0) prefLocs(i) else Nil) } new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null) } def createShuffleMapTaskSet( numTasks: Int, stageId: Int, stageAttemptId: Int, prefLocs: Seq[TaskLocation]*): TaskSet = { if (prefLocs.size != 0 && prefLocs.size != numTasks) { throw new IllegalArgumentException("Wrong number of task locations") } val tasks = Array.tabulate[Task[_]](numTasks) { i => new ShuffleMapTask(stageId, stageAttemptId, null, new Partition { override def index: Int = i }, prefLocs(i), new Properties, SparkEnv.get.closureSerializer.newInstance().serialize(TaskMetrics.registered).array()) } new TaskSet(tasks, stageId, stageAttemptId, priority = 0, null) } }
Example 135
Source File: PartitionPruningRDDSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 136
Source File: StarryRDD.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.reflect.ClassTag class StarryRDD[T: ClassTag](sc: SparkContext, rddName: String, @transient private var data: Seq[T] ) extends RDD[T](sc, Nil) { def this (sc: SparkContext, data: Seq[T]) = { this (sc, getClass.getSimpleName, data) } setName(rddName) override def compute(split: Partition, context: TaskContext): Iterator[T] = { split.asInstanceOf[ParallelCollectionPartition[T]].iterator } def updateData(data: Seq[T]): Unit = { this.data = data this.markCheckpointed() } override protected def getPartitions: Array[Partition] = { Array(new ParallelCollectionPartition(id, 0, data)) } }
Example 137
Source File: SparkPlanExecutor.scala From starry with Apache License 2.0 | 5 votes |
package com.github.passionke.starry import org.apache.spark.{Partition, StarryTaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.{ReuseSubquery, SparkPlan} object SparkPlanExecutor { def exec(plan: SparkPlan, sparkSession: SparkSession) = { val newPlan = Seq( ReuseSubquery(sparkSession.sessionState.conf)) .foldLeft(plan) { case (sp, rule) => rule.apply(sp) } doExec(newPlan) } def firstPartition(rdd: RDD[InternalRow]): Partition = { rdd.partitions.head } def doExec(sparkPlan: SparkPlan): List[InternalRow] = { val rdd = sparkPlan.execute().map(ite => ite.copy()) val partition = firstPartition(rdd) rdd.compute(partition, new StarryTaskContext).toList } def rddCompute(rdd: RDD[InternalRow]): List[InternalRow] = { val partition = firstPartition(rdd) rdd.compute(partition, new StarryTaskContext).toList } }
Example 138
Source File: SlidingRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.rdd import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.{TaskContext, Partition} import org.apache.spark.rdd.RDD private[mllib] class SlidingRDDPartition[T](val idx: Int, val prev: Partition, val tail: Seq[T], val offset: Int) extends Partition with Serializable { override val index: Int = idx } private[mllib] class SlidingRDD[T: ClassTag](@transient val parent: RDD[T], val windowSize: Int, val step: Int) extends RDD[Array[T]](parent) { require(windowSize > 0 && step > 0 && !(windowSize == 1 && step == 1), "Window size and step must be greater than 0, " + s"and they cannot be both 1, but got windowSize = $windowSize and step = $step.") override def compute(split: Partition, context: TaskContext): Iterator[Array[T]] = { val part = split.asInstanceOf[SlidingRDDPartition[T]] (firstParent[T].iterator(part.prev, context) ++ part.tail) .drop(part.offset) .sliding(windowSize, step) .withPartial(false) .map(_.toArray) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SlidingRDDPartition[T]].prev) override def getPartitions: Array[Partition] = { val parentPartitions = parent.partitions val n = parentPartitions.length if (n == 0) { Array.empty } else if (n == 1) { Array(new SlidingRDDPartition[T](0, parentPartitions(0), Seq.empty, 0)) } else { val w1 = windowSize - 1 // Get partition sizes and first w1 elements. val (sizes, heads) = parent.mapPartitions { iter => val w1Array = iter.take(w1).toArray Iterator.single((w1Array.length + iter.length, w1Array)) }.collect().unzip val partitions = mutable.ArrayBuffer.empty[SlidingRDDPartition[T]] var i = 0 var cumSize = 0 var partitionIndex = 0 while (i < n) { val mod = cumSize % step val offset = if (mod == 0) 0 else step - mod val size = sizes(i) if (offset < size) { val tail = mutable.ListBuffer.empty[T] // Keep appending to the current tail until it has w1 elements. var j = i + 1 while (j < n && tail.length < w1) { tail ++= heads(j).take(w1 - tail.length) j += 1 } if (sizes(i) + tail.length >= offset + windowSize) { partitions += new SlidingRDDPartition[T](partitionIndex, parentPartitions(i), tail, offset) partitionIndex += 1 } } cumSize += size i += 1 } partitions.toArray } } // TODO: Override methods such as aggregate, which only requires one Spark job. }
Example 139
Source File: JDBCRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.util.Properties import scala.collection.mutable.ArrayBuffer import org.apache.spark.Partition import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode} def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = { if (partitioning == null) return Array[Partition](JDBCPartition(null, 0)) val numPartitions = partitioning.numPartitions val column = partitioning.column if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0)) // Overflow and silliness can happen if you subtract then divide. // Here we get a little roundoff, but that's (hopefully) OK. val stride: Long = (partitioning.upperBound / numPartitions - partitioning.lowerBound / numPartitions) var i: Int = 0 var currentValue: Long = partitioning.lowerBound var ans = new ArrayBuffer[Partition]() while (i < numPartitions) { val lowerBound = if (i != 0) s"$column >= $currentValue" else null currentValue += stride val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null val whereClause = if (upperBound == null) { lowerBound } else if (lowerBound == null) { upperBound } else { s"$lowerBound AND $upperBound" } ans += JDBCPartition(whereClause, i) i = i + 1 } ans.toArray } } private[sql] case class JDBCRelation( url: String, table: String, parts: Array[Partition], properties: Properties = new Properties())(@transient val sqlContext: SQLContext) extends BaseRelation with PrunedFilteredScan with InsertableRelation { override val needConversion: Boolean = false override val schema: StructType = JDBCRDD.resolveTable(url, table, properties) override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row] JDBCRDD.scanTable( sqlContext.sparkContext, schema, url, properties, table, requiredColumns, filters, parts).asInstanceOf[RDD[Row]] } override def insert(data: DataFrame, overwrite: Boolean): Unit = { data.write .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append) .jdbc(url, table, properties) } }
Example 140
Source File: SampledRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.commons.math3.distribution.PoissonDistribution import org.apache.spark.{Partition, TaskContext} @deprecated("Replaced by PartitionwiseSampledRDDPartition", "1.0.0") private[spark] class SampledRDDPartition(val prev: Partition, val seed: Int) extends Partition with Serializable { override val index: Int = prev.index } @deprecated("Replaced by PartitionwiseSampledRDD", "1.0.0") private[spark] class SampledRDD[T: ClassTag]( prev: RDD[T], withReplacement: Boolean, frac: Double, seed: Int) extends RDD[T](prev) { override def getPartitions: Array[Partition] = { val rg = new Random(seed) firstParent[T].partitions.map(x => new SampledRDDPartition(x, rg.nextInt)) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[SampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[T] = { val split = splitIn.asInstanceOf[SampledRDDPartition] if (withReplacement) { // For large datasets, the expected number of occurrences of each element in a sample with // replacement is Poisson(frac). We use that to get a count for each element. val poisson = new PoissonDistribution(frac) poisson.reseedRandomGenerator(split.seed) firstParent[T].iterator(split.prev, context).flatMap { element => val count = poisson.sample() if (count == 0) { Iterator.empty // Avoid object allocation when we return 0 items, which is quite often } else { Iterator.fill(count)(element) } } } else { // Sampling without replacement val rand = new Random(split.seed) firstParent[T].iterator(split.prev, context).filter(x => (rand.nextDouble <= frac)) } } }
Example 141
Source File: WholeTextFileRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.hadoop.conf.{Configurable, Configuration} import org.apache.hadoop.io.{Text, Writable} import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.{Partition, SparkContext} import org.apache.spark.input.WholeTextFileInputFormat private[spark] class WholeTextFileRDD( sc : SparkContext, inputFormatClass: Class[_ <: WholeTextFileInputFormat], keyClass: Class[Text], valueClass: Class[Text], conf: Configuration, minPartitions: Int) extends NewHadoopRDD[Text, Text](sc, inputFormatClass, keyClass, valueClass, conf) { override def getPartitions: Array[Partition] = { val inputFormat = inputFormatClass.newInstance val conf = getConf inputFormat match { case configurable: Configurable => configurable.setConf(conf) case _ => } val jobContext = newJobContext(conf, jobId) inputFormat.setMinPartitions(jobContext, minPartitions) val rawSplits = inputFormat.getSplits(jobContext).toArray val result = new Array[Partition](rawSplits.size) for (i <- 0 until rawSplits.size) { result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) } result } }
Example 142
Source File: ZippedWithIndexRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils private[spark] class ZippedWithIndexRDDPartition(val prev: Partition, val startIndex: Long) extends Partition with Serializable { override val index: Int = prev.index } @transient private val startIndices: Array[Long] = { val n = prev.partitions.length if (n == 0) { Array[Long]() } else if (n == 1) { Array(0L) } else { prev.context.runJob( prev, Utils.getIteratorSize _, 0 until n - 1 // do not need to count the last partition ).scanLeft(0L)(_ + _) } } override def getPartitions: Array[Partition] = { firstParent[T].partitions.map(x => new ZippedWithIndexRDDPartition(x, startIndices(x.index))) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[ZippedWithIndexRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[(T, Long)] = { val split = splitIn.asInstanceOf[ZippedWithIndexRDDPartition] firstParent[T].iterator(split.prev, context).zipWithIndex.map { x => (x._1, split.startIndex + x._2) } } }
Example 143
Source File: MemoryCheckpointRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.storage.RDDBlockId import org.apache.spark.{Partition, SparkContext, SparkException, TaskContext} import scala.reflect.ClassTag // We use a different class than LocalCheckpointRDD, but the same functionality, // so that we easily identify (e..g, pattern match) this class in DAGScheduler. class MemoryCheckpointRDD[T: ClassTag](sc: SparkContext, rddId: Int, numPartitions: Int) extends LocalCheckpointRDD[T](sc, rddId, numPartitions) { def this(rdd: RDD[T]) { this(rdd.context, rdd.id, rdd.partitions.size) } override def compute(partition: Partition, context: TaskContext): Iterator[T] = { throw new SparkException( s"Checkpoint block ${RDDBlockId(rddId, partition.index)} not found! Either the executor " + s"that originally checkpointed this partition is no longer alive, or the original RDD is " + s"unpersisted. If this problem persists, you may consider using `rdd.checkpoint()` " + s"or `rdd.localcheckpoint()` instead, which are slower than memory checkpointing but more fault-tolerant.") } }
Example 144
Source File: UnionRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies override def getPartitions: Array[Partition] = { val array = new Array[Partition](rdds.map(_.partitions.length).sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 145
Source File: PartitionwiseSampledRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 146
Source File: PartitionerAwareUnionRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.length > 0) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map(index => { new PartitionerAwareUnionRDDPartition(rdds, index) }).toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => { val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 147
Source File: HazelcastRDD.scala From hazelcast-spark with Apache License 2.0 | 4 votes |
package com.hazelcast.spark.connector.rdd import com.hazelcast.client.HazelcastClientNotActiveException import com.hazelcast.client.cache.impl.ClientCacheProxy import com.hazelcast.client.proxy.ClientMapProxy import com.hazelcast.core.{HazelcastInstance, Partition => HazelcastPartition} import com.hazelcast.spark.connector.conf.SerializableConf import com.hazelcast.spark.connector.iterator.{CacheIterator, MapIterator} import com.hazelcast.spark.connector.util.ConnectionUtil.{closeHazelcastConnection, getHazelcastConnection} import com.hazelcast.spark.connector.util.HazelcastUtil._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.collection.JavaConversions._ import scala.util.Try class HazelcastRDD[K, V](@transient val sc: SparkContext, val hzName: String, val isCache: Boolean, val config: SerializableConf) extends RDD[(K, V)](sc, Seq.empty) { @transient lazy val hazelcastPartitions: scala.collection.mutable.Map[Int, String] = { val client: HazelcastInstance = getHazelcastConnection(config.serverAddresses, id, config) val partitions: scala.collection.mutable.Map[Int, String] = scala.collection.mutable.Map[Int, String]() client.getPartitionService.getPartitions.foreach { p => partitions.put(p.getPartitionId, p.getOwner.getAddress.getHost + ":" + p.getOwner.getAddress.getPort) } closeHazelcastConnection(config.serverAddresses, id) partitions } @DeveloperApi override def compute(split: Partition, context: TaskContext): Iterator[(K, V)] = { Try(computeInternal(split)).recover[Iterator[(K, V)]]({ case e: HazelcastClientNotActiveException ⇒ computeInternal(split) }).get } def computeInternal(split: Partition): Iterator[(K, V)] = { val partitionLocationInfo = split.asInstanceOf[PartitionLocationInfo] val client: HazelcastInstance = getHazelcastConnection(partitionLocationInfo.location, id, config) if (isCache) { val cache: ClientCacheProxy[K, V] = getClientCacheProxy(hzName, client) new CacheIterator[K, V](cache.iterator(config.readBatchSize, split.index, config.valueBatchingEnabled)) } else { val map: ClientMapProxy[K, V] = getClientMapProxy(hzName, client) new MapIterator[K, V](map.iterator(config.readBatchSize, split.index, config.valueBatchingEnabled)) } } override protected def getPartitions: Array[Partition] = { var array: Array[Partition] = Array[Partition]() for (i <- 0 until hazelcastPartitions.size) { array = array :+ new PartitionLocationInfo(i, hazelcastPartitions.get(i).get) } array } override protected def getPreferredLocations(split: Partition): Seq[String] = { Seq(hazelcastPartitions.get(split.index).get) } }