org.apache.spark.OneToOneDependency Scala Examples
The following examples show how to use org.apache.spark.OneToOneDependency.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: EdgeRDDImpl.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 2
Source File: InsertRDD.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.datastream.writer import scala.annotation.tailrec import scala.reflect.ClassTag import org.apache.spark.{ OneToOneDependency, NarrowDependency, Partition, TaskContext } import org.apache.spark.rdd.RDD import com.actian.spark_vector.datastream.{ DataStreamPartition, DataStreamPartitionAssignment, VectorEndpointConf } private val endPointsToParentPartitionsMap = { val affinities = rdd.partitions.map(getPreferredLocationsRec(rdd, _)) val ret = DataStreamPartitionAssignment(affinities, writeConf.vectorEndpoints) logDebug(s"Computed endPointsToParentPartitionsMap and got: ${ (0 until ret.length).map { case idx => val vals = ret(idx) s"Datastream $idx -> RDD partitions ${vals.length}: ${vals.take(partitionsPerDataStreamToPrint).mkString(",")} ${if (vals.length > partitionsPerDataStreamToPrint) "..." else ""}" } }") ret.map(_.map(rdd.partitions(_).index)) } override protected def getPartitions = (0 until writeConf.size).map(idx => DataStreamPartition(idx, rdd, endPointsToParentPartitionsMap(idx))).toArray override protected def getPreferredLocations(split: Partition) = { logDebug(s"getPreferredLocations is called for partition ${split.index} and we are returning ${writeConf.vectorEndpoints(split.index).host}") Seq(writeConf.vectorEndpoints(split.index).host) } override def compute(split: Partition, taskContext: TaskContext): Iterator[R] = split.asInstanceOf[DataStreamPartition].parents.toIterator.flatMap(firstParent[R].iterator(_, taskContext)) override def getDependencies: Seq[NarrowDependency[_]] = Seq(new NarrowDependency(rdd) { def getParents(partitionId: Int) = endPointsToParentPartitionsMap(partitionId) }) }
Example 3
Source File: EdgeRDDImpl.scala From zen with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx2.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx2._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 4
Source File: PartitionerAwareUnionRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.length > 0) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map(index => { new PartitionerAwareUnionRDDPartition(rdds, index) }).toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => { val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 5
Source File: EdgeRDDImpl.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 6
Source File: PartitionerAwareUnionRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.nonEmpty) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map { index => new PartitionerAwareUnionRDDPartition(rdds, index) }.toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 7
Source File: EdgeRDDImpl.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 8
Source File: EdgeRDDImpl.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 9
Source File: PartitionerAwareUnionRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.length > 0) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map(index => { new PartitionerAwareUnionRDDPartition(rdds, index) }).toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => { val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 10
Source File: EdgeRDDImpl.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 11
Source File: PartitionerAwareUnionRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.nonEmpty) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map { index => new PartitionerAwareUnionRDDPartition(rdds, index) }.toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 12
Source File: MultiWayZipPartitionsRDD.scala From hail with MIT License | 5 votes |
package is.hail.sparkextras import org.apache.spark.rdd.RDD import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import scala.reflect.ClassTag object MultiWayZipPartitionsRDD { def apply[T: ClassTag , V: ClassTag]( rdds: IndexedSeq[RDD[T]] )(f: (Array[Iterator[T]]) => Iterator[V]): MultiWayZipPartitionsRDD[T, V] = { new MultiWayZipPartitionsRDD(rdds.head.sparkContext, rdds, f) } } private case class MultiWayZipPartition(val index: Int, val partitions: IndexedSeq[Partition]) extends Partition class MultiWayZipPartitionsRDD[T: ClassTag, V: ClassTag]( sc: SparkContext, var rdds: IndexedSeq[RDD[T]], var f: (Array[Iterator[T]]) => Iterator[V] ) extends RDD[V](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.length > 0) private val numParts = rdds(0).partitions.length require(rdds.forall(rdd => rdd.partitions.length == numParts)) override val partitioner = None override def getPartitions: Array[Partition] = { Array.tabulate[Partition](numParts) { i => MultiWayZipPartition(i, rdds.map(rdd => rdd.partitions(i))) } } override def compute(s: Partition, tc: TaskContext) = { val partitions = s.asInstanceOf[MultiWayZipPartition].partitions val arr = Array.tabulate(rdds.length)(i => rdds(i).iterator(partitions(i), tc)) f(arr) } override def clearDependencies() { super.clearDependencies rdds = null f = null } }
Example 13
Source File: CarbonRDD.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.rdd import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.conf.Configuration import org.apache.spark.{Dependency, OneToOneDependency, Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.util.SparkSQLUtil import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.metadata.schema.table.TableInfo import org.apache.carbondata.core.util._ abstract class CarbonRDDWithTableInfo[T: ClassTag]( @transient private val ss: SparkSession, @transient private var deps: Seq[Dependency[_]], serializedTableInfo: Array[Byte]) extends CarbonRDD[T](ss, deps) { def this(@transient sparkSession: SparkSession, @transient oneParent: RDD[_], serializedTableInfo: Array[Byte]) = { this (sparkSession, List(new OneToOneDependency(oneParent)), serializedTableInfo) } def getTableInfo: TableInfo = TableInfo.deserialize(serializedTableInfo) }
Example 14
Source File: PartitionerAwareUnionRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.length > 0) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map(index => { new PartitionerAwareUnionRDDPartition(rdds, index) }).toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => { val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 15
Source File: SubtractedRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.Dependency import org.apache.spark.OneToOneDependency import org.apache.spark.Partition import org.apache.spark.Partitioner import org.apache.spark.ShuffleDependency import org.apache.spark.SparkEnv import org.apache.spark.TaskContext import org.apache.spark.serializer.Serializer def setSerializer(serializer: Serializer): SubtractedRDD[K, V, W] = { this.serializer = Option(serializer) this } override def getDependencies: Seq[Dependency[_]] = { Seq(rdd1, rdd2).map { rdd => if (rdd.partitioner == Some(part)) { logDebug("Adding one-to-one dependency with " + rdd) new OneToOneDependency(rdd) } else { logDebug("Adding shuffle dependency with " + rdd) new ShuffleDependency(rdd, part, serializer) } } } override def getPartitions: Array[Partition] = { val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.size) { // Each CoGroupPartition will depend on rdd1 and rdd2 array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) => dependencies(j) match { case s: ShuffleDependency[_, _, _] => new ShuffleCoGroupSplitDep(s.shuffleHandle) case _ => new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)) } }.toArray) } array } override val partitioner = Some(part) override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = { val partition = p.asInstanceOf[CoGroupPartition] val map = new JHashMap[K, ArrayBuffer[V]] def getSeq(k: K): ArrayBuffer[V] = { val seq = map.get(k) if (seq != null) { seq } else { val seq = new ArrayBuffer[V]() map.put(k, seq) seq } } def integrate(dep: CoGroupSplitDep, op: Product2[K, V] => Unit) = dep match { case NarrowCoGroupSplitDep(rdd, _, itsSplit) => rdd.iterator(itsSplit, context).asInstanceOf[Iterator[Product2[K, V]]].foreach(op) case ShuffleCoGroupSplitDep(handle) => val iter = SparkEnv.get.shuffleManager .getReader(handle, partition.index, partition.index + 1, context) .read() iter.foreach(op) } // the first dep is rdd1; add all values to the map integrate(partition.deps(0), t => getSeq(t._1) += t._2) // the second dep is rdd2; remove all of its keys integrate(partition.deps(1), t => map.remove(t._1)) map.iterator.map { t => t._2.iterator.map { (t._1, _) } }.flatten } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null } }
Example 16
Source File: PartitionerAwareUnionRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.nonEmpty) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map { index => new PartitionerAwareUnionRDDPartition(rdds, index) }.toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 17
Source File: EdgeRDDImpl.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 18
Source File: EdgeRDDImpl.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{OneToOneDependency, HashPartitioner, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.graphx._ class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }
Example 19
Source File: LeftJoin.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd.function.join import com.twosigma.flint.rdd.{ PartitionsIterator, PeekableIterator } import org.apache.spark.{ NarrowDependency, OneToOneDependency } import com.twosigma.flint.rdd.OrderedRDD import scala.collection.immutable.TreeMap import scala.reflect.ClassTag import java.util.{ HashMap => JHashMap } protected[flint] object LeftJoin { val skMapInitialSize = 1024 def apply[K: ClassTag, SK, V, V2]( leftRdd: OrderedRDD[K, V], rightRdd: OrderedRDD[K, V2], toleranceFn: K => K, leftSk: V => SK, rightSk: V2 => SK )(implicit ord: Ordering[K]): OrderedRDD[K, (V, Option[(K, V2)])] = { // A map from left partition index to left range split and right partitions. val leftIndexToJoinSplits = TreeMap(RangeMergeJoin.leftJoinSplits( toleranceFn, leftRdd.rangeSplits, rightRdd.rangeSplits ).map { case (split, parts) => (split.partition.index, (split, parts)) }: _*) val leftDep = new OneToOneDependency(leftRdd) val rightDep = new NarrowDependency(rightRdd) { override def getParents(partitionId: Int) = leftIndexToJoinSplits(partitionId)._2.map(_.index) } // A map from left partition index to right partitions val rightPartitions = leftRdd.sc.broadcast(leftIndexToJoinSplits.map { case (idx, joinSplit) => (idx, joinSplit._2) }) val joinedSplits = leftIndexToJoinSplits.map { case (_, (split, _)) => split }.toArray // We don't need the left dependency as we will just load it on demand here new OrderedRDD[K, (V, Option[(K, V2)])](leftRdd.sc, joinedSplits, Seq(leftDep, rightDep))( (part, context) => { val parts = rightPartitions.value(part.index) val rightIter = PeekableIterator(PartitionsIterator(rightRdd, parts, context)) val lastSeen = new JHashMap[SK, (K, V2)](skMapInitialSize) leftRdd.iterator(part, context).map { case (k, v) => // Catch-up the iterator for the right table to match the left key. In the // process, we'll have the last-seen row for each SK in the right table. val sk = leftSk(v) catchUp(k, rightSk, rightIter, lastSeen) val lastSeenRight = lastSeen.get(sk) if (lastSeenRight != null && ord.gteq(lastSeenRight._1, toleranceFn(k))) { (k, (v, Some(lastSeenRight))) } else { (k, (v, None)) } } } ) } @annotation.tailrec private[rdd] def catchUp[K, SK, V]( cur: K, skFn: V => SK, iter: PeekableIterator[(K, V)], lastSeen: JHashMap[SK, (K, V)] )(implicit ord: Ordering[K]) { val peek = iter.peek if (peek.nonEmpty && ord.lteq(peek.get._1, cur)) { val (k, v) = iter.next val sk = skFn(v) lastSeen.put(sk, (k, v)) catchUp(cur, skFn, iter, lastSeen) } } }
Example 20
Source File: PartitionerAwareUnionRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.reflect.ClassTag import org.apache.spark.{OneToOneDependency, Partition, SparkContext, TaskContext} import org.apache.spark.util.Utils private[spark] class PartitionerAwareUnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]] ) extends RDD[T](sc, rdds.map(x => new OneToOneDependency(x))) { require(rdds.nonEmpty) require(rdds.forall(_.partitioner.isDefined)) require(rdds.flatMap(_.partitioner).toSet.size == 1, "Parent RDDs have different partitioners: " + rdds.flatMap(_.partitioner)) override val partitioner = rdds.head.partitioner override def getPartitions: Array[Partition] = { val numPartitions = partitioner.get.numPartitions (0 until numPartitions).map { index => new PartitionerAwareUnionRDDPartition(rdds, index) }.toArray } // Get the location where most of the partitions of parent RDDs are located override def getPreferredLocations(s: Partition): Seq[String] = { logDebug("Finding preferred location for " + this + ", partition " + s.index) val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents val locations = rdds.zip(parentPartitions).flatMap { case (rdd, part) => val parentLocations = currPrefLocs(rdd, part) logDebug("Location of " + rdd + " partition " + part.index + " = " + parentLocations) parentLocations } val location = if (locations.isEmpty) { None } else { // Find the location that maximum number of parent partitions prefer Some(locations.groupBy(x => x).maxBy(_._2.length)._1) } logDebug("Selected location for " + this + ", partition " + s.index + " = " + location) location.toSeq } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val parentPartitions = s.asInstanceOf[PartitionerAwareUnionRDDPartition].parents rdds.zip(parentPartitions).iterator.flatMap { case (rdd, p) => rdd.iterator(p, context) } } override def clearDependencies() { super.clearDependencies() rdds = null } // Get the *current* preferred locations from the DAGScheduler (as opposed to the static ones) private def currPrefLocs(rdd: RDD[_], part: Partition): Seq[String] = { rdd.context.getPreferredLocs(rdd, part.index).map(tl => tl.host) } }
Example 21
Source File: SubtractedRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.Dependency import org.apache.spark.OneToOneDependency import org.apache.spark.Partition import org.apache.spark.Partitioner import org.apache.spark.ShuffleDependency import org.apache.spark.SparkEnv import org.apache.spark.TaskContext private[spark] class SubtractedRDD[K: ClassTag, V: ClassTag, W: ClassTag]( @transient var rdd1: RDD[_ <: Product2[K, V]], @transient var rdd2: RDD[_ <: Product2[K, W]], part: Partitioner) extends RDD[(K, V)](rdd1.context, Nil) { override def getDependencies: Seq[Dependency[_]] = { def rddDependency[T1: ClassTag, T2: ClassTag](rdd: RDD[_ <: Product2[T1, T2]]) : Dependency[_] = { if (rdd.partitioner == Some(part)) { logDebug("Adding one-to-one dependency with " + rdd) new OneToOneDependency(rdd) } else { logDebug("Adding shuffle dependency with " + rdd) new ShuffleDependency[T1, T2, Any](rdd, part) } } Seq(rddDependency[K, V](rdd1), rddDependency[K, W](rdd2)) } override def getPartitions: Array[Partition] = { val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.length) { // Each CoGroupPartition will depend on rdd1 and rdd2 array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) => dependencies(j) match { case s: ShuffleDependency[_, _, _] => None case _ => Some(new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i))) } }.toArray) } array } override val partitioner = Some(part) override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = { val partition = p.asInstanceOf[CoGroupPartition] val map = new JHashMap[K, ArrayBuffer[V]] def getSeq(k: K): ArrayBuffer[V] = { val seq = map.get(k) if (seq != null) { seq } else { val seq = new ArrayBuffer[V]() map.put(k, seq) seq } } def integrate(depNum: Int, op: Product2[K, V] => Unit): Unit = { dependencies(depNum) match { case oneToOneDependency: OneToOneDependency[_] => val dependencyPartition = partition.narrowDeps(depNum).get.split oneToOneDependency.rdd.iterator(dependencyPartition, context) .asInstanceOf[Iterator[Product2[K, V]]].foreach(op) case shuffleDependency: ShuffleDependency[_, _, _] => val iter = SparkEnv.get.shuffleManager .getReader( shuffleDependency.shuffleHandle, partition.index, partition.index + 1, context) .read() iter.foreach(op) } } // the first dep is rdd1; add all values to the map integrate(0, t => getSeq(t._1) += t._2) // the second dep is rdd2; remove all of its keys integrate(1, t => map.remove(t._1)) map.asScala.iterator.map(t => t._2.iterator.map((t._1, _))).flatten } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null } }
Example 22
Source File: EdgeRDDImpl.scala From drizzle-spark with Apache License 2.0 | 4 votes |
package org.apache.spark.graphx.impl import scala.reflect.{classTag, ClassTag} import org.apache.spark.{HashPartitioner, OneToOneDependency} import org.apache.spark.graphx._ import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel class EdgeRDDImpl[ED: ClassTag, VD: ClassTag] private[graphx] ( @transient override val partitionsRDD: RDD[(PartitionID, EdgePartition[ED, VD])], val targetStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) extends EdgeRDD[ED](partitionsRDD.context, List(new OneToOneDependency(partitionsRDD))) { override def setName(_name: String): this.type = { if (partitionsRDD.name != null) { partitionsRDD.setName(partitionsRDD.name + ", " + _name) } else { partitionsRDD.setName(_name) } this } setName("EdgeRDD") override def count(): Long = { partitionsRDD.map(_._2.size.toLong).reduce(_ + _) } override def mapValues[ED2: ClassTag](f: Edge[ED] => ED2): EdgeRDDImpl[ED2, VD] = mapEdgePartitions((pid, part) => part.map(f)) override def reverse: EdgeRDDImpl[ED, VD] = mapEdgePartitions((pid, part) => part.reverse) def filter( epred: EdgeTriplet[VD, ED] => Boolean, vpred: (VertexId, VD) => Boolean): EdgeRDDImpl[ED, VD] = { mapEdgePartitions((pid, part) => part.filter(epred, vpred)) } override def innerJoin[ED2: ClassTag, ED3: ClassTag] (other: EdgeRDD[ED2]) (f: (VertexId, VertexId, ED, ED2) => ED3): EdgeRDDImpl[ED3, VD] = { val ed2Tag = classTag[ED2] val ed3Tag = classTag[ED3] this.withPartitionsRDD[ED3, VD](partitionsRDD.zipPartitions(other.partitionsRDD, true) { (thisIter, otherIter) => val (pid, thisEPart) = thisIter.next() val (_, otherEPart) = otherIter.next() Iterator(Tuple2(pid, thisEPart.innerJoin(otherEPart)(f)(ed2Tag, ed3Tag))) }) } def mapEdgePartitions[ED2: ClassTag, VD2: ClassTag]( f: (PartitionID, EdgePartition[ED, VD]) => EdgePartition[ED2, VD2]): EdgeRDDImpl[ED2, VD2] = { this.withPartitionsRDD[ED2, VD2](partitionsRDD.mapPartitions({ iter => if (iter.hasNext) { val (pid, ep) = iter.next() Iterator(Tuple2(pid, f(pid, ep))) } else { Iterator.empty } }, preservesPartitioning = true)) } private[graphx] def withPartitionsRDD[ED2: ClassTag, VD2: ClassTag]( partitionsRDD: RDD[(PartitionID, EdgePartition[ED2, VD2])]): EdgeRDDImpl[ED2, VD2] = { new EdgeRDDImpl(partitionsRDD, this.targetStorageLevel) } override private[graphx] def withTargetStorageLevel( targetStorageLevel: StorageLevel): EdgeRDDImpl[ED, VD] = { new EdgeRDDImpl(this.partitionsRDD, targetStorageLevel) } }