org.apache.spark.api.java.JavaPairRDD Scala Examples
The following examples show how to use org.apache.spark.api.java.JavaPairRDD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 2
Source File: GroupSorted.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted.api.java import java.util.{ Comparator, Iterator => JIterator } import scala.reflect.ClassTag import scala.collection.JavaConverters._ import org.apache.spark.{ Partitioner, HashPartitioner } import org.apache.spark.Partitioner.defaultPartitioner import org.apache.spark.api.java.JavaPairRDD import org.apache.spark.api.java.function.{ Function => JFunction, Function2 => JFunction2, FlatMapFunction => JFlatMapFunction } import com.tresata.spark.sorted.{ GroupSorted => SGroupSorted } object GroupSorted { private case class ComparatorOrdering[T](comparator: Comparator[T]) extends Ordering[T] { def compare(x: T, y: T) = comparator.compare(x, y) } private def comparatorToOrdering[T](comparator: Comparator[T]): Ordering[T] = new ComparatorOrdering(comparator) private def fakeClassTag[T]: ClassTag[T] = ClassTag.AnyRef.asInstanceOf[ClassTag[T]] private implicit def ordering[K]: Ordering[K] = comparatorToOrdering(NaturalComparator.get[K]) private def groupSort[K, V](javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]): SGroupSorted[K, V] = { implicit def kClassTag: ClassTag[K] = javaPairRDD.kClassTag implicit def vClassTag: ClassTag[V] = javaPairRDD.vClassTag val valueOrdering = Option(valueComparator).map(comparatorToOrdering) SGroupSorted(javaPairRDD.rdd, partitioner, valueOrdering) } } class GroupSorted[K, V] private (sGroupSorted: SGroupSorted[K, V]) extends JavaPairRDD[K, V](sGroupSorted)(GroupSorted.fakeClassTag[K], GroupSorted.fakeClassTag[V]) { def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner, valueComparator: Comparator[V]) = this(GroupSorted.groupSort(javaPairRDD, partitioner, valueComparator)) def this(javaPairRDD: JavaPairRDD[K, V], partitioner: Partitioner) = this(GroupSorted.groupSort(javaPairRDD, partitioner, null)) def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int, valueComparator: Comparator[V]) = this(javaPairRDD, if (numPartitions > 0) new HashPartitioner(numPartitions) else defaultPartitioner(javaPairRDD.rdd), valueComparator) def this(javaPairRDD: JavaPairRDD[K, V], numPartitions: Int) = this(javaPairRDD, numPartitions, null) def this(javaPairRDD: JavaPairRDD[K, V], valueComparator: Comparator[V]) = this(javaPairRDD, -1, valueComparator) def this(javaPairRDD: JavaPairRDD[K, V]) = this(javaPairRDD, -1, null) import GroupSorted._ override def flatMapValues[W](f: JFlatMapFunction[V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.flatMapValues(v => f.call(v).asScala)) } override def mapValues[W](f: JFunction[V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapValues(v => f.call(v))) } def mapKeyValuesToValues[W](f: JFunction[Tuple2[K, V], W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapKeyValuesToValues(kv => f.call(kv))) } def mapStreamByKey[W](f: JFunction[JIterator[V], JIterator[W]]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.mapStreamByKey(it => f.call(it.asJava).asScala)) } def foldLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.foldLeftByKey(w)((w, v) => f.call(w, v))) } def reduceLeftByKey[W >: V](f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.reduceLeftByKey(f.call)) } def scanLeftByKey[W](w: W, f: JFunction2[W, V, W]): GroupSorted[K, W] = { implicit def wClassTag: ClassTag[W] = fakeClassTag[W] new GroupSorted[K, W](sGroupSorted.scanLeftByKey(w)((w, v) => f.call(w, v))) } }
Example 3
Source File: RRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 4
Source File: RRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 5
Source File: InfinispanJavaRDD.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.rdd import org.apache.spark.api.java.{JavaPairRDD, JavaSparkContext} import org.infinispan.query.dsl.Query import org.infinispan.spark._ import org.infinispan.spark.config.ConnectorConfiguration import scala.annotation.varargs import scala.reflect.ClassTag object InfinispanJavaRDD { def createInfinispanRDD[K, V](jsc: JavaSparkContext, config: ConnectorConfiguration): InfinispanJavaRDD[K, V] = { createInfinispanRDD(jsc.sc, config, new PerServerSplitter) } def createInfinispanRDD[K, V](jsc: JavaSparkContext, config: ConnectorConfiguration, splitter: Splitter): InfinispanJavaRDD[K, V] = { val infinispanRDD = new InfinispanRDD[K, V](jsc.sc, config, splitter) implicit val keyClassTag = ClassTag.AnyRef.asInstanceOf[ClassTag[K]] implicit val valueClassTag = ClassTag.AnyRef.asInstanceOf[ClassTag[V]] new InfinispanJavaRDD[K, V](infinispanRDD) } def write[K, V](pairRDD: JavaPairRDD[K, V], config: ConnectorConfiguration) = pairRDD.rdd.writeToInfinispan(config) } class InfinispanJavaRDD[K, V](rdd: InfinispanRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) with CacheManagementAware { def filterByQuery[R](q: Query): JavaPairRDD[K, R] = { val filteredRDD = rdd.filterByQuery[R](q) implicit val converted = ClassTag.AnyRef.asInstanceOf[ClassTag[R]] JavaPairRDD.fromRDD[K, R](filteredRDD) } def filterByQuery[R](q: String): JavaPairRDD[K, R] = { val filteredRDD = rdd.filterByQuery[R](q) implicit val converted = ClassTag.AnyRef.asInstanceOf[ClassTag[R]] JavaPairRDD.fromRDD[K, R](filteredRDD) } @varargs def filterByCustom[R](filterFactory: String, params: AnyRef*): JavaPairRDD[K, R] = { val filteredRDD = rdd.filterByCustom[R](filterFactory, params: _*) implicit val converted = ClassTag.AnyRef.asInstanceOf[ClassTag[R]] JavaPairRDD.fromRDD[K, R](filteredRDD) } override def count() = rdd.count() override def cacheAdmin(): CacheAdmin = rdd.cacheAdmin() }