org.apache.spark.rdd.PairRDDFunctions Scala Examples
The following examples show how to use org.apache.spark.rdd.PairRDDFunctions.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TestValueTransformations.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner } import org.apache.spark.rdd.PairRDDFunctions case class Customer(ID: Int, name: String) case class Item(ID: Int, name: String, price: Float) case class Order(ID: Int, item: Item, quantity: Int, var discount: Float) case class CustomerOrders(cust: Customer, order: Order, offer: Boolean) object TestValueTransformations { def main(args: Array[String]): Unit = { val sc = new SparkContext(new SparkConf().setAppName("TestCombineByKeyJob")) val rdd = sc.parallelize( List( CustomerOrders(Customer(1, "A"), Order(1, Item(1, "item_1", 20), 2, 0), false), CustomerOrders(Customer(1, "A"), Order(2, Item(2, "item_2", 10), 1, 0), false), CustomerOrders(Customer(2, "B"), Order(1, Item(1, "item_1", 20), 2, 0), true))) println(">>> List of customers availing offers") orderValuePerCustomer.foreach(println) println(">>> Total order value for customer ID = 1 is " + orderValuePerCustomer.reduceByKey(_ + _).lookup(1).toString()) } }
Example 2
Source File: TransformedDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.rdd.{PairRDDFunctions, RDD} import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.length > 0, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq Some(transformFunc(parentRDDs, validTime)) } }
Example 3
Source File: TransformedDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.rdd.{PairRDDFunctions, RDD} import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.length > 0, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq Some(transformFunc(parentRDDs, validTime)) } }
Example 4
Source File: ControlFilesCreator.scala From spark-benchmarks with Apache License 2.0 | 5 votes |
package com.bbva.spark.benchmarks.dfsio import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat import org.apache.spark.SparkContext import org.apache.spark.rdd.{PairRDDFunctions, RDD} object ControlFilesCreator { val BaseFileName = "test_io_" def createFiles(controlDirPath: String, numFiles: Int, fileSize: Long)(implicit sc: SparkContext): Unit = { sc.parallelize(0 until numFiles, numFiles).map(getFileName).map { fileName => val controlFilePath = new Path(controlDirPath, s"in_file_$fileName") (controlFilePath.toString, new LongWritable(fileSize)) }.saveAsSequenceFileByKey(controlDirPath) } implicit class RichRDD[T](val self: RDD[T]) extends AnyVal { def saveAsSequenceFileByKey[K, V](path: String)(implicit ev: RDD[T] => PairRDDFunctions[K, V]): Unit = self.saveAsHadoopFile(path, classOf[Text], classOf[LongWritable], classOf[RDDMultipleSequenceFileOutputFormat]) } private def getFileName(fileIndex: Int): String = BaseFileName + fileIndex class RDDMultipleSequenceFileOutputFormat extends MultipleSequenceFileOutputFormat[Any, Any] { override def generateActualKey(key: Any, value: Any): Any = new Text(key.toString.split("/").last) override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String = new Path(key.toString).toString } }
Example 5
Source File: RatingDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.mllib.random._ import org.apache.spark.rdd.{PairRDDFunctions, RDD} import org.apache.spark.mllib.linalg.{Vectors, Vector} object RatingDataGenerator { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("RatingDataGeneration") val sc = new SparkContext(conf) var outputPath = "" var numUsers: Int = 100 var numProducts: Int = 100 var sparsity: Double = 0.05 var implicitPrefs: Boolean = false val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt if (args.length == 5) { outputPath = args(0) numUsers = args(1).toInt numProducts = args(2).toInt sparsity = args(3).toDouble implicitPrefs = args(4).toBoolean println(s"Output Path: $outputPath") println(s"Num of Users: $numUsers") println(s"Num of Products: $numProducts") println(s"Sparsity: $sparsity") println(s"Implicit Prefs: $implicitPrefs") } else { System.err.println( s"Usage: $RatingDataGenerator <OUTPUT_PATH> <NUM_USERS> <NUM_PRODUCTS> <SPARSITY> <IMPLICITPREFS>" ) System.exit(1) } val rawData: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, numUsers, numProducts, numPartitions) val rng = new java.util.Random() val data = rawData.map{v => val a = Array.fill[Double](v.size)(0.0) v.foreachActive{(i,vi) => if(rng.nextDouble <= sparsity){ a(i) = vi } } Vectors.dense(a).toSparse } data.saveAsObjectFile(outputPath) sc.stop() } }