org.apache.spark.rdd.PairRDDFunctions Scala Examples

The following examples show how to use org.apache.spark.rdd.PairRDDFunctions. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: TestValueTransformations.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner }
import org.apache.spark.rdd.PairRDDFunctions

case class Customer(ID: Int, name: String)
case class Item(ID: Int, name: String, price: Float)
case class Order(ID: Int, item: Item, quantity: Int, var discount: Float)
case class CustomerOrders(cust: Customer, order: Order, offer: Boolean)

object TestValueTransformations {
	def main(args: Array[String]): Unit = {
		val sc = new SparkContext(new SparkConf().setAppName("TestCombineByKeyJob"))
		val rdd = sc.parallelize(
			List(
				CustomerOrders(Customer(1, "A"), Order(1, Item(1, "item_1", 20), 2, 0), false),
				CustomerOrders(Customer(1, "A"), Order(2, Item(2, "item_2", 10), 1, 0), false),
				CustomerOrders(Customer(2, "B"), Order(1, Item(1, "item_1", 20), 2, 0), true)))

		println(">>> List of customers availing offers")
		
		orderValuePerCustomer.foreach(println)

		println(">>> Total order value for customer ID = 1 is " + orderValuePerCustomer.reduceByKey(_ + _).lookup(1).toString())

	}
} 
Example 2
Source File: TransformedDStream.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import org.apache.spark.rdd.{PairRDDFunctions, RDD}
import org.apache.spark.streaming.{Duration, Time}
import scala.reflect.ClassTag

private[streaming]
class TransformedDStream[U: ClassTag] (
    parents: Seq[DStream[_]],
    transformFunc: (Seq[RDD[_]], Time) => RDD[U]
  ) extends DStream[U](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to transform is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[U]] = {
    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
    Some(transformFunc(parentRDDs, validTime))
  }
} 
Example 3
Source File: TransformedDStream.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.streaming.dstream

import org.apache.spark.rdd.{PairRDDFunctions, RDD}
import org.apache.spark.streaming.{Duration, Time}
import scala.reflect.ClassTag

private[streaming]
class TransformedDStream[U: ClassTag] (
    parents: Seq[DStream[_]],
    transformFunc: (Seq[RDD[_]], Time) => RDD[U]
  ) extends DStream[U](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to transform is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[U]] = {
    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
    Some(transformFunc(parentRDDs, validTime))
  }
} 
Example 4
Source File: ControlFilesCreator.scala    From spark-benchmarks   with Apache License 2.0 5 votes vote down vote up
package com.bbva.spark.benchmarks.dfsio

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat
import org.apache.spark.SparkContext
import org.apache.spark.rdd.{PairRDDFunctions, RDD}

object ControlFilesCreator {

  val BaseFileName = "test_io_"

  def createFiles(controlDirPath: String, numFiles: Int, fileSize: Long)(implicit sc: SparkContext): Unit = {
    sc.parallelize(0 until numFiles, numFiles).map(getFileName).map { fileName =>
      val controlFilePath = new Path(controlDirPath, s"in_file_$fileName")
      (controlFilePath.toString, new LongWritable(fileSize))
    }.saveAsSequenceFileByKey(controlDirPath)
  }

  implicit class RichRDD[T](val self: RDD[T]) extends AnyVal {
    def saveAsSequenceFileByKey[K, V](path: String)(implicit ev: RDD[T] => PairRDDFunctions[K, V]): Unit =
      self.saveAsHadoopFile(path, classOf[Text], classOf[LongWritable], classOf[RDDMultipleSequenceFileOutputFormat])
  }

  private def getFileName(fileIndex: Int): String = BaseFileName + fileIndex

  class RDDMultipleSequenceFileOutputFormat extends MultipleSequenceFileOutputFormat[Any, Any] {

    override def generateActualKey(key: Any, value: Any): Any = new Text(key.toString.split("/").last)

    override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String =
      new Path(key.toString).toString

  }
} 
Example 5
Source File: RatingDataGenerator.scala    From Swallow   with Apache License 2.0 5 votes vote down vote up
package com.intel.hibench.sparkbench.ml

import com.intel.hibench.sparkbench.common.IOCommon

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.mllib.random._
import org.apache.spark.rdd.{PairRDDFunctions, RDD}
import org.apache.spark.mllib.linalg.{Vectors, Vector}

object RatingDataGenerator {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("RatingDataGeneration")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numUsers: Int = 100
    var numProducts: Int = 100
    var sparsity: Double = 0.05
    var implicitPrefs: Boolean = false
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    if (args.length == 5) {
      outputPath = args(0)
      numUsers = args(1).toInt
      numProducts = args(2).toInt
      sparsity = args(3).toDouble
      implicitPrefs = args(4).toBoolean

      println(s"Output Path: $outputPath")
      println(s"Num of Users: $numUsers")
      println(s"Num of Products: $numProducts")
      println(s"Sparsity: $sparsity")
      println(s"Implicit Prefs: $implicitPrefs")
    } else {
      System.err.println(
        s"Usage: $RatingDataGenerator <OUTPUT_PATH> <NUM_USERS> <NUM_PRODUCTS> <SPARSITY>  <IMPLICITPREFS>"
      )
      System.exit(1)
    }

    val rawData: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, numUsers, numProducts, numPartitions)
    val rng = new java.util.Random()
    val data = rawData.map{v =>
      val a = Array.fill[Double](v.size)(0.0)
      v.foreachActive{(i,vi) =>
         if(rng.nextDouble <= sparsity){
           a(i) = vi
         }
      }
      Vectors.dense(a).toSparse
   }


    data.saveAsObjectFile(outputPath)

    sc.stop()
  }
}