org.apache.spark.rdd.PairRDDFunctions Scala Example

Source File: TestValueTransformations.scala From spark-dev with GNU General Public License v3.0

5 votes

package examples

import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner }
import org.apache.spark.rdd.PairRDDFunctions

case class Customer(ID: Int, name: String)
case class Item(ID: Int, name: String, price: Float)
case class Order(ID: Int, item: Item, quantity: Int, var discount: Float)
case class CustomerOrders(cust: Customer, order: Order, offer: Boolean)

object TestValueTransformations {
	def main(args: Array[String]): Unit = {
		val sc = new SparkContext(new SparkConf().setAppName("TestCombineByKeyJob"))
		val rdd = sc.parallelize(
			List(
				CustomerOrders(Customer(1, "A"), Order(1, Item(1, "item_1", 20), 2, 0), false),
				CustomerOrders(Customer(1, "A"), Order(2, Item(2, "item_2", 10), 1, 0), false),
				CustomerOrders(Customer(2, "B"), Order(1, Item(1, "item_1", 20), 2, 0), true)))

		println(">>> List of customers availing offers")
		
		orderValuePerCustomer.foreach(println)

		println(">>> Total order value for customer ID = 1 is " + orderValuePerCustomer.reduceByKey(_ + _).lookup(1).toString())

	}
}

Source File: TransformedDStream.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import org.apache.spark.rdd.{PairRDDFunctions, RDD}
import org.apache.spark.streaming.{Duration, Time}
import scala.reflect.ClassTag

private[streaming]
class TransformedDStream[U: ClassTag] (
    parents: Seq[DStream[_]],
    transformFunc: (Seq[RDD[_]], Time) => RDD[U]
  ) extends DStream[U](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to transform is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[U]] = {
    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
    Some(transformFunc(parentRDDs, validTime))
  }
}

Source File: TransformedDStream.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import org.apache.spark.rdd.{PairRDDFunctions, RDD}
import org.apache.spark.streaming.{Duration, Time}
import scala.reflect.ClassTag

private[streaming]
class TransformedDStream[U: ClassTag] (
    parents: Seq[DStream[_]],
    transformFunc: (Seq[RDD[_]], Time) => RDD[U]
  ) extends DStream[U](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to transform is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[U]] = {
    val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq
    Some(transformFunc(parentRDDs, validTime))
  }
}

Source File: ControlFilesCreator.scala From spark-benchmarks with Apache License 2.0

5 votes

package com.bbva.spark.benchmarks.dfsio

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{LongWritable, Text}
import org.apache.hadoop.mapred.lib.MultipleSequenceFileOutputFormat
import org.apache.spark.SparkContext
import org.apache.spark.rdd.{PairRDDFunctions, RDD}

object ControlFilesCreator {

  val BaseFileName = "test_io_"

  def createFiles(controlDirPath: String, numFiles: Int, fileSize: Long)(implicit sc: SparkContext): Unit = {
    sc.parallelize(0 until numFiles, numFiles).map(getFileName).map { fileName =>
      val controlFilePath = new Path(controlDirPath, s"in_file_$fileName")
      (controlFilePath.toString, new LongWritable(fileSize))
    }.saveAsSequenceFileByKey(controlDirPath)
  }

  implicit class RichRDD[T](val self: RDD[T]) extends AnyVal {
    def saveAsSequenceFileByKey[K, V](path: String)(implicit ev: RDD[T] => PairRDDFunctions[K, V]): Unit =
      self.saveAsHadoopFile(path, classOf[Text], classOf[LongWritable], classOf[RDDMultipleSequenceFileOutputFormat])
  }

  private def getFileName(fileIndex: Int): String = BaseFileName + fileIndex

  class RDDMultipleSequenceFileOutputFormat extends MultipleSequenceFileOutputFormat[Any, Any] {

    override def generateActualKey(key: Any, value: Any): Any = new Text(key.toString.split("/").last)

    override def generateFileNameForKeyValue(key: Any, value: Any, name: String): String =
      new Path(key.toString).toString

  }
}

Source File: RatingDataGenerator.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.ml

import com.intel.hibench.sparkbench.common.IOCommon

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.mllib.random._
import org.apache.spark.rdd.{PairRDDFunctions, RDD}
import org.apache.spark.mllib.linalg.{Vectors, Vector}

object RatingDataGenerator {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("RatingDataGeneration")
    val sc = new SparkContext(conf)

    var outputPath = ""
    var numUsers: Int = 100
    var numProducts: Int = 100
    var sparsity: Double = 0.05
    var implicitPrefs: Boolean = false
    val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism)
    val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism")
      .getOrElse((parallel / 2).toString).toInt

    if (args.length == 5) {
      outputPath = args(0)
      numUsers = args(1).toInt
      numProducts = args(2).toInt
      sparsity = args(3).toDouble
      implicitPrefs = args(4).toBoolean

      println(s"Output Path: $outputPath")
      println(s"Num of Users: $numUsers")
      println(s"Num of Products: $numProducts")
      println(s"Sparsity: $sparsity")
      println(s"Implicit Prefs: $implicitPrefs")
    } else {
      System.err.println(
        s"Usage: $RatingDataGenerator <OUTPUT_PATH> <NUM_USERS> <NUM_PRODUCTS> <SPARSITY>  <IMPLICITPREFS>"
      )
      System.exit(1)
    }

    val rawData: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, numUsers, numProducts, numPartitions)
    val rng = new java.util.Random()
    val data = rawData.map{v =>
      val a = Array.fill[Double](v.size)(0.0)
      v.foreachActive{(i,vi) =>
         if(rng.nextDouble <= sparsity){
           a(i) = vi
         }
      }
      Vectors.dense(a).toSparse
   }


    data.saveAsObjectFile(outputPath)

    sc.stop()
  }
}

org.apache.spark.rdd.PairRDDFunctions Scala Examples