org.apache.spark.rdd.RDD.rddToPairRDDFunctions Scala Examples

The following examples show how to use org.apache.spark.rdd.RDD.rddToPairRDDFunctions. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: ClusterModel.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.visualisation.model

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.SddfContext.rddToRdd
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple


  lazy val clusterSizeDistribution: Array[(Int, Int)] = {
    val distr = clusters.map(set => (set.size, 1))
    val distr2 = distr.reduceByKey(_ + _)
    // distr should be small so using collect is fine right here
    distr2.sortByKey().collect()
  }
  
  lazy val duplicateCount: Int = {
    clusterSizeDistribution.map(p => {
      val clusterSize = p._1
      val clusterCount = p._2
      (clusterSize - 1) * clusterCount 
    }).sum
  }
  
  lazy val duplicatePairCount: Int = {
    clusterSizeDistribution.map(p => {
      val clusterSize = p._1
      val clusterCount = p._2
      numberOfPossiblePairs(clusterSize) * clusterCount
    }).sum
  }

  lazy val clusterCount = {
    clusters.count
  }

  lazy val recall: Double = {
    recall(goldstandard, getDuplicatePairs)
  }

  lazy val precision: Double = {
    precision(goldstandard, getDuplicatePairs)
  }

  private lazy val getDuplicatePairs: RDD[SymPair[Tuple]] = {
    clusters.map(_.toSeq).cartesianBlocksWithoutIdentity()
  }
  
  private def numberOfPossiblePairs(elementCount: Int): Int = {
    (elementCount * (elementCount - 1)) / 2
  }

} 
Example 2
Source File: PipeReaderGoldstandardClusterOutput.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import java.util.regex.PatternSyntaxException

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.SddfContext.rddToRdd
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair


class PipeReaderGoldstandardClusterOutput(
  separator: Char = ',',
  clusterIdIndex: Int = 0,
  tupleIdIndex: Int = 1,
  idConverter: IdConverter = IdConverterBasic)
  extends PipeElement[RDD[String], RDD[Seq[Long]]] {

  override def step(inputRdd: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Long]] = {
    // parse tuple ids
    val clusterIdTupleIdRdd = inputRdd.map(line => {
      val parts = line.split(separator)
      val tupleId = idConverter.convert(parts(tupleIdIndex).replaceAll("[^0-9]",""))
      val clusterId = idConverter.convert(parts(clusterIdIndex).replaceAll("[^0-9]",""))
      (clusterId, tupleId)
    })
    clusterIdTupleIdRdd.groupByKey().map(_._2.toSeq)
  }

}

object PipeReaderGoldstandardClusterOutput {
  
  def apply(
      separator: Char = ',',
      clusterIdIndex: Int = 0,
      tupleIdIndex: Int = 1,
      idConverter: IdConverter = IdConverterBasic) = {
    new PipeReaderGoldstandardClusterOutput(separator, clusterIdIndex, tupleIdIndex, idConverter)
  }

} 
Example 3
Source File: PipeReaderGoldstandardIdsCluster.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.reading.goldstandard

import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.SddfContext.rddToRdd
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.IdConverter
import de.unihamburg.vsis.sddf.reading.IdConverterBasic
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.convert.PipeConvertClusterToPair



object PipeReaderGoldstandardIdsCluster {
  
  def apply(
      separator: Char = ',',
      clusterIdIndex: Int = 0,
      tupleIdIndex: Int = 1,
      idConverter: IdConverter = IdConverterBasic) = {
    PipeReaderGoldstandardClusterOutput(separator, clusterIdIndex, tupleIdIndex, idConverter)
    .append(PipeConvertClusterToPair())
  }

} 
Example 4
Source File: PipeBlockerStandard.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable


  def step(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Tuple]] = {
    val bkvTuplePairs: RDD[(String, Tuple)] = input.map(t => (bkvBuilder.buildBlockingKey(t), t))
    val keyBlocks: RDD[(String, Iterable[Tuple])] = bkvTuplePairs.groupByKey
    keyBlocks.map(_._2.toSeq).filter(_.size > 1)
  }

  @transient override val _analysable = new AlgoAnalysable
  _analysable.algo = this
  _analysable.name = this.name
  override val name = "StandardBlocker"
  override val paramMap = Map("BlockingKeyBuilder" -> bkvBuilder)

}

object PipeBlockerStandard {

  def apply(implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerStandard()
  }

} 
Example 5
Source File: PipeBlockerSuffixArray.scala    From sddf   with GNU General Public License v3.0 5 votes vote down vote up
package de.unihamburg.vsis.sddf.indexing.blocking

import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import de.unihamburg.vsis.sddf.Parameterized
import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable


  def filterBlocks(suffixTuplePair: (String, Seq[Tuple])): Boolean = {
    val tupleCount = suffixTuplePair._2.length
    if (tupleCount > maximumBlockSize) {
      false
    } else if (tupleCount < 2) {
      false
    } else {
      true
    }
  }
}

object PipeBlockerSuffixArray {

  def apply(minimumSuffixLength: Int = 6, maximumBlockSize: Int = 12)(
    implicit bkvBuilder: BlockingKeyBuilder) = {
    new PipeBlockerSuffixArray(minimumSuffixLength, maximumBlockSize)
  }

} 
Example 6
Source File: TestJoins.scala    From spark-dev   with GNU General Public License v3.0 5 votes vote down vote up
package examples

import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner }
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import scala.Iterator



object TestJoins {
	def main(args: Array[String]): Unit = {
		val sc = new SparkContext(new SparkConf().setAppName("TestJoinJob"))

		val x = sc.parallelize(List((1, 2), (1, 3), (2, 3), (2, 4))).partitionBy(new HashPartitioner(2)).cache
		val y = sc.parallelize(List((2, 5), (2, 6))).partitionBy(new HashPartitioner(2)).cache

		inspectRDD(x)
		inspectRDD(y)

		println(">>> joining x with y")
		val joinRDD = x.join(y).cache
		joinRDD.collect().foreach(println)
		inspectRDD(joinRDD)

		println(">>> left outer join of x with y")
		val leftJoin = x.leftOuterJoin(y).cache
		leftJoin.collect().foreach(println)
		inspectRDD(leftJoin)

		println(">>> right outer join of x with y")
		val rightJoin = x.rightOuterJoin(y).cache
		rightJoin.collect().foreach(println)
		inspectRDD(rightJoin)
	}
	
	def inspectRDD[T](rdd: RDD[T]): Unit = {
		
		println(">>> Partition length...")
		rdd.mapPartitions(f => Iterator(f.length), true).foreach(println)
		
		println(">>> Partition data...")
		rdd.foreachPartition(f => f.foreach(println))
	}
} 
Example 7
Source File: L6-14HBase.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.io.Text
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions
import org.json4s.DefaultFormats
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.parse
import org.json4s.string2JsonInput

object HBaseSinkApp {

  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>")
      System.exit(1)
    }

    val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val batchInterval = 10
    val windowSize = 20
    val slideInterval = 10

    val ssc = new StreamingContext(conf, Seconds(batchInterval))

    HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env",
      interval = batchInterval)
      .flatMap(rec => {
        implicit val formats = DefaultFormats
        val query = parse(rec) \ "query"
        ((query \ "results" \ "quote").children)
          .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat))
      })
      .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval))
      .foreachRDD(rdd => {
        val hbaseConf = HBaseConfiguration.create()
        hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName)
        hbaseConf.set("hbase.master", hbaseMaster)
        val jobConf = new Configuration(hbaseConf)
        jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName)
        rdd.map(rec => {
          val put = new Put(rec._1.getBytes)
          put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval)))
          (rec._1, put)
        }).saveAsNewAPIHadoopDataset(jobConf)
      })

    ssc.start()
    ssc.awaitTermination()
  }
} 
Example 8
Source File: L9-11CollabFilteringPreprocessing.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.FileSplit
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.HadoopRDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import com.google.common.io.Files

object CollabFilteringPreprocessingApp {

  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: CollabFilteringPreprocessingApp <appname> <inputpath> <outputpath>")
      System.exit(1)
    }
    val Seq(appName, iPath, oPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val delim = " "

    val sc = new SparkContext(conf)
    sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions)
      .asInstanceOf[HadoopRDD[LongWritable, Text]]
      .mapPartitionsWithInputSplit((iSplit, iter) =>
        iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1))))
      .filter(r => r._2 != "0")
      .map(r => ((r._1, r._2), 1))
      .reduceByKey(_ + _)
      .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2)
      .sample(false, 0.7)
      .coalesce(1)
      .saveAsTextFile(oPath)
  }
} 
Example 9
Source File: L9-12CollabFiltering.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object CollabFilteringApp {

  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: CollabFilteringApp <appname> <batchInterval> <iPath>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, iPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val ratingStream = ssc.textFileStream(iPath).map(_.split(" ") match {
      case Array(subject, activity, freq) =>
        Rating(subject.toInt, activity.toInt, freq.toDouble)
    })

    val rank = 10
    val numIterations = 10
    val lambda = 0.01
    ratingStream.foreachRDD(ratingRDD => {
      val testTrain = ratingRDD.randomSplit(Array(0.3, 0.7))
      val model = ALS.train(testTrain(1), rank, numIterations, lambda)
      val test = testTrain(0).map {
        case Rating(subject, activity, freq) =>
          (subject, activity)
      }
      val prediction = model.predict(test)
      prediction.take(5).map(println)
    })

    ssc.start()
    ssc.awaitTermination()
  }

} 
Example 10
Source File: L9-13FPMiningPreprocessing.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.hadoop.io.LongWritable
import org.apache.hadoop.io.Text
import org.apache.hadoop.mapred.FileSplit
import org.apache.hadoop.mapred.TextInputFormat
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.HadoopRDD
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

import com.google.common.io.Files

object FPMiningPreprocessingApp {

  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: FPMiningPreprocessingApp <appname> <inputpath> <outputpath>")
      System.exit(1)
    }
    val Seq(appName, iPath, oPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val delim = " "

    val sc = new SparkContext(conf)
    sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions)
      .asInstanceOf[HadoopRDD[LongWritable, Text]]
      .mapPartitionsWithInputSplit((iSplit, iter) =>
        iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1))))
      .filter(r => r._2 != "0")
      .map(r => (r._1, r._2))
      .distinct()
      .groupByKey()
      .map(r => r._2.mkString(" "))
      .sample(false, 0.7)
      .coalesce(1)
      .saveAsTextFile(oPath)
  }
} 
Example 11
Source File: Consensus.scala    From spark-tutorial   with Apache License 2.0 5 votes vote down vote up
package se.uu.farmbio.tutorial

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions

object Consensus {

  def main(args: Array[String]) = {

    //Start the Spark context
    val conf = new SparkConf()
      .setAppName("Consensus")
      .setMaster("local")
    val sc = new SparkContext(conf)

    //Read DNA sequences 
    val dnaRDD = sc.textFile("dna.txt")

    
        (key, maxCountPair._1)
    }

    //Now we can format and save the consensus
    consensusRDD.sortBy { //first we sort by position (key)
      case (key, mostFrequent) =>
        key
    }.map { //now we can get rid of the position, which is no longer needed
      case (position, mostFrequent) =>
        mostFrequent
    }.saveAsTextFile("dna.consensus.txt") //finally we save to a text file

    //Stop the Spark context
    sc.stop

  }

} 
Example 12
Source File: Quickstart.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.sparkGuide

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
object Quickstart {
  def main(args: Array[String]) {

    val conf = new SparkConf().setMaster("local").setAppName("Spark Exercise:Average Age Calculator")
    val sc = new SparkContext(conf)

    val textFile = sc.textFile("README.md")
    val count = textFile.count()
    val first = textFile.first()
    val linesWithSpark = textFile.filter(line => line.contains("Spark"))
    linesWithSpark.foreach { x => println _ }

    val filterCount = textFile.filter(line => line.contains("Spark")).count()
    textFile.map(line => line.split(" ").size).foreach { x => println _ }
    val max = textFile.map(line => line.split(" ").size).reduce((a, b) => if (a > b) a else b)
    val Mathmax = textFile.map(line => line.split(" ").size).reduce((a, b) => Math.max(a, b))
    //reduceByKey对Key相同的元素的值求和
    val wordCounts = textFile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_)
    //collect将RDD转成Scala数组,并返回
    wordCounts.collect()
    linesWithSpark.cache()
    linesWithSpark.count()
    linesWithSpark.count()

  }

}