org.apache.spark.rdd.RDD.rddToPairRDDFunctions Scala Examples
The following examples show how to use org.apache.spark.rdd.RDD.rddToPairRDDFunctions.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ClusterModel.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.visualisation.model import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import de.unihamburg.vsis.sddf.SddfContext.rddToRdd import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.reading.Tuple lazy val clusterSizeDistribution: Array[(Int, Int)] = { val distr = clusters.map(set => (set.size, 1)) val distr2 = distr.reduceByKey(_ + _) // distr should be small so using collect is fine right here distr2.sortByKey().collect() } lazy val duplicateCount: Int = { clusterSizeDistribution.map(p => { val clusterSize = p._1 val clusterCount = p._2 (clusterSize - 1) * clusterCount }).sum } lazy val duplicatePairCount: Int = { clusterSizeDistribution.map(p => { val clusterSize = p._1 val clusterCount = p._2 numberOfPossiblePairs(clusterSize) * clusterCount }).sum } lazy val clusterCount = { clusters.count } lazy val recall: Double = { recall(goldstandard, getDuplicatePairs) } lazy val precision: Double = { precision(goldstandard, getDuplicatePairs) } private lazy val getDuplicatePairs: RDD[SymPair[Tuple]] = { clusters.map(_.toSeq).cartesianBlocksWithoutIdentity() } private def numberOfPossiblePairs(elementCount: Int): Int = { (elementCount * (elementCount - 1)) / 2 } }
Example 2
Source File: PipeReaderGoldstandardClusterOutput.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.goldstandard import java.util.regex.PatternSyntaxException import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import de.unihamburg.vsis.sddf.SddfContext.rddToRdd import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.IdConverter import de.unihamburg.vsis.sddf.reading.IdConverterBasic import de.unihamburg.vsis.sddf.reading.SymPair class PipeReaderGoldstandardClusterOutput( separator: Char = ',', clusterIdIndex: Int = 0, tupleIdIndex: Int = 1, idConverter: IdConverter = IdConverterBasic) extends PipeElement[RDD[String], RDD[Seq[Long]]] { override def step(inputRdd: RDD[String])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Long]] = { // parse tuple ids val clusterIdTupleIdRdd = inputRdd.map(line => { val parts = line.split(separator) val tupleId = idConverter.convert(parts(tupleIdIndex).replaceAll("[^0-9]","")) val clusterId = idConverter.convert(parts(clusterIdIndex).replaceAll("[^0-9]","")) (clusterId, tupleId) }) clusterIdTupleIdRdd.groupByKey().map(_._2.toSeq) } } object PipeReaderGoldstandardClusterOutput { def apply( separator: Char = ',', clusterIdIndex: Int = 0, tupleIdIndex: Int = 1, idConverter: IdConverter = IdConverterBasic) = { new PipeReaderGoldstandardClusterOutput(separator, clusterIdIndex, tupleIdIndex, idConverter) } }
Example 3
Source File: PipeReaderGoldstandardIdsCluster.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.reading.goldstandard import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import de.unihamburg.vsis.sddf.SddfContext.rddToRdd import de.unihamburg.vsis.sddf.pipe.PipeElement import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.IdConverter import de.unihamburg.vsis.sddf.reading.IdConverterBasic import de.unihamburg.vsis.sddf.reading.SymPair import de.unihamburg.vsis.sddf.convert.PipeConvertClusterToPair object PipeReaderGoldstandardIdsCluster { def apply( separator: Char = ',', clusterIdIndex: Int = 0, tupleIdIndex: Int = 1, idConverter: IdConverter = IdConverterBasic) = { PipeReaderGoldstandardClusterOutput(separator, clusterIdIndex, tupleIdIndex, idConverter) .append(PipeConvertClusterToPair()) } }
Example 4
Source File: PipeBlockerStandard.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.indexing.blocking import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import de.unihamburg.vsis.sddf.Parameterized import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder import de.unihamburg.vsis.sddf.logging.Logging import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable def step(input: RDD[Tuple])(implicit pipeContext: AbstractPipeContext): RDD[Seq[Tuple]] = { val bkvTuplePairs: RDD[(String, Tuple)] = input.map(t => (bkvBuilder.buildBlockingKey(t), t)) val keyBlocks: RDD[(String, Iterable[Tuple])] = bkvTuplePairs.groupByKey keyBlocks.map(_._2.toSeq).filter(_.size > 1) } @transient override val _analysable = new AlgoAnalysable _analysable.algo = this _analysable.name = this.name override val name = "StandardBlocker" override val paramMap = Map("BlockingKeyBuilder" -> bkvBuilder) } object PipeBlockerStandard { def apply(implicit bkvBuilder: BlockingKeyBuilder) = { new PipeBlockerStandard() } }
Example 5
Source File: PipeBlockerSuffixArray.scala From sddf with GNU General Public License v3.0 | 5 votes |
package de.unihamburg.vsis.sddf.indexing.blocking import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import de.unihamburg.vsis.sddf.Parameterized import de.unihamburg.vsis.sddf.indexing.blocking.keygeneration.BlockingKeyBuilder import de.unihamburg.vsis.sddf.logging.Logging import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext import de.unihamburg.vsis.sddf.reading.Tuple import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable def filterBlocks(suffixTuplePair: (String, Seq[Tuple])): Boolean = { val tupleCount = suffixTuplePair._2.length if (tupleCount > maximumBlockSize) { false } else if (tupleCount < 2) { false } else { true } } } object PipeBlockerSuffixArray { def apply(minimumSuffixLength: Int = 6, maximumBlockSize: Int = 12)( implicit bkvBuilder: BlockingKeyBuilder) = { new PipeBlockerSuffixArray(minimumSuffixLength, maximumBlockSize) } }
Example 6
Source File: TestJoins.scala From spark-dev with GNU General Public License v3.0 | 5 votes |
package examples import org.apache.spark.{ SparkConf, SparkContext, HashPartitioner } import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import scala.Iterator object TestJoins { def main(args: Array[String]): Unit = { val sc = new SparkContext(new SparkConf().setAppName("TestJoinJob")) val x = sc.parallelize(List((1, 2), (1, 3), (2, 3), (2, 4))).partitionBy(new HashPartitioner(2)).cache val y = sc.parallelize(List((2, 5), (2, 6))).partitionBy(new HashPartitioner(2)).cache inspectRDD(x) inspectRDD(y) println(">>> joining x with y") val joinRDD = x.join(y).cache joinRDD.collect().foreach(println) inspectRDD(joinRDD) println(">>> left outer join of x with y") val leftJoin = x.leftOuterJoin(y).cache leftJoin.collect().foreach(println) inspectRDD(leftJoin) println(">>> right outer join of x with y") val rightJoin = x.rightOuterJoin(y).cache rightJoin.collect().foreach(println) inspectRDD(rightJoin) } def inspectRDD[T](rdd: RDD[T]): Unit = { println(">>> Partition length...") rdd.mapPartitions(f => Iterator(f.length), true).foreach(println) println(">>> Partition data...") rdd.foreachPartition(f => f.foreach(println)) } }
Example 7
Source File: L6-14HBase.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hbase.HBaseConfiguration import org.apache.hadoop.hbase.client.Put import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.io.Text import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions import org.json4s.DefaultFormats import org.json4s.jvalue2extractable import org.json4s.jvalue2monadic import org.json4s.native.JsonMethods.parse import org.json4s.string2JsonInput object HBaseSinkApp { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: HBaseSinkApp <appname> <hbaseMaster> <tableName> <columnFamilyName> <columnName>") System.exit(1) } val Seq(appName, hbaseMaster, tableName, columnFamilyName, columnName) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val batchInterval = 10 val windowSize = 20 val slideInterval = 10 val ssc = new StreamingContext(conf, Seconds(batchInterval)) HttpUtils.createStream(ssc, url = "https://query.yahooapis.com/v1/public/yql?q=select%20*%20from%20yahoo.finance.quotes%20where%20symbol%20in%20(%22IBM,GOOG,MSFT,AAPL,FB,ORCL,YHOO,TWTR,LNKD,INTC%22)%0A%09%09&format=json&diagnostics=true&env=http%3A%2F%2Fdatatables.org%2Falltables.env", interval = batchInterval) .flatMap(rec => { implicit val formats = DefaultFormats val query = parse(rec) \ "query" ((query \ "results" \ "quote").children) .map(rec => ((rec \ "symbol").extract[String], (rec \ "LastTradePriceOnly").extract[String].toFloat)) }) .reduceByKeyAndWindow((x: Float, y: Float) => (x + y), Seconds(windowSize), Seconds(slideInterval)) .foreachRDD(rdd => { val hbaseConf = HBaseConfiguration.create() hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, tableName) hbaseConf.set("hbase.master", hbaseMaster) val jobConf = new Configuration(hbaseConf) jobConf.set("mapreduce.job.outputformat.class", classOf[TableOutputFormat[Text]].getName) rdd.map(rec => { val put = new Put(rec._1.getBytes) put.addColumn(columnFamilyName.getBytes, columnName.getBytes, Bytes.toBytes(rec._2 / (windowSize / batchInterval))) (rec._1, put) }).saveAsNewAPIHadoopDataset(jobConf) }) ssc.start() ssc.awaitTermination() } }
Example 8
Source File: L9-11CollabFilteringPreprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.HadoopRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.google.common.io.Files object CollabFilteringPreprocessingApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: CollabFilteringPreprocessingApp <appname> <inputpath> <outputpath>") System.exit(1) } val Seq(appName, iPath, oPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val delim = " " val sc = new SparkContext(conf) sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) .asInstanceOf[HadoopRDD[LongWritable, Text]] .mapPartitionsWithInputSplit((iSplit, iter) => iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) .filter(r => r._2 != "0") .map(r => ((r._1, r._2), 1)) .reduceByKey(_ + _) .map(r => r._1._1.replace("subject", "") + delim + r._1._2 + delim + r._2) .sample(false, 0.7) .coalesce(1) .saveAsTextFile(oPath) } }
Example 9
Source File: L9-12CollabFiltering.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.recommendation.ALS import org.apache.spark.mllib.recommendation.Rating import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CollabFilteringApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: CollabFilteringApp <appname> <batchInterval> <iPath>") System.exit(1) } val Seq(appName, batchInterval, iPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val ratingStream = ssc.textFileStream(iPath).map(_.split(" ") match { case Array(subject, activity, freq) => Rating(subject.toInt, activity.toInt, freq.toDouble) }) val rank = 10 val numIterations = 10 val lambda = 0.01 ratingStream.foreachRDD(ratingRDD => { val testTrain = ratingRDD.randomSplit(Array(0.3, 0.7)) val model = ALS.train(testTrain(1), rank, numIterations, lambda) val test = testTrain(0).map { case Rating(subject, activity, freq) => (subject, activity) } val prediction = model.predict(test) prediction.take(5).map(println) }) ssc.start() ssc.awaitTermination() } }
Example 10
Source File: L9-13FPMiningPreprocessing.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.hadoop.io.LongWritable import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.FileSplit import org.apache.hadoop.mapred.TextInputFormat import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.HadoopRDD import org.apache.spark.rdd.RDD.rddToPairRDDFunctions import com.google.common.io.Files object FPMiningPreprocessingApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: FPMiningPreprocessingApp <appname> <inputpath> <outputpath>") System.exit(1) } val Seq(appName, iPath, oPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val delim = " " val sc = new SparkContext(conf) sc.hadoopFile(iPath, classOf[TextInputFormat], classOf[LongWritable], classOf[Text], sc.defaultMinPartitions) .asInstanceOf[HadoopRDD[LongWritable, Text]] .mapPartitionsWithInputSplit((iSplit, iter) => iter.map(splitAndLine => (Files.getNameWithoutExtension(iSplit.asInstanceOf[FileSplit].getPath.toString), splitAndLine._2.toString.split(" ")(1)))) .filter(r => r._2 != "0") .map(r => (r._1, r._2)) .distinct() .groupByKey() .map(r => r._2.mkString(" ")) .sample(false, 0.7) .coalesce(1) .saveAsTextFile(oPath) } }
Example 11
Source File: Consensus.scala From spark-tutorial with Apache License 2.0 | 5 votes |
package se.uu.farmbio.tutorial import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToPairRDDFunctions object Consensus { def main(args: Array[String]) = { //Start the Spark context val conf = new SparkConf() .setAppName("Consensus") .setMaster("local") val sc = new SparkContext(conf) //Read DNA sequences val dnaRDD = sc.textFile("dna.txt") (key, maxCountPair._1) } //Now we can format and save the consensus consensusRDD.sortBy { //first we sort by position (key) case (key, mostFrequent) => key }.map { //now we can get rid of the position, which is no longer needed case (position, mostFrequent) => mostFrequent }.saveAsTextFile("dna.consensus.txt") //finally we save to a text file //Stop the Spark context sc.stop } }
Example 12
Source File: Quickstart.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.sparkGuide import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToPairRDDFunctions object Quickstart { def main(args: Array[String]) { val conf = new SparkConf().setMaster("local").setAppName("Spark Exercise:Average Age Calculator") val sc = new SparkContext(conf) val textFile = sc.textFile("README.md") val count = textFile.count() val first = textFile.first() val linesWithSpark = textFile.filter(line => line.contains("Spark")) linesWithSpark.foreach { x => println _ } val filterCount = textFile.filter(line => line.contains("Spark")).count() textFile.map(line => line.split(" ").size).foreach { x => println _ } val max = textFile.map(line => line.split(" ").size).reduce((a, b) => if (a > b) a else b) val Mathmax = textFile.map(line => line.split(" ").size).reduce((a, b) => Math.max(a, b)) //reduceByKey对Key相同的元素的值求和 val wordCounts = textFile.flatMap(line => line.split(" ")).map(word => (word, 1)).reduceByKey(_+_) //collect将RDD转成Scala数组,并返回 wordCounts.collect() linesWithSpark.cache() linesWithSpark.count() linesWithSpark.count() } }