org.apache.spark.RangePartitioner Scala Examples
The following examples show how to use org.apache.spark.RangePartitioner.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: OrderedRDDFunctions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 2
Source File: CCDriver.scala From connected-component with MIT License | 5 votes |
package com.kwartile.lib.cc import org.apache.spark.{RangePartitioner, SparkConf, SparkContext} val rangePartitioner = new RangePartitioner(cc2.getNumPartitions, cc2) val connectedComponents = cc2.reduceByKey(rangePartitioner, (a, b) => {b ::: a}) //connectedComponents.mapPartitionsWithIndex((index, iter) => { // iter.toList.map(x => (index, x._1, x._2.size)).iterator // }).collect.foreach(println) println("connected components") connectedComponents.map(x => (x._2.length).toString + " " + x._1 + " " + x._2.sorted.mkString(" ")).saveAsTextFile(cliqueFile + "_cc_out") } else { println("Max iteration reached. Could not converge") } } }
Example 3
Source File: UsePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_5 import com.tomekl007.UserTransaction import org.apache.spark.{HashPartitioner, RangePartitioner, SparkContext} import org.apache.spark.sql.SparkSession import org.scalatest.FunSuite import org.scalatest.Matchers._ class UsePartitioner extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use different partitioners") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) val keyed = data.keyBy(_.userId) //when, then val partitioner = keyed.partitioner assert(partitioner.isEmpty) val hashPartitioner = keyed.partitionBy(new HashPartitioner(100)) println(hashPartitioner) assert(hashPartitioner.partitioner.isDefined) val rangePartitioner = keyed.partitionBy(new RangePartitioner(100, keyed)) println(rangePartitioner) assert(rangePartitioner.partitioner.isDefined) } }
Example 4
Source File: CustomRangePartitioner.scala From Hands-On-Big-Data-Analytics-with-PySpark with MIT License | 5 votes |
package com.tomekl007.chapter_5 import com.tomekl007.UserTransaction import org.apache.spark.sql.SparkSession import org.apache.spark.{HashPartitioner, Partitioner, RangePartitioner, SparkContext} import org.scalatest.FunSuite class CustomRangePartitionerTest extends FunSuite { val spark: SparkContext = SparkSession.builder().master("local[2]").getOrCreate().sparkContext test("should use custom range partitioner") { //given val keysWithValuesList = Array( UserTransaction("A", 100), UserTransaction("B", 4), UserTransaction("A", 100001), UserTransaction("B", 10), UserTransaction("C", 10) ) val data = spark.parallelize(keysWithValuesList) val keyed = data.keyBy(_.amount) //when, then val partitioned = keyed.partitionBy(new CustomRangePartitioner(List((0,100), (100, 10000), (10000, 1000000)))) //then partitioned.collect().toList } } class CustomRangePartitioner(ranges: List[(Int,Int)]) extends Partitioner{ override def numPartitions: Int = ranges.size override def getPartition(key: Any): Int = { if(!key.isInstanceOf[Int]){ throw new IllegalArgumentException("partitioner works only for Int type") } val keyInt = key.asInstanceOf[Int] val index = ranges.lastIndexWhere(v => keyInt >= v._1 && keyInt <= v._2) println(s"for key: $key return $index") index } }
Example 5
Source File: OrderedRDDFunctions.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 6
Source File: StratifiedRepartition.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.{HasLabelCol, Wrappable} import org.apache.spark.RangePartitioner import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.HasSeed import org.apache.spark.ml.util._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} override def transform(dataset: Dataset[_]): DataFrame = { // Count unique values in label column val distinctLabelCounts = dataset.select(getLabelCol).groupBy(getLabelCol).count().collect() val labelToCount = distinctLabelCounts.map(row => (row.getInt(0), row.getLong(1))) val labelToFraction = getMode match { case SPConstants.Equal => getEqualLabelCount(labelToCount, dataset) case SPConstants.Mixed => val equalLabelToCount = getEqualLabelCount(labelToCount, dataset) val normalizedRatio = equalLabelToCount.map { case (label, count) => count }.sum / labelToCount.length labelToCount.map { case (label, count) => (label, count / normalizedRatio)}.toMap case SPConstants.Original => labelToCount.map { case (label, count) => (label, 1.0) }.toMap case _ => throw new Exception(s"Unknown mode specified to StratifiedRepartition: $getMode") } val labelColIndex = dataset.schema.fieldIndex(getLabelCol) val spdata = dataset.toDF().rdd.keyBy(row => row.getInt(labelColIndex)) .sampleByKeyExact(true, labelToFraction, getSeed) .mapPartitions(keyToRow => keyToRow.zipWithIndex.map { case ((key, row), index) => (index, row) }) val rangePartitioner = new RangePartitioner(dataset.rdd.getNumPartitions, spdata) val rspdata = spdata.partitionBy(rangePartitioner).mapPartitions(keyToRow => keyToRow.map{case (key, row) => row}).persist() dataset.sqlContext.createDataFrame(rspdata, dataset.schema) } private def getEqualLabelCount(labelToCount: Array[(Int, Long)], dataset: Dataset[_]): Map[Int, Double] = { val maxLabelCount = Math.max(labelToCount.map { case (label, count) => count }.max, dataset.rdd.getNumPartitions) labelToCount.map { case (label, count) => (label, maxLabelCount.toDouble / count) }.toMap } def transformSchema(schema: StructType): StructType = schema def copy(extra: ParamMap): DropColumns = defaultCopy(extra) }
Example 7
Source File: OrderedRDDFunctions.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 8
Source File: OrderedRDDFunctions.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Logging, Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => { val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) } case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 9
Source File: OrderedRDDFunctions.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 10
Source File: OrderedRDDFunctions.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Logging, Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => { val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) } case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 11
Source File: PartitionBy.scala From learning-spark with Apache License 2.0 | 5 votes |
package com.javachen.spark.examples.rdd import org.apache.spark.{RangePartitioner,HashPartitioner, SparkContext} object PartitionBy { def main(args: Array[String]) { val sc = new SparkContext("local", "ReduceByKeyToDriver Test") val data1 = Array[(String, Int)](("K", 1), ("T", 2), ("T", 3), ("W", 4), ("W", 5), ("W", 6) ) val pairs = sc.parallelize(data1, 3) //val result = pairs.reduce((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) //val result = pairs.fold(("K0",10))((A, B) => (A._1 + "#" + B._1, A._2 + B._2)) var result = pairs.partitionBy(new RangePartitioner(2, pairs, true)) result = pairs.partitionBy(new HashPartitioner(2)) result.foreach(println) } }