org.apache.spark.sql.catalyst.plans.physical.HashPartitioning Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.plans.physical.HashPartitioning.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: PartitioningSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{InterpretedMutableProjection, Literal} import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashPartitioning} class PartitioningSuite extends SparkFunSuite { test("HashPartitioning compatibility should be sensitive to expression ordering (SPARK-9785)") { val expressions = Seq(Literal(2), Literal(3)) // Consider two HashPartitionings that have the same _set_ of hash expressions but which are // created with different orderings of those expressions: val partitioningA = HashPartitioning(expressions, 100) val partitioningB = HashPartitioning(expressions.reverse, 100) // These partitionings are not considered equal: assert(partitioningA != partitioningB) // However, they both satisfy the same clustered distribution: val distribution = ClusteredDistribution(expressions) assert(partitioningA.satisfies(distribution)) assert(partitioningB.satisfies(distribution)) // These partitionings compute different hashcodes for the same input row: def computeHashCode(partitioning: HashPartitioning): Int = { val hashExprProj = new InterpretedMutableProjection(partitioning.expressions, Seq.empty) hashExprProj.apply(InternalRow.empty).hashCode() } assert(computeHashCode(partitioningA) != computeHashCode(partitioningB)) // Thus, these partitionings are incompatible: assert(!partitioningA.compatibleWith(partitioningB)) assert(!partitioningB.compatibleWith(partitioningA)) assert(!partitioningA.guarantees(partitioningB)) assert(!partitioningB.guarantees(partitioningA)) // Just to be sure that we haven't cheated by having these methods always return false, // check that identical partitionings are still compatible with and guarantee each other: assert(partitioningA === partitioningA) assert(partitioningA.guarantees(partitioningA)) assert(partitioningA.compatibleWith(partitioningA)) } }
Example 2
Source File: ExchangeSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{Alias, Literal} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, IdentityBroadcastMode, SinglePartition} import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchange} import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode import org.apache.spark.sql.test.SharedSQLContext class ExchangeSuite extends SparkPlanTest with SharedSQLContext { import testImplicits._ test("shuffling UnsafeRows in exchange") { val input = (1 to 1000).map(Tuple1.apply) checkAnswer( input.toDF(), plan => ShuffleExchange(SinglePartition, plan), input.map(Row.fromTuple) ) } test("compatible BroadcastMode") { val mode1 = IdentityBroadcastMode val mode2 = HashedRelationBroadcastMode(Literal(1L) :: Nil) val mode3 = HashedRelationBroadcastMode(Literal("s") :: Nil) assert(mode1.compatibleWith(mode1)) assert(!mode1.compatibleWith(mode2)) assert(!mode2.compatibleWith(mode1)) assert(mode2.compatibleWith(mode2)) assert(!mode2.compatibleWith(mode3)) assert(mode3.compatibleWith(mode3)) } test("BroadcastExchange same result") { val df = spark.range(10) val plan = df.queryExecution.executedPlan val output = plan.output assert(plan sameResult plan) val exchange1 = BroadcastExchangeExec(IdentityBroadcastMode, plan) val hashMode = HashedRelationBroadcastMode(output) val exchange2 = BroadcastExchangeExec(hashMode, plan) val hashMode2 = HashedRelationBroadcastMode(Alias(output.head, "id2")() :: Nil) val exchange3 = BroadcastExchangeExec(hashMode2, plan) val exchange4 = ReusedExchangeExec(output, exchange3) assert(exchange1 sameResult exchange1) assert(exchange2 sameResult exchange2) assert(exchange3 sameResult exchange3) assert(exchange4 sameResult exchange4) assert(!exchange1.sameResult(exchange2)) assert(!exchange2.sameResult(exchange3)) assert(!exchange3.sameResult(exchange4)) assert(exchange4 sameResult exchange3) } test("ShuffleExchange same result") { val df = spark.range(10) val plan = df.queryExecution.executedPlan val output = plan.output assert(plan sameResult plan) val part1 = HashPartitioning(output, 1) val exchange1 = ShuffleExchange(part1, plan) val exchange2 = ShuffleExchange(part1, plan) val part2 = HashPartitioning(output, 2) val exchange3 = ShuffleExchange(part2, plan) val part3 = HashPartitioning(output ++ output, 2) val exchange4 = ShuffleExchange(part3, plan) val exchange5 = ReusedExchangeExec(output, exchange4) assert(exchange1 sameResult exchange1) assert(exchange2 sameResult exchange2) assert(exchange3 sameResult exchange3) assert(exchange4 sameResult exchange4) assert(exchange5 sameResult exchange5) assert(exchange1 sameResult exchange2) assert(!exchange2.sameResult(exchange3)) assert(!exchange3.sameResult(exchange4)) assert(!exchange4.sameResult(exchange5)) assert(exchange5 sameResult exchange4) } }
Example 3
Source File: PartitioningSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{InterpretedMutableProjection, Literal} import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashPartitioning} class PartitioningSuite extends SparkFunSuite { test("HashPartitioning compatibility should be sensitive to expression ordering (SPARK-9785)") { val expressions = Seq(Literal(2), Literal(3)) // Consider two HashPartitionings that have the same _set_ of hash expressions but which are // created with different orderings of those expressions: val partitioningA = HashPartitioning(expressions, 100) val partitioningB = HashPartitioning(expressions.reverse, 100) // These partitionings are not considered equal: assert(partitioningA != partitioningB) // However, they both satisfy the same clustered distribution: val distribution = ClusteredDistribution(expressions) assert(partitioningA.satisfies(distribution)) assert(partitioningB.satisfies(distribution)) // These partitionings compute different hashcodes for the same input row: def computeHashCode(partitioning: HashPartitioning): Int = { val hashExprProj = new InterpretedMutableProjection(partitioning.expressions, Seq.empty) hashExprProj.apply(InternalRow.empty).hashCode() } assert(computeHashCode(partitioningA) != computeHashCode(partitioningB)) // Thus, these partitionings are incompatible: assert(!partitioningA.compatibleWith(partitioningB)) assert(!partitioningB.compatibleWith(partitioningA)) assert(!partitioningA.guarantees(partitioningB)) assert(!partitioningB.guarantees(partitioningA)) // Just to be sure that we haven't cheated by having these methods always return false, // check that identical partitionings are still compatible with and guarantee each other: assert(partitioningA === partitioningA) assert(partitioningA.guarantees(partitioningA)) assert(partitioningA.compatibleWith(partitioningA)) } }
Example 4
Source File: ExchangeSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{Alias, Literal} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, IdentityBroadcastMode, SinglePartition} import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchange} import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode import org.apache.spark.sql.test.SharedSQLContext class ExchangeSuite extends SparkPlanTest with SharedSQLContext { import testImplicits._ test("shuffling UnsafeRows in exchange") { val input = (1 to 1000).map(Tuple1.apply) checkAnswer( input.toDF(), plan => ShuffleExchange(SinglePartition, plan), input.map(Row.fromTuple) ) } test("compatible BroadcastMode") { val mode1 = IdentityBroadcastMode val mode2 = HashedRelationBroadcastMode(Literal(1L) :: Nil) val mode3 = HashedRelationBroadcastMode(Literal("s") :: Nil) assert(mode1.compatibleWith(mode1)) assert(!mode1.compatibleWith(mode2)) assert(!mode2.compatibleWith(mode1)) assert(mode2.compatibleWith(mode2)) assert(!mode2.compatibleWith(mode3)) assert(mode3.compatibleWith(mode3)) } test("BroadcastExchange same result") { val df = spark.range(10) val plan = df.queryExecution.executedPlan val output = plan.output assert(plan sameResult plan) val exchange1 = BroadcastExchangeExec(IdentityBroadcastMode, plan) val hashMode = HashedRelationBroadcastMode(output) val exchange2 = BroadcastExchangeExec(hashMode, plan) val hashMode2 = HashedRelationBroadcastMode(Alias(output.head, "id2")() :: Nil) val exchange3 = BroadcastExchangeExec(hashMode2, plan) val exchange4 = ReusedExchangeExec(output, exchange3) assert(exchange1 sameResult exchange1) assert(exchange2 sameResult exchange2) assert(exchange3 sameResult exchange3) assert(exchange4 sameResult exchange4) assert(!exchange1.sameResult(exchange2)) assert(!exchange2.sameResult(exchange3)) assert(!exchange3.sameResult(exchange4)) assert(exchange4 sameResult exchange3) } test("ShuffleExchange same result") { val df = spark.range(10) val plan = df.queryExecution.executedPlan val output = plan.output assert(plan sameResult plan) val part1 = HashPartitioning(output, 1) val exchange1 = ShuffleExchange(part1, plan) val exchange2 = ShuffleExchange(part1, plan) val part2 = HashPartitioning(output, 2) val exchange3 = ShuffleExchange(part2, plan) val part3 = HashPartitioning(output ++ output, 2) val exchange4 = ShuffleExchange(part3, plan) val exchange5 = ReusedExchangeExec(output, exchange4) assert(exchange1 sameResult exchange1) assert(exchange2 sameResult exchange2) assert(exchange3 sameResult exchange3) assert(exchange4 sameResult exchange4) assert(exchange5 sameResult exchange5) assert(exchange1 sameResult exchange2) assert(!exchange2.sameResult(exchange3)) assert(!exchange3.sameResult(exchange4)) assert(!exchange4.sameResult(exchange5)) assert(exchange5 sameResult exchange4) } }
Example 5
Source File: PartitioningSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{InterpretedMutableProjection, Literal} import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashPartitioning} class PartitioningSuite extends SparkFunSuite { test("HashPartitioning compatibility should be sensitive to expression ordering (SPARK-9785)") { val expressions = Seq(Literal(2), Literal(3)) // Consider two HashPartitionings that have the same _set_ of hash expressions but which are // created with different orderings of those expressions: val partitioningA = HashPartitioning(expressions, 100) val partitioningB = HashPartitioning(expressions.reverse, 100) // These partitionings are not considered equal: assert(partitioningA != partitioningB) // However, they both satisfy the same clustered distribution: val distribution = ClusteredDistribution(expressions) assert(partitioningA.satisfies(distribution)) assert(partitioningB.satisfies(distribution)) // These partitionings compute different hashcodes for the same input row: def computeHashCode(partitioning: HashPartitioning): Int = { val hashExprProj = new InterpretedMutableProjection(partitioning.expressions, Seq.empty) hashExprProj.apply(InternalRow.empty).hashCode() } assert(computeHashCode(partitioningA) != computeHashCode(partitioningB)) // Thus, these partitionings are incompatible: assert(!partitioningA.compatibleWith(partitioningB)) assert(!partitioningB.compatibleWith(partitioningA)) assert(!partitioningA.guarantees(partitioningB)) assert(!partitioningB.guarantees(partitioningA)) // Just to be sure that we haven't cheated by having these methods always return false, // check that identical partitionings are still compatible with and guarantee each other: assert(partitioningA === partitioningA) assert(partitioningA.guarantees(partitioningA)) assert(partitioningA.compatibleWith(partitioningA)) } }
Example 6
Source File: ExchangeSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{Alias, Literal} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, IdentityBroadcastMode, SinglePartition} import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ReusedExchangeExec, ShuffleExchange} import org.apache.spark.sql.execution.joins.HashedRelationBroadcastMode import org.apache.spark.sql.test.SharedSQLContext class ExchangeSuite extends SparkPlanTest with SharedSQLContext { import testImplicits._ test("shuffling UnsafeRows in exchange") { val input = (1 to 1000).map(Tuple1.apply) checkAnswer( input.toDF(), plan => ShuffleExchange(SinglePartition, plan), input.map(Row.fromTuple) ) } test("compatible BroadcastMode") { val mode1 = IdentityBroadcastMode val mode2 = HashedRelationBroadcastMode(Literal(1L) :: Nil) val mode3 = HashedRelationBroadcastMode(Literal("s") :: Nil) assert(mode1.compatibleWith(mode1)) assert(!mode1.compatibleWith(mode2)) assert(!mode2.compatibleWith(mode1)) assert(mode2.compatibleWith(mode2)) assert(!mode2.compatibleWith(mode3)) assert(mode3.compatibleWith(mode3)) } test("BroadcastExchange same result") { val df = spark.range(10) val plan = df.queryExecution.executedPlan val output = plan.output assert(plan sameResult plan) val exchange1 = BroadcastExchangeExec(IdentityBroadcastMode, plan) val hashMode = HashedRelationBroadcastMode(output) val exchange2 = BroadcastExchangeExec(hashMode, plan) val hashMode2 = HashedRelationBroadcastMode(Alias(output.head, "id2")() :: Nil) val exchange3 = BroadcastExchangeExec(hashMode2, plan) val exchange4 = ReusedExchangeExec(output, exchange3, sparkContext.sparkUser) assert(exchange1 sameResult exchange1) assert(exchange2 sameResult exchange2) assert(exchange3 sameResult exchange3) assert(exchange4 sameResult exchange4) assert(!exchange1.sameResult(exchange2)) assert(!exchange2.sameResult(exchange3)) assert(!exchange3.sameResult(exchange4)) assert(exchange4 sameResult exchange3) } test("ShuffleExchange same result") { val df = spark.range(10) val plan = df.queryExecution.executedPlan val output = plan.output assert(plan sameResult plan) val part1 = HashPartitioning(output, 1) val exchange1 = ShuffleExchange(part1, plan) val exchange2 = ShuffleExchange(part1, plan) val part2 = HashPartitioning(output, 2) val exchange3 = ShuffleExchange(part2, plan) val part3 = HashPartitioning(output ++ output, 2) val exchange4 = ShuffleExchange(part3, plan) val exchange5 = ReusedExchangeExec(output, exchange4, sparkContext.sparkUser) assert(exchange1 sameResult exchange1) assert(exchange2 sameResult exchange2) assert(exchange3 sameResult exchange3) assert(exchange4 sameResult exchange4) assert(exchange5 sameResult exchange5) assert(exchange1 sameResult exchange2) assert(!exchange2.sameResult(exchange3)) assert(!exchange3.sameResult(exchange4)) assert(!exchange4.sameResult(exchange5)) assert(exchange5 sameResult exchange4) } }
Example 7
Source File: Exchange.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, Expression, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 8
Source File: PartitioningSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{InterpretedMutableProjection, Literal} import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, HashPartitioning} class PartitioningSuite extends SparkFunSuite { test("HashPartitioning compatibility should be sensitive to expression ordering (SPARK-9785)") { val expressions = Seq(Literal(2), Literal(3)) // Consider two HashPartitionings that have the same _set_ of hash expressions but which are // created with different orderings of those expressions: val partitioningA = HashPartitioning(expressions, 100) val partitioningB = HashPartitioning(expressions.reverse, 100) // These partitionings are not considered equal: assert(partitioningA != partitioningB) // However, they both satisfy the same clustered distribution: val distribution = ClusteredDistribution(expressions) assert(partitioningA.satisfies(distribution)) assert(partitioningB.satisfies(distribution)) // These partitionings compute different hashcodes for the same input row: def computeHashCode(partitioning: HashPartitioning): Int = { val hashExprProj = new InterpretedMutableProjection(partitioning.expressions, Seq.empty) hashExprProj.apply(InternalRow.empty).hashCode() } assert(computeHashCode(partitioningA) != computeHashCode(partitioningB)) // Thus, these partitionings are incompatible: assert(!partitioningA.compatibleWith(partitioningB)) assert(!partitioningB.compatibleWith(partitioningA)) assert(!partitioningA.guarantees(partitioningB)) assert(!partitioningB.guarantees(partitioningA)) // Just to be sure that we haven't cheated by having these methods always return false, // check that identical partitionings are still compatible with and guarantee each other: assert(partitioningA === partitioningA) assert(partitioningA.guarantees(partitioningA)) assert(partitioningA.compatibleWith(partitioningA)) } }