org.apache.spark.sql.catalyst.expressions.UnsafeRow Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.UnsafeRow.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: RPCContinuousShuffleWriter.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous.shuffle import scala.concurrent.Future import scala.concurrent.duration.Duration import org.apache.spark.Partitioner import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.util.ThreadUtils class RPCContinuousShuffleWriter( writerId: Int, outputPartitioner: Partitioner, endpoints: Array[RpcEndpointRef]) extends ContinuousShuffleWriter { if (outputPartitioner.numPartitions != 1) { throw new IllegalArgumentException("multiple readers not yet supported") } if (outputPartitioner.numPartitions != endpoints.length) { throw new IllegalArgumentException(s"partitioner size ${outputPartitioner.numPartitions} did " + s"not match endpoint count ${endpoints.length}") } def write(epoch: Iterator[UnsafeRow]): Unit = { while (epoch.hasNext) { val row = epoch.next() endpoints(outputPartitioner.getPartition(row)).askSync[Unit](ReceiverRow(writerId, row)) } val futures = endpoints.map(_.ask[Unit](ReceiverEpochMarker(writerId))).toSeq implicit val ec = ThreadUtils.sameThread ThreadUtils.awaitResult(Future.sequence(futures), Duration.Inf) } }
Example 3
Source File: IntervalTreeJoin.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.genApp import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedProjection, UnsafeRow} import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} @DeveloperApi case class IntervalTreeJoin(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1)), List(condition(2), condition(3))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) (new Interval[Int](v1Key.getInt(0), v1Key.getInt(1)), x.copy()) }) val v2 = right.execute() val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) (new Interval[Int](v2Key.getInt(0), v2Key.getInt(1)), x.copy()) }) if (v1.count <= v2.count) { val v3 = IntervalTreeJoinImpl.overlapJoin(context.sparkContext, v1kv, v2kv) .flatMap(l => l._2 .map(r => (l._1, r))) v3.map { case (l: InternalRow, r: InternalRow) => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema); joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner } } } else { val v3 = IntervalTreeJoinImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r))) v3.map { case (r: InternalRow, l: InternalRow) => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema); joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner } } } } }
Example 4
Source File: IntervalTreeJoinChromosome.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.methods.genApp import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Expression, InterpretedProjection, UnsafeRow} import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.biodatageeks.sequila.rangejoins.genApp.Interval @DeveloperApi case class IntervalTreeJoinChromosome(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1),condition(4)), List(condition(2), condition(3),condition(5))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) ((v1Key.getString(2),new Interval[Int](v1Key.getInt(0), v1Key.getInt(1))), x.copy()) }) val v2 = right.execute() val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) ((v2Key.getString(2),new Interval[Int](v2Key.getInt(0), v2Key.getInt(1))), x.copy()) }) if (v1.count <= v2.count) { val v3 = IntervalTreeJoinChromosomeImpl.overlapJoin(context.sparkContext, v1kv, v2kv) .flatMap(l => l._2 .map(r => (l._1, r))) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) p.map(r => joiner.join(r._1.asInstanceOf[UnsafeRow], r._2.asInstanceOf[UnsafeRow])) } ) } else { val v3 = IntervalTreeJoinChromosomeImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r))) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(right.schema, left.schema) p.map(r=>joiner.join(r._2.asInstanceOf[UnsafeRow],r._1.asInstanceOf[UnsafeRow])) } ) } } }
Example 5
Source File: HashSetRowIterator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.storage.set.hashset import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class ObjectHashSetRowIterator(set: ObjectHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { rawIter.next() } } class IntKeysHashSetRowIterator(set: IntKeysHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() val uRow = new UnsafeRow() val bufferHolder = new BufferHolder() val rowWriter = new UnsafeRowWriter() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { bufferHolder.reset() rowWriter.initialize(bufferHolder, 1) rowWriter.write(0, rawIter.next()) uRow.pointTo(bufferHolder.buffer, 1, bufferHolder.totalSize()) uRow } } class LongKeysHashSetRowIterator(set: LongKeysHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() val numFields = set.schemaInfo.arity val uRow = new UnsafeRow() val bufferHolder = new BufferHolder() val rowWriter = new UnsafeRowWriter() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { bufferHolder.reset() rowWriter.initialize(bufferHolder, numFields) val value = rawIter.nextLong() if (numFields == 2) { rowWriter.write(0, (value >> 32).toInt) rowWriter.write(1, value.toInt) } else { rowWriter.write(0, value) } uRow.pointTo(bufferHolder.buffer, numFields, bufferHolder.totalSize()) uRow } } object HashSetRowIterator { def create(set: HashSet): Iterator[InternalRow] = { set match { //case set: UnsafeFixedWidthSet => set.iterator().asScala case set: IntKeysHashSet => new IntKeysHashSetRowIterator(set) case set: LongKeysHashSet => new LongKeysHashSetRowIterator(set) case set: ObjectHashSet => new ObjectHashSetRowIterator(set) } } }
Example 6
Source File: AggregateSetRDDMinMaxPartition.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution.setrdd import edu.ucla.cs.wis.bigdatalog.spark.SchemaInfo import edu.ucla.cs.wis.bigdatalog.spark.execution.aggregates.{AggregateSetRDDPartition, KeyValueToInternalRowIterator, MonotonicAggregate} import edu.ucla.cs.wis.bigdatalog.spark.storage.map.UnsafeFixedWidthMonotonicAggregationMap import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow class AggregateSetRDDMinMaxPartition(aggregateStore: UnsafeFixedWidthMonotonicAggregationMap, schemaInfo: SchemaInfo, monotonicAggregate: MonotonicAggregate) extends AggregateSetRDDPartition(aggregateStore, schemaInfo, monotonicAggregate) with Serializable { def size: Int = aggregateStore.numElements() def iterator: Iterator[InternalRow] = { KeyValueToInternalRowIterator(aggregateStore.iterator(), monotonicAggregate.generateResultProjection()) } // update() merges the results produced during the iteration into this partition // during update(): // - the underlying aggregateSetRDDPartition storage is updated. // - a 2nd aggregateSetRDDPartition is produced indicating the rows that changed during the merge // This is similar to regular aggregation, just that we-use the same hashmap each iteration def update(iter: Iterator[InternalRow], monotonicAggregate: MonotonicAggregate): (AggregateSetRDDPartition, SetRDDPartition[InternalRow]) = {//Iterator[InternalRow]) = { val start = System.currentTimeMillis() val before = aggregateStore.numElements() // this is going to perform the aggregation and return an iterator over the output val maIter = monotonicAggregate.getAggregationIterator(iter, aggregateStore) logInfo("Update deltaSPrime set size before %s after %s, delta set size %s took %s ms" .format(before, aggregateStore.numElements(), maIter.deltaSet.size, System.currentTimeMillis() - start)) val hashMapIter = new JavaHashMapIterator(maIter.deltaSet, monotonicAggregate.generateResultProjection()) (new AggregateSetRDDMinMaxPartition(aggregateStore, schemaInfo, monotonicAggregate), SetRDDHashSetPartition(hashMapIter, schemaInfo)) } } class JavaHashMapIterator(hashMap: java.util.HashMap[UnsafeRow, UnsafeRow], resultProjection: (UnsafeRow, UnsafeRow) => UnsafeRow) extends Iterator[InternalRow] { val iterator = hashMap.entrySet().iterator() override def hasNext: Boolean = iterator.hasNext override def next: InternalRow = { val entry = iterator.next() return resultProjection(entry.getKey, entry.getValue) } }
Example 7
Source File: KinesisRecordToUnsafeRowConverter.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import com.amazonaws.services.kinesis.model.Record import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.unsafe.types.UTF8String private[kinesis] class KinesisRecordToUnsafeRowConverter { private val rowWriter = new UnsafeRowWriter(5) def toUnsafeRow(record: Record, streamName: String): UnsafeRow = { rowWriter.reset() rowWriter.write(0, record.getData.array()) rowWriter.write(1, UTF8String.fromString(streamName)) rowWriter.write(2, UTF8String.fromString(record.getPartitionKey)) rowWriter.write(3, UTF8String.fromString(record.getSequenceNumber)) rowWriter.write(4, DateTimeUtils.fromJavaTimestamp( new java.sql.Timestamp(record.getApproximateArrivalTimestamp.getTime))) rowWriter.getRow } }
Example 8
Source File: CartesianProductExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator class UnsafeCartesianRDD( left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int, inMemoryBufferThreshold: Int, spillThreshold: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold) val partition = split.asInstanceOf[CartesianPartition] rdd2.iterator(partition.s2, context).foreach(rowArray.add) // Create an iterator from rowArray def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator() val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, rowArray.clear()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD( leftResults, rightResults, right.output.size, sqlContext.conf.cartesianProductExecBufferInMemoryThreshold, sqlContext.conf.cartesianProductExecBufferSpillThreshold) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 9
Source File: ObjectAggregationMap.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import java.{util => ju} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.internal.config import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, TypedImperativeAggregate} import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter def dumpToExternalSorter( groupingAttributes: Seq[Attribute], aggregateFunctions: Seq[AggregateFunction]): UnsafeKVExternalSorter = { val aggBufferAttributes = aggregateFunctions.flatMap(_.aggBufferAttributes) val sorter = new UnsafeKVExternalSorter( StructType.fromAttributes(groupingAttributes), StructType.fromAttributes(aggBufferAttributes), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, TaskContext.get().taskMemoryManager().pageSizeBytes, SparkEnv.get.conf.get(config.SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD), null ) val mapIterator = iterator val unsafeAggBufferProjection = UnsafeProjection.create(aggBufferAttributes.map(_.dataType).toArray) while (mapIterator.hasNext) { val entry = mapIterator.next() aggregateFunctions.foreach { case agg: TypedImperativeAggregate[_] => agg.serializeAggregateBufferInPlace(entry.aggregationBuffer) case _ => } sorter.insertKV( entry.groupingKey, unsafeAggBufferProjection(entry.aggregationBuffer) ) } hashMap.clear() sorter } def clear(): Unit = { hashMap.clear() } } // Stores the grouping key and aggregation buffer class AggregationBufferEntry(var groupingKey: UnsafeRow, var aggregationBuffer: InternalRow)
Example 10
Source File: BufferHolderSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.UnsafeRow class BufferHolderSuite extends SparkFunSuite { test("SPARK-16071 Check the size limit to avoid integer overflow") { var e = intercept[UnsupportedOperationException] { new BufferHolder(new UnsafeRow(Int.MaxValue / 8)) } assert(e.getMessage.contains("too many fields")) val holder = new BufferHolder(new UnsafeRow(1000)) holder.reset() holder.grow(1000) e = intercept[UnsupportedOperationException] { holder.grow(Integer.MAX_VALUE) } assert(e.getMessage.contains("exceeds size limitation")) } }
Example 11
Source File: BufferHolderSparkSubmitSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.scalatest.{BeforeAndAfterEach, Matchers} import org.apache.spark.{SparkFunSuite, TestUtils} import org.apache.spark.deploy.SparkSubmitSuite import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.util.ResetSystemProperties // A test for growing the buffer holder to nearly 2GB. Due to the heap size limitation of the Spark // unit tests JVM, the actually test code is running as a submit job. class BufferHolderSparkSubmitSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach with ResetSystemProperties { test("SPARK-22222: Buffer holder should be able to allocate memory larger than 1GB") { val unusedJar = TestUtils.createJarWithClasses(Seq.empty) val argsForSparkSubmit = Seq( "--class", BufferHolderSparkSubmitSuite.getClass.getName.stripSuffix("$"), "--name", "SPARK-22222", "--master", "local-cluster[2,1,1024]", "--driver-memory", "4g", "--conf", "spark.ui.enabled=false", "--conf", "spark.master.rest.enabled=false", "--conf", "spark.driver.extraJavaOptions=-ea", unusedJar.toString) SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") } } object BufferHolderSparkSubmitSuite { def main(args: Array[String]): Unit = { val ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH val holder = new BufferHolder(new UnsafeRow(1000)) holder.reset() holder.grow(roundToWord(ARRAY_MAX / 2)) holder.reset() holder.grow(roundToWord(ARRAY_MAX / 2 + 8)) holder.reset() holder.grow(roundToWord(Integer.MAX_VALUE / 2)) holder.reset() holder.grow(roundToWord(Integer.MAX_VALUE)) } private def roundToWord(len: Int): Int = { ByteArrayMethods.roundNumberOfBytesToNearestWord(len) } }
Example 12
Source File: CartesianProductExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.hadoop.security.UserGroupInformation import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { private[this] val user = UserGroupInformation.getCurrentUser.getShortUserName override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get(user).blockManager, SparkEnv.get(user).serializerManager, context, null, null, 1024, SparkEnv.get(user).memoryManager.pageSizeBytes, SparkEnv.get(user).conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 13
Source File: BufferHolderSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.UnsafeRow class BufferHolderSuite extends SparkFunSuite { test("SPARK-16071 Check the size limit to avoid integer overflow") { var e = intercept[UnsupportedOperationException] { new BufferHolder(new UnsafeRow(Int.MaxValue / 8)) } assert(e.getMessage.contains("too many fields")) val holder = new BufferHolder(new UnsafeRow(1000)) holder.reset() holder.grow(1000) e = intercept[UnsupportedOperationException] { holder.grow(Integer.MAX_VALUE) } assert(e.getMessage.contains("exceeds size limitation")) } }
Example 14
Source File: CartesianProductExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 15
Source File: BufferHolderSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.UnsafeRow class BufferHolderSuite extends SparkFunSuite { test("SPARK-16071 Check the size limit to avoid integer overflow") { var e = intercept[UnsupportedOperationException] { new BufferHolder(new UnsafeRow(Int.MaxValue / 8)) } assert(e.getMessage.contains("too many fields")) val holder = new BufferHolder(new UnsafeRow(1000)) holder.reset() holder.grow(1000) e = intercept[UnsupportedOperationException] { holder.grow(Integer.MAX_VALUE) } assert(e.getMessage.contains("exceeds size limitation")) } }
Example 16
Source File: Statistics.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.commons.codec.binary.Base64 import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.types._ case class ColumnStat(statRow: InternalRow) { def forNumeric[T <: AtomicType](dataType: T): NumericColumnStat[T] = { NumericColumnStat(statRow, dataType) } def forString: StringColumnStat = StringColumnStat(statRow) def forBinary: BinaryColumnStat = BinaryColumnStat(statRow) def forBoolean: BooleanColumnStat = BooleanColumnStat(statRow) override def toString: String = { // use Base64 for encoding Base64.encodeBase64String(statRow.asInstanceOf[UnsafeRow].getBytes) } } object ColumnStat { def apply(numFields: Int, str: String): ColumnStat = { // use Base64 for decoding val bytes = Base64.decodeBase64(str) val unsafeRow = new UnsafeRow(numFields) unsafeRow.pointTo(bytes, bytes.length) ColumnStat(unsafeRow) } } case class NumericColumnStat[T <: AtomicType](statRow: InternalRow, dataType: T) { // The indices here must be consistent with `ColumnStatStruct.numericColumnStat`. val numNulls: Long = statRow.getLong(0) val max: T#InternalType = statRow.get(1, dataType).asInstanceOf[T#InternalType] val min: T#InternalType = statRow.get(2, dataType).asInstanceOf[T#InternalType] val ndv: Long = statRow.getLong(3) } case class StringColumnStat(statRow: InternalRow) { // The indices here must be consistent with `ColumnStatStruct.stringColumnStat`. val numNulls: Long = statRow.getLong(0) val avgColLen: Double = statRow.getDouble(1) val maxColLen: Long = statRow.getInt(2) val ndv: Long = statRow.getLong(3) } case class BinaryColumnStat(statRow: InternalRow) { // The indices here must be consistent with `ColumnStatStruct.binaryColumnStat`. val numNulls: Long = statRow.getLong(0) val avgColLen: Double = statRow.getDouble(1) val maxColLen: Long = statRow.getInt(2) } case class BooleanColumnStat(statRow: InternalRow) { // The indices here must be consistent with `ColumnStatStruct.booleanColumnStat`. val numNulls: Long = statRow.getLong(0) val numTrues: Long = statRow.getLong(1) val numFalses: Long = statRow.getLong(2) }
Example 17
Source File: RPCContinuousShuffleReader.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous.shuffle import java.util.concurrent._ import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.util.NextIterator override def getNext(): UnsafeRow = { var nextRow: UnsafeRow = null while (!finished && nextRow == null) { completion.poll(epochIntervalMs, TimeUnit.MILLISECONDS) match { case null => // Try again if the poll didn't wait long enough to get a real result. // But we should be getting at least an epoch marker every checkpoint interval. val writerIdsUncommitted = writerEpochMarkersReceived.zipWithIndex.collect { case (flag, idx) if !flag => idx } logWarning( s"Completion service failed to make progress after $epochIntervalMs ms. Waiting " + s"for writers ${writerIdsUncommitted.mkString(",")} to send epoch markers.") // The completion service guarantees this future will be available immediately. case future => future.get() match { case ReceiverRow(writerId, r) => // Start reading the next element in the queue we just took from. completion.submit(completionTask(writerId)) nextRow = r case ReceiverEpochMarker(writerId) => // Don't read any more from this queue. If all the writers have sent epoch markers, // the epoch is over; otherwise we need to loop again to poll from the remaining // writers. writerEpochMarkersReceived(writerId) = true if (writerEpochMarkersReceived.forall(_ == true)) { finished = true } } } } nextRow } override def close(): Unit = { executor.shutdownNow() } } } }
Example 18
Source File: ContinuousShuffleReadRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous.shuffle import java.util.UUID import org.apache.spark.{Partition, SparkContext, SparkEnv, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.rpc.RpcAddress import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.NextIterator case class ContinuousShuffleReadPartition( index: Int, endpointName: String, queueSize: Int, numShuffleWriters: Int, epochIntervalMs: Long) extends Partition { // Initialized only on the executor, and only once even as we call compute() multiple times. lazy val (reader: ContinuousShuffleReader, endpoint) = { val env = SparkEnv.get.rpcEnv val receiver = new RPCContinuousShuffleReader( queueSize, numShuffleWriters, epochIntervalMs, env) val endpoint = env.setupEndpoint(endpointName, receiver) TaskContext.get().addTaskCompletionListener[Unit] { ctx => env.stop(endpoint) } (receiver, endpoint) } } class ContinuousShuffleReadRDD( sc: SparkContext, numPartitions: Int, queueSize: Int = 1024, numShuffleWriters: Int = 1, epochIntervalMs: Long = 1000, val endpointNames: Seq[String] = Seq(s"RPCContinuousShuffleReader-${UUID.randomUUID()}")) extends RDD[UnsafeRow](sc, Nil) { override protected def getPartitions: Array[Partition] = { (0 until numPartitions).map { partIndex => ContinuousShuffleReadPartition( partIndex, endpointNames(partIndex), queueSize, numShuffleWriters, epochIntervalMs) }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[UnsafeRow] = { split.asInstanceOf[ContinuousShuffleReadPartition].reader.read() } }
Example 19
Source File: ContinuousCoalesceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.UUID import org.apache.spark.{HashPartitioner, SparkEnv} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.continuous.shuffle.{ContinuousShuffleReadPartition, ContinuousShuffleReadRDD} case class ContinuousCoalesceExec(numPartitions: Int, child: SparkPlan) extends SparkPlan { override def output: Seq[Attribute] = child.output override def children: Seq[SparkPlan] = child :: Nil override def outputPartitioning: Partitioning = SinglePartition override def doExecute(): RDD[InternalRow] = { assert(numPartitions == 1) new ContinuousCoalesceRDD( sparkContext, numPartitions, conf.continuousStreamingExecutorQueueSize, sparkContext.getLocalProperty(ContinuousExecution.EPOCH_INTERVAL_KEY).toLong, child.execute()) } }
Example 20
Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} import org.apache.spark.util.CompletionIterator case class StreamingGlobalLimitExec( streamLimit: Long, child: SparkPlan, stateInfo: Option[StatefulOperatorStateInfo] = None, outputMode: Option[OutputMode] = None) extends UnaryExecNode with StateStoreWriter { private val keySchema = StructType(Array(StructField("key", NullType))) private val valueSchema = StructType(Array(StructField("value", LongType))) override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append, "StreamingGlobalLimitExec is only valid for streams in Append output mode") child.execute().mapPartitionsWithStateStore( getStateInfo, keySchema, valueSchema, indexOrdinal = None, sqlContext.sessionState, Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) => val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null))) val numOutputRows = longMetric("numOutputRows") val numUpdatedStateRows = longMetric("numUpdatedStateRows") val allUpdatesTimeMs = longMetric("allUpdatesTimeMs") val commitTimeMs = longMetric("commitTimeMs") val updatesStartTimeNs = System.nanoTime val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L) var cumulativeRowCount = preBatchRowCount val result = iter.filter { r => val x = cumulativeRowCount < streamLimit if (x) { cumulativeRowCount += 1 } x } CompletionIterator[InternalRow, Iterator[InternalRow]](result, { if (cumulativeRowCount > preBatchRowCount) { numUpdatedStateRows += 1 numOutputRows += cumulativeRowCount - preBatchRowCount store.put(key, getValueRow(cumulativeRowCount)) } allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs) commitTimeMs += timeTakenMs { store.commit() } setStoreMetrics(store) }) } } override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = child.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil private def getValueRow(value: Long): UnsafeRow = { UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value))) } }
Example 21
Source File: CartesianProductExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, ExternalAppendOnlyUnsafeRowArray, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.util.CompletionIterator class UnsafeCartesianRDD( left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int, inMemoryBufferThreshold: Int, spillThreshold: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { val rowArray = new ExternalAppendOnlyUnsafeRowArray(inMemoryBufferThreshold, spillThreshold) val partition = split.asInstanceOf[CartesianPartition] rdd2.iterator(partition.s2, context).foreach(rowArray.add) // Create an iterator from rowArray def createIter(): Iterator[UnsafeRow] = rowArray.generateIterator() val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, rowArray.clear()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD( leftResults, rightResults, right.output.size, sqlContext.conf.cartesianProductExecBufferInMemoryThreshold, sqlContext.conf.cartesianProductExecBufferSpillThreshold) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 22
Source File: ObjectAggregationMap.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import java.{util => ju} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.internal.config import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, TypedImperativeAggregate} import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter def dumpToExternalSorter( groupingAttributes: Seq[Attribute], aggregateFunctions: Seq[AggregateFunction]): UnsafeKVExternalSorter = { val aggBufferAttributes = aggregateFunctions.flatMap(_.aggBufferAttributes) val sorter = new UnsafeKVExternalSorter( StructType.fromAttributes(groupingAttributes), StructType.fromAttributes(aggBufferAttributes), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, TaskContext.get().taskMemoryManager().pageSizeBytes, SparkEnv.get.conf.get(config.SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD), null ) val mapIterator = iterator val unsafeAggBufferProjection = UnsafeProjection.create(aggBufferAttributes.map(_.dataType).toArray) while (mapIterator.hasNext) { val entry = mapIterator.next() aggregateFunctions.foreach { case agg: TypedImperativeAggregate[_] => agg.serializeAggregateBufferInPlace(entry.aggregationBuffer) case _ => } sorter.insertKV( entry.groupingKey, unsafeAggBufferProjection(entry.aggregationBuffer) ) } hashMap.clear() sorter } def clear(): Unit = { hashMap.clear() } } // Stores the grouping key and aggregation buffer class AggregationBufferEntry(var groupingKey: UnsafeRow, var aggregationBuffer: InternalRow)
Example 23
Source File: BufferHolderSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.UnsafeRow class BufferHolderSuite extends SparkFunSuite { test("SPARK-16071 Check the size limit to avoid integer overflow") { assert(intercept[UnsupportedOperationException] { new BufferHolder(new UnsafeRow(Int.MaxValue / 8)) }.getMessage.contains("too many fields")) val holder = new BufferHolder(new UnsafeRow(1000)) holder.reset() holder.grow(1000) assert(intercept[IllegalArgumentException] { holder.grow(Integer.MAX_VALUE) }.getMessage.contains("exceeds size limitation")) } }
Example 24
Source File: BufferHolderSparkSubmitSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers} import org.apache.spark.{SparkFunSuite, TestUtils} import org.apache.spark.deploy.SparkSubmitSuite import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.unsafe.array.ByteArrayMethods import org.apache.spark.util.ResetSystemProperties // A test for growing the buffer holder to nearly 2GB. Due to the heap size limitation of the Spark // unit tests JVM, the actually test code is running as a submit job. class BufferHolderSparkSubmitSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach with ResetSystemProperties { test("SPARK-22222: Buffer holder should be able to allocate memory larger than 1GB") { val unusedJar = TestUtils.createJarWithClasses(Seq.empty) val argsForSparkSubmit = Seq( "--class", BufferHolderSparkSubmitSuite.getClass.getName.stripSuffix("$"), "--name", "SPARK-22222", "--master", "local-cluster[1,1,4096]", "--driver-memory", "4g", "--conf", "spark.ui.enabled=false", "--conf", "spark.master.rest.enabled=false", "--conf", "spark.driver.extraJavaOptions=-ea", unusedJar.toString) SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") } } object BufferHolderSparkSubmitSuite extends Assertions { def main(args: Array[String]): Unit = { val ARRAY_MAX = ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH val unsafeRow = new UnsafeRow(1000) val holder = new BufferHolder(unsafeRow) holder.reset() assert(intercept[IllegalArgumentException] { holder.grow(-1) }.getMessage.contains("because the size is negative")) // while to reuse a buffer may happen, this test checks whether the buffer can be grown holder.grow(ARRAY_MAX / 2) assert(unsafeRow.getSizeInBytes % 8 == 0) holder.grow(ARRAY_MAX / 2 + 7) assert(unsafeRow.getSizeInBytes % 8 == 0) holder.grow(Integer.MAX_VALUE / 2) assert(unsafeRow.getSizeInBytes % 8 == 0) holder.grow(ARRAY_MAX - holder.totalSize()) assert(unsafeRow.getSizeInBytes % 8 == 0) assert(intercept[IllegalArgumentException] { holder.grow(ARRAY_MAX + 1 - holder.totalSize()) }.getMessage.contains("because the size after growing")) } }
Example 25
Source File: ArrowConverters.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.arrow import java.io.ByteArrayOutputStream import java.nio.channels.Channels import org.apache.arrow.memory.BufferAllocator import org.apache.arrow.vector._ import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.types._ import com.twosigma.flint.util.Utils import org.apache.arrow.vector.ipc.{ ArrowFileReader, ArrowFileWriter } import org.apache.arrow.vector.ipc.message.ArrowRecordBatch trait ClosableIterator[T] extends Iterator[T] with AutoCloseable class ConcatClosableIterator[T](iters: Iterator[ClosableIterator[T]]) extends ClosableIterator[T] { var curIter: ClosableIterator[T] = _ private def advance(): Unit = { require(curIter == null || !curIter.hasNext, "Should not advance if curIter is not empty") require(iters.hasNext, "Should not advance if iters doesn't have next") closeCurrent() curIter = iters.next() } private def closeCurrent(): Unit = if (curIter != null) curIter.close() override def close(): Unit = closeCurrent() override def hasNext: Boolean = { if (curIter == null || !curIter.hasNext) { if (iters.hasNext) { advance() hasNext } else { false } } else { true } } override def next(): T = curIter.next() } def byteArrayToBatch( batchBytes: Array[Byte], allocator: BufferAllocator ): ArrowRecordBatch = { val in = new ByteArrayReadableSeekableByteChannel(batchBytes) val reader = new ArrowFileReader(in, allocator) // Read a batch from a byte stream, ensure the reader is closed Utils.tryWithSafeFinally { val root = reader.getVectorSchemaRoot // throws IOException val unloader = new VectorUnloader(root) reader.loadNextBatch() // throws IOException unloader.getRecordBatch } { reader.close() } } }
Example 26
Source File: CartesianProductExec.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsInternal { iter => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition: (InternalRow) => Boolean = newPredicate(condition.get, left.output ++ right.output) val joined = new JoinedRow iter.filter { r => boundCondition(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 27
Source File: BufferHolderSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.UnsafeRow class BufferHolderSuite extends SparkFunSuite { test("SPARK-16071 Check the size limit to avoid integer overflow") { var e = intercept[UnsupportedOperationException] { new BufferHolder(new UnsafeRow(Int.MaxValue / 8)) } assert(e.getMessage.contains("too many fields")) val holder = new BufferHolder(new UnsafeRow(1000)) holder.reset() holder.grow(1000) e = intercept[UnsupportedOperationException] { holder.grow(Integer.MAX_VALUE) } assert(e.getMessage.contains("exceeds size limitation")) } }