org.apache.spark.sql.catalyst.InternalRow Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.InternalRow.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: LocalTableScanExec.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) private val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 3
Source File: AvroDataToCatalyst.scala From spark-schema-registry with Apache License 2.0 | 6 votes |
package com.hortonworks.spark.registry.avro import java.io.ByteArrayInputStream import com.hortonworks.registries.schemaregistry.{SchemaVersionInfo, SchemaVersionKey} import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer import org.apache.avro.Schema import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{BinaryType, DataType} import scala.collection.JavaConverters._ case class AvroDataToCatalyst(child: Expression, schemaName: String, version: Option[Int], config: Map[String, Object]) extends UnaryExpression with ExpectsInputTypes { override def inputTypes = Seq(BinaryType) @transient private lazy val srDeser: AvroSnapshotDeserializer = { val obj = new AvroSnapshotDeserializer() obj.init(config.asJava) obj } @transient private lazy val srSchema = fetchSchemaVersionInfo(schemaName, version) @transient private lazy val avroSchema = new Schema.Parser().parse(srSchema.getSchemaText) override lazy val dataType: DataType = SchemaConverters.toSqlType(avroSchema).dataType @transient private lazy val avroDeser= new AvroDeserializer(avroSchema, dataType) override def nullable: Boolean = true override def nullSafeEval(input: Any): Any = { val binary = input.asInstanceOf[Array[Byte]] val row = avroDeser.deserialize(srDeser.deserialize(new ByteArrayInputStream(binary), srSchema.getVersion)) val result = row match { case r: InternalRow => r.copy() case _ => row } result } override def simpleString: String = { s"from_sr(${child.sql}, ${dataType.simpleString})" } override def sql: String = { s"from_sr(${child.sql}, ${dataType.catalogString})" } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val expr = ctx.addReferenceObj("this", this) defineCodeGen(ctx, ev, input => s"(${ctx.boxedType(dataType)})$expr.nullSafeEval($input)") } private def fetchSchemaVersionInfo(schemaName: String, version: Option[Int]): SchemaVersionInfo = { val srClient = new SchemaRegistryClient(config.asJava) version.map(v => srClient.getSchemaVersionInfo(new SchemaVersionKey(schemaName, v))) .getOrElse(srClient.getLatestSchemaVersionInfo(schemaName)) } }
Example 4
Source File: KustoCsvSerializationUtils.scala From azure-kusto-spark with Apache License 2.0 | 6 votes |
package com.microsoft.kusto.spark.datasink import java.util.TimeZone import com.microsoft.kusto.spark.utils.DataTypeMapping import org.apache.commons.lang3.time.FastDateFormat import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types.DataTypes._ import org.apache.spark.sql.types.StructType private[kusto] class KustoCsvSerializationUtils (val schema: StructType, timeZone: String){ private[kusto] val dateFormat = FastDateFormat.getInstance("yyyy-MM-dd'T'HH:mm:ss.SSSXXX", TimeZone.getTimeZone(timeZone)) private[kusto] def convertRow(row: InternalRow) = { val values = new Array[String](row.numFields) for (i <- 0 until row.numFields if !row.isNullAt(i)) { val dataType = schema.fields(i).dataType values(i) = dataType match { case DateType => DateTimeUtils.toJavaDate(row.getInt(i)).toString case TimestampType => dateFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(i))) case _ => row.get(i, dataType).toString } } values } } private[kusto] object KustoCsvMapper { import org.apache.spark.sql.types.StructType import org.json def createCsvMapping(schema: StructType): String = { val csvMapping = new json.JSONArray() for (i <- 0 until schema.length) { val field = schema.apply(i) val dataType = field.dataType val mapping = new json.JSONObject() mapping.put("Name", field.name) mapping.put("Ordinal", i) mapping.put("DataType", DataTypeMapping.sparkTypeToKustoTypeMap.getOrElse(dataType, StringType)) csvMapping.put(mapping) } csvMapping.toString } }
Example 5
Source File: ArrowEvalPythonExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.arrow.ArrowUtils import org.apache.spark.sql.types.StructType case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends EvalPythonExec(udfs, output, child) { private val batchSize = conf.arrowMaxRecordsPerBatch private val sessionLocalTimeZone = conf.sessionLocalTimeZone private val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf) protected override def evaluate( funcs: Seq[ChainedPythonFunctions], argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] = { val outputTypes = output.drop(child.output.length).map(_.dataType) // DO NOT use iter.grouped(). See BatchIterator. val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter) val columnarBatchIter = new ArrowPythonRunner( funcs, PythonEvalType.SQL_SCALAR_PANDAS_UDF, argOffsets, schema, sessionLocalTimeZone, pythonRunnerConf).compute(batchIter, context.partitionId(), context) new Iterator[InternalRow] { private var currentIter = if (columnarBatchIter.hasNext) { val batch = columnarBatchIter.next() val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType()) assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " + s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}") batch.rowIterator.asScala } else { Iterator.empty } override def hasNext: Boolean = currentIter.hasNext || { if (columnarBatchIter.hasNext) { currentIter = columnarBatchIter.next().rowIterator.asScala hasNext } else { false } } override def next(): InternalRow = currentIter.next() } } }
Example 6
Source File: BatchEvalPythonExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import net.razorvine.pickle.{Pickler, Unpickler} import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{StructField, StructType} case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends EvalPythonExec(udfs, output, child) { protected override def evaluate( funcs: Seq[ChainedPythonFunctions], argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] = { EvaluatePython.registerPicklers() // register pickler for Row val dataTypes = schema.map(_.dataType) val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython) // enable memo iff we serialize the row with schema (schema and class should be memorized) val pickle = new Pickler(needConversion) // Input iterator to Python: input rows are grouped so we send them in batches to Python. // For each row, add it to the queue. val inputIterator = iter.map { row => if (needConversion) { EvaluatePython.toJava(row, schema) } else { // fast path for these types that does not need conversion in Python val fields = new Array[Any](row.numFields) var i = 0 while (i < row.numFields) { val dt = dataTypes(i) fields(i) = EvaluatePython.toJava(row.get(i, dt), dt) i += 1 } fields } }.grouped(100).map(x => pickle.dumps(x.toArray)) // Output iterator for results from Python. val outputIterator = new PythonUDFRunner(funcs, PythonEvalType.SQL_BATCHED_UDF, argOffsets) .compute(inputIterator, context.partitionId(), context) val unpickle = new Unpickler val mutableRow = new GenericInternalRow(1) val resultType = if (udfs.length == 1) { udfs.head.dataType } else { StructType(udfs.map(u => StructField("", u.dataType, u.nullable))) } val fromJava = EvaluatePython.makeFromJava(resultType) outputIterator.flatMap { pickedResult => val unpickledBatch = unpickle.loads(pickedResult) unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala }.map { result => if (udfs.length == 1) { // fast path for single UDF mutableRow(0) = fromJava(result) mutableRow } else { fromJava(result).asInstanceOf[InternalRow] } } } }
Example 7
Source File: subquery.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId, InSet, Literal, PlanExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{BooleanType, DataType, StructType} case class ReuseSubquery(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of subqueries to avoid O(N*N) sameResult calls. val subqueries = mutable.HashMap[StructType, ArrayBuffer[SubqueryExec]]() plan transformAllExpressions { case sub: ExecSubqueryExpression => val sameSchema = subqueries.getOrElseUpdate(sub.plan.schema, ArrayBuffer[SubqueryExec]()) val sameResult = sameSchema.find(_.sameResult(sub.plan)) if (sameResult.isDefined) { sub.withNewPlan(sameResult.get) } else { sameSchema += sub.plan sub } } } }
Example 8
Source File: BoundOrdering.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.window import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Projection private[window] final case class RangeBoundOrdering( ordering: Ordering[InternalRow], current: Projection, bound: Projection) extends BoundOrdering { override def compare( inputRow: InternalRow, inputIndex: Int, outputRow: InternalRow, outputIndex: Int): Int = ordering.compare(current(inputRow), bound(outputRow)) }
Example 9
Source File: NullableColumnAccessor.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.catalyst.InternalRow private[columnar] trait NullableColumnAccessor extends ColumnAccessor { private var nullsBuffer: ByteBuffer = _ private var nullCount: Int = _ private var seenNulls: Int = 0 private var nextNullIndex: Int = _ private var pos: Int = 0 abstract override protected def initialize(): Unit = { nullsBuffer = underlyingBuffer.duplicate().order(ByteOrder.nativeOrder()) nullCount = ByteBufferHelper.getInt(nullsBuffer) nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1 pos = 0 underlyingBuffer.position(underlyingBuffer.position() + 4 + nullCount * 4) super.initialize() } abstract override def extractTo(row: InternalRow, ordinal: Int): Unit = { if (pos == nextNullIndex) { seenNulls += 1 if (seenNulls < nullCount) { nextNullIndex = ByteBufferHelper.getInt(nullsBuffer) } row.setNullAt(ordinal) } else { super.extractTo(row, ordinal) } pos += 1 } abstract override def hasNext: Boolean = seenNulls < nullCount || super.hasNext }
Example 10
Source File: NullableColumnBuilder.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.catalyst.InternalRow private[columnar] trait NullableColumnBuilder extends ColumnBuilder { protected var nulls: ByteBuffer = _ protected var nullCount: Int = _ private var pos: Int = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { nulls = ByteBuffer.allocate(1024) nulls.order(ByteOrder.nativeOrder()) pos = 0 nullCount = 0 super.initialize(initialSize, columnName, useCompression) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { columnStats.gatherStats(row, ordinal) if (row.isNullAt(ordinal)) { nulls = ColumnBuilder.ensureFreeSpace(nulls, 4) nulls.putInt(pos) nullCount += 1 } else { super.appendFrom(row, ordinal) } pos += 1 } abstract override def build(): ByteBuffer = { val nonNulls = super.build() val nullDataLen = nulls.position() nulls.limit(nullDataLen) nulls.rewind() val buffer = ByteBuffer .allocate(4 + nullDataLen + nonNulls.remaining()) .order(ByteOrder.nativeOrder()) .putInt(nullCount) .put(nulls) .put(nonNulls) buffer.rewind() buffer } protected def buildNonNulls(): ByteBuffer = { nulls.limit(nulls.position()).rewind() super.build() } }
Example 11
Source File: CompressibleColumnBuilder.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 12
Source File: CompressibleColumnAccessor.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.execution.vectorized.WritableColumnVector import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: InternalRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = decoder.decompress(columnVector, capacity) }
Example 13
Source File: GroupedIterator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateOrdering, GenerateUnsafeProjection} object GroupedIterator { def apply( input: Iterator[InternalRow], keyExpressions: Seq[Expression], inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = { if (input.hasNext) { new GroupedIterator(input.buffered, keyExpressions, inputSchema) } else { Iterator.empty } } } def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator def next(): (InternalRow, Iterator[InternalRow]) = { assert(hasNext) // Ensure we have fetched the next iterator. val ret = (keyProjection(currentGroup), currentIterator) currentIterator = null ret } private def fetchNextGroupIterator(): Boolean = { assert(currentIterator == null) if (currentRow == null && input.hasNext) { currentRow = input.next() } if (currentRow == null) { // These is no data left, return false. false } else { // Skip to next group. // currentRow may be overwritten by `hasNext`, so we should compare them first. while (keyOrdering.compare(currentGroup, currentRow) == 0 && input.hasNext) { currentRow = input.next() } if (keyOrdering.compare(currentGroup, currentRow) == 0) { // We are in the last group, there is no more groups, return false. false } else { // Now the `currentRow` is the first row of next group. currentGroup = currentRow.copy() currentIterator = createGroupValuesIterator() true } } } private def createGroupValuesIterator(): Iterator[InternalRow] = { new Iterator[InternalRow] { def hasNext: Boolean = currentRow != null || fetchNextRowInGroup() def next(): InternalRow = { assert(hasNext) val res = currentRow currentRow = null res } private def fetchNextRowInGroup(): Boolean = { assert(currentRow == null) if (input.hasNext) { // The inner iterator should NOT consume the input into next group, here we use `head` to // peek the next input, to see if we should continue to process it. if (keyOrdering.compare(currentGroup, input.head) == 0) { // Next input is in the current group. Continue the inner iterator. currentRow = input.next() true } else { // Next input is not in the right group. End this inner iterator. false } } else { // There is no more data, return false. false } } } } }
Example 14
Source File: ExistingRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoder, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericInternalRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } case class RDDScanExec( output: Seq[Attribute], rdd: RDD[InternalRow], name: String, override val outputPartitioning: Partitioning = UnknownPartitioning(0), override val outputOrdering: Seq[SortOrder] = Nil) extends LeafExecNode { private def rddName: String = Option(rdd.name).map(n => s" $n").getOrElse("") override val nodeName: String = s"Scan $name$rddName" override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.mapPartitionsWithIndexInternal { (index, iter) => val proj = UnsafeProjection.create(schema) proj.initialize(index) iter.map { r => numOutputRows += 1 proj(r) } } } override def simpleString: String = { s"$nodeName${Utils.truncatedString(output, "[", ",", "]")}" } }
Example 15
Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} import org.apache.spark.util.CompletionIterator case class StreamingGlobalLimitExec( streamLimit: Long, child: SparkPlan, stateInfo: Option[StatefulOperatorStateInfo] = None, outputMode: Option[OutputMode] = None) extends UnaryExecNode with StateStoreWriter { private val keySchema = StructType(Array(StructField("key", NullType))) private val valueSchema = StructType(Array(StructField("value", LongType))) override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append, "StreamingGlobalLimitExec is only valid for streams in Append output mode") child.execute().mapPartitionsWithStateStore( getStateInfo, keySchema, valueSchema, indexOrdinal = None, sqlContext.sessionState, Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) => val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null))) val numOutputRows = longMetric("numOutputRows") val numUpdatedStateRows = longMetric("numUpdatedStateRows") val allUpdatesTimeMs = longMetric("allUpdatesTimeMs") val commitTimeMs = longMetric("commitTimeMs") val updatesStartTimeNs = System.nanoTime val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L) var cumulativeRowCount = preBatchRowCount val result = iter.filter { r => val x = cumulativeRowCount < streamLimit if (x) { cumulativeRowCount += 1 } x } CompletionIterator[InternalRow, Iterator[InternalRow]](result, { if (cumulativeRowCount > preBatchRowCount) { numUpdatedStateRows += 1 numOutputRows += cumulativeRowCount - preBatchRowCount store.put(key, getValueRow(cumulativeRowCount)) } allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs) commitTimeMs += timeTakenMs { store.commit() } setStoreMetrics(store) }) } } override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = child.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil private def getValueRow(value: Long): UnsafeRow = { UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value))) } }
Example 16
Source File: ContinuousWriteRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import org.apache.spark.{Partition, SparkEnv, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory} import org.apache.spark.util.Utils class ContinuousWriteRDD(var prev: RDD[InternalRow], writeTask: DataWriterFactory[InternalRow]) extends RDD[Unit](prev) { override val partitioner = prev.partitioner override def getPartitions: Array[Partition] = prev.partitions override def compute(split: Partition, context: TaskContext): Iterator[Unit] = { val epochCoordinator = EpochCoordinatorRef.get( context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), SparkEnv.get) EpochTracker.initializeCurrentEpoch( context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong) while (!context.isInterrupted() && !context.isCompleted()) { var dataWriter: DataWriter[InternalRow] = null // write the data and commit this writer. Utils.tryWithSafeFinallyAndFailureCallbacks(block = { try { val dataIterator = prev.compute(split, context) dataWriter = writeTask.createDataWriter( context.partitionId(), context.taskAttemptId(), EpochTracker.getCurrentEpoch.get) while (dataIterator.hasNext) { dataWriter.write(dataIterator.next()) } logInfo(s"Writer for partition ${context.partitionId()} " + s"in epoch ${EpochTracker.getCurrentEpoch.get} is committing.") val msg = dataWriter.commit() epochCoordinator.send( CommitPartitionEpoch( context.partitionId(), EpochTracker.getCurrentEpoch.get, msg) ) logInfo(s"Writer for partition ${context.partitionId()} " + s"in epoch ${EpochTracker.getCurrentEpoch.get} committed.") EpochTracker.incrementCurrentEpoch() } catch { case _: InterruptedException => // Continuous shutdown always involves an interrupt. Just finish the task. } })(catchBlock = { // If there is an error, abort this writer. We enter this callback in the middle of // rethrowing an exception, so compute() will stop executing at this point. logError(s"Writer for partition ${context.partitionId()} is aborting.") if (dataWriter != null) dataWriter.abort() logError(s"Writer for partition ${context.partitionId()} aborted.") }) } Iterator() } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 17
Source File: ContinuousCoalesceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import java.util.UUID import org.apache.spark.{HashPartitioner, SparkEnv} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.continuous.shuffle.{ContinuousShuffleReadPartition, ContinuousShuffleReadRDD} case class ContinuousCoalesceExec(numPartitions: Int, child: SparkPlan) extends SparkPlan { override def output: Seq[Attribute] = child.output override def children: Seq[SparkPlan] = child :: Nil override def outputPartitioning: Partitioning = SinglePartition override def doExecute(): RDD[InternalRow] = { assert(numPartitions == 1) new ContinuousCoalesceRDD( sparkContext, numPartitions, conf.continuousStreamingExecutorQueueSize, sparkContext.getLocalProperty(ContinuousExecution.EPOCH_INTERVAL_KEY).toLong, child.execute()) } }
Example 18
Source File: ContinuousDataSourceRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.v2.reader._ import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousInputPartitionReader import org.apache.spark.util.NextIterator class ContinuousDataSourceRDDPartition( val index: Int, val inputPartition: InputPartition[InternalRow]) extends Partition with Serializable { // This is semantically a lazy val - it's initialized once the first time a call to // ContinuousDataSourceRDD.compute() needs to access it, so it can be shared across // all compute() calls for a partition. This ensures that one compute() picks up where the // previous one ended. // We don't make it actually a lazy val because it needs input which isn't available here. // This will only be initialized on the executors. private[continuous] var queueReader: ContinuousQueuedDataReader = _ } override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { // If attempt number isn't 0, this is a task retry, which we don't support. if (context.attemptNumber() != 0) { throw new ContinuousTaskRetryException() } val readerForPartition = { val partition = split.asInstanceOf[ContinuousDataSourceRDDPartition] if (partition.queueReader == null) { partition.queueReader = new ContinuousQueuedDataReader(partition, context, dataQueueSize, epochPollIntervalMs) } partition.queueReader } new NextIterator[InternalRow] { override def getNext(): InternalRow = { readerForPartition.next() match { case null => finished = true null case row => row } } override def close(): Unit = {} } } override def getPreferredLocations(split: Partition): Seq[String] = { split.asInstanceOf[ContinuousDataSourceRDDPartition].inputPartition.preferredLocations() } } object ContinuousDataSourceRDD { private[continuous] def getContinuousReader( reader: InputPartitionReader[InternalRow]): ContinuousInputPartitionReader[_] = { reader match { case r: ContinuousInputPartitionReader[InternalRow] => r case _ => throw new IllegalStateException(s"Unknown continuous reader type ${reader.getClass}") } } }
Example 19
Source File: WriteToContinuousDataSourceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan) extends SparkPlan with Logging { override def children: Seq[SparkPlan] = Seq(query) override def output: Seq[Attribute] = Nil override protected def doExecute(): RDD[InternalRow] = { val writerFactory = writer.createWriterFactory() val rdd = new ContinuousWriteRDD(query.execute(), writerFactory) logInfo(s"Start processing data source writer: $writer. " + s"The input RDD has ${rdd.partitions.length} partitions.") EpochCoordinatorRef.get( sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), sparkContext.env) .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions)) try { // Force the RDD to run so continuous processing starts; no data is actually being collected // to the driver, as ContinuousWriteRDD outputs nothing. rdd.collect() } catch { case _: InterruptedException => // Interruption is how continuous queries are ended, so accept and ignore the exception. case cause: Throwable => cause match { // Do not wrap interruption exceptions that will be handled by streaming specially. case _ if StreamExecution.isInterruptionException(cause) => throw cause // Only wrap non fatal exceptions. case NonFatal(e) => throw new SparkException("Writing job aborted.", e) case _ => throw cause } } sparkContext.emptyRDD } }
Example 20
Source File: ConsoleWriter.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.sources import org.apache.spark.internal.Logging import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.sources.v2.DataSourceOptions import org.apache.spark.sql.sources.v2.writer.{DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.types.StructType class ConsoleWriter(schema: StructType, options: DataSourceOptions) extends StreamWriter with Logging { // Number of rows to display, by default 20 rows protected val numRowsToShow = options.getInt("numRows", 20) // Truncate the displayed data if it is too long, by default it is true protected val isTruncated = options.getBoolean("truncate", true) assert(SparkSession.getActiveSession.isDefined) protected val spark = SparkSession.getActiveSession.get def createWriterFactory(): DataWriterFactory[InternalRow] = PackedRowWriterFactory override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = { // We have to print a "Batch" label for the epoch for compatibility with the pre-data source V2 // behavior. printRows(messages, schema, s"Batch: $epochId") } def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} protected def printRows( commitMessages: Array[WriterCommitMessage], schema: StructType, printMessage: String): Unit = { val rows = commitMessages.collect { case PackedRowCommitMessage(rs) => rs }.flatten // scalastyle:off println println("-------------------------------------------") println(printMessage) println("-------------------------------------------") // scalastyle:off println Dataset.ofRows(spark, LocalRelation(schema.toAttributes, rows)) .show(numRowsToShow, isTruncated) } override def toString(): String = { s"ConsoleWriter[numRows=$numRowsToShow, truncate=$isTruncated]" } }
Example 21
Source File: ContinuousRecordEndpoint.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.SparkEnv import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.v2.reader.streaming.PartitionOffset case class ContinuousRecordPartitionOffset(partitionId: Int, offset: Int) extends PartitionOffset case class GetRecord(offset: ContinuousRecordPartitionOffset) override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case GetRecord(ContinuousRecordPartitionOffset(partitionId, offset)) => lock.synchronized { val bufOffset = offset - startOffsets(partitionId) val buf = buckets(partitionId) val record = if (buf.size <= bufOffset) None else Some(buf(bufOffset)) context.reply(record.map(InternalRow(_))) } } }
Example 22
Source File: StreamingRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2} object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source, session: SparkSession): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes)(session) } }
Example 23
Source File: EventTimeWatermarkExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 24
Source File: CoGroupedIterator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 25
Source File: ReferenceSort.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 26
Source File: ColumnarTestUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 27
Source File: MemorySinkV2Suite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.scalatest.BeforeAndAfter import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.streaming.sources._ import org.apache.spark.sql.streaming.{OutputMode, StreamTest} import org.apache.spark.sql.types.StructType class MemorySinkV2Suite extends StreamTest with BeforeAndAfter { test("data writer") { val partition = 1234 val writer = new MemoryDataWriter( partition, OutputMode.Append(), new StructType().add("i", "int")) writer.write(InternalRow(1)) writer.write(InternalRow(2)) writer.write(InternalRow(44)) val msg = writer.commit() assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44)) assert(msg.partition == partition) // Buffer should be cleared, so repeated commits should give empty. assert(writer.commit().data.isEmpty) } test("streaming writer") { val sink = new MemorySinkV2 val writeSupport = new MemoryStreamWriter( sink, OutputMode.Append(), new StructType().add("i", "int")) writeSupport.commit(0, Array( MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))), MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))), MemoryWriterCommitMessage(2, Seq(Row(6), Row(7))) )) assert(sink.latestBatchId.contains(0)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7)) writeSupport.commit(19, Array( MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))), MemoryWriterCommitMessage(0, Seq(Row(33))) )) assert(sink.latestBatchId.contains(19)) assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33)) assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33)) } }
Example 28
Source File: ExtraStrategiesSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.test.SharedSQLContext case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) val unsafeProj = UnsafeProjection.create(schema) val unsafeRow = unsafeProj(row).copy() sparkContext.parallelize(Seq(unsafeRow)) } override def producedAttributes: AttributeSet = outputSet override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { spark.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { spark.experimental.extraStrategies = Nil } } }
Example 29
Source File: DataSourceTest.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String private[sql] abstract class DataSourceTest extends QueryTest { protected def sqlTest(sqlString: String, expectedAnswer: Seq[Row], enableRegex: Boolean = false) { test(sqlString) { withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> enableRegex.toString) { checkAnswer(spark.sql(sqlString), expectedAnswer) } } } } class DDLScanSource extends RelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { SimpleDDLScan( parameters("from").toInt, parameters("TO").toInt, parameters("Table"))(sqlContext.sparkSession) } } case class SimpleDDLScan( from: Int, to: Int, table: String)(@transient val sparkSession: SparkSession) extends BaseRelation with TableScan { override def sqlContext: SQLContext = sparkSession.sqlContext override def schema: StructType = StructType(Seq( StructField("intType", IntegerType, nullable = false).withComment(s"test comment $table"), StructField("stringType", StringType, nullable = false), StructField("dateType", DateType, nullable = false), StructField("timestampType", TimestampType, nullable = false), StructField("doubleType", DoubleType, nullable = false), StructField("bigintType", LongType, nullable = false), StructField("tinyintType", ByteType, nullable = false), StructField("decimalType", DecimalType.USER_DEFAULT, nullable = false), StructField("fixedDecimalType", DecimalType(5, 1), nullable = false), StructField("binaryType", BinaryType, nullable = false), StructField("booleanType", BooleanType, nullable = false), StructField("smallIntType", ShortType, nullable = false), StructField("floatType", FloatType, nullable = false), StructField("mapType", MapType(StringType, StringType)), StructField("arrayType", ArrayType(StringType)), StructField("structType", StructType(StructField("f1", StringType) :: StructField("f2", IntegerType) :: Nil ) ) )) override def needConversion: Boolean = false override def buildScan(): RDD[Row] = { // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row] sparkSession.sparkContext.parallelize(from to to).map { e => InternalRow(UTF8String.fromString(s"people$e"), e * 2) }.asInstanceOf[RDD[Row]] } }
Example 30
Source File: HierarchyRowFunctions.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hierarchy import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.types.Node private[hierarchy] case class HierarchyRowFunctions(inputTypes: Seq[DataType]) { private[hierarchy] def rowGet[K](i: Int): Row => K = (row: Row) => row.getAs[K](i) private[hierarchy] def rowInit[K](pk: Row => K, pathDataType: DataType): (Row, Option[Long]) => Row = { (row, myOrdKey) => myOrdKey match { case Some(x) => Row(row.toSeq ++ Seq(Node(List(pk(row)), pathDataType, ordPath = List(x))): _*) case None => Row(row.toSeq ++ Seq(Node(List(pk(row)), pathDataType)): _*) } } private[hierarchy] def rowModifyAndOrder[K](pk: Row => K, pathDataType: DataType): (Row, Row, Option[Long]) => Row = { (left, right, myord) => { val pathComponent: K = pk(right) // TODO(weidner): is myNode a ref/ptr or a copy of node?: val myNode: Node = left.getAs[Node](left.length - 1) val path: Seq[Any] = myNode.path ++ List(pathComponent) var node: Node = null // Node(path, ordPath = myOrdPath) myord match { case Some(ord) => val parentOrdPath = myNode.ordPath match { case x: Seq[Long] => x case _ => List() } node = Node(path, pathDataType, ordPath = parentOrdPath ++ List(ord)) case None => node = Node(path, pathDataType) } Row(right.toSeq :+ node: _*) } } private[hierarchy] def rowModify[K](pk: Row => K, pathDataType: DataType): (Row, Row) => Row = { (left, right) => val pathComponent: K = pk(right) val path: Seq[Any] = left.getAs[Node](left.length - 1).path ++ List(pathComponent) val node: Node = Node(path, pathDataType) Row(right.toSeq :+ node: _*) } private[hierarchy] def rowAppend[K](row: Row, node: Node): Row = { Row(row.toSeq :+ node: _*) } private[hierarchy] def rowStartWhere[K](exp: Expression): Row => Boolean = { row => val numColumns = inputTypes.length val converters = inputTypes.map(CatalystTypeConverters.createToCatalystConverter) val values = Stream.from(0).takeWhile(_ < numColumns).map({ i => converters(i)(row(i)) }) val newRow = InternalRow.fromSeq(values) exp.eval(newRow).asInstanceOf[Boolean] } private[hierarchy] def bindExpression(exp: Expression, attributes: Seq[Attribute]) : Expression = exp.transform { case a: AttributeReference => val index = attributes.indexWhere(_.name == a.name) BoundReference(index, a.dataType, a.nullable) } }
Example 31
Source File: ERPCurrencyConversionExpression.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.currency.erp import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes} import org.apache.spark.sql.currency.CurrencyConversionException import org.apache.spark.sql.currency.erp.ERPConversionLoader.RConversionOptionsCurried import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import scala.util.control.NonFatal case class ERPCurrencyConversionExpression( conversionFunction: RConversionOptionsCurried, children: Seq[Expression]) extends Expression with ImplicitCastInputTypes with CodegenFallback { protected val CLIENT_INDEX = 0 protected val CONVERSION_TYPE_INDEX = 1 protected val AMOUNT_INDEX = 2 protected val FROM_INDEX = 3 protected val TO_INDEX = 4 protected val DATE_INDEX = 5 protected val NUM_ARGS = 6 protected val errorMessage = "Currency conversion library encountered an internal error" override def eval(input: InternalRow): Any = { val inputArguments = children.map(_.eval(input)) require(inputArguments.length == NUM_ARGS, "wrong number of arguments") // parse arguments val client = Option(inputArguments(CLIENT_INDEX).asInstanceOf[UTF8String]).map(_.toString) val conversionType = Option(inputArguments(CONVERSION_TYPE_INDEX).asInstanceOf[UTF8String]).map(_.toString) val amount = Option(inputArguments(AMOUNT_INDEX).asInstanceOf[Decimal].toJavaBigDecimal) val sourceCurrency = Option(inputArguments(FROM_INDEX).asInstanceOf[UTF8String]).map(_.toString) val targetCurrency = Option(inputArguments(TO_INDEX).asInstanceOf[UTF8String]).map(_.toString) val date = Option(inputArguments(DATE_INDEX).asInstanceOf[UTF8String]).map(_.toString) // perform conversion val conversion = conversionFunction(client, conversionType, sourceCurrency, targetCurrency, date) val resultTry = conversion(amount) // If 'resultTry' holds a 'Failure', we have to propagate it because potential failure // handling already took place. We just wrap it in case it is a cryptic error. resultTry.recover { case NonFatal(err) => throw new CurrencyConversionException(errorMessage, err) }.get.map(Decimal.apply).orNull } override def dataType: DataType = DecimalType.forType(DoubleType) override def nullable: Boolean = true override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, DecimalType, StringType, StringType, StringType) def inputNames: Seq[String] = Seq("client", "conversion_type", "amount", "source", "target", "date") def getChild(name: String): Option[Expression] = { inputNames.zip(children).find { case (n, _) => name == n }.map(_._2) } }
Example 32
Source File: NodeType.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import java.sql.Date import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.unsafe.types.UTF8String class NodeType extends UserDefinedType[Node] { override val sqlType = StructType(Seq( StructField("path", ArrayType(StringType, containsNull = false), nullable = false), StructField("dataType", StringType, nullable = false), StructField("preRank", IntegerType, nullable = true), StructField("postRank", IntegerType, nullable = true), StructField("isLeaf", BooleanType, nullable = true), StructField("ordPath", ArrayType(LongType, containsNull=false), nullable = true) )) override def serialize(obj: Any): Any = obj match { case node: Node => InternalRow(new GenericArrayData(node.path.map { case null => null case p => UTF8String.fromString(p.toString) }), UTF8String.fromString(node.pathDataTypeJson), node.preRank, node.postRank, node.isLeaf, if (node.ordPath == null){ node.ordPath } else { new GenericArrayData(node.ordPath) }) case _ => throw new UnsupportedOperationException(s"Cannot serialize ${obj.getClass}") } // scalastyle:off cyclomatic.complexity override def deserialize(datum: Any): Node = datum match { case row: InternalRow => { val stringArray = row.getArray(0).toArray[UTF8String](StringType).map { case null => null case somethingElse => somethingElse.toString } val readDataTypeString: String = row.getString(1) val readDataType: DataType = DataType.fromJson(readDataTypeString) val path: Seq[Any] = readDataType match { case StringType => stringArray case LongType => stringArray.map(v => if (v != null) v.toLong else null) case IntegerType => stringArray.map(v => if (v != null) v.toInt else null) case DoubleType => stringArray.map(v => if (v != null) v.toDouble else null) case FloatType => stringArray.map(v => if (v != null) v.toFloat else null) case ByteType => stringArray.map(v => if (v != null) v.toByte else null) case BooleanType => stringArray.map(v => if (v != null) v.toBoolean else null) case TimestampType => stringArray.map(v => if (v != null) v.toLong else null) case dt: DataType => sys.error(s"Type $dt not supported for hierarchy path") } val preRank: Integer = if (row.isNullAt(2)) null else row.getInt(2) val postRank: Integer = if (row.isNullAt(3)) null else row.getInt(3) // scalastyle:off magic.number val isLeaf: java.lang.Boolean = if (row.isNullAt(4)) null else row.getBoolean(4) val ordPath: Seq[Long] = if (row.isNullAt(5)) null else row.getArray(5).toLongArray() // scalastyle:on magic.number Node( path, readDataTypeString, preRank, postRank, isLeaf, ordPath ) } case node: Node => node case _ => throw new UnsupportedOperationException(s"Cannot deserialize ${datum.getClass}") } // scalastyle:on override def userClass: java.lang.Class[Node] = classOf[Node] } case object NodeType extends NodeType
Example 33
Source File: AnnotationFilter.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.analysis.{UnresolvedException, UnresolvedAttribute} import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.{InternalRow, trees} import org.apache.spark.sql.types._ case class AnnotationFilter(child: Expression)( val filters: Set[String] = Set.empty, val exprId: ExprId = NamedExpression.newExprId) extends UnaryExpression with NamedExpression with CodegenFallback { override def name: String = child match { case e:NamedExpression => e.name case _ => throw new UnresolvedException(this, "name of AnnotationFilter with non-named child") } override lazy val resolved = childrenResolved override def toAttribute: Attribute = { if (resolved) { child.transform ({ case a:Alias => a.copy(a.child, a.name)(a.exprId, qualifiers = a.qualifiers, explicitMetadata = Some(MetadataAccessor.filterMetadata(a.metadata, filters))) case a:AttributeReference => a.copy(a.name, a.dataType, a.nullable, metadata = MetadataAccessor.filterMetadata(a.metadata, filters))(a.exprId, a.qualifiers) case p => p }) match { case e: NamedExpression => e.toAttribute case _ => throw new UnresolvedException(this, "toAttribute of AnnotationFilter with " + "no-named child") } } else { UnresolvedAttribute(name) } } override def equals(other: Any): Boolean = other match { case aa: AnnotationFilter => child == aa.child && filters == aa.filters && exprId == aa.exprId case _ => false } // scalastyle:off magic.number override def hashCode:Int = { List[Int](child.hashCode, filters.hashCode, exprId.hashCode) .foldLeft(17)((l, r) => 31 * l + r) } override def metadata: Metadata = { child match { case named: NamedExpression => MetadataAccessor.filterMetadata(named.metadata, filters) case _ => Metadata.empty } } override def qualifiers: Seq[String] = Nil override def eval(input: InternalRow): Any = child.eval(input) override def nullable: Boolean = child.nullable override def dataType: DataType = child.dataType override protected final def otherCopyArgs: Seq[AnyRef] = filters :: exprId :: Nil }
Example 34
Source File: stringExpressions.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.unsafe.types._ import org.apache.spark.sql.types._ case class Replace(se: Expression, fe: Expression, pe: Expression) extends TernaryExpression with ImplicitCastInputTypes with CodegenFallback { override def inputTypes: Seq[AbstractDataType] = Seq.fill(3)(StringType) override def eval(input: InternalRow): Any = { val s = se.eval(input).asInstanceOf[UTF8String] val f = fe.eval(input).asInstanceOf[UTF8String] val p = pe.eval(input).asInstanceOf[UTF8String] (s, f, p) match { case (null, _, _) | (_, null, _) | (null, null, _) => null case (stre, strf, null) => UTF8String.fromString(stre.toString() .replaceAllLiterally(strf.toString(), "")) case (stre, strf, strp) => UTF8String.fromString(stre.toString() .replaceAllLiterally(strf.toString(), strp.toString())) case _ => sys.error(s"Unexpected input") } } override def nullable: Boolean = se.nullable override def dataType: DataType = StringType override def children: Seq[Expression] = se :: fe :: pe :: Nil }
Example 35
Source File: GlobalSapSQLContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.io.File import com.sap.spark.util.TestUtils import com.sap.spark.{GlobalSparkContext, WithSQLContext} import org.apache.spark.SparkContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{BoundReference, Cast} import org.apache.spark.unsafe.types._ import org.apache.spark.sql.types._ import org.scalatest.Suite import scala.io.Source trait GlobalSapSQLContext extends GlobalSparkContext with WithSQLContext { self: Suite => override implicit def sqlContext: SQLContext = GlobalSapSQLContext._sqlc override protected def setUpSQLContext(): Unit = GlobalSapSQLContext.init(sc) override protected def tearDownSQLContext(): Unit = GlobalSapSQLContext.reset() def getDataFrameFromSourceFile(sparkSchema: StructType, path: File): DataFrame = { val conversions = sparkSchema.toSeq.zipWithIndex.map({ case (field, index) => Cast(BoundReference(index, StringType, nullable = true), field.dataType) }) val data = Source.fromFile(path) .getLines() .map({ line => val stringRow = InternalRow.fromSeq(line.split(",", -1).map(UTF8String.fromString)) Row.fromSeq(conversions.map({ c => c.eval(stringRow) })) }) val rdd = sc.parallelize(data.toSeq, numberOfSparkWorkers) sqlContext.createDataFrame(rdd, sparkSchema) } } object GlobalSapSQLContext { private var _sqlc: SQLContext = _ private def init(sc: SparkContext): Unit = if (_sqlc == null) { _sqlc = TestUtils.newSQLContext(sc) } private def reset(): Unit = { if (_sqlc != null) { _sqlc.catalog.unregisterAllTables() } } }
Example 36
Source File: ExpressionEvalHelper.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions // // Partially backported from Spark 1.5.2. // import org.apache.spark.sql.extension.OptimizerFactoryForTests import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.scalactic.TripleEqualsSupport.Spread import org.scalatest.FunSuite import org.scalatest.prop.GeneratorDrivenPropertyChecks // scalastyle:off case _ => } expression.eval(inputRow) } protected def generateProject( generator: => Projection, expression: Expression): Projection = { try { generator } catch { case e: Throwable => fail( s""" |Code generation of $expression failed: |$e |${e.getStackTraceString} """.stripMargin) } } protected def checkEvaluationWithoutCodegen( expression: Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = { val actual = try evaluate(expression, inputRow) catch { case e: Exception => fail(s"Exception evaluating $expression", e) } if (!checkResult(actual, expected)) { val input = if (inputRow == EmptyRow) "" else s", input: $inputRow" fail(s"Incorrect evaluation (codegen off): $expression, " + s"actual: $actual, " + s"expected: $expected$input") } } protected def checkEvaluationWithOptimization( expression: Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = { val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation) val optimizedPlan = OptimizerFactoryForTests.default().execute(plan) checkEvaluationWithoutCodegen(optimizedPlan.expressions.head, expected, inputRow) } }
Example 37
Source File: ExcelOutputWriter.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import java.math.BigDecimal import java.sql.Date import java.sql.Timestamp import java.text.DateFormat import java.text.SimpleDateFormat import java.util.Calendar import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.NullWritable import org.apache.hadoop.io.ArrayWritable import org.apache.hadoop.mapreduce.RecordWriter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow } import org.apache.spark.sql.Row import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types._ import org.zuinnote.hadoop.office.format.common.dao.SpreadSheetCellDAO import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration import org.zuinnote.hadoop.office.format.common.util.msexcel.MSExcelUtil import org.zuinnote.hadoop.office.format.mapreduce._ import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration import java.util.Locale import java.text.DecimalFormat import org.zuinnote.hadoop.office.format.common.converter.ExcelConverterSimpleSpreadSheetCellDAO import java.text.NumberFormat // NOTE: This class is instantiated and used on executor side only, no need to be serializable. private[excel] class ExcelOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext, options: Map[String, String]) extends OutputWriter { def write(row: Row): Unit = { // check useHeader if (useHeader) { val headers = row.schema.fieldNames var i = 0 for (x <- headers) { val headerColumnSCD = new SpreadSheetCellDAO(x, "", "", MSExcelUtil.getCellAddressA1Format(currentRowNum, i), defaultSheetName) recordWriter.write(NullWritable.get(), headerColumnSCD) i += 1 } currentRowNum += 1 useHeader = false } // for each value in the row if (row.size>0) { var currentColumnNum = 0; val simpleObject = new Array[AnyRef](row.size) for (i <- 0 to row.size - 1) { // for each element of the row val obj = row.get(i) if ((obj.isInstanceOf[Seq[String]]) && (obj.asInstanceOf[Seq[String]].length==5)) { val formattedValue = obj.asInstanceOf[Seq[String]](0) val comment = obj.asInstanceOf[Seq[String]](1) val formula = obj.asInstanceOf[Seq[String]](2) val address = obj.asInstanceOf[Seq[String]](3) val sheetName = obj.asInstanceOf[Seq[String]](4) simpleObject(i) = new SpreadSheetCellDAO(formattedValue,comment,formula,address,sheetName) } else { simpleObject(i)=obj.asInstanceOf[AnyRef] } } // convert row to spreadsheetcellDAO val spreadSheetCellDAORow = simpleConverter.getSpreadSheetCellDAOfromSimpleDataType(simpleObject, defaultSheetName, currentRowNum) // write it for (x<- spreadSheetCellDAORow) { recordWriter.write(NullWritable.get(), x) } } currentRowNum += 1 } override def close(): Unit = { recordWriter.close(context) currentRowNum = 0; } }
Example 38
Source File: ZOrderCurveUDT.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import magellan.BoundingBox import magellan.index.ZOrderCurve import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow class ZOrderCurveUDT extends UserDefinedType[ZOrderCurve] { override def sqlType: DataType = StructType( Seq( StructField("xmin", DoubleType, nullable = false), StructField("ymin", DoubleType, nullable = false), StructField("xmax", DoubleType, nullable = false), StructField("ymax", DoubleType, nullable = false), StructField("precision", IntegerType, nullable = false), StructField("bits", LongType, nullable = false) )) override def serialize(obj: ZOrderCurve): Any = { val row = new GenericInternalRow(6) val BoundingBox(xmin, ymin, xmax, ymax) = obj.boundingBox row.setDouble(0, xmin) row.setDouble(1, ymin) row.setDouble(2, xmax) row.setDouble(3, ymax) row.setInt(4, obj.precision) row.setLong(5, obj.bits) row } override def deserialize(datum: Any): ZOrderCurve = { val row = datum.asInstanceOf[InternalRow] val boundingBox = BoundingBox(row.getDouble(0), row.getDouble(1), row.getDouble(2), row.getDouble(3)) new ZOrderCurve(boundingBox, row.getInt(4), row.getLong(5)) } override def userClass: Class[ZOrderCurve] = classOf[ZOrderCurve] }
Example 39
Source File: PointUDT.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import magellan._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow class PointUDT extends UserDefinedType[Point] with GeometricUDT { override val sqlType: StructType = StructType( Seq( StructField("type", IntegerType, nullable = false), StructField("xmin", DoubleType, nullable = false), StructField("ymin", DoubleType, nullable = false), StructField("xmax", DoubleType, nullable = false), StructField("ymax", DoubleType, nullable = false), StructField("x", DoubleType, nullable = false), StructField("y", DoubleType, nullable = false) )) override def serialize(point: Point): InternalRow = { val row = new GenericInternalRow(7) row.setInt(0, point.getType()) row.setDouble(1, point.getX()) row.setDouble(2, point.getY()) row.setDouble(3, point.getX()) row.setDouble(4, point.getY()) row.setDouble(5, point.getX()) row.setDouble(6, point.getY()) row } override def serialize(shape: Shape) = serialize(shape.asInstanceOf[Point]) override def userClass: Class[Point] = classOf[Point] override def deserialize(datum: Any): Point = { val row = datum.asInstanceOf[InternalRow] require(row.numFields == 7) Point(row.getDouble(5), row.getDouble(6)) } override def pyUDT: String = "magellan.types.PointUDT" def serialize(x: Double, y: Double): InternalRow = { val row = new GenericInternalRow(7) row.setInt(0, 1) row.setDouble(1, x) row.setDouble(2, y) row.setDouble(3, x) row.setDouble(4, y) row.setDouble(5, x) row.setDouble(6, y) row } override val geometryType = new Point().getType() }
Example 40
Source File: LineUDT.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import magellan._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow class LineUDT extends UserDefinedType[Line] with GeometricUDT { override def sqlType: DataType = StructType( Seq( StructField("type", IntegerType, nullable = false), StructField("xmin", DoubleType, nullable = false), StructField("ymin", DoubleType, nullable = false), StructField("xmax", DoubleType, nullable = false), StructField("ymax", DoubleType, nullable = false), StructField("startX", DoubleType, nullable = false), StructField("startY", DoubleType, nullable = false), StructField("endX", DoubleType, nullable = false), StructField("endY", DoubleType, nullable = false) )) override def serialize(line: Line): InternalRow = { val row = new GenericInternalRow(9) row.setInt(0, 2) val BoundingBox(xmin, ymin, xmax, ymax) = line.boundingBox row.setDouble(1, xmin) row.setDouble(2, ymin) row.setDouble(3, xmax) row.setDouble(4, ymax) row.setDouble(5, line.getStart().getX()) row.setDouble(6, line.getStart().getY()) row.setDouble(7, line.getEnd().getX()) row.setDouble(8, line.getEnd().getY()) row } override def serialize(shape: Shape) = serialize(shape.asInstanceOf[Line]) override def userClass: Class[Line] = classOf[Line] override def deserialize(datum: Any): Line = { val row = datum.asInstanceOf[InternalRow] val startX = row.getDouble(5) val startY = row.getDouble(6) val endX = row.getDouble(7) val endY = row.getDouble(8) val line = new Line() val start = Point(startX, startY) val end = Point(endX, endY) line.setStart(start) line.setEnd(end) line } override def pyUDT: String = "magellan.types.LineUDT" override val geometryType = new Line().getType() }
Example 41
Source File: PolygonUDT.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import magellan._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow class PolygonUDT extends UserDefinedType[Polygon] with GeometricUDT { override val sqlType: StructType = StructType(Seq( StructField("type", IntegerType, nullable = false), StructField("xmin", DoubleType, nullable = false), StructField("ymin", DoubleType, nullable = false), StructField("xmax", DoubleType, nullable = false), StructField("ymax", DoubleType, nullable = false), StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("xcoordinates", ArrayType(DoubleType, containsNull = false), nullable = true), StructField("ycoordinates", ArrayType(DoubleType, containsNull = false), nullable = true) )) override def serialize(polygon: Polygon): InternalRow = { polygon.serialize() } override def serialize(shape: Shape) = serialize(shape.asInstanceOf[Polygon]) override def userClass: Class[Polygon] = classOf[Polygon] override def deserialize(datum: Any): Polygon = { val row = datum.asInstanceOf[InternalRow] val polygon = new Polygon() polygon.init(row) polygon } override val geometryType = new Polygon().getType() }
Example 42
Source File: PolyLineUDT.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import magellan._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow class PolyLineUDT extends UserDefinedType[PolyLine] with GeometricUDT { override val sqlType: StructType = StructType( Seq( StructField("type", IntegerType, nullable = false), StructField("xmin", DoubleType, nullable = false), StructField("ymin", DoubleType, nullable = false), StructField("xmax", DoubleType, nullable = false), StructField("ymax", DoubleType, nullable = false), StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("xcoordinates", ArrayType(DoubleType, containsNull = false), nullable = true), StructField("ycoordinates", ArrayType(DoubleType, containsNull = false), nullable = true) )) override def serialize(polyLine: PolyLine): InternalRow = { polyLine.serialize() } override def serialize(shape: Shape) = serialize(shape.asInstanceOf[PolyLine]) override def userClass: Class[PolyLine] = classOf[PolyLine] override def deserialize(datum: Any): PolyLine = { val row = datum.asInstanceOf[InternalRow] val polyline = new PolyLine() polyline.init(row) polyline } override def pyUDT: String = "magellan.types.PolyLineUDT" override val geometryType = new PolyLine().getType() }
Example 43
Source File: serdes.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import magellan._ import magellan.catalyst.MagellanExpression import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.types._ case class MagellanSerializer( override val child: Expression, _dataType: DataType) extends UnaryExpression with MagellanExpression with CodegenFallback with NonSQLExpression { override def nullable: Boolean = false override protected def nullSafeEval(input: Any): Any = { val shape = input.asInstanceOf[Shape] serialize(shape) } override def dataType: DataType = _dataType } case class MagellanDeserializer( override val child: Expression, klass: Class[_ <: Shape]) extends UnaryExpression with MagellanExpression with CodegenFallback with NonSQLExpression { override def nullable: Boolean = false override protected def nullSafeEval(input: Any): Any = { newInstance(input.asInstanceOf[InternalRow]) } override def dataType: DataType = ObjectType(klass) }
Example 44
Source File: MagellanExpression.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.catalyst import magellan._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ trait MagellanExpression { private val SERIALIZERS = Map( new Point().getType() -> new PointUDT, new Line().getType() -> new LineUDT, new PolyLine().getType() -> new PolyLineUDT, new Polygon().getType() -> new PolygonUDT) def newInstance(row: InternalRow): Shape = { SERIALIZERS.get(row.getInt(0)).fold(NullShape.asInstanceOf[Shape])(_.deserialize(row)) } def serialize(shape: Shape): Any = { SERIALIZERS.get(shape.getType()).get.serialize(shape) } def sqlType(klass: Class[_ <: Shape]): DataType = { SERIALIZERS.get(klass.newInstance().getType()).get } }
Example 45
Source File: TransformerSuite.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.catalyst import magellan.TestingUtils._ import magellan.{MockPointExpr, Point, TestSparkContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, Transformer} import org.apache.spark.sql.magellan.dsl.expressions._ import org.scalatest.FunSuite class TransformerSuite extends FunSuite with TestSparkContext { test("transform") { val sqlCtx = this.sqlContext val path = this.getClass.getClassLoader.getResource("testpoint/").getPath val df = sqlCtx.read.format("magellan").load(path) import sqlCtx.implicits._ val dbl = (x: Point) => Point(2 * x.getX(), 2 * x.getY()) val point = df.withColumn("transformed", $"point".transform(dbl)) .select($"transformed") .first()(0).asInstanceOf[Point] assert(point.getX() ~== -199.0 absTol 1.0) } test("eval: transform") { val fn = (p: Point) => Point(2 * p.getX(), 2 * p.getY()) val expr = Transformer(MockPointExpr(Point(1.0, 2.0)), fn) val result = expr.eval(null).asInstanceOf[InternalRow] // skip the type assert(result.getDouble(1) === 2.0) assert(result.getDouble(2) === 4.0) } }
Example 46
Source File: ColumnarShuffledHashJoinExec.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.execution import java.util.concurrent.TimeUnit._ import com.intel.sparkColumnarPlugin.vectorized._ import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, CodegenSupport, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} import scala.collection.mutable.ListBuffer import org.apache.arrow.vector.ipc.message.ArrowFieldNode import org.apache.arrow.vector.ipc.message.ArrowRecordBatch import org.apache.arrow.vector.types.pojo.ArrowType import org.apache.arrow.vector.types.pojo.Field import org.apache.arrow.vector.types.pojo.Schema import org.apache.arrow.gandiva.expression._ import org.apache.arrow.gandiva.evaluator._ import io.netty.buffer.ArrowBuf import com.google.common.collect.Lists; import com.intel.sparkColumnarPlugin.expression._ import com.intel.sparkColumnarPlugin.vectorized.ExpressionEvaluator import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide} class ColumnarShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends ShuffledHashJoinExec( leftKeys, rightKeys, joinType, buildSide, condition, left, right) { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "joinTime" -> SQLMetrics.createTimingMetric(sparkContext, "join time"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def supportsColumnar = true //TODO() Disable code generation //override def supportCodegen: Boolean = false override def doExecuteColumnar(): RDD[ColumnarBatch] = { val numOutputRows = longMetric("numOutputRows") val joinTime = longMetric("joinTime") val buildTime = longMetric("buildTime") val resultSchema = this.schema streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) { (streamIter, buildIter) => //val hashed = buildHashedRelation(buildIter) //join(streamIter, hashed, numOutputRows) val vjoin = ColumnarShuffledHashJoin.create(leftKeys, rightKeys, resultSchema, joinType, buildSide, condition, left, right, buildTime, joinTime, numOutputRows) val vjoinResult = vjoin.columnarInnerJoin(streamIter, buildIter) TaskContext.get().addTaskCompletionListener[Unit](_ => { vjoin.close() }) new CloseableColumnBatchIterator(vjoinResult) } } }
Example 47
Source File: ColumnarSortExec.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.execution import com.intel.sparkColumnarPlugin.expression._ import com.intel.sparkColumnarPlugin.vectorized._ import java.util.concurrent.TimeUnit._ import org.apache.spark.{SparkEnv, TaskContext, SparkContext} import org.apache.spark.executor.TaskMetrics import org.apache.spark.sql.execution._ import org.apache.spark.sql.catalyst.expressions.SortOrder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} class ColumnarSortExec( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan, testSpillFrequency: Int = 0) extends SortExec(sortOrder, global, child, testSpillFrequency) { override def supportsColumnar = true // Disable code generation override def supportCodegen: Boolean = false override lazy val metrics = Map( "totalSortTime" -> SQLMetrics .createTimingMetric(sparkContext, "time in sort + shuffle process"), "sortTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in sort process"), "shuffleTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in shuffle process"), "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "numOutputBatches" -> SQLMetrics.createMetric(sparkContext, "number of output batches")) override def doExecuteColumnar(): RDD[ColumnarBatch] = { val elapse = longMetric("totalSortTime") val sortTime = longMetric("sortTime") val shuffleTime = longMetric("shuffleTime") val numOutputRows = longMetric("numOutputRows") val numOutputBatches = longMetric("numOutputBatches") child.executeColumnar().mapPartitions { iter => val hasInput = iter.hasNext val res = if (!hasInput) { Iterator.empty } else { val sorter = ColumnarSorter.create( sortOrder, true, child.output, sortTime, numOutputBatches, numOutputRows, shuffleTime, elapse) TaskContext .get() .addTaskCompletionListener[Unit](_ => { sorter.close() }) new CloseableColumnBatchIterator(sorter.createColumnarIterator(iter)) } res } } }
Example 48
Source File: ColumnarSubquery.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.expression import org.apache.arrow.gandiva.evaluator._ import org.apache.arrow.gandiva.exceptions.GandivaException import org.apache.arrow.gandiva.expression._ import org.apache.arrow.vector.types.pojo.ArrowType import org.apache.arrow.vector.types.pojo.Field import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.execution.BaseSubqueryExec import org.apache.spark.sql.execution.ExecSubqueryExpression import org.apache.spark.sql.execution.ScalarSubquery import org.apache.spark.sql.types._ import scala.collection.mutable.ListBuffer class ColumnarScalarSubquery( query: ScalarSubquery) extends Expression with ColumnarExpression { override def dataType: DataType = query.dataType override def children: Seq[Expression] = Nil override def nullable: Boolean = true override def toString: String = query.toString override def eval(input: InternalRow): Any = query.eval(input) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = query.doGenCode(ctx, ev) override def canEqual(that: Any): Boolean = query.canEqual(that) override def productArity: Int = query.productArity override def productElement(n: Int): Any = query.productElement(n) override def doColumnarCodeGen(args: java.lang.Object): (TreeNode, ArrowType) = { val value = query.eval(null) val resultType = CodeGeneration.getResultType(query.dataType) query.dataType match { case t: StringType => (TreeBuilder.makeStringLiteral(value.toString().asInstanceOf[String]), resultType) case t: IntegerType => (TreeBuilder.makeLiteral(value.asInstanceOf[Integer]), resultType) case t: LongType => (TreeBuilder.makeLiteral(value.asInstanceOf[java.lang.Long]), resultType) case t: DoubleType => (TreeBuilder.makeLiteral(value.asInstanceOf[java.lang.Double]), resultType) case d: DecimalType => val v = value.asInstanceOf[Decimal] (TreeBuilder.makeDecimalLiteral(v.toString, v.precision, v.scale), resultType) case d: DateType => throw new UnsupportedOperationException(s"DateType is not supported yet.") } } }
Example 49
Source File: OapOutputWriter.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{OutputWriter, WriteResult} import org.apache.spark.sql.execution.datasources.oap.io.OapDataWriter import org.apache.spark.sql.types.StructType private[oap] class OapOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext) extends OutputWriter { private var rowCount = 0 private var partitionString: String = "" override def setPartitionString(ps: String): Unit = { partitionString = ps } private val writer: OapDataWriter = { val isCompressed = FileOutputFormat.getCompressOutput(context) val conf = context.getConfiguration val file: Path = new Path(path) val fs = file.getFileSystem(conf) val fileOut = fs.create(file, false) new OapDataWriter(isCompressed, fileOut, dataSchema, conf) } override def write(row: InternalRow): Unit = { rowCount += 1 writer.write(row) } override def close(): Unit = { writer.close() } override def writeStatus(): WriteResult = { OapWriteResult(dataFileName, rowCount, partitionString) } def dataFileName: String = new Path(path).getName }
Example 50
Source File: OapIndexOutputWriter.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.oap.adapter.InputFileNameHolderAdapter // TODO: parameter name "path" is ambiguous private[index] class OapIndexOutputWriter( path: String, context: TaskAttemptContext ) extends OutputWriter { private val outputFormat = new OapIndexOutputFormat() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { val outputPath = FileOutputFormat.getOutputPath(context) val configuration = ContextUtil.getConfiguration(context) IndexUtils.generateTempIndexFilePath( configuration, inputFileName, outputPath, path, extension) } } private var recordWriter: RecordWriter[Void, InternalRow] = _ private var inputFileName: String = _ private var rowCount: Long = 0 override def write(row: InternalRow): Unit = { checkStartOfNewFile() recordWriter.write(null, row) rowCount += 1 } override def close(): Unit = { closeWriter() } private def initWriter(): Unit = { inputFileName = InputFileNameHolderAdapter.getInputFileName().toString recordWriter = outputFormat.getRecordWriter(context) rowCount = 0 } private def closeWriter(): Unit = { if (recordWriter != null) { recordWriter.close(context) recordWriter = null } } private def checkStartOfNewFile(): Unit = { if (inputFileName != InputFileNameHolderAdapter.getInputFileName().toString) { closeWriter() initWriter() } } }
Example 51
Source File: OapIndexOutputFormat.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.parquet.format.CompressionCodec import org.apache.parquet.hadoop.util.ContextUtil import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OapException import org.apache.spark.sql.execution.datasources.oap.index.OapIndexProperties.IndexVersion import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.sql.types.StructType private[index] class OapIndexOutputFormat extends FileOutputFormat[Void, InternalRow] { private val BTREE_WRITER_VERSION = OapConf.OAP_INDEX_BTREE_WRITER_VERSION.key private def getCodec(taskAttemptContext: TaskAttemptContext): CompressionCodec = { val configuration = ContextUtil.getConfiguration(taskAttemptContext) CompressionCodec.valueOf( configuration.get( OapConf.OAP_INDEX_BTREE_COMPRESSION.key, OapConf.OAP_INDEX_BTREE_COMPRESSION.defaultValueString).toUpperCase) } private def getWriterVersion(taskAttemptContext: TaskAttemptContext) = { val configuration = ContextUtil.getConfiguration(taskAttemptContext) val indexVersion = configuration.get(BTREE_WRITER_VERSION, OapIndexProperties.DEFAULT_WRITER_VERSION.toString) IndexVersion.fromString(indexVersion) } override def getRecordWriter( taskAttemptContext: TaskAttemptContext): RecordWriter[Void, InternalRow] = { val configuration = ContextUtil.getConfiguration(taskAttemptContext) def canBeSkipped(file: Path): Boolean = { val isAppend = configuration.get(OapIndexFileFormat.IS_APPEND).toBoolean if (isAppend) { val target = new Path(FileOutputFormat.getOutputPath(taskAttemptContext), file.getName) target.getFileSystem(configuration).exists(target) } else { false } } val codec = getCodec(taskAttemptContext) val writerVersion = getWriterVersion(taskAttemptContext) val extension = "." + configuration.get(OapIndexFileFormat.INDEX_TIME) + "." + configuration.get(OapIndexFileFormat.INDEX_NAME) + ".index" val file = getDefaultWorkFile(taskAttemptContext, extension) val schema = StructType.fromString(configuration.get(OapIndexFileFormat.ROW_SCHEMA)) val indexType = configuration.get(OapIndexFileFormat.INDEX_TYPE, "") if (canBeSkipped(file)) { new DummyIndexRecordWriter() } else if (indexType == "BTREE") { BTreeIndexRecordWriter(configuration, file, schema, codec, writerVersion) } else if (indexType == "BITMAP") { val writer = file.getFileSystem(configuration).create(file, true) new BitmapIndexRecordWriter(configuration, writer, schema) } else { throw new OapException("Unknown Index Type: " + indexType) } } }
Example 52
Source File: package.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering import org.apache.spark.sql.execution.datasources.oap.io.ColumnStatistics import org.apache.spark.sql.execution.datasources.oap.utils.OapUtils import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StructField, StructType} package object oap { type Key = InternalRow def order(sf: StructField): Ordering[Key] = GenerateOrdering.create(StructType(Array(sf))) // Return if the rowGroup or file can be skipped by min max statistics def isSkippedByStatistics( columnStats: Array[ColumnStatistics], filter: Filter, schema: StructType): Boolean = filter match { case Or(left, right) => isSkippedByStatistics(columnStats, left, schema) && isSkippedByStatistics(columnStats, right, schema) case And(left, right) => isSkippedByStatistics(columnStats, left, schema) || isSkippedByStatistics(columnStats, right, schema) case IsNotNull(attribute) => val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) !stat.hasNonNullValue case EqualTo(attribute, handle) => val key = OapUtils.keyFromAny(handle) val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) val comp = order(schema(idx)) (OapUtils.keyFromBytes(stat.min, schema(idx).dataType), OapUtils.keyFromBytes( stat.max, schema(idx).dataType)) match { case (Some(v1), Some(v2)) => comp.gt(v1, key) || comp.lt(v2, key) case _ => false } case LessThan(attribute, handle) => val key = OapUtils.keyFromAny(handle) val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) val comp = order(schema(idx)) OapUtils.keyFromBytes(stat.min, schema(idx).dataType) match { case Some(v) => comp.gteq(v, key) case None => false } case LessThanOrEqual(attribute, handle) => val key = OapUtils.keyFromAny(handle) val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) val comp = order(schema(idx)) OapUtils.keyFromBytes(stat.min, schema(idx).dataType) match { case Some(v) => comp.gt(v, key) case None => false } case GreaterThan(attribute, handle) => val key = OapUtils.keyFromAny(handle) val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) val comp = order(schema(idx)) OapUtils.keyFromBytes(stat.max, schema(idx).dataType) match { case Some(v) => comp.lteq(v, key) case None => false } case GreaterThanOrEqual(attribute, handle) => val key = OapUtils.keyFromAny(handle) val idx = schema.fieldIndex(attribute) val stat = columnStats(idx) val comp = order(schema(idx)) OapUtils.keyFromBytes(stat.max, schema(idx).dataType) match { case Some(v) => comp.lt(v, key) case None => false } case _ => false } } class OapException(message: String, cause: Throwable) extends Exception(message, cause) { def this(message: String) = this(message, null) }
Example 53
Source File: OapDataReader.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.fs.FSDataInputStream import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{OapException, PartitionedFile} import org.apache.spark.sql.execution.datasources.oap.INDEX_STAT._ import org.apache.spark.sql.execution.datasources.oap.OapFileFormat import org.apache.spark.sql.execution.datasources.oap.io.OapDataFileProperties.DataFileVersion import org.apache.spark.sql.execution.datasources.oap.io.OapDataFileProperties.DataFileVersion.DataFileVersion import org.apache.spark.unsafe.types.UTF8String abstract class OapDataReader { def read(file: PartitionedFile): Iterator[InternalRow] // The two following fields have to be defined by certain versions of OapDataReader for use in // [[OapMetricsManager]] def rowsReadByIndex: Option[Long] def indexStat: INDEX_STAT } object OapDataReader extends Logging { def readVersion(is: FSDataInputStream, fileLen: Long): DataFileVersion = { val MAGIC_VERSION_LENGTH = 4 val metaEnd = fileLen - 4 // seek to the position of data file meta length is.seek(metaEnd) val metaLength = is.readInt() // read all bytes of data file meta val magicBuffer = new Array[Byte](MAGIC_VERSION_LENGTH) is.readFully(metaEnd - metaLength, magicBuffer) val magic = UTF8String.fromBytes(magicBuffer).toString magic match { case m if ! m.contains("OAP") => throw new OapException("Not a valid Oap Data File") case m if m == "OAP1" => DataFileVersion.OAP_DATAFILE_V1 case _ => throw new OapException("Not a supported Oap Data File version") } } def getDataFileClassFor(dataReaderClassFromDataSourceMeta: String, reader: OapDataReader): String = { dataReaderClassFromDataSourceMeta match { case c if c == OapFileFormat.PARQUET_DATA_FILE_CLASSNAME => c case c if c == OapFileFormat.ORC_DATA_FILE_CLASSNAME => c case c if c == OapFileFormat.OAP_DATA_FILE_CLASSNAME => reader match { case r: OapDataReaderV1 => OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME case _ => throw new OapException(s"Undefined connection for $reader") } case _ => throw new OapException( s"Undefined data reader class name $dataReaderClassFromDataSourceMeta") } } }
Example 54
Source File: OapIndexWriteTaskStatsTracker.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.util.concurrent.atomic.LongAdder import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.oap.index.IndexBuildResult import org.apache.spark.sql.oap.adapter.InputFileNameHolderAdapter case class IndexWriteTaskStats(writeStatus: Seq[IndexBuildResult]) extends WriteTaskStats class OapIndexWriteTaskStatsTracker extends WriteTaskStatsTracker with Logging { private[this] var curInputFileName: String = _ private[this] var statusMap: Map[String, LongAdder] = Map.empty[String, LongAdder] override def newPartition(partitionValues: InternalRow): Unit = { // currently unhandled } override def newBucket(bucketId: Int): Unit = { // currently unhandled } override def newFile(filePath: String): Unit = { // currently unhandled } override def newRow(row: InternalRow): Unit = { val inputFileName = InputFileNameHolderAdapter.getInputFileName().toString if (curInputFileName != inputFileName) { curInputFileName = inputFileName statusMap = statusMap + (inputFileName -> new LongAdder) } statusMap(curInputFileName).increment() } override def getFinalStats(): WriteTaskStats = { val results = statusMap.map { case (filePath, rowCount) => val path = new Path(filePath) IndexBuildResult(path.getName, rowCount.longValue(), "", path.getParent.toString) } IndexWriteTaskStats(results.toSeq) } } class OapIndexWriteJobStatsTracker extends WriteJobStatsTracker with Logging { private[this] var indexBuildResultSeq: Seq[IndexBuildResult] = _ override def newTaskInstance(): WriteTaskStatsTracker = new OapIndexWriteTaskStatsTracker override def processStats(stats: Seq[WriteTaskStats]): Unit = { indexBuildResultSeq = stats.flatMap(_.asInstanceOf[IndexWriteTaskStats].writeStatus) } def indexBuildResults: Seq[IndexBuildResult] = indexBuildResultSeq }
Example 55
Source File: ParquetReadSupportWrapper.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import java.util.{Map => JMap} import org.apache.hadoop.conf.Configuration import org.apache.parquet.hadoop.api.{InitContext, ReadSupport} import org.apache.parquet.hadoop.api.ReadSupport.ReadContext import org.apache.parquet.io.api.RecordMaterializer import org.apache.parquet.schema._ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow override def prepareForRead( conf: Configuration, keyValueMetaData: JMap[String, String], fileSchema: MessageType, readContext: ReadContext): RecordMaterializer[InternalRow] = { readSupport.prepareForRead(conf, keyValueMetaData, fileSchema, readContext) } } object ParquetReadSupportWrapper { // Proxy ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA value. val SPARK_ROW_REQUESTED_SCHEMA: String = ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA }
Example 56
Source File: NonNullKeySuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.utils import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.ByteBufferOutputStream class NonNullKeySuite extends SparkFunSuite with Logging { private lazy val random = new Random(0) private lazy val values = { val booleans: Seq[Boolean] = Seq(true, false) val bytes: Seq[Byte] = Seq(Byte.MinValue, 0, 10, 30, Byte.MaxValue) val shorts: Seq[Short] = Seq(Short.MinValue, -100, 0, 10, 200, Short.MaxValue) val ints: Seq[Int] = Seq(Int.MinValue, -100, 0, 100, 12346, Int.MaxValue) val longs: Seq[Long] = Seq(Long.MinValue, -10000, 0, 20, Long.MaxValue) val floats: Seq[Float] = Seq(Float.MinValue, Float.MinPositiveValue, Float.MaxValue) val doubles: Seq[Double] = Seq(Double.MinValue, Double.MinPositiveValue, Double.MaxValue) val strings: Seq[UTF8String] = Seq("", "test", "b plus tree", "BTreeRecordReaderWriter").map(UTF8String.fromString) val binaries: Seq[Array[Byte]] = (0 until 20 by 5).map{ size => val buf = new Array[Byte](size) random.nextBytes(buf) buf } val values = booleans ++ bytes ++ shorts ++ ints ++ longs ++ floats ++ doubles ++ strings ++ binaries ++ Nil random.shuffle(values) } private def toSparkDataType(any: Any): DataType = { any match { case _: Boolean => BooleanType case _: Short => ShortType case _: Byte => ByteType case _: Int => IntegerType case _: Long => LongType case _: Float => FloatType case _: Double => DoubleType case _: UTF8String => StringType case _: Array[Byte] => BinaryType } } test("Read/Write Based On Schema") { values.grouped(10).foreach { valueSeq => val schema = StructType(valueSeq.zipWithIndex.map { case (v, i) => StructField(s"col$i", toSparkDataType(v)) }) val nnkw = new NonNullKeyWriter(schema) val nnkr = new NonNullKeyReader(schema) val row = InternalRow.fromSeq(valueSeq) val buf = new ByteBufferOutputStream() nnkw.writeKey(buf, row) val answerRow = nnkr.readKey(FiberCache(buf.toByteArray), 0)._1 assert(row.equals(answerRow)) } } }
Example 57
Source File: StatisticsTest.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.statistics import java.io.ByteArrayOutputStream import scala.collection.mutable.ArrayBuffer import org.scalatest.BeforeAndAfterEach import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.BaseOrdering import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache import org.apache.spark.sql.execution.datasources.oap.index.RangeInterval import org.apache.spark.sql.execution.datasources.oap.utils.{NonNullKeyReader, NonNullKeyWriter} import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.apache.spark.unsafe.Platform import org.apache.spark.unsafe.memory.MemoryBlock import org.apache.spark.unsafe.types.UTF8String abstract class StatisticsTest extends SparkFunSuite with BeforeAndAfterEach { protected def rowGen(i: Int): InternalRow = InternalRow(i, UTF8String.fromString(s"test#$i")) protected lazy val schema: StructType = StructType(StructField("a", IntegerType) :: StructField("b", StringType) :: Nil) @transient protected lazy val nnkw: NonNullKeyWriter = new NonNullKeyWriter(schema) @transient protected lazy val nnkr: NonNullKeyReader = new NonNullKeyReader(schema) @transient protected lazy val ordering: BaseOrdering = GenerateOrdering.create(schema) @transient protected lazy val partialOrdering: BaseOrdering = GenerateOrdering.create(StructType(schema.dropRight(1))) protected var out: ByteArrayOutputStream = _ protected var intervalArray: ArrayBuffer[RangeInterval] = new ArrayBuffer[RangeInterval]() override def beforeEach(): Unit = { out = new ByteArrayOutputStream(8000) } override def afterEach(): Unit = { out.close() intervalArray.clear() } protected def generateInterval( start: InternalRow, end: InternalRow, startInclude: Boolean, endInclude: Boolean): Unit = { intervalArray.clear() intervalArray.append(new RangeInterval(start, end, startInclude, endInclude)) } protected def checkInternalRow(row1: InternalRow, row2: InternalRow): Unit = { val res = row1 == row2 // it works.. assert(res, s"row1: $row1 does not match $row2") } protected def wrapToFiberCache(out: ByteArrayOutputStream): FiberCache = { val bytes = out.toByteArray FiberCache(bytes) } }
Example 58
Source File: DeltaByteArrayEncoderSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.scalacheck.{Arbitrary, Gen, Properties} import org.scalacheck.Prop.forAll import org.scalatest.prop.Checkers import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.oap.adapter.PropertiesAdapter import org.apache.spark.sql.execution.datasources.oap.filecache.StringFiberBuilder import org.apache.spark.sql.types.StringType import org.apache.spark.unsafe.types.UTF8String class DeltaByteArrayEncoderCheck extends Properties("DeltaByteArrayEncoder") { private val rowCountInEachGroup = Gen.choose(1, 1024) private val rowCountInLastGroup = Gen.choose(1, 1024) private val groupCount = Gen.choose(1, 100) property("Encoding/Decoding String Type") = forAll { (values: Array[String]) => forAll(rowCountInEachGroup, rowCountInLastGroup, groupCount) { (rowCount, lastCount, groupCount) => if (values.nonEmpty) { // This is the 'PLAIN' FiberBuilder to validate the 'Encoding/Decoding' // Normally, the test case should be: // values => encoded bytes => decoded bytes => decoded values (Using ColumnValues class) // Validate if 'values' and 'decoded values' are identical. // But ColumnValues only support read value form DataFile. So, we have to use another way // to validate. val referenceFiberBuilder = StringFiberBuilder(rowCount, 0) val fiberBuilder = DeltaByteArrayFiberBuilder(rowCount, 0, StringType) val fiberParser = DeltaByteArrayDataFiberParser( new OapDataFileMetaV1(rowCountInEachGroup = rowCount), StringType) !(0 until groupCount).exists { group => // If lastCount > rowCount, assume lastCount = rowCount val count = if (group < groupCount - 1) { rowCount } else if (lastCount > rowCount) { rowCount } else { lastCount } (0 until count).foreach { row => fiberBuilder.append(InternalRow(UTF8String.fromString(values(row % values.length)))) referenceFiberBuilder .append(InternalRow(UTF8String.fromString(values(row % values.length)))) } val bytes = fiberBuilder.build().fiberData val parsedBytes = fiberParser.parse(bytes, count) val referenceBytes = referenceFiberBuilder.build().fiberData referenceFiberBuilder.clear() fiberBuilder.clear() assert(parsedBytes.length == referenceBytes.length) parsedBytes.zip(referenceBytes).exists(byte => byte._1 != byte._2) } } else true } } } class DeltaByteArrayEncoderSuite extends SparkFunSuite with Checkers { test("Check Encoding/Decoding") { check(PropertiesAdapter.getProp(new DictionaryBasedEncoderCheck())) } }
Example 59
Source File: DictionaryBasedEncoderSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.parquet.bytes.BytesInput import org.apache.parquet.column.page.DictionaryPage import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBinaryDictionary import org.scalacheck.{Arbitrary, Gen, Properties} import org.scalacheck.Prop.forAll import org.scalatest.prop.Checkers import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.oap.adapter.PropertiesAdapter import org.apache.spark.sql.execution.datasources.oap.filecache.StringFiberBuilder import org.apache.spark.sql.types.StringType import org.apache.spark.unsafe.types.UTF8String class DictionaryBasedEncoderCheck extends Properties("DictionaryBasedEncoder") { private val rowCountInEachGroup = Gen.choose(1, 1024) private val rowCountInLastGroup = Gen.choose(1, 1024) private val groupCount = Gen.choose(1, 100) property("Encoding/Decoding String Type") = forAll { (values: Array[String]) => forAll(rowCountInEachGroup, rowCountInLastGroup, groupCount) { (rowCount, lastCount, groupCount) => if (values.nonEmpty) { // This is the 'PLAIN' FiberBuilder to validate the 'Encoding/Decoding' // Normally, the test case should be: // values => encoded bytes => decoded bytes => decoded values (Using ColumnValues class) // Validate if 'values' and 'decoded values' are identical. // But ColumnValues only support read value form DataFile. So, we have to use another way // to validate. val referenceFiberBuilder = StringFiberBuilder(rowCount, 0) val fiberBuilder = PlainBinaryDictionaryFiberBuilder(rowCount, 0, StringType) !(0 until groupCount).exists { group => // If lastCount > rowCount, assume lastCount = rowCount val count = if (group < groupCount - 1) { rowCount } else if (lastCount > rowCount) { rowCount } else { lastCount } (0 until count).foreach { row => fiberBuilder.append(InternalRow(UTF8String.fromString(values(row % values.length)))) referenceFiberBuilder .append(InternalRow(UTF8String.fromString(values(row % values.length)))) } val bytes = fiberBuilder.build().fiberData val dictionary = new PlainBinaryDictionary( new DictionaryPage( BytesInput.from(fiberBuilder.buildDictionary), fiberBuilder.getDictionarySize, org.apache.parquet.column.Encoding.PLAIN)) val fiberParser = PlainDictionaryFiberParser( new OapDataFileMetaV1(rowCountInEachGroup = rowCount), dictionary, StringType) val parsedBytes = fiberParser.parse(bytes, count) val referenceBytes = referenceFiberBuilder.build().fiberData referenceFiberBuilder.clear() referenceFiberBuilder.resetDictionary() fiberBuilder.clear() fiberBuilder.resetDictionary() assert(parsedBytes.length == referenceBytes.length) parsedBytes.zip(referenceBytes).exists(byte => byte._1 != byte._2) } } else { true } } } } class DictionaryBasedEncoderSuite extends SparkFunSuite with Checkers { test("Check Encoding/Decoding") { check(PropertiesAdapter.getProp(new DictionaryBasedEncoderCheck())) } }
Example 60
Source File: ArrowFileFormat.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.oap.spark.sql.execution.datasources.arrow import scala.collection.JavaConverters._ import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat.UnsafeItr import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions} import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._ import org.apache.arrow.dataset.scanner.ScanOptions import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileStatus import org.apache.hadoop.mapreduce.Job import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile} import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils import org.apache.spark.sql.sources.{DataSourceRegister, Filter} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap; class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable { val batchSize = 4096 def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = { ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava)) } override def inferSchema( sparkSession: SparkSession, options: Map[String, String], files: Seq[FileStatus]): Option[StructType] = { convert(files, options) } override def prepareWrite( sparkSession: SparkSession, job: Job, options: Map[String, String], dataSchema: StructType): OutputWriterFactory = { throw new UnsupportedOperationException("Write is not supported for Arrow source") } override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true override def buildReaderWithPartitionValues(sparkSession: SparkSession, dataSchema: StructType, partitionSchema: StructType, requiredSchema: StructType, filters: Seq[Filter], options: Map[String, String], hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = { (file: PartitionedFile) => { val sqlConf = sparkSession.sessionState.conf; val enableFilterPushDown = sqlConf.arrowFilterPushDown val factory = ArrowUtils.makeArrowDiscovery( file.filePath, new ArrowOptions( new CaseInsensitiveStringMap( options.asJava).asScala.toMap)) // todo predicate validation / pushdown val dataset = factory.finish(); val filter = if (enableFilterPushDown) { ArrowFilters.translateFilters(filters) } else { org.apache.arrow.dataset.filter.Filter.EMPTY } val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray, filter, batchSize) val scanner = dataset.newScan(scanOptions) val itrList = scanner .scan() .iterator() .asScala .map(task => task.scan()) .toList val itr = itrList .toIterator .flatMap(itr => itr.asScala) .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema)) new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]] } } override def shortName(): String = "arrow" } object ArrowFileFormat { class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] { override def hasNext: Boolean = delegate.hasNext override def next(): T = delegate.next() } }
Example 61
Source File: MatrixUDT.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ private[spark] class MatrixUDT extends UserDefinedType[Matrix] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense // the dense matrix is built by numRows, numCols, values and isTransposed, all of which are // set as not nullable, except values since in the future, support for binary matrices might // be added for which values are not needed. // the sparse matrix needs colPtrs and rowIndices, which are set as // null, while building the dense matrix. StructType(Seq( StructField("type", ByteType, nullable = false), StructField("numRows", IntegerType, nullable = false), StructField("numCols", IntegerType, nullable = false), StructField("colPtrs", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("rowIndices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true), StructField("isTransposed", BooleanType, nullable = false) )) } override def serialize(obj: Matrix): InternalRow = { val row = new GenericInternalRow(7) obj match { case sm: SparseMatrix => row.setByte(0, 0) row.setInt(1, sm.numRows) row.setInt(2, sm.numCols) row.update(3, UnsafeArrayData.fromPrimitiveArray(sm.colPtrs)) row.update(4, UnsafeArrayData.fromPrimitiveArray(sm.rowIndices)) row.update(5, UnsafeArrayData.fromPrimitiveArray(sm.values)) row.setBoolean(6, sm.isTransposed) case dm: DenseMatrix => row.setByte(0, 1) row.setInt(1, dm.numRows) row.setInt(2, dm.numCols) row.setNullAt(3) row.setNullAt(4) row.update(5, UnsafeArrayData.fromPrimitiveArray(dm.values)) row.setBoolean(6, dm.isTransposed) } row } override def deserialize(datum: Any): Matrix = { datum match { case row: InternalRow => require(row.numFields == 7, s"MatrixUDT.deserialize given row with length ${row.numFields} but requires length == 7") val tpe = row.getByte(0) val numRows = row.getInt(1) val numCols = row.getInt(2) val values = row.getArray(5).toDoubleArray() val isTransposed = row.getBoolean(6) tpe match { case 0 => val colPtrs = row.getArray(3).toIntArray() val rowIndices = row.getArray(4).toIntArray() new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed) case 1 => new DenseMatrix(numRows, numCols, values, isTransposed) } } } override def userClass: Class[Matrix] = classOf[Matrix] override def equals(o: Any): Boolean = { o match { case v: MatrixUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[MatrixUDT].getName.hashCode() override def typeName: String = "matrix" override def pyUDT: String = "pyspark.ml.linalg.MatrixUDT" private[spark] override def asNullable: MatrixUDT = this }
Example 62
Source File: VectorUDT.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ private[spark] class VectorUDT extends UserDefinedType[Vector] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse // vectors. The "values" field is nullable because we might want to add binary vectors later, // which uses "size" and "indices", but not "values". StructType(Seq( StructField("type", ByteType, nullable = false), StructField("size", IntegerType, nullable = true), StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true))) } override def serialize(obj: Vector): InternalRow = { obj match { case SparseVector(size, indices, values) => val row = new GenericInternalRow(4) row.setByte(0, 0) row.setInt(1, size) row.update(2, UnsafeArrayData.fromPrimitiveArray(indices)) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) row case DenseVector(values) => val row = new GenericInternalRow(4) row.setByte(0, 1) row.setNullAt(1) row.setNullAt(2) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) row } } override def deserialize(datum: Any): Vector = { datum match { case row: InternalRow => require(row.numFields == 4, s"VectorUDT.deserialize given row with length ${row.numFields} but requires length == 4") val tpe = row.getByte(0) tpe match { case 0 => val size = row.getInt(1) val indices = row.getArray(2).toIntArray() val values = row.getArray(3).toDoubleArray() new SparseVector(size, indices, values) case 1 => val values = row.getArray(3).toDoubleArray() new DenseVector(values) } } } override def pyUDT: String = "pyspark.ml.linalg.VectorUDT" override def userClass: Class[Vector] = classOf[Vector] override def equals(o: Any): Boolean = { o match { case v: VectorUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[VectorUDT].getName.hashCode() override def typeName: String = "vector" private[spark] override def asNullable: VectorUDT = this }
Example 63
Source File: LocalRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def sameResult(plan: LogicalPlan): Boolean = { plan.canonicalized match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } } override lazy val statistics = Statistics(sizeInBytes = (output.map(n => BigInt(n.dataType.defaultSize))).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 64
Source File: MonotonicallyIncreasingID.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initializeInternal(partitionIndex: Int): Unit = { count = 0L partitionMask = partitionIndex.toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, "") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "") ctx.addPartitionInitializationStatement(s"$countTerm = 0L;") ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = "false") } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" }
Example 65
Source File: misc.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.types._ @ExpressionDescription( usage = "_FUNC_() - Returns the current database.", extended = """ Examples: > SELECT _FUNC_(); default """) case class CurrentDatabase() extends LeafExpression with Unevaluable { override def dataType: DataType = StringType override def foldable: Boolean = true override def nullable: Boolean = false override def prettyName: String = "current_database" }
Example 66
Source File: BoundAttribute.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types._ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean) extends LeafExpression { override def toString: String = s"input[$ordinal, ${dataType.simpleString}, $nullable]" // Use special getter for primitive types (for UnsafeRow) override def eval(input: InternalRow): Any = { if (input.isNullAt(ordinal)) { null } else { dataType match { case BooleanType => input.getBoolean(ordinal) case ByteType => input.getByte(ordinal) case ShortType => input.getShort(ordinal) case IntegerType | DateType => input.getInt(ordinal) case LongType | TimestampType => input.getLong(ordinal) case FloatType => input.getFloat(ordinal) case DoubleType => input.getDouble(ordinal) case StringType => input.getUTF8String(ordinal) case BinaryType => input.getBinary(ordinal) case CalendarIntervalType => input.getInterval(ordinal) case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale) case t: StructType => input.getStruct(ordinal, t.size) case _: ArrayType => input.getArray(ordinal) case _: MapType => input.getMap(ordinal) case _ => input.get(ordinal, dataType) } } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val javaType = ctx.javaType(dataType) val value = ctx.getValue(ctx.INPUT_ROW, dataType, ordinal.toString) if (ctx.currentVars != null && ctx.currentVars(ordinal) != null) { val oev = ctx.currentVars(ordinal) ev.isNull = oev.isNull ev.value = oev.value val code = oev.code oev.code = "" ev.copy(code = code) } else if (nullable) { ev.copy(code = s""" boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal); $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value);""") } else { ev.copy(code = s"""$javaType ${ev.value} = $value;""", isNull = "false") } } } object BindReferences extends Logging { def bindReference[A <: Expression]( expression: A, input: AttributeSeq, allowFailures: Boolean = false): A = { expression.transform { case a: AttributeReference => attachTree(a, "Binding attribute") { val ordinal = input.indexOf(a.exprId) if (ordinal == -1) { if (allowFailures) { a } else { sys.error(s"Couldn't find $a in ${input.attrs.mkString("[", ",", "]")}") } } else { BoundReference(ordinal, a.dataType, input(ordinal).nullable) } } }.asInstanceOf[A] // Kind of a hack, but safe. TODO: Tighten return type when possible. } }
Example 67
Source File: decimalExpressions.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types._ case class CheckOverflow(child: Expression, dataType: DecimalType) extends UnaryExpression { override def nullable: Boolean = true override def nullSafeEval(input: Any): Any = { val d = input.asInstanceOf[Decimal].clone() if (d.changePrecision(dataType.precision, dataType.scale)) { d } else { null } } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen(ctx, ev, eval => { val tmp = ctx.freshName("tmp") s""" | Decimal $tmp = $eval.clone(); | if ($tmp.changePrecision(${dataType.precision}, ${dataType.scale})) { | ${ev.value} = $tmp; | } else { | ${ev.isNull} = true; | } """.stripMargin }) } override def toString: String = s"CheckOverflow($child, $dataType)" override def sql: String = child.sql }
Example 68
Source File: ReferenceToExpressions.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.objects.LambdaVariable import org.apache.spark.sql.types.DataType case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) extends Expression { override def nullable: Boolean = result.nullable override def dataType: DataType = result.dataType override def checkInputDataTypes(): TypeCheckResult = { if (result.references.nonEmpty) { return TypeCheckFailure("The result expression cannot reference to any attributes.") } var maxOrdinal = -1 result foreach { case b: BoundReference if b.ordinal > maxOrdinal => maxOrdinal = b.ordinal case _ => } if (maxOrdinal > children.length) { return TypeCheckFailure(s"The result expression need $maxOrdinal input expressions, but " + s"there are only ${children.length} inputs.") } TypeCheckSuccess } private lazy val projection = UnsafeProjection.create(children) override def eval(input: InternalRow): Any = { result.eval(projection(input)) } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val childrenGen = children.map(_.genCode(ctx)) val (classChildrenVars, initClassChildrenVars) = childrenGen.zip(children).map { case (childGen, child) => // SPARK-18125: The children vars are local variables. If the result expression uses // splitExpression, those variables cannot be accessed so compilation fails. // To fix it, we use class variables to hold those local variables. val classChildVarName = ctx.freshName("classChildVar") val classChildVarIsNull = ctx.freshName("classChildVarIsNull") ctx.addMutableState(ctx.javaType(child.dataType), classChildVarName, "") ctx.addMutableState("boolean", classChildVarIsNull, "") val classChildVar = LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType) val initCode = s"${classChildVar.value} = ${childGen.value};\n" + s"${classChildVar.isNull} = ${childGen.isNull};" (classChildVar, initCode) }.unzip val resultGen = result.transform { case b: BoundReference => classChildrenVars(b.ordinal) }.genCode(ctx) ExprCode(code = childrenGen.map(_.code).mkString("\n") + initClassChildrenVars.mkString("\n") + resultGen.code, isNull = resultGen.isNull, value = resultGen.value) } }
Example 69
Source File: GeneratePredicate.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ object GeneratePredicate extends CodeGenerator[Expression, Predicate] { protected def canonicalize(in: Expression): Expression = ExpressionCanonicalizer.execute(in) protected def bind(in: Expression, inputSchema: Seq[Attribute]): Expression = BindReferences.bindReference(in, inputSchema) protected def create(predicate: Expression): Predicate = { val ctx = newCodeGenContext() val eval = predicate.genCode(ctx) val codeBody = s""" public SpecificPredicate generate(Object[] references) { return new SpecificPredicate(references); } class SpecificPredicate extends ${classOf[Predicate].getName} { private final Object[] references; ${ctx.declareMutableStates()} public SpecificPredicate(Object[] references) { this.references = references; ${ctx.initMutableStates()} } public void initialize(int partitionIndex) { ${ctx.initPartition()} } ${ctx.declareAddedFunctions()} public boolean eval(InternalRow ${ctx.INPUT_ROW}) { ${eval.code} return !${eval.isNull} && ${eval.value}; } }""" val code = CodeFormatter.stripOverlappingComments( new CodeAndComment(codeBody, ctx.getPlaceHolderToComments())) logDebug(s"Generated predicate '$predicate':\n${CodeFormatter.format(code)}") CodeGenerator.compile(code).generate(ctx.references.toArray).asInstanceOf[Predicate] } }
Example 70
Source File: ResolveInlineTables.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import scala.util.control.NonFatal import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Cast import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.types.{StructField, StructType} private[analysis] def convert(table: UnresolvedInlineTable): LocalRelation = { // For each column, traverse all the values and find a common data type and nullability. val fields = table.rows.transpose.zip(table.names).map { case (column, name) => val inputTypes = column.map(_.dataType) val tpe = TypeCoercion.findWiderTypeWithoutStringPromotion(inputTypes).getOrElse { table.failAnalysis(s"incompatible types found in column $name for inline table") } StructField(name, tpe, nullable = column.exists(_.nullable)) } val attributes = StructType(fields).toAttributes assert(fields.size == table.names.size) val newRows: Seq[InternalRow] = table.rows.map { row => InternalRow.fromSeq(row.zipWithIndex.map { case (e, ci) => val targetType = fields(ci).dataType try { if (e.dataType.sameType(targetType)) { e.eval() } else { Cast(e, targetType).eval() } } catch { case NonFatal(ex) => table.failAnalysis(s"failed to evaluate expression ${e.sql}: ${ex.getMessage}") } }) } LocalRelation(attributes, newRows) } }
Example 71
Source File: DeclarativeAggregateEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) { lazy val initializer = GenerateSafeProjection.generate(function.initialValues) lazy val updater = GenerateSafeProjection.generate( function.updateExpressions, function.aggBufferAttributes ++ input) lazy val merger = GenerateSafeProjection.generate( function.mergeExpressions, function.aggBufferAttributes ++ function.inputAggBufferAttributes) lazy val evaluator = GenerateSafeProjection.generate( function.evaluateExpression :: Nil, function.aggBufferAttributes) def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy() def update(values: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = values.foldLeft(initialize()) { (buffer, input) => updater(joiner(buffer, input)) } buffer.copy() } def merge(buffers: InternalRow*): InternalRow = { val joiner = new JoinedRow val buffer = buffers.foldLeft(initialize()) { (left, right) => merger(joiner(left, right)) } buffer.copy() } def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy() }
Example 72
Source File: GeneratorExpressionSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types._ class GeneratorExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { private def checkTuple(actual: Expression, expected: Seq[InternalRow]): Unit = { assert(actual.eval(null).asInstanceOf[TraversableOnce[InternalRow]].toSeq === expected) } private final val empty_array = CreateArray(Seq.empty) private final val int_array = CreateArray(Seq(1, 2, 3).map(Literal(_))) private final val str_array = CreateArray(Seq("a", "b", "c").map(Literal(_))) test("explode") { val int_correct_answer = Seq(create_row(1), create_row(2), create_row(3)) val str_correct_answer = Seq(create_row("a"), create_row("b"), create_row("c")) checkTuple(Explode(empty_array), Seq.empty) checkTuple(Explode(int_array), int_correct_answer) checkTuple(Explode(str_array), str_correct_answer) } test("posexplode") { val int_correct_answer = Seq(create_row(0, 1), create_row(1, 2), create_row(2, 3)) val str_correct_answer = Seq(create_row(0, "a"), create_row(1, "b"), create_row(2, "c")) checkTuple(PosExplode(CreateArray(Seq.empty)), Seq.empty) checkTuple(PosExplode(int_array), int_correct_answer) checkTuple(PosExplode(str_array), str_correct_answer) } test("inline") { val correct_answer = Seq(create_row(0, "a"), create_row(1, "b"), create_row(2, "c")) checkTuple( Inline(Literal.create(Array(), ArrayType(new StructType().add("id", LongType)))), Seq.empty) checkTuple( Inline(CreateArray(Seq( CreateStruct(Seq(Literal(0), Literal("a"))), CreateStruct(Seq(Literal(1), Literal("b"))), CreateStruct(Seq(Literal(2), Literal("c"))) ))), correct_answer) } test("stack") { checkTuple(Stack(Seq(1, 1).map(Literal(_))), Seq(create_row(1))) checkTuple(Stack(Seq(1, 1, 2).map(Literal(_))), Seq(create_row(1, 2))) checkTuple(Stack(Seq(2, 1, 2).map(Literal(_))), Seq(create_row(1), create_row(2))) checkTuple(Stack(Seq(2, 1, 2, 3).map(Literal(_))), Seq(create_row(1, 2), create_row(3, null))) checkTuple(Stack(Seq(3, 1, 2, 3).map(Literal(_))), Seq(1, 2, 3).map(create_row(_))) checkTuple(Stack(Seq(4, 1, 2, 3).map(Literal(_))), Seq(1, 2, 3, null).map(create_row(_))) checkTuple( Stack(Seq(3, 1, 1.0, "a", 2, 2.0, "b", 3, 3.0, "c").map(Literal(_))), Seq(create_row(1, 1.0, "a"), create_row(2, 2.0, "b"), create_row(3, 3.0, "c"))) assert(Stack(Seq(Literal(1))).checkInputDataTypes().isFailure) assert(Stack(Seq(Literal(1.0))).checkInputDataTypes().isFailure) assert(Stack(Seq(Literal(1), Literal(1), Literal(1.0))).checkInputDataTypes().isSuccess) assert(Stack(Seq(Literal(2), Literal(1), Literal(1.0))).checkInputDataTypes().isFailure) } }
Example 73
Source File: ExpressionEvalHelperSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, IntegerType} case class BadCodegenExpression() extends LeafExpression { override def nullable: Boolean = false override def eval(input: InternalRow): Any = 10 override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { ev.copy(code = s""" |int some_variable = 11; |int ${ev.value} = 10; """.stripMargin) } override def dataType: DataType = IntegerType }
Example 74
Source File: ObjectExpressionsSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.objects.Invoke import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{IntegerType, ObjectType} class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("SPARK-16622: The returned value of the called method in Invoke can be null") { val inputRow = InternalRow.fromSeq(Seq((false, null))) val cls = classOf[Tuple2[Boolean, java.lang.Integer]] val inputObject = BoundReference(0, ObjectType(cls), nullable = true) val invoke = Invoke(inputObject, "_2", IntegerType) checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow) } test("MapObjects should make copies of unsafe-backed data") { // test UnsafeRow-backed data val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]] val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4)))) val structExpected = new GenericArrayData( Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4)))) checkEvalutionWithUnsafeProjection( structEncoder.serializer.head, structExpected, structInputRow) // test UnsafeArray-backed data val arrayEncoder = ExpressionEncoder[Array[Array[Int]]] val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4)))) val arrayExpected = new GenericArrayData( Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4)))) checkEvalutionWithUnsafeProjection( arrayEncoder.serializer.head, arrayExpected, arrayInputRow) // test UnsafeMap-backed data val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]] val mapInputRow = InternalRow.fromSeq(Seq(Array( Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400)))) val mapExpected = new GenericArrayData(Seq( new ArrayBasedMapData( new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(100, 200))), new ArrayBasedMapData( new GenericArrayData(Array(3, 4)), new GenericArrayData(Array(300, 400))))) checkEvalutionWithUnsafeProjection( mapEncoder.serializer.head, mapExpected, mapInputRow) } }
Example 75
Source File: GenerateUnsafeRowJoinerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val random = new Random() val schema1 = RandomDataGenerator.randomSchema(random, numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(random, numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 76
Source File: ConvertToLocalRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor class ConvertToLocalRelationSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("LocalRelation", FixedPoint(100), ConvertToLocalRelation) :: Nil } test("Project on LocalRelation should be turned into a single LocalRelation") { val testRelation = LocalRelation( LocalRelation('a.int, 'b.int).output, InternalRow(1, 2) :: InternalRow(4, 5) :: Nil) val correctAnswer = LocalRelation( LocalRelation('a1.int, 'b1.int).output, InternalRow(1, 3) :: InternalRow(4, 6) :: Nil) val projectOnLocal = testRelation.select( UnresolvedAttribute("a").as("a1"), (UnresolvedAttribute("b") + 1).as("b1")) val optimized = Optimize.execute(projectOnLocal.analyze) comparePlans(optimized, correctAnswer) } }
Example 77
Source File: LocalTableScanExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) private val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 78
Source File: RowIterator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.util.NoSuchElementException import org.apache.spark.sql.catalyst.InternalRow def toScala: Iterator[InternalRow] = new RowIteratorToScala(this) } object RowIterator { def fromScala(scalaIter: Iterator[InternalRow]): RowIterator = { scalaIter match { case wrappedRowIter: RowIteratorToScala => wrappedRowIter.rowIter case _ => new RowIteratorFromScala(scalaIter) } } } private final class RowIteratorToScala(val rowIter: RowIterator) extends Iterator[InternalRow] { private [this] var hasNextWasCalled: Boolean = false private [this] var _hasNext: Boolean = false override def hasNext: Boolean = { // Idempotency: if (!hasNextWasCalled) { _hasNext = rowIter.advanceNext() hasNextWasCalled = true } _hasNext } override def next(): InternalRow = { if (!hasNext) throw new NoSuchElementException hasNextWasCalled = false rowIter.getRow } } private final class RowIteratorFromScala(scalaIter: Iterator[InternalRow]) extends RowIterator { private[this] var _next: InternalRow = null override def advanceNext(): Boolean = { if (scalaIter.hasNext) { _next = scalaIter.next() true } else { _next = null false } } override def getRow: InternalRow = _next override def toScala: Iterator[InternalRow] = scalaIter }
Example 79
Source File: ShuffledHashJoinExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffledHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"), "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map")) override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = { val buildDataSize = longMetric("buildDataSize") val buildTime = longMetric("buildTime") val start = System.nanoTime() val context = TaskContext.get() val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager()) buildTime += (System.nanoTime() - start) / 1000000 buildDataSize += relation.estimatedSize // This relation is usually used until the end of task. context.addTaskCompletionListener(_ => relation.close()) relation } protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) => val hashed = buildHashedRelation(buildIter) join(streamIter, hashed, numOutputRows) } } }
Example 80
Source File: CartesianProductExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 81
Source File: RecordReaderIterator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.Closeable import org.apache.hadoop.mapreduce.RecordReader import org.apache.spark.sql.catalyst.InternalRow class RecordReaderIterator[T]( private[this] var rowReader: RecordReader[_, T]) extends Iterator[T] with Closeable { private[this] var havePair = false private[this] var finished = false override def hasNext: Boolean = { if (!finished && !havePair) { finished = !rowReader.nextKeyValue if (finished) { // Close and release the reader here; close() will also be called when the task // completes, but for tasks that read from many files, it helps to release the // resources early. close() } havePair = !finished } !finished } override def next(): T = { if (!hasNext) { throw new java.util.NoSuchElementException("End of stream") } havePair = false rowReader.getCurrentValue } override def close(): Unit = { if (rowReader != null) { try { // Close the reader and release it. Note: it's very important that we don't close the // reader more than once, since that exposes us to MAPREDUCE-5918 when running against // older Hadoop 2.x releases. That bug can lead to non-deterministic corruption issues // when reading compressed input. rowReader.close() } finally { rowReader = null } } } }
Example 82
Source File: ParquetOutputWriter.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce._ import org.apache.parquet.hadoop.ParquetOutputFormat import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.OutputWriter // NOTE: This class is instantiated and used on executor side only, no need to be serializable. private[parquet] class ParquetOutputWriter(path: String, context: TaskAttemptContext) extends OutputWriter { private val recordWriter: RecordWriter[Void, InternalRow] = { new ParquetOutputFormat[InternalRow]() { override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = { new Path(path) } }.getRecordWriter(context) } override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal") override def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row) override def close(): Unit = recordWriter.close(context) }
Example 83
Source File: Exchange.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.broadcast import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]() plan.transformUp { case exchange: Exchange => // the exchanges that have same results usually also have same schemas (same column names). val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]()) val samePlan = sameSchema.find { e => exchange.sameResult(e) } if (samePlan.isDefined) { // Keep the output of this exchange, the following plans require that to resolve // attributes. ReusedExchangeExec(exchange.output, samePlan.get) } else { sameSchema += exchange exchange } } } }
Example 84
Source File: BoundOrdering.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.window import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Projection private[window] final case class RangeBoundOrdering( ordering: Ordering[InternalRow], current: Projection, bound: Projection) extends BoundOrdering { override def compare( inputRow: InternalRow, inputIndex: Int, outputRow: InternalRow, outputIndex: Int): Int = ordering.compare(current(inputRow), bound(outputRow)) }
Example 85
Source File: NullableColumnAccessor.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.catalyst.InternalRow private[columnar] trait NullableColumnAccessor extends ColumnAccessor { private var nullsBuffer: ByteBuffer = _ private var nullCount: Int = _ private var seenNulls: Int = 0 private var nextNullIndex: Int = _ private var pos: Int = 0 abstract override protected def initialize(): Unit = { nullsBuffer = underlyingBuffer.duplicate().order(ByteOrder.nativeOrder()) nullCount = ByteBufferHelper.getInt(nullsBuffer) nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1 pos = 0 underlyingBuffer.position(underlyingBuffer.position + 4 + nullCount * 4) super.initialize() } abstract override def extractTo(row: InternalRow, ordinal: Int): Unit = { if (pos == nextNullIndex) { seenNulls += 1 if (seenNulls < nullCount) { nextNullIndex = ByteBufferHelper.getInt(nullsBuffer) } row.setNullAt(ordinal) } else { super.extractTo(row, ordinal) } pos += 1 } abstract override def hasNext: Boolean = seenNulls < nullCount || super.hasNext }
Example 86
Source File: NullableColumnBuilder.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.catalyst.InternalRow private[columnar] trait NullableColumnBuilder extends ColumnBuilder { protected var nulls: ByteBuffer = _ protected var nullCount: Int = _ private var pos: Int = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { nulls = ByteBuffer.allocate(1024) nulls.order(ByteOrder.nativeOrder()) pos = 0 nullCount = 0 super.initialize(initialSize, columnName, useCompression) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { columnStats.gatherStats(row, ordinal) if (row.isNullAt(ordinal)) { nulls = ColumnBuilder.ensureFreeSpace(nulls, 4) nulls.putInt(pos) nullCount += 1 } else { super.appendFrom(row, ordinal) } pos += 1 } abstract override def build(): ByteBuffer = { val nonNulls = super.build() val nullDataLen = nulls.position() nulls.limit(nullDataLen) nulls.rewind() val buffer = ByteBuffer .allocate(4 + nullDataLen + nonNulls.remaining()) .order(ByteOrder.nativeOrder()) .putInt(nullCount) .put(nulls) .put(nonNulls) buffer.rewind() buffer } protected def buildNonNulls(): ByteBuffer = { nulls.limit(nulls.position()).rewind() super.build() } }
Example 87
Source File: CompressibleColumnBuilder.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 88
Source File: CompressibleColumnAccessor.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: InternalRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } }
Example 89
Source File: GenerateExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.metric.SQLMetrics case class GenerateExec( generator: Generator, join: Boolean, outer: Boolean, generatorOutput: Seq[Attribute], child: SparkPlan) extends UnaryExecNode { override def output: Seq[Attribute] = { if (join) { child.output ++ generatorOutput } else { generatorOutput } } override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) override def producedAttributes: AttributeSet = AttributeSet(output) override def outputPartitioning: Partitioning = child.outputPartitioning val boundGenerator = BindReferences.bindReference(generator, child.output) protected override def doExecute(): RDD[InternalRow] = { // boundGenerator.terminate() should be triggered after all of the rows in the partition val rows = if (join) { child.execute().mapPartitionsInternal { iter => val generatorNullRow = new GenericInternalRow(generator.elementSchema.length) val joinedRow = new JoinedRow iter.flatMap { row => // we should always set the left (child output) joinedRow.withLeft(row) val outputRows = boundGenerator.eval(row) if (outer && outputRows.isEmpty) { joinedRow.withRight(generatorNullRow) :: Nil } else { outputRows.map(joinedRow.withRight) } } ++ LazyIterator(boundGenerator.terminate).map { row => // we leave the left side as the last element of its child output // keep it the same as Hive does joinedRow.withRight(row) } } } else { child.execute().mapPartitionsInternal { iter => iter.flatMap(boundGenerator.eval) ++ LazyIterator(boundGenerator.terminate) } } val numOutputRows = longMetric("numOutputRows") rows.mapPartitionsWithIndexInternal { (index, iter) => val proj = UnsafeProjection.create(output, output) proj.initialize(index) iter.map { r => numOutputRows += 1 proj(r) } } } }
Example 90
Source File: commands.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 91
Source File: StreamingRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LeafNode import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes) } }
Example 92
Source File: EventTimeWatermarkExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 93
Source File: CoGroupedIterator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 94
Source File: ReferenceSort.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 95
Source File: ColumnarTestUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 96
Source File: ExtraStrategiesSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.test.SharedSQLContext case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) val unsafeProj = UnsafeProjection.create(schema) val unsafeRow = unsafeProj(row).copy() sparkContext.parallelize(Seq(unsafeRow)) } override def producedAttributes: AttributeSet = outputSet override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { spark.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { spark.experimental.extraStrategies = Nil } } }
Example 97
Source File: CarbonLoadParams.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.management import java.text.SimpleDateFormat import java.util import scala.collection.mutable import org.apache.hadoop.conf.Configuration import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.command.UpdateTableModel import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.carbondata.core.indexstore.PartitionSpec import org.apache.carbondata.core.statusmanager.SegmentStatus import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.events.OperationContext import org.apache.carbondata.processing.loading.model.CarbonLoadModel case class CarbonLoadParams( sparkSession: SparkSession, tableName: String, sizeInBytes: Long, isOverwriteTable: Boolean, carbonLoadModel: CarbonLoadModel, hadoopConf: Configuration, logicalPartitionRelation: LogicalRelation, dateFormat : SimpleDateFormat, timeStampFormat : SimpleDateFormat, optionsOriginal: Map[String, String], finalPartition : Map[String, Option[String]], currPartitions: util.List[PartitionSpec], partitionStatus : SegmentStatus, var dataFrame: Option[DataFrame], scanResultRDD : Option[RDD[InternalRow]], updateModel: Option[UpdateTableModel], operationContext: OperationContext) { }
Example 98
Source File: MergeProjection.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.mutation.merge import java.sql.{Date, Timestamp} import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection} import org.apache.spark.sql.catalyst.util.DateTimeUtils case class MergeProjection( @transient tableCols: Seq[String], @transient statusCol : String, @transient ds: Dataset[Row], @transient rltn: CarbonDatasourceHadoopRelation, @transient sparkSession: SparkSession, @transient mergeAction: MergeAction) { private val cutOffDate = Integer.MAX_VALUE >> 1 val isUpdate = mergeAction.isInstanceOf[UpdateAction] val isDelete = mergeAction.isInstanceOf[DeleteAction] def apply(row: GenericRowWithSchema): InternalRow = { // TODO we can avoid these multiple conversions if this is added as a SparkPlan node. val values = row.values.map { case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s) case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d) case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b) case d: Date => DateTimeUtils.fromJavaDate(d) case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t) case value => value } projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow] } val (projection, output) = generateProjection private def generateProjection: (Projection, Array[Expression]) = { val existingDsOutput = rltn.carbonRelation.schema.toAttributes val colsMap = mergeAction match { case UpdateAction(updateMap) => updateMap case InsertAction(insertMap) => insertMap case _ => null } if (colsMap != null) { val output = new Array[Expression](tableCols.length) val expecOutput = new Array[Expression](tableCols.length) colsMap.foreach { case (k, v) => val tableIndex = tableCols.indexOf(k.toString().toLowerCase) if (tableIndex < 0) { throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap") } output(tableIndex) = v.expr.transform { case a: Attribute if !a.resolved => ds.queryExecution.analyzed.resolveQuoted(a.name, sparkSession.sessionState.analyzer.resolver).get } expecOutput(tableIndex) = existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get } if (output.contains(null)) { throw new CarbonMergeDataSetException(s"Not all columns are mapped") } (new InterpretedMutableProjection(output++Seq( ds.queryExecution.analyzed.resolveQuoted(statusCol, sparkSession.sessionState.analyzer.resolver).get), ds.queryExecution.analyzed.output), expecOutput) } else { (null, null) } } }
Example 99
Source File: SparkUnsafeRowReadSuport.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.carbondata.execution.datasources.readsupport import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types.StructType import org.apache.carbondata.core.metadata.schema.table.CarbonTable import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport class SparkUnsafeRowReadSuport(requiredSchema: StructType) extends CarbonReadSupport[InternalRow] { private val unsafeProjection = UnsafeProjection.create(requiredSchema) override def initialize(carbonColumns: Array[CarbonColumn], carbonTable: CarbonTable): Unit = { } override def readRow(data: Array[AnyRef]): InternalRow = { unsafeProjection(new GenericInternalRow(data.asInstanceOf[Array[Any]])) } override def close(): Unit = { // Nothing to close } }
Example 100
Source File: CarbonTaskCompletionListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.carbondata.execution.datasources.tasklisteners import org.apache.hadoop.io.NullWritable import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.datasources.RecordReaderIterator import org.apache.spark.util.TaskCompletionListener import org.apache.carbondata.common.logging.LogServiceFactory import org.apache.carbondata.core.memory.UnsafeMemoryManager import org.apache.carbondata.core.util.{DataTypeUtil, ThreadLocalTaskInfo} import org.apache.carbondata.hadoop.internal.ObjectArrayWritable trait CarbonCompactionTaskCompletionListener extends TaskCompletionListener case class CarbonQueryTaskCompletionListenerImpl(iter: RecordReaderIterator[InternalRow], freeMemory: Boolean = false) extends CarbonQueryTaskCompletionListener { override def onTaskCompletion(context: TaskContext): Unit = { if (iter != null) { try { iter.close() } catch { case e: Exception => LogServiceFactory.getLogService(this.getClass.getCanonicalName).error(e) } } if (freeMemory) { UnsafeMemoryManager.INSTANCE .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId) ThreadLocalTaskInfo.clearCarbonTaskInfo() } DataTypeUtil.clearFormatter() } } case class CarbonLoadTaskCompletionListenerImpl(recordWriter: RecordWriter[NullWritable, ObjectArrayWritable], taskAttemptContext: TaskAttemptContext) extends CarbonLoadTaskCompletionListener { override def onTaskCompletion(context: TaskContext): Unit = { try { recordWriter.close(taskAttemptContext) } finally { UnsafeMemoryManager.INSTANCE .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId) ThreadLocalTaskInfo.clearCarbonTaskInfo() DataTypeUtil.clearFormatter() } } }
Example 101
Source File: CarbonDataSourceScan.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.strategy import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} class CarbonDataSourceScan( override val output: Seq[Attribute], val rdd: RDD[InternalRow], @transient override val relation: HadoopFsRelation, val partitioning: Partitioning, val md: Map[String, String], identifier: Option[TableIdentifier], @transient private val logicalRelation: LogicalRelation) extends FileSourceScanExec( relation, output, relation.dataSchema, Seq.empty, Seq.empty, identifier) { // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val supportsBatch: Boolean = true // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = (partitioning, Nil) // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val metadata: Map[String, String] = md override def inputRDDs(): Seq[RDD[InternalRow]] = rdd :: Nil }
Example 102
Source File: CarbonDataSourceScan.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.strategy import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation} class CarbonDataSourceScan( override val output: Seq[Attribute], val rdd: RDD[InternalRow], @transient override val relation: HadoopFsRelation, val partitioning: Partitioning, val md: Map[String, String], identifier: Option[TableIdentifier], @transient private val logicalRelation: LogicalRelation) extends FileSourceScanExec( relation, output, relation.dataSchema, Seq.empty, None, Seq.empty, identifier) { // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val supportsBatch: Boolean = true // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) = (partitioning, Nil) // added lazy since spark 2.3.2 version (SPARK-PR#21815) override lazy val metadata: Map[String, String] = md override def inputRDDs(): Seq[RDD[InternalRow]] = rdd :: Nil }
Example 103
Source File: RowStreamParserImp.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.streaming.parser import java.text.SimpleDateFormat import java.util import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.processing.loading.ComplexDelimitersEnum import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants class RowStreamParserImp extends CarbonStreamParser { var configuration: Configuration = null var isVarcharTypeMapping: Array[Boolean] = null var structType: StructType = null var encoder: ExpressionEncoder[Row] = null var timeStampFormat: SimpleDateFormat = null var dateFormat: SimpleDateFormat = null var complexDelimiters: util.ArrayList[String] = new util.ArrayList[String]() var serializationNullFormat: String = null override def initialize(configuration: Configuration, structType: StructType, isVarcharTypeMapping: Array[Boolean]): Unit = { this.configuration = configuration this.structType = structType this.encoder = RowEncoder.apply(this.structType).resolveAndBind() this.isVarcharTypeMapping = isVarcharTypeMapping this.timeStampFormat = new SimpleDateFormat( this.configuration.get(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT)) this.dateFormat = new SimpleDateFormat( this.configuration.get(CarbonCommonConstants.CARBON_DATE_FORMAT)) this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_1")) this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_2")) this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_3")) this.complexDelimiters.add(ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_4.value()) this.serializationNullFormat = this.configuration.get(DataLoadProcessorConstants.SERIALIZATION_NULL_FORMAT) } override def parserRow(value: InternalRow): Array[Object] = { this.encoder.fromRow(value).toSeq.zipWithIndex.map { case (x, i) => FieldConverter.objectToString( x, serializationNullFormat, complexDelimiters, timeStampFormat, dateFormat, isVarcharType = i < this.isVarcharTypeMapping.length && this.isVarcharTypeMapping(i), binaryCodec = null) } }.toArray override def close(): Unit = { } }
Example 104
Source File: VectorUDT.scala From ann4s with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.nn import ann4s.{Vector0, Vector16, Vector8, Vector32, Vector64, Vector} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ class VectorUDT extends UserDefinedType[Vector] { override def sqlType: DataType = _sqlType override def serialize(obj: Vector): InternalRow = { val row = new GenericInternalRow(5) row.setNullAt(1) row.setNullAt(2) row.setNullAt(3) row.setNullAt(4) obj match { case Vector0 => row.setByte(0, 0) case Vector8(values, w, b) => row.setByte(0, 1) row.update(1, UnsafeArrayData.fromPrimitiveArray(values)) row.update(3, UnsafeArrayData.fromPrimitiveArray(Array(w, b))) case Vector16(values) => row.setByte(0, 2) row.update(2, UnsafeArrayData.fromPrimitiveArray(values)) case Vector32(values) => row.setByte(0, 3) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) case Vector64(values) => row.setByte(0, 4) row.update(4, UnsafeArrayData.fromPrimitiveArray(values)) } row } override def deserialize(datum: Any): Vector = { datum match { case row: InternalRow => require(row.numFields == 5, s"nn.VectorUDT.deserialize given row with length ${row.numFields} but requires length == 5") val tpe = row.getByte(0) tpe match { case 0 => Vector0 case 1 => val wb = row.getArray(3).toFloatArray() Vector8(row.getArray(1).toByteArray(), wb(0), wb(1)) case 2 => Vector16(row.getArray(2).toShortArray()) case 3 => Vector32(row.getArray(3).toFloatArray()) case 4 => Vector64(row.getArray(4).toDoubleArray()) } } } override def userClass: Class[Vector] = classOf[Vector] override def equals(o: Any): Boolean = { o match { case _: VectorUDT => true case _ => false } } override def hashCode(): Int = classOf[VectorUDT].getName.hashCode override def typeName: String = "nn.vector" private[spark] override def asNullable: VectorUDT = this private[this] val _sqlType = { StructType(Seq( StructField("type", ByteType, nullable = false), StructField("fixed8", ArrayType(ByteType, containsNull = false), nullable = true), StructField("fixed16", ArrayType(ShortType, containsNull = false), nullable = true), StructField("float32", ArrayType(FloatType, containsNull = false), nullable = true), StructField("float64", ArrayType(DoubleType, containsNull = false), nullable = true))) } } object VectorUDT { def register(): Unit = { UDTRegistration.register("ann4s.Vector", "org.apache.spark.ml.nn.VectorUDT") UDTRegistration.register("ann4s.EmptyVector", "org.apache.spark.ml.nn.VectorUDT") UDTRegistration.register("ann4s.Fixed8Vector", "org.apache.spark.ml.nn.VectorUDT") UDTRegistration.register("ann4s.Fixed16Vector", "org.apache.spark.ml.nn.VectorUDT") UDTRegistration.register("ann4s.Float32Vector", "org.apache.spark.ml.nn.VectorUDT") UDTRegistration.register("ann4s.Float64Vector", "org.apache.spark.ml.nn.VectorUDT") } }
Example 105
Source File: ScanPartition.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb.datasource import com.amazonaws.services.dynamodbv2.document.Item import com.audienceproject.shaded.google.common.util.concurrent.RateLimiter import com.audienceproject.spark.dynamodb.connector.DynamoConnector import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.sources.v2.reader.{InputPartition, InputPartitionReader} import org.apache.spark.sql.types.{StructField, StructType} import scala.collection.JavaConverters._ class ScanPartition(schema: StructType, partitionIndex: Int, connector: DynamoConnector, filters: Array[Filter]) extends InputPartition[InternalRow] { private val requiredColumns = schema.map(_.name) @transient private lazy val typeConversions = schema.collect({ case StructField(name, dataType, _, _) => name -> TypeConversion(name, dataType) }).toMap override def createPartitionReader(): InputPartitionReader[InternalRow] = { if (connector.isEmpty) new EmptyReader else new PartitionReader } private class EmptyReader extends InputPartitionReader[InternalRow] { override def next(): Boolean = false override def get(): InternalRow = throw new IllegalStateException("Unable to call get() on empty iterator") override def close(): Unit = {} } private class PartitionReader extends InputPartitionReader[InternalRow] { private val pageIterator = connector.scan(partitionIndex, requiredColumns, filters).pages().iterator().asScala private val rateLimiter = RateLimiter.create(connector.readLimit) private var innerIterator: Iterator[InternalRow] = Iterator.empty private var currentRow: InternalRow = _ private var proceed = false override def next(): Boolean = { proceed = true innerIterator.hasNext || { if (pageIterator.hasNext) { nextPage() next() } else false } } override def get(): InternalRow = { if (proceed) { currentRow = innerIterator.next() proceed = false } currentRow } override def close(): Unit = {} private def nextPage(): Unit = { val page = pageIterator.next() val result = page.getLowLevelResult Option(result.getScanResult.getConsumedCapacity).foreach(cap => rateLimiter.acquire(cap.getCapacityUnits.toInt max 1)) innerIterator = result.getItems.iterator().asScala.map(itemToRow(requiredColumns)) } } private def itemToRow(requiredColumns: Seq[String])(item: Item): InternalRow = if (requiredColumns.nonEmpty) InternalRow.fromSeq(requiredColumns.map(columnName => typeConversions(columnName)(item))) else InternalRow.fromSeq(item.asMap().asScala.values.toSeq.map(_.toString)) }
Example 106
Source File: DynamoWriterFactory.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb.datasource import com.audienceproject.spark.dynamodb.connector.{ColumnSchema, TableConnector} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory} import org.apache.spark.sql.types.StructType class DynamoWriterFactory(connector: TableConnector, parameters: Map[String, String], schema: StructType) extends DataWriterFactory[InternalRow] { private val batchSize = parameters.getOrElse("writebatchsize", "25").toInt private val update = parameters.getOrElse("update", "false").toBoolean private val delete = parameters.getOrElse("delete", "false").toBoolean private val region = parameters.get("region") private val roleArn = parameters.get("rolearn") override def createDataWriter(partitionId: Int, taskId: Long, epochId: Long): DataWriter[InternalRow] = { val columnSchema = new ColumnSchema(connector.keySchema, schema) val client = connector.getDynamoDB(region, roleArn) if (update) { assert(!delete, "Please provide exactly one of 'update' or 'delete' options.") new DynamoUpdateWriter(columnSchema, connector, client) } else if (delete) { new DynamoBatchDeleteWriter(batchSize, columnSchema, connector, client) } else { new DynamoBatchWriter(batchSize, columnSchema, connector, client) } } }
Example 107
Source File: TypeConversion.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb.datasource import com.amazonaws.services.dynamodbv2.document.{IncompatibleTypeException, Item} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import scala.collection.JavaConverters._ private[dynamodb] object TypeConversion { def apply(attrName: String, sparkType: DataType): Item => Any = sparkType match { case BooleanType => nullableGet(_.getBOOL)(attrName) case StringType => nullableGet(item => attrName => UTF8String.fromString(item.getString(attrName)))(attrName) case IntegerType => nullableGet(_.getInt)(attrName) case LongType => nullableGet(_.getLong)(attrName) case DoubleType => nullableGet(_.getDouble)(attrName) case FloatType => nullableGet(_.getFloat)(attrName) case BinaryType => nullableGet(_.getBinary)(attrName) case DecimalType() => nullableGet(_.getNumber)(attrName) case ArrayType(innerType, _) => nullableGet(_.getList)(attrName).andThen(extractArray(convertValue(innerType))) case MapType(keyType, valueType, _) => if (keyType != StringType) throw new IllegalArgumentException(s"Invalid Map key type '${keyType.typeName}'. DynamoDB only supports String as Map key type.") nullableGet(_.getRawMap)(attrName).andThen(extractMap(convertValue(valueType))) case StructType(fields) => val nestedConversions = fields.collect({ case StructField(name, dataType, _, _) => name -> convertValue(dataType) }) nullableGet(_.getRawMap)(attrName).andThen(extractStruct(nestedConversions)) case _ => throw new IllegalArgumentException(s"Spark DataType '${sparkType.typeName}' could not be mapped to a corresponding DynamoDB data type.") } private val stringConverter = (value: Any) => UTF8String.fromString(value.asInstanceOf[String]) private def convertValue(sparkType: DataType): Any => Any = sparkType match { case IntegerType => nullableConvert(_.intValue()) case LongType => nullableConvert(_.longValue()) case DoubleType => nullableConvert(_.doubleValue()) case FloatType => nullableConvert(_.floatValue()) case DecimalType() => nullableConvert(identity) case ArrayType(innerType, _) => extractArray(convertValue(innerType)) case MapType(keyType, valueType, _) => if (keyType != StringType) throw new IllegalArgumentException(s"Invalid Map key type '${keyType.typeName}'. DynamoDB only supports String as Map key type.") extractMap(convertValue(valueType)) case StructType(fields) => val nestedConversions = fields.collect({ case StructField(name, dataType, _, _) => name -> convertValue(dataType) }) extractStruct(nestedConversions) case BooleanType => { case boolean: Boolean => boolean case _ => null } case StringType => { case string: String => UTF8String.fromString(string) case _ => null } case BinaryType => { case byteArray: Array[Byte] => byteArray case _ => null } case _ => throw new IllegalArgumentException(s"Spark DataType '${sparkType.typeName}' could not be mapped to a corresponding DynamoDB data type.") } private def nullableGet(getter: Item => String => Any)(attrName: String): Item => Any = { case item if item.hasAttribute(attrName) => try getter(item)(attrName) catch { case _: NumberFormatException => null case _: IncompatibleTypeException => null } case _ => null } private def nullableConvert(converter: java.math.BigDecimal => Any): Any => Any = { case item: java.math.BigDecimal => converter(item) case _ => null } private def extractArray(converter: Any => Any): Any => Any = { case list: java.util.List[_] => new GenericArrayData(list.asScala.map(converter)) case set: java.util.Set[_] => new GenericArrayData(set.asScala.map(converter).toSeq) case _ => null } private def extractMap(converter: Any => Any): Any => Any = { case map: java.util.Map[_, _] => ArrayBasedMapData(map, stringConverter, converter) case _ => null } private def extractStruct(conversions: Seq[(String, Any => Any)]): Any => Any = { case map: java.util.Map[_, _] => InternalRow.fromSeq(conversions.map({ case (name, conv) => conv(map.get(name)) })) case _ => null } }
Example 108
Source File: DynamoBatchWriter.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb.datasource import com.amazonaws.services.dynamodbv2.document.DynamoDB import com.audienceproject.shaded.google.common.util.concurrent.RateLimiter import com.audienceproject.spark.dynamodb.connector.{ColumnSchema, TableConnector} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.v2.writer.{DataWriter, WriterCommitMessage} import scala.collection.mutable.ArrayBuffer class DynamoBatchWriter(batchSize: Int, columnSchema: ColumnSchema, connector: TableConnector, client: DynamoDB) extends DataWriter[InternalRow] { protected val buffer: ArrayBuffer[InternalRow] = new ArrayBuffer[InternalRow](batchSize) protected val rateLimiter: RateLimiter = RateLimiter.create(connector.writeLimit) override def write(record: InternalRow): Unit = { buffer += record.copy() if (buffer.size == batchSize) { flush() } } override def commit(): WriterCommitMessage = { flush() new WriterCommitMessage {} } override def abort(): Unit = {} protected def flush(): Unit = { if (buffer.nonEmpty) { connector.putItems(columnSchema, buffer)(client, rateLimiter) buffer.clear() } } }
Example 109
Source File: JavaConverter.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb.catalyst import java.util import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import scala.collection.JavaConverters._ object JavaConverter { def convertRowValue(row: InternalRow, index: Int, elementType: DataType): Any = { elementType match { case ArrayType(innerType, _) => convertArray(row.getArray(index), innerType) case MapType(keyType, valueType, _) => convertMap(row.getMap(index), keyType, valueType) case StructType(fields) => convertStruct(row.getStruct(index, fields.length), fields) case StringType => row.getString(index) case _ => row.get(index, elementType) } } def convertArray(array: ArrayData, elementType: DataType): Any = { elementType match { case ArrayType(innerType, _) => array.toSeq[ArrayData](elementType).map(convertArray(_, innerType)).asJava case MapType(keyType, valueType, _) => array.toSeq[MapData](elementType).map(convertMap(_, keyType, valueType)).asJava case structType: StructType => array.toSeq[InternalRow](structType).map(convertStruct(_, structType.fields)).asJava case StringType => convertStringArray(array).asJava case _ => array.toSeq[Any](elementType).asJava } } def convertMap(map: MapData, keyType: DataType, valueType: DataType): util.Map[String, Any] = { if (keyType != StringType) throw new IllegalArgumentException( s"Invalid Map key type '${keyType.typeName}'. DynamoDB only supports String as Map key type.") val keys = convertStringArray(map.keyArray()) val values = valueType match { case ArrayType(innerType, _) => map.valueArray().toSeq[ArrayData](valueType).map(convertArray(_, innerType)) case MapType(innerKeyType, innerValueType, _) => map.valueArray().toSeq[MapData](valueType).map(convertMap(_, innerKeyType, innerValueType)) case structType: StructType => map.valueArray().toSeq[InternalRow](structType).map(convertStruct(_, structType.fields)) case StringType => convertStringArray(map.valueArray()) case _ => map.valueArray().toSeq[Any](valueType) } val kvPairs = for (i <- 0 until map.numElements()) yield keys(i) -> values(i) Map(kvPairs: _*).asJava } def convertStruct(row: InternalRow, fields: Seq[StructField]): util.Map[String, Any] = { val kvPairs = for (i <- 0 until row.numFields) yield if (row.isNullAt(i)) fields(i).name -> null else fields(i).name -> convertRowValue(row, i, fields(i).dataType) Map(kvPairs: _*).asJava } def convertStringArray(array: ArrayData): Seq[String] = array.toSeq[UTF8String](StringType).map(_.toString) }
Example 110
Source File: ExistingDStream.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.rdd.{EmptyRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream private[streaming] case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow]) extends SparkPlan with StreamPlan { def children = Nil override def doExecute() = { assert(validTime != null) Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime)) .asInstanceOf[Option[RDD[InternalRow]]] .getOrElse(new EmptyRDD[InternalRow](sparkContext)) } }
Example 111
Source File: SocketTextSource.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.sources import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.{BaseRelation, SchemaRelationProvider} import org.apache.spark.sql.streaming.StreamPlan import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.dstream.DStream class SocketTextSource extends SchemaRelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { require(parameters.contains("host") && parameters.contains("port") && parameters.contains("messageToRow")) val messageToRow = { try { val clz = Class.forName(parameters("messageToRow")) clz.newInstance().asInstanceOf[MessageToRowConverter] } catch { case e: Exception => sys.error(s"Failed to load class : ${e.toString}") } } new SocketTextRelation( parameters("host"), parameters("port").toInt, messageToRow, schema, sqlContext) } } case class SocketTextRelation( host: String, port: Int, messageToRowConverter: MessageToRowConverter, val schema: StructType, @transient val sqlContext: SQLContext) extends StreamBaseRelation with StreamPlan { // Currently only support Kafka with String messages @transient private val socketStream = streamSqlContext.streamingContext.socketTextStream( host, port) @transient val stream: DStream[InternalRow] = socketStream.map(messageToRowConverter.toRow(_, schema)) }
Example 112
Source File: MessageDelimiter.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.sources import org.apache.spark.Logging import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Cast, EmptyRow, Literal} import org.apache.spark.sql.types.StructType class MessageDelimiter extends MessageToRowConverter with Logging { val delimiter = " " def toRow(msg: String, schema: StructType): InternalRow = { val splitted = msg.split(delimiter).map(Literal(_)) val casted = splitted.indices.map(i => Cast(splitted(i), schema(i).dataType).eval(EmptyRow)) InternalRow.fromSeq(casted) } def toMessage(row: Row): String = row.mkString(delimiter) } trait MessageToRowConverter extends Serializable { def toRow(message: String, schema: StructType): InternalRow def toMessage(row: Row): String }
Example 113
Source File: StreamPlan.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream private[streaming] object StreamPlan { val currentContext = new ThreadLocal[StreamSQLContext]() } trait StreamPlan { protected var validTime: Time = null def streamSqlContext = StreamPlan.currentContext.get() def stream: DStream[InternalRow] def setValidTime(time: Time): Unit = { validTime = time } }
Example 114
Source File: MQTTStreamSink.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.sql.streaming.mqtt import scala.collection.JavaConverters._ import scala.collection.mutable import org.eclipse.paho.client.mqttv3.MqttException import org.apache.spark.SparkEnv import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.bahir.utils.Logging import org.apache.bahir.utils.Retry class MQTTStreamWriter (schema: StructType, parameters: DataSourceOptions) extends StreamWriter with Logging { override def createWriterFactory(): DataWriterFactory[InternalRow] = { // Skipping client identifier as single batch can be distributed to multiple // Spark worker process. MQTT server does not support two connections // declaring same client ID at given point in time. val params = parameters.asMap().asScala.filterNot( _._1.equalsIgnoreCase("clientId") ) MQTTDataWriterFactory(params) } override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} } case class MQTTDataWriterFactory(config: mutable.Map[String, String]) extends DataWriterFactory[InternalRow] { override def createDataWriter( partitionId: Int, taskId: Long, epochId: Long ): DataWriter[InternalRow] = new MQTTDataWriter(config) } case object MQTTWriterCommitMessage extends WriterCommitMessage class MQTTDataWriter(config: mutable.Map[String, String]) extends DataWriter[InternalRow] { private lazy val publishAttempts: Int = SparkEnv.get.conf.getInt("spark.mqtt.client.publish.attempts", -1) private lazy val publishBackoff: Long = SparkEnv.get.conf.getTimeAsMs("spark.mqtt.client.publish.backoff", "5s") private lazy val (_, _, topic, _, _, qos, _, _, _) = MQTTUtils.parseConfigParams(config.toMap) override def write(record: InternalRow): Unit = { val client = CachedMQTTClient.getOrCreate(config.toMap) val message = record.getBinary(0) Retry(publishAttempts, publishBackoff, classOf[MqttException]) { // In case of errors, retry sending the message. client.publish(topic, message, qos, false) } } override def commit(): WriterCommitMessage = MQTTWriterCommitMessage override def abort(): Unit = {} } case class MQTTRelation(override val sqlContext: SQLContext, data: DataFrame) extends BaseRelation { override def schema: StructType = data.schema } class MQTTStreamSinkProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister with CreatableRelationProvider { override def createStreamWriter(queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { new MQTTStreamWriter(schema, options) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { MQTTRelation(sqlContext, data) } override def shortName(): String = "mqtt" }
Example 115
Source File: JCUDACodegenIterator.scala From GPUEnabler with Apache License 2.0 | 5 votes |
package com.ibm.gpuenabler import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.Encoder abstract class JCUDACodegenIterator extends Iterator[InternalRow] { def hasNext() : Boolean def next() : InternalRow def init[T](itr : java.util.Iterator[InternalRow], args: Array[Any], size : Int, cached: Int, gpuPtrs: java.util.List[java.util.Map[String, CachedGPUMeta]], blockID: Int, userGridSizes: Array[Array[Int]], userBlockSizes: Array[Array[Int]], stages: Int, smSize: Int, inpEnc: Encoder[T]) }
Example 116
Source File: ArrowEvalPythonExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.StructType case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends EvalPythonExec(udfs, output, child) { private val batchSize = conf.arrowMaxRecordsPerBatch private val sessionLocalTimeZone = conf.sessionLocalTimeZone private val pandasRespectSessionTimeZone = conf.pandasRespectSessionTimeZone protected override def evaluate( funcs: Seq[ChainedPythonFunctions], bufferSize: Int, reuseWorker: Boolean, argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] = { val outputTypes = output.drop(child.output.length).map(_.dataType) // DO NOT use iter.grouped(). See BatchIterator. val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter) val columnarBatchIter = new ArrowPythonRunner( funcs, bufferSize, reuseWorker, PythonEvalType.SQL_SCALAR_PANDAS_UDF, argOffsets, schema, sessionLocalTimeZone, pandasRespectSessionTimeZone) .compute(batchIter, context.partitionId(), context) new Iterator[InternalRow] { private var currentIter = if (columnarBatchIter.hasNext) { val batch = columnarBatchIter.next() val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType()) assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " + s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}") batch.rowIterator.asScala } else { Iterator.empty } override def hasNext: Boolean = currentIter.hasNext || { if (columnarBatchIter.hasNext) { currentIter = columnarBatchIter.next().rowIterator.asScala hasNext } else { false } } override def next(): InternalRow = currentIter.next() } } }
Example 117
Source File: BatchEvalPythonExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import scala.collection.JavaConverters._ import net.razorvine.pickle.{Pickler, Unpickler} import org.apache.spark.TaskContext import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{StructField, StructType} case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends EvalPythonExec(udfs, output, child) { protected override def evaluate( funcs: Seq[ChainedPythonFunctions], bufferSize: Int, reuseWorker: Boolean, argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] = { EvaluatePython.registerPicklers() // register pickler for Row val dataTypes = schema.map(_.dataType) val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython) // enable memo iff we serialize the row with schema (schema and class should be memorized) val pickle = new Pickler(needConversion) // Input iterator to Python: input rows are grouped so we send them in batches to Python. // For each row, add it to the queue. val inputIterator = iter.map { row => if (needConversion) { EvaluatePython.toJava(row, schema) } else { // fast path for these types that does not need conversion in Python val fields = new Array[Any](row.numFields) var i = 0 while (i < row.numFields) { val dt = dataTypes(i) fields(i) = EvaluatePython.toJava(row.get(i, dt), dt) i += 1 } fields } }.grouped(100).map(x => pickle.dumps(x.toArray)) // Output iterator for results from Python. val outputIterator = new PythonUDFRunner( funcs, bufferSize, reuseWorker, PythonEvalType.SQL_BATCHED_UDF, argOffsets) .compute(inputIterator, context.partitionId(), context) val unpickle = new Unpickler val mutableRow = new GenericInternalRow(1) val resultType = if (udfs.length == 1) { udfs.head.dataType } else { StructType(udfs.map(u => StructField("", u.dataType, u.nullable))) } val fromJava = EvaluatePython.makeFromJava(resultType) outputIterator.flatMap { pickedResult => val unpickledBatch = unpickle.loads(pickedResult) unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala }.map { result => if (udfs.length == 1) { // fast path for single UDF mutableRow(0) = fromJava(result) mutableRow } else { fromJava(result).asInstanceOf[InternalRow] } } } }
Example 118
Source File: BoundOrdering.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.window import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Projection private[window] final case class RangeBoundOrdering( ordering: Ordering[InternalRow], current: Projection, bound: Projection) extends BoundOrdering { override def compare( inputRow: InternalRow, inputIndex: Int, outputRow: InternalRow, outputIndex: Int): Int = ordering.compare(current(inputRow), bound(outputRow)) }
Example 119
Source File: NullableColumnAccessor.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.catalyst.InternalRow private[columnar] trait NullableColumnAccessor extends ColumnAccessor { private var nullsBuffer: ByteBuffer = _ private var nullCount: Int = _ private var seenNulls: Int = 0 private var nextNullIndex: Int = _ private var pos: Int = 0 abstract override protected def initialize(): Unit = { nullsBuffer = underlyingBuffer.duplicate().order(ByteOrder.nativeOrder()) nullCount = ByteBufferHelper.getInt(nullsBuffer) nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1 pos = 0 underlyingBuffer.position(underlyingBuffer.position() + 4 + nullCount * 4) super.initialize() } abstract override def extractTo(row: InternalRow, ordinal: Int): Unit = { if (pos == nextNullIndex) { seenNulls += 1 if (seenNulls < nullCount) { nextNullIndex = ByteBufferHelper.getInt(nullsBuffer) } row.setNullAt(ordinal) } else { super.extractTo(row, ordinal) } pos += 1 } abstract override def hasNext: Boolean = seenNulls < nullCount || super.hasNext }
Example 120
Source File: NullableColumnBuilder.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.catalyst.InternalRow private[columnar] trait NullableColumnBuilder extends ColumnBuilder { protected var nulls: ByteBuffer = _ protected var nullCount: Int = _ private var pos: Int = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { nulls = ByteBuffer.allocate(1024) nulls.order(ByteOrder.nativeOrder()) pos = 0 nullCount = 0 super.initialize(initialSize, columnName, useCompression) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { columnStats.gatherStats(row, ordinal) if (row.isNullAt(ordinal)) { nulls = ColumnBuilder.ensureFreeSpace(nulls, 4) nulls.putInt(pos) nullCount += 1 } else { super.appendFrom(row, ordinal) } pos += 1 } abstract override def build(): ByteBuffer = { val nonNulls = super.build() val nullDataLen = nulls.position() nulls.limit(nullDataLen) nulls.rewind() val buffer = ByteBuffer .allocate(4 + nullDataLen + nonNulls.remaining()) .order(ByteOrder.nativeOrder()) .putInt(nullCount) .put(nulls) .put(nonNulls) buffer.rewind() buffer } protected def buildNonNulls(): ByteBuffer = { nulls.limit(nulls.position()).rewind() super.build() } }
Example 121
Source File: CompressibleColumnBuilder.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 122
Source File: CompressibleColumnAccessor.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor} import org.apache.spark.sql.execution.vectorized.WritableColumnVector import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor { this: NativeColumnAccessor[T] => private var decoder: Decoder[T] = _ abstract override protected def initialize(): Unit = { super.initialize() decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType) } abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext override def extractSingle(row: InternalRow, ordinal: Int): Unit = { decoder.next(row, ordinal) } def decompress(columnVector: WritableColumnVector, capacity: Int): Unit = decoder.decompress(columnVector, capacity) }
Example 123
Source File: MicroBatchWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.sources import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriterFactory, SupportsWriteInternalRow, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter class MicroBatchWriter(batchId: Long, writer: StreamWriter) extends DataSourceWriter { override def commit(messages: Array[WriterCommitMessage]): Unit = { writer.commit(batchId, messages) } override def abort(messages: Array[WriterCommitMessage]): Unit = writer.abort(batchId, messages) override def createWriterFactory(): DataWriterFactory[Row] = writer.createWriterFactory() } class InternalRowMicroBatchWriter(batchId: Long, writer: StreamWriter) extends DataSourceWriter with SupportsWriteInternalRow { override def commit(messages: Array[WriterCommitMessage]): Unit = { writer.commit(batchId, messages) } override def abort(messages: Array[WriterCommitMessage]): Unit = writer.abort(batchId, messages) override def createInternalRowWriterFactory(): DataWriterFactory[InternalRow] = writer match { case w: SupportsWriteInternalRow => w.createInternalRowWriterFactory() case _ => throw new IllegalStateException( "InternalRowMicroBatchWriter should only be created with base writer support") } }
Example 124
Source File: EventTimeWatermarkExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 125
Source File: CoGroupedIterator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 126
Source File: ReferenceSort.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryExecNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = child.outputPartitioning }
Example 127
Source File: ColumnarTestUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 128
Source File: ExtraStrategiesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.test.SharedSQLContext case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) val unsafeProj = UnsafeProjection.create(schema) val unsafeRow = unsafeProj(row).copy() sparkContext.parallelize(Seq(unsafeRow)) } override def producedAttributes: AttributeSet = outputSet override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { spark.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { spark.experimental.extraStrategies = Nil } } }
Example 129
Source File: DataSourceTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String private[sql] abstract class DataSourceTest extends QueryTest { protected def sqlTest(sqlString: String, expectedAnswer: Seq[Row], enableRegex: Boolean = false) { test(sqlString) { withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> enableRegex.toString) { checkAnswer(spark.sql(sqlString), expectedAnswer) } } } } class DDLScanSource extends RelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { SimpleDDLScan( parameters("from").toInt, parameters("TO").toInt, parameters("Table"))(sqlContext.sparkSession) } } case class SimpleDDLScan( from: Int, to: Int, table: String)(@transient val sparkSession: SparkSession) extends BaseRelation with TableScan { override def sqlContext: SQLContext = sparkSession.sqlContext override def schema: StructType = StructType(Seq( StructField("intType", IntegerType, nullable = false).withComment(s"test comment $table"), StructField("stringType", StringType, nullable = false), StructField("dateType", DateType, nullable = false), StructField("timestampType", TimestampType, nullable = false), StructField("doubleType", DoubleType, nullable = false), StructField("bigintType", LongType, nullable = false), StructField("tinyintType", ByteType, nullable = false), StructField("decimalType", DecimalType.USER_DEFAULT, nullable = false), StructField("fixedDecimalType", DecimalType(5, 1), nullable = false), StructField("binaryType", BinaryType, nullable = false), StructField("booleanType", BooleanType, nullable = false), StructField("smallIntType", ShortType, nullable = false), StructField("floatType", FloatType, nullable = false), StructField("mapType", MapType(StringType, StringType)), StructField("arrayType", ArrayType(StringType)), StructField("structType", StructType(StructField("f1", StringType) :: StructField("f2", IntegerType) :: Nil ) ) )) override def needConversion: Boolean = false override def buildScan(): RDD[Row] = { // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row] sparkSession.sparkContext.parallelize(from to to).map { e => InternalRow(UTF8String.fromString(s"people$e"), e * 2) }.asInstanceOf[RDD[Row]] } }
Example 130
Source File: UnivocityGenerator.scala From mimir with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.ubodin.csv import java.io.Writer import com.univocity.parsers.csv.CsvWriter import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ private[csv] class UnivocityGenerator( schema: StructType, writer: Writer, options: CSVOptions) { private val writerSettings = options.asWriterSettings writerSettings.setHeaders(schema.fieldNames: _*) private val gen = new CsvWriter(writer, writerSettings) private var printHeader = options.headerFlag // A `ValueConverter` is responsible for converting a value of an `InternalRow` to `String`. // When the value is null, this converter should not be called. private type ValueConverter = (InternalRow, Int) => String // `ValueConverter`s for all values in the fields of the schema private val valueConverters: Array[ValueConverter] = schema.map(_.dataType).map(makeConverter).toArray private def makeConverter(dataType: DataType): ValueConverter = dataType match { case DateType => (row: InternalRow, ordinal: Int) => options.dateFormat.format(DateTimeUtils.toJavaDate(row.getInt(ordinal))) case TimestampType => (row: InternalRow, ordinal: Int) => options.timestampFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal))) case udt: UserDefinedType[_] => makeConverter(udt.sqlType) case dt: DataType => (row: InternalRow, ordinal: Int) => row.get(ordinal, dt).toString } private def convertRow(row: InternalRow): Seq[String] = { var i = 0 val values = new Array[String](row.numFields) while (i < row.numFields) { if (!row.isNullAt(i)) { values(i) = valueConverters(i).apply(row, i) } else { values(i) = options.nullValue } i += 1 } values } def write(row: InternalRow): Unit = { if (printHeader) { gen.writeHeaders() } gen.writeRow(convertRow(row): _*) printHeader = false } def close(): Unit = gen.close() def flush(): Unit = gen.flush() }
Example 131
Source File: PDFDataSource.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.datasource.pdf import org.apache.spark.sql.sources.v2._ import org.apache.spark.sql.types._ import org.apache.spark.sql.sources.v2.reader._ import scala.collection.JavaConverters._ import org.apache.spark.sql.catalyst.InternalRow import java.util.{Collections, List => JList, Optional} import org.apache.spark.unsafe.types.UTF8String import mimir.exec.spark.datasource.csv.CSVDataSourceReader class DefaultSource extends DataSourceV2 with ReadSupport { def createReader(options: DataSourceOptions) = { val path = options.get("path").get val pages = options.get("pages").orElse("all") val area = Option(options.get("area").orElse(null)) val hasGrid = options.get("gridLines").orElse("false").toBoolean val pdfExtractor = new PDFTableExtractor() val outPath = s"${path}.csv" pdfExtractor.defaultExtract(path, pages, area, Some(outPath), hasGrid) //println(s"------PDFDataSource----$path -> $outPath") //println({scala.io.Source.fromFile(outPath).mkString}) new CSVDataSourceReader(outPath, options.asMap().asScala.toMap) } }
Example 132
Source File: SageMakerProtobufWriter.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types.StructType def write(row: Row): Unit = { val labelColumnName = options.getOrElse("labelColumnName", "label") val featuresColumnName = options.getOrElse("featuresColumnName", "features") val record = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Some(labelColumnName)) record.writeTo(byteArrayOutputStream) recordWriter.write(NullWritable.get(), new BytesWritable(byteArrayOutputStream.toByteArray)) byteArrayOutputStream.reset() } override def close(): Unit = { recordWriter.close(context) } }
Example 133
Source File: StarryLocalTableScanExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.{RDD, StarryRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class StarryLocalTableScanExec( tableName: String, output: Seq[Attribute], @transient rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) @transient private lazy val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val rdd = new StarryRDD(sparkContext, tableName, unsafeRows) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.length) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.length) taken } }
Example 134
Source File: StarryHashJoinExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import com.github.passionke.starry.SparkPlanExecutor import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.JoinType import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} case class StarryHashJoinExec( leftKeys: Seq[Expression], rightKeys: Seq[Expression], joinType: JoinType, buildSide: BuildSide, condition: Option[Expression], left: SparkPlan, right: SparkPlan) extends BinaryExecNode with HashJoin { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val avgHashProbe = longMetric("avgHashProbe") val rows = SparkPlanExecutor.doExec(buildPlan) val hashed = HashedRelation(rows.iterator, buildKeys, rows.length, null) streamedPlan.execute().mapPartitions { streamedIter => join(streamedIter, hashed, numOutputRows, avgHashProbe) } } }
Example 135
Source File: StarryTakeOrderedAndProjectExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, SortOrder, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.util.Utils case class StarryTakeOrderedAndProjectExec( limit: Int, sortOrder: Seq[SortOrder], projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryExecNode { override def output: Seq[Attribute] = { projectList.map(_.toAttribute) } override def executeCollect(): Array[InternalRow] = { val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val data = child.execute().map(_.copy()).takeOrdered(limit)(ord) if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) data.map(r => proj(r).copy()) } else { data } } protected override def doExecute(): RDD[InternalRow] = { val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val localTopK: RDD[InternalRow] = { child.execute().map(_.copy()).mapPartitions { iter => org.apache.spark.util.collection.Utils.takeOrdered(iter, limit)(ord) } } localTopK.mapPartitions { iter => val topK = org.apache.spark.util.collection.Utils.takeOrdered(iter.map(_.copy()), limit)(ord) if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) topK.map(r => proj(r)) } else { topK } } } override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = SinglePartition override def simpleString: String = { val orderByString = Utils.truncatedString(sortOrder, "[", ",", "]") val outputString = Utils.truncatedString(output, "[", ",", "]") s"TakeOrderedAndProject(limit=$limit, orderBy=$orderByString, output=$outputString)" } }
Example 136
Source File: StarryUnionExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import com.github.passionke.starry.SparkPlanExecutor import org.apache.spark.rdd.{RDD, StarryRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute case class StarryUnionExec(children: Seq[SparkPlan]) extends SparkPlan { override def output: Seq[Attribute] = children.map(_.output).transpose.map(attrs => attrs.head.withNullability(attrs.exists(_.nullable))) protected override def doExecute(): RDD[InternalRow] = { val b = children.flatMap(child => { SparkPlanExecutor.doExec(child) }) new StarryRDD(sparkContext, b) } }
Example 137
Source File: StarryLocalRelation.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.logical import org.apache.spark.sql.catalyst.{InternalRow, analysis} import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, Statistics} override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def computeStats(): Statistics = Statistics(sizeInBytes = output.map(n => BigInt(n.dataType.defaultSize)).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 138
Source File: SparkPlanExecutor.scala From starry with Apache License 2.0 | 5 votes |
package com.github.passionke.starry import org.apache.spark.{Partition, StarryTaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.{ReuseSubquery, SparkPlan} object SparkPlanExecutor { def exec(plan: SparkPlan, sparkSession: SparkSession) = { val newPlan = Seq( ReuseSubquery(sparkSession.sessionState.conf)) .foldLeft(plan) { case (sp, rule) => rule.apply(sp) } doExec(newPlan) } def firstPartition(rdd: RDD[InternalRow]): Partition = { rdd.partitions.head } def doExec(sparkPlan: SparkPlan): List[InternalRow] = { val rdd = sparkPlan.execute().map(ite => ite.copy()) val partition = firstPartition(rdd) rdd.compute(partition, new StarryTaskContext).toList } def rddCompute(rdd: RDD[InternalRow]): List[InternalRow] = { val partition = firstPartition(rdd) rdd.compute(partition, new StarryTaskContext).toList } }
Example 139
Source File: KinesisWriteTask.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import java.nio.ByteBuffer import com.amazonaws.services.kinesis.producer.{KinesisProducer, UserRecordResult} import com.google.common.util.concurrent.{FutureCallback, Futures} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, UnsafeProjection} import org.apache.spark.sql.types.{BinaryType, StringType} private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, String], inputSchema: Seq[Attribute]) extends Logging { private var producer: KinesisProducer = _ private val projection = createProjection private val streamName = producerConfiguration.getOrElse( KinesisSourceProvider.SINK_STREAM_NAME_KEY, "") def execute(iterator: Iterator[InternalRow]): Unit = { producer = CachedKinesisProducer.getOrCreate(producerConfiguration) while (iterator.hasNext) { val currentRow = iterator.next() val projectedRow = projection(currentRow) val partitionKey = projectedRow.getString(0) val data = projectedRow.getBinary(1) sendData(partitionKey, data) } } def sendData(partitionKey: String, data: Array[Byte]): String = { var sentSeqNumbers = new String val future = producer.addUserRecord(streamName, partitionKey, ByteBuffer.wrap(data)) val kinesisCallBack = new FutureCallback[UserRecordResult]() { override def onFailure(t: Throwable): Unit = { logError(s"Writing to $streamName failed due to ${t.getCause}") } override def onSuccess(result: UserRecordResult): Unit = { val shardId = result.getShardId sentSeqNumbers = result.getSequenceNumber } } Futures.addCallback(future, kinesisCallBack) producer.flushSync() sentSeqNumbers } def close(): Unit = { if (producer != null) { producer.flush() producer = null } } private def createProjection: UnsafeProjection = { val partitionKeyExpression = inputSchema .find(_.name == KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME).getOrElse( throw new IllegalStateException("Required attribute " + s"'${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME}' not found")) partitionKeyExpression.dataType match { case StringType | BinaryType => // ok case t => throw new IllegalStateException(s"${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME} " + "attribute type must be a String or BinaryType") } val dataExpression = inputSchema.find(_.name == KinesisWriter.DATA_ATTRIBUTE_NAME).getOrElse( throw new IllegalStateException("Required attribute " + s"'${KinesisWriter.DATA_ATTRIBUTE_NAME}' not found") ) dataExpression.dataType match { case StringType | BinaryType => // ok case t => throw new IllegalStateException(s"${KinesisWriter.DATA_ATTRIBUTE_NAME} " + "attribute type must be a String or BinaryType") } UnsafeProjection.create( Seq(Cast(partitionKeyExpression, StringType), Cast(dataExpression, StringType)), inputSchema) } }
Example 140
Source File: relations.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution import edu.ucla.cs.wis.bigdatalog.spark.BigDatalogContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.sql.execution.LeafNode import org.apache.spark.sql.catalyst.InternalRow abstract class RecursiveRelation(name: String, output: Seq[Attribute], partitioning: Seq[Int]) extends LeafNode { @transient final val bigDatalogContext = SQLContext.getActive().getOrElse(null).asInstanceOf[BigDatalogContext] override def simpleString = s"$nodeName " + output.mkString("[", ",", "]") + "(" + name + ")" override def outputPartitioning: Partitioning = { if (partitioning == null || partitioning.isEmpty) UnknownPartitioning(0) else new HashPartitioning(partitioning.zip(output).filter(_._1 == 1).map(_._2), bigDatalogContext.conf.numShufflePartitions) } override def outputsUnsafeRows: Boolean = true override def doExecute(): RDD[InternalRow] = { bigDatalogContext.getRDD(name) } } case class LinearRecursiveRelation(name : String, output : Seq[Attribute], partitioning: Seq[Int]) extends RecursiveRelation(name, output, partitioning) case class NonLinearRecursiveRelation(name : String, output : Seq[Attribute], partitioning: Seq[Int]) extends RecursiveRelation("all_" + name, output, partitioning) case class AggregateRelation(name : String, output : Seq[Attribute], partitioning: Seq[Int]) extends RecursiveRelation(name, output, partitioning)
Example 141
Source File: AggregateSetRDDMinMaxPartition.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution.setrdd import edu.ucla.cs.wis.bigdatalog.spark.SchemaInfo import edu.ucla.cs.wis.bigdatalog.spark.execution.aggregates.{AggregateSetRDDPartition, KeyValueToInternalRowIterator, MonotonicAggregate} import edu.ucla.cs.wis.bigdatalog.spark.storage.map.UnsafeFixedWidthMonotonicAggregationMap import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow class AggregateSetRDDMinMaxPartition(aggregateStore: UnsafeFixedWidthMonotonicAggregationMap, schemaInfo: SchemaInfo, monotonicAggregate: MonotonicAggregate) extends AggregateSetRDDPartition(aggregateStore, schemaInfo, monotonicAggregate) with Serializable { def size: Int = aggregateStore.numElements() def iterator: Iterator[InternalRow] = { KeyValueToInternalRowIterator(aggregateStore.iterator(), monotonicAggregate.generateResultProjection()) } // update() merges the results produced during the iteration into this partition // during update(): // - the underlying aggregateSetRDDPartition storage is updated. // - a 2nd aggregateSetRDDPartition is produced indicating the rows that changed during the merge // This is similar to regular aggregation, just that we-use the same hashmap each iteration def update(iter: Iterator[InternalRow], monotonicAggregate: MonotonicAggregate): (AggregateSetRDDPartition, SetRDDPartition[InternalRow]) = {//Iterator[InternalRow]) = { val start = System.currentTimeMillis() val before = aggregateStore.numElements() // this is going to perform the aggregation and return an iterator over the output val maIter = monotonicAggregate.getAggregationIterator(iter, aggregateStore) logInfo("Update deltaSPrime set size before %s after %s, delta set size %s took %s ms" .format(before, aggregateStore.numElements(), maIter.deltaSet.size, System.currentTimeMillis() - start)) val hashMapIter = new JavaHashMapIterator(maIter.deltaSet, monotonicAggregate.generateResultProjection()) (new AggregateSetRDDMinMaxPartition(aggregateStore, schemaInfo, monotonicAggregate), SetRDDHashSetPartition(hashMapIter, schemaInfo)) } } class JavaHashMapIterator(hashMap: java.util.HashMap[UnsafeRow, UnsafeRow], resultProjection: (UnsafeRow, UnsafeRow) => UnsafeRow) extends Iterator[InternalRow] { val iterator = hashMap.entrySet().iterator() override def hasNext: Boolean = iterator.hasNext override def next: InternalRow = { val entry = iterator.next() return resultProjection(entry.getKey, entry.getValue) } }
Example 142
Source File: AggregateSetRDDPartition.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution.aggregates import edu.ucla.cs.wis.bigdatalog.spark.SchemaInfo import edu.ucla.cs.wis.bigdatalog.spark.execution.setrdd.SetRDDPartition import edu.ucla.cs.wis.bigdatalog.spark.storage.map.UnsafeFixedWidthMonotonicAggregationMap import org.apache.spark.sql.catalyst.InternalRow abstract class AggregateSetRDDPartition(val aggregateStore: UnsafeFixedWidthMonotonicAggregationMap, val schemaInfo: SchemaInfo, val monotonicAggregate: MonotonicAggregate) extends Serializable with org.apache.spark.Logging { def this() = this(null, null, null) def size: Int def iterator: Iterator[InternalRow] def update(iter: Iterator[InternalRow], monotonicAggregate: MonotonicAggregate): (AggregateSetRDDPartition, SetRDDPartition[InternalRow]) }
Example 143
Source File: SetRDDHashSetPartition.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution.setrdd import edu.ucla.cs.wis.bigdatalog.spark.SchemaInfo import edu.ucla.cs.wis.bigdatalog.spark.storage.HashSetManager import edu.ucla.cs.wis.bigdatalog.spark.storage.set.hashset.{HashSet, HashSetRowIterator} import org.apache.spark.sql.catalyst.InternalRow import scala.reflect.ClassTag class SetRDDHashSetPartition(val set: HashSet, schemaInfo: SchemaInfo, numFactsGenerated: Long = 0, numFactsDerived: Long = 0) (implicit val cTag: ClassTag[InternalRow]) extends SetRDDPartition[InternalRow](numFactsGenerated, numFactsDerived) with Serializable { def this(set: HashSet, schemaInfo: SchemaInfo) = this(set, schemaInfo, 0, 0) override def size: Long = set.size override def iterator: Iterator[InternalRow] = HashSetRowIterator.create(set) override def union(otherPart: SetRDDPartition[InternalRow], rddId: Int): SetRDDHashSetPartition = { val start = System.currentTimeMillis() val newPartition = otherPart match { case otherPart: SetRDDHashSetPartition => { val set : HashSet = this.set.union(otherPart.set) new SetRDDHashSetPartition(set, schemaInfo) } case other => union(otherPart.iterator, rddId) } logInfo("Union set size %s for rdd %s took %s ms".format(this.set.size, rddId, System.currentTimeMillis() - start)) newPartition } override def union(iter: Iterator[InternalRow], rddId: Int): SetRDDHashSetPartition = { val start = System.currentTimeMillis() // add items to the existing set val newSet = this.set while (iter.hasNext) newSet.insert(iter.next()) logInfo("Union set size %s for rdd %s took %s ms".format(this.set.size, rddId, System.currentTimeMillis() - start)) new SetRDDHashSetPartition(newSet, schemaInfo) } override def diff(iter: Iterator[InternalRow], rddId: Int): SetRDDHashSetPartition = { val start = System.currentTimeMillis() val diffSet = HashSetManager.create(schemaInfo) //var row: InternalRow = null var numFactsGenerated: Long = 0 while (iter.hasNext) { //row = iter.next() numFactsGenerated += 1 this.set.ifNotExistsInsert(iter.next(), diffSet) //if (!this.set.exists(row)) // diffSet.insert(row) } logInfo("Diff set size %s for rdd %s took %s ms".format(diffSet.size, rddId, System.currentTimeMillis() - start)) new SetRDDHashSetPartition(diffSet, schemaInfo, numFactsGenerated, diffSet.size) } } object SetRDDHashSetPartition { def apply(iter: Iterator[InternalRow], schemaInfo: SchemaInfo): SetRDDHashSetPartition = { val set = HashSetManager.create(schemaInfo) while (iter.hasNext) set.insert(iter.next()) new SetRDDHashSetPartition(set, schemaInfo) } }
Example 144
Source File: ShuffleHashJoin.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution import edu.ucla.cs.wis.bigdatalog.spark.BigDatalogContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning, PartitioningCollection} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashJoin, HashedRelation} import org.apache.spark.sql.execution.metric.SQLMetrics case class ShuffleHashJoin(leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { @transient final protected val bigDatalogContext = SQLContext.getActive().getOrElse(null).asInstanceOf[BigDatalogContext] val cacheBuildSide = bigDatalogContext.getConf.getBoolean("spark.datalog.shufflehashjoin.cachebuildside", true) override lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) var cachedBuildPlan: RDD[HashedRelation] = null override def output: Seq[Attribute] = left.output ++ right.output override def outputPartitioning: Partitioning = PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning)) override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false protected override def doExecute(): RDD[InternalRow] = { val numStreamedRows = buildSide match { case BuildLeft => longMetric("numRightRows") case BuildRight => longMetric("numLeftRows") } val numOutputRows = longMetric("numOutputRows") if (cacheBuildSide) { if (cachedBuildPlan == null) { cachedBuildPlan = buildPlan.execute() .mapPartitionsInternal(iter => Iterator(HashedRelation(iter, SQLMetrics.nullLongMetric, buildSideKeyGenerator))) .persist() } cachedBuildPlan.zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) => hashJoin(streamedIter, numStreamedRows, buildIter.next(), numOutputRows)} } else { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) => val hashedRelation = HashedRelation(buildIter, SQLMetrics.nullLongMetric, buildSideKeyGenerator) hashJoin(streamedIter, numStreamedRows, hashedRelation, numOutputRows) } } } }
Example 145
Source File: RelationCatalog.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StructType import scala.collection.mutable.HashMap class RelationCatalog extends Serializable { val directory = HashMap.empty[String, RelationInfo] def addRelation(name : String, schema : StructType) : Unit = { val relationInfo = new RelationInfo().setSchema(schema) directory.get(name) match { case Some(oldRelationInfo) => // update rdd if already present. Schema should not change oldRelationInfo.setRDD(relationInfo.getRDD()) case None => directory.put(name, relationInfo) } } def setRDD(name : String, rdd : RDD[InternalRow]) : Unit = { directory.get(name) match { case Some(oldRelationInfo) => oldRelationInfo.setRDD(rdd) case None => directory.put(name, new RelationInfo().setRDD(rdd)) } } def getRelationInfo(name : String) : RelationInfo = { if (directory.contains(name)) directory(name) else null } def removeRDD(name : String) : Unit = { directory.remove(name) } def clear() : Unit = { directory.clear() } override def toString(): String = { val output = new StringBuilder() directory.iterator.foreach(f => output.append(f.toString())) output.toString() } } class RelationInfo() extends Serializable { private var schema : StructType = _ private var rdd : RDD[InternalRow] = _ def getSchema() : StructType = schema def setSchema(schema : StructType) : RelationInfo = { this.schema = schema this } def getRDD() : RDD[InternalRow] = rdd def setRDD(rdd : RDD[InternalRow]) : RelationInfo = { this.rdd = rdd this } override def toString() : String = { "schema: " + this.schema + (if (rdd != null) " RDD") } }
Example 146
Source File: HashSetRowIterator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.storage.set.hashset import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class ObjectHashSetRowIterator(set: ObjectHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { rawIter.next() } } class IntKeysHashSetRowIterator(set: IntKeysHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() val uRow = new UnsafeRow() val bufferHolder = new BufferHolder() val rowWriter = new UnsafeRowWriter() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { bufferHolder.reset() rowWriter.initialize(bufferHolder, 1) rowWriter.write(0, rawIter.next()) uRow.pointTo(bufferHolder.buffer, 1, bufferHolder.totalSize()) uRow } } class LongKeysHashSetRowIterator(set: LongKeysHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() val numFields = set.schemaInfo.arity val uRow = new UnsafeRow() val bufferHolder = new BufferHolder() val rowWriter = new UnsafeRowWriter() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { bufferHolder.reset() rowWriter.initialize(bufferHolder, numFields) val value = rawIter.nextLong() if (numFields == 2) { rowWriter.write(0, (value >> 32).toInt) rowWriter.write(1, value.toInt) } else { rowWriter.write(0, value) } uRow.pointTo(bufferHolder.buffer, numFields, bufferHolder.totalSize()) uRow } } object HashSetRowIterator { def create(set: HashSet): Iterator[InternalRow] = { set match { //case set: UnsafeFixedWidthSet => set.iterator().asScala case set: IntKeysHashSet => new IntKeysHashSetRowIterator(set) case set: LongKeysHashSet => new LongKeysHashSetRowIterator(set) case set: ObjectHashSet => new ObjectHashSetRowIterator(set) } } }
Example 147
Source File: LocalRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, analysis} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs = Iterator(output) override def sameResult(plan: LogicalPlan): Boolean = plan match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) }
Example 148
Source File: MonotonicallyIncreasingID.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext} import org.apache.spark.sql.types.{LongType, DataType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initInternal(): Unit = { count = 0L partitionMask = TaskContext.getPartitionId().toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++; """ } }
Example 149
Source File: randomExpressions.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types.{DataType, DoubleType} import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom case class Randn(seed: Long) extends RDG { override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() def this() = this(Utils.random.nextLong()) def this(seed: Expression) = this(seed match { case IntegerLiteral(s) => s case _ => throw new AnalysisException("Input argument to rand must be an integer literal.") }) override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val rngTerm = ctx.freshName("rng") val className = classOf[XORShiftRandom].getName ctx.addMutableState(className, rngTerm, s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian(); """ } }
Example 150
Source File: BoundAttribute.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types._ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean) extends LeafExpression with NamedExpression { override def toString: String = s"input[$ordinal, $dataType]" // Use special getter for primitive types (for UnsafeRow) override def eval(input: InternalRow): Any = { if (input.isNullAt(ordinal)) { null } else { dataType match { case BooleanType => input.getBoolean(ordinal) case ByteType => input.getByte(ordinal) case ShortType => input.getShort(ordinal) case IntegerType | DateType => input.getInt(ordinal) case LongType | TimestampType => input.getLong(ordinal) case FloatType => input.getFloat(ordinal) case DoubleType => input.getDouble(ordinal) case StringType => input.getUTF8String(ordinal) case BinaryType => input.getBinary(ordinal) case CalendarIntervalType => input.getInterval(ordinal) case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale) case t: StructType => input.getStruct(ordinal, t.size) case _: ArrayType => input.getArray(ordinal) case _: MapType => input.getMap(ordinal) case _ => input.get(ordinal, dataType) } } } override def name: String = s"i[$ordinal]" override def toAttribute: Attribute = throw new UnsupportedOperationException override def qualifiers: Seq[String] = throw new UnsupportedOperationException override def exprId: ExprId = throw new UnsupportedOperationException override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val javaType = ctx.javaType(dataType) val value = ctx.getValue(ctx.INPUT_ROW, dataType, ordinal.toString) s""" boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal); $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value); """ } } object BindReferences extends Logging { def bindReference[A <: Expression]( expression: A, input: Seq[Attribute], allowFailures: Boolean = false): A = { expression.transform { case a: AttributeReference => attachTree(a, "Binding attribute") { val ordinal = input.indexWhere(_.exprId == a.exprId) if (ordinal == -1) { if (allowFailures) { a } else { sys.error(s"Couldn't find $a in ${input.mkString("[", ",", "]")}") } } else { BoundReference(ordinal, a.dataType, a.nullable) } } }.asInstanceOf[A] // Kind of a hack, but safe. TODO: Tighten return type when possible. } }
Example 151
Source File: decimalExpressions.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types._ case class CheckOverflow(child: Expression, dataType: DecimalType) extends UnaryExpression { override def nullable: Boolean = true override def nullSafeEval(input: Any): Any = { val d = input.asInstanceOf[Decimal].clone() if (d.changePrecision(dataType.precision, dataType.scale)) { d } else { null } } override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { nullSafeCodeGen(ctx, ev, eval => { val tmp = ctx.freshName("tmp") s""" | Decimal $tmp = $eval.clone(); | if ($tmp.changePrecision(${dataType.precision}, ${dataType.scale})) { | ${ev.value} = $tmp; | } else { | ${ev.isNull} = true; | } """.stripMargin }) } override def toString: String = s"CheckOverflow($child, $dataType)" }
Example 152
Source File: GeneratePredicate.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Boolean] { protected def canonicalize(in: Expression): Expression = ExpressionCanonicalizer.execute(in) protected def bind(in: Expression, inputSchema: Seq[Attribute]): Expression = BindReferences.bindReference(in, inputSchema) protected def create(predicate: Expression): ((InternalRow) => Boolean) = { val ctx = newCodeGenContext() val eval = predicate.gen(ctx) val code = s""" public SpecificPredicate generate($exprType[] expr) { return new SpecificPredicate(expr); } class SpecificPredicate extends ${classOf[Predicate].getName} { private final $exprType[] expressions; ${declareMutableStates(ctx)} ${declareAddedFunctions(ctx)} public SpecificPredicate($exprType[] expr) { expressions = expr; ${initMutableStates(ctx)} } public boolean eval(InternalRow ${ctx.INPUT_ROW}) { ${eval.code} return !${eval.isNull} && ${eval.value}; } }""" logDebug(s"Generated predicate '$predicate':\n${CodeFormatter.format(code)}") val p = compile(code).generate(ctx.references.toArray).asInstanceOf[Predicate] (r: InternalRow) => p.eval(r) } }
Example 153
Source File: SortOrder.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext} import org.apache.spark.sql.types._ import org.apache.spark.util.collection.unsafe.sort.PrefixComparators.BinaryPrefixComparator import org.apache.spark.util.collection.unsafe.sort.PrefixComparators.DoublePrefixComparator abstract sealed class SortDirection case object Ascending extends SortDirection case object Descending extends SortDirection case class SortPrefix(child: SortOrder) extends UnaryExpression { override def eval(input: InternalRow): Any = throw new UnsupportedOperationException override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val childCode = child.child.gen(ctx) val input = childCode.value val BinaryPrefixCmp = classOf[BinaryPrefixComparator].getName val DoublePrefixCmp = classOf[DoublePrefixComparator].getName val (nullValue: Long, prefixCode: String) = child.child.dataType match { case BooleanType => (Long.MinValue, s"$input ? 1L : 0L") case _: IntegralType => (Long.MinValue, s"(long) $input") case DateType | TimestampType => (Long.MinValue, s"(long) $input") case FloatType | DoubleType => (DoublePrefixComparator.computePrefix(Double.NegativeInfinity), s"$DoublePrefixCmp.computePrefix((double)$input)") case StringType => (0L, s"$input.getPrefix()") case BinaryType => (0L, s"$BinaryPrefixCmp.computePrefix($input)") case dt: DecimalType if dt.precision - dt.scale <= Decimal.MAX_LONG_DIGITS => val prefix = if (dt.precision <= Decimal.MAX_LONG_DIGITS) { s"$input.toUnscaledLong()" } else { // reduce the scale to fit in a long val p = Decimal.MAX_LONG_DIGITS val s = p - (dt.precision - dt.scale) s"$input.changePrecision($p, $s) ? $input.toUnscaledLong() : ${Long.MinValue}L" } (Long.MinValue, prefix) case dt: DecimalType => (DoublePrefixComparator.computePrefix(Double.NegativeInfinity), s"$DoublePrefixCmp.computePrefix($input.toDouble())") case _ => (0L, "0L") } childCode.code + s""" |long ${ev.value} = ${nullValue}L; |boolean ${ev.isNull} = false; |if (!${childCode.isNull}) { | ${ev.value} = $prefixCode; |} """.stripMargin } override def dataType: DataType = LongType }
Example 154
Source File: RowTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema} import org.apache.spark.sql.types._ import org.scalatest.{Matchers, FunSpec} class RowTest extends FunSpec with Matchers { val schema = StructType( StructField("col1", StringType) :: StructField("col2", StringType) :: StructField("col3", IntegerType) :: Nil) val values = Array("value1", "value2", 1) val valuesWithoutCol3 = Array[Any](null, "value2", null) val sampleRow: Row = new GenericRowWithSchema(values, schema) val sampleRowWithoutCol3: Row = new GenericRowWithSchema(valuesWithoutCol3, schema) val noSchemaRow: Row = new GenericRow(values) describe("Row (without schema)") { it("throws an exception when accessing by fieldName") { intercept[UnsupportedOperationException] { noSchemaRow.fieldIndex("col1") } intercept[UnsupportedOperationException] { noSchemaRow.getAs("col1") } } } describe("Row (with schema)") { it("fieldIndex(name) returns field index") { sampleRow.fieldIndex("col1") shouldBe 0 sampleRow.fieldIndex("col3") shouldBe 2 } it("getAs[T] retrieves a value by fieldname") { sampleRow.getAs[String]("col1") shouldBe "value1" sampleRow.getAs[Int]("col3") shouldBe 1 } it("Accessing non existent field throws an exception") { intercept[IllegalArgumentException] { sampleRow.getAs[String]("non_existent") } } it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") { val expected = Map( "col1" -> "value1", "col2" -> "value2" ) sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected } it("getValuesMap() retrieves null value on non AnyVal Type") { val expected = Map( "col1" -> null, "col2" -> "value2" ) sampleRowWithoutCol3.getValuesMap[String](List("col1", "col2")) shouldBe expected } it("getAs() on type extending AnyVal throws an exception when accessing field that is null") { intercept[NullPointerException] { sampleRowWithoutCol3.getInt(sampleRowWithoutCol3.fieldIndex("col3")) } } it("getAs() on type extending AnyVal does not throw exception when value is null"){ sampleRowWithoutCol3.getAs[String](sampleRowWithoutCol3.fieldIndex("col1")) shouldBe null } } describe("row equals") { val externalRow = Row(1, 2) val externalRow2 = Row(1, 2) val internalRow = InternalRow(1, 2) val internalRow2 = InternalRow(1, 2) it("equality check for external rows") { externalRow shouldEqual externalRow2 } it("equality check for internal rows") { internalRow shouldEqual internalRow2 } } }
Example 155
Source File: GenerateUnsafeRowJoinerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val schema1 = RandomDataGenerator.randomSchema(numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 156
Source File: ConvertToLocalRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor class ConvertToLocalRelationSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("LocalRelation", FixedPoint(100), ConvertToLocalRelation) :: Nil } test("Project on LocalRelation should be turned into a single LocalRelation") { val testRelation = LocalRelation( LocalRelation('a.int, 'b.int).output, InternalRow(1, 2) :: InternalRow(4, 5) :: Nil) val correctAnswer = LocalRelation( LocalRelation('a1.int, 'b1.int).output, InternalRow(1, 3) :: InternalRow(4, 6) :: Nil) val projectOnLocal = testRelation.select( UnresolvedAttribute("a").as("a1"), (UnresolvedAttribute("b") + 1).as("b1")) val optimized = Optimize.execute(projectOnLocal.analyze) comparePlans(optimized, correctAnswer) } }
Example 157
Source File: RowIterator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.util.NoSuchElementException import org.apache.spark.sql.catalyst.InternalRow def toScala: Iterator[InternalRow] = new RowIteratorToScala(this) } object RowIterator { def fromScala(scalaIter: Iterator[InternalRow]): RowIterator = { scalaIter match { case wrappedRowIter: RowIteratorToScala => wrappedRowIter.rowIter case _ => new RowIteratorFromScala(scalaIter) } } } private final class RowIteratorToScala(val rowIter: RowIterator) extends Iterator[InternalRow] { private [this] var hasNextWasCalled: Boolean = false private [this] var _hasNext: Boolean = false override def hasNext: Boolean = { // Idempotency: if (!hasNextWasCalled) { _hasNext = rowIter.advanceNext() hasNextWasCalled = true } _hasNext } override def next(): InternalRow = { if (!hasNext) throw new NoSuchElementException hasNextWasCalled = false rowIter.getRow } } private final class RowIteratorFromScala(scalaIter: Iterator[InternalRow]) extends RowIterator { private[this] var _next: InternalRow = null override def advanceNext(): Boolean = { if (scalaIter.hasNext) { _next = scalaIter.next() true } else { _next = null false } } override def getRow: InternalRow = _next override def toScala: Iterator[InternalRow] = scalaIter }
Example 158
Source File: SeqScanNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute case class SeqScanNode(conf: SQLConf, output: Seq[Attribute], data: Seq[InternalRow]) extends LeafLocalNode(conf) { private[this] var iterator: Iterator[InternalRow] = _ private[this] var currentRow: InternalRow = _ override def open(): Unit = { iterator = data.iterator } override def next(): Boolean = { if (iterator.hasNext) { currentRow = iterator.next() true } else { false } } override def fetch(): InternalRow = currentRow override def close(): Unit = { // Do nothing } }
Example 159
Source File: FilterNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate case class FilterNode(conf: SQLConf, condition: Expression, child: LocalNode) extends UnaryLocalNode(conf) { private[this] var predicate: (InternalRow) => Boolean = _ override def output: Seq[Attribute] = child.output override def open(): Unit = { child.open() predicate = GeneratePredicate.generate(condition, child.output) } override def next(): Boolean = { var found = false while (!found && child.next()) { found = predicate.apply(child.fetch()) } found } override def fetch(): InternalRow = child.fetch() override def close(): Unit = child.close() }
Example 160
Source File: ExpandNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Projection} case class ExpandNode( conf: SQLConf, projections: Seq[Seq[Expression]], output: Seq[Attribute], child: LocalNode) extends UnaryLocalNode(conf) { assert(projections.size > 0) private[this] var result: InternalRow = _ private[this] var idx: Int = _ private[this] var input: InternalRow = _ private[this] var groups: Array[Projection] = _ override def open(): Unit = { child.open() groups = projections.map(ee => newProjection(ee, child.output)).toArray idx = groups.length } override def next(): Boolean = { if (idx >= groups.length) { if (child.next()) { input = child.fetch() idx = 0 } else { return false } } result = groups(idx)(input) idx += 1 true } override def fetch(): InternalRow = result override def close(): Unit = child.close() }
Example 161
Source File: IntersectNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import scala.collection.mutable import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute case class IntersectNode(conf: SQLConf, left: LocalNode, right: LocalNode) extends BinaryLocalNode(conf) { override def output: Seq[Attribute] = left.output private[this] var leftRows: mutable.HashSet[InternalRow] = _ private[this] var currentRow: InternalRow = _ override def open(): Unit = { left.open() leftRows = mutable.HashSet[InternalRow]() while (left.next()) { leftRows += left.fetch().copy() } left.close() right.open() } override def next(): Boolean = { currentRow = null while (currentRow == null && right.next()) { currentRow = right.fetch() if (!leftRows.contains(currentRow)) { currentRow = null } } currentRow != null } override def fetch(): InternalRow = currentRow override def close(): Unit = { left.close() right.close() } }
Example 162
Source File: BinaryHashJoinNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.joins.{HashedRelation, BuildLeft, BuildRight, BuildSide} case class BinaryHashJoinNode( conf: SQLConf, leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: LocalNode, right: LocalNode) extends BinaryLocalNode(conf) with HashJoinNode { protected override val (streamedNode, streamedKeys) = buildSide match { case BuildLeft => (right, rightKeys) case BuildRight => (left, leftKeys) } private val (buildNode, buildKeys) = buildSide match { case BuildLeft => (left, leftKeys) case BuildRight => (right, rightKeys) } override def output: Seq[Attribute] = left.output ++ right.output private def buildSideKeyGenerator: Projection = { // We are expecting the data types of buildKeys and streamedKeys are the same. assert(buildKeys.map(_.dataType) == streamedKeys.map(_.dataType)) UnsafeProjection.create(buildKeys, buildNode.output) } protected override def doOpen(): Unit = { buildNode.open() val hashedRelation = HashedRelation(buildNode, buildSideKeyGenerator) // We have built the HashedRelation. So, close buildNode. buildNode.close() streamedNode.open() // Set the HashedRelation used by the HashJoinNode. withHashedRelation(hashedRelation) } override def close(): Unit = { // Please note that we do not need to call the close method of our buildNode because // it has been called in this.open. streamedNode.close() } }
Example 163
Source File: HashJoinNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.joins._ private def fetchNextMatch(): Boolean = { currentHashMatches = null currentMatchPosition = -1 while (currentHashMatches == null && streamedNode.next()) { currentStreamedRow = streamedNode.fetch() val key = joinKeys(currentStreamedRow) if (!key.anyNull) { currentHashMatches = hashed.get(key) } } if (currentHashMatches == null) { false } else { currentMatchPosition = 0 true } } override def fetch(): InternalRow = { val ret = buildSide match { case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition)) case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow) } resultProjection(ret) } }
Example 164
Source File: SampleNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler} case class SampleNode( conf: SQLConf, lowerBound: Double, upperBound: Double, withReplacement: Boolean, seed: Long, child: LocalNode) extends UnaryLocalNode(conf) { override def output: Seq[Attribute] = child.output private[this] var iterator: Iterator[InternalRow] = _ private[this] var currentRow: InternalRow = _ override def open(): Unit = { child.open() val sampler = if (withReplacement) { // Disable gap sampling since the gap sampling method buffers two rows internally, // requiring us to copy the row, which is more expensive than the random number generator. new PoissonSampler[InternalRow](upperBound - lowerBound, useGapSamplingIfPossible = false) } else { new BernoulliCellSampler[InternalRow](lowerBound, upperBound) } sampler.setSeed(seed) iterator = sampler.sample(child.asIterator) } override def next(): Boolean = { if (iterator.hasNext) { currentRow = iterator.next() true } else { false } } override def fetch(): InternalRow = currentRow override def close(): Unit = child.close() }
Example 165
Source File: UnionNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute case class UnionNode(conf: SQLConf, children: Seq[LocalNode]) extends LocalNode(conf) { override def output: Seq[Attribute] = children.head.output private[this] var currentChild: LocalNode = _ private[this] var nextChildIndex: Int = _ override def open(): Unit = { currentChild = children.head currentChild.open() nextChildIndex = 1 } private def advanceToNextChild(): Boolean = { var found = false var exit = false while (!exit && !found) { if (currentChild != null) { currentChild.close() } if (nextChildIndex >= children.size) { found = false exit = true } else { currentChild = children(nextChildIndex) nextChildIndex += 1 currentChild.open() found = currentChild.next() } } found } override def close(): Unit = { if (currentChild != null) { currentChild.close() } } override def fetch(): InternalRow = currentChild.fetch() override def next(): Boolean = { if (currentChild.next()) { true } else { advanceToNextChild() } } }
Example 166
Source File: BroadcastHashJoinNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashedRelation} case class BroadcastHashJoinNode( conf: SQLConf, streamedKeys: Seq[Expression], streamedNode: LocalNode, buildSide: BuildSide, buildOutput: Seq[Attribute], hashedRelation: Broadcast[HashedRelation]) extends UnaryLocalNode(conf) with HashJoinNode { override val child = streamedNode // Because we do not pass in the buildNode, we take the output of buildNode to // create the inputSet properly. override def inputSet: AttributeSet = AttributeSet(child.output ++ buildOutput) override def output: Seq[Attribute] = buildSide match { case BuildRight => streamedNode.output ++ buildOutput case BuildLeft => buildOutput ++ streamedNode.output } protected override def doOpen(): Unit = { streamedNode.open() // Set the HashedRelation used by the HashJoinNode. withHashedRelation(hashedRelation.value) } override def close(): Unit = { streamedNode.close() } }
Example 167
Source File: TakeOrderedAndProjectNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.util.BoundedPriorityQueue case class TakeOrderedAndProjectNode( conf: SQLConf, limit: Int, sortOrder: Seq[SortOrder], projectList: Option[Seq[NamedExpression]], child: LocalNode) extends UnaryLocalNode(conf) { private[this] var projection: Option[Projection] = _ private[this] var ord: InterpretedOrdering = _ private[this] var iterator: Iterator[InternalRow] = _ private[this] var currentRow: InternalRow = _ override def output: Seq[Attribute] = { val projectOutput = projectList.map(_.map(_.toAttribute)) projectOutput.getOrElse(child.output) } override def open(): Unit = { child.open() projection = projectList.map(new InterpretedProjection(_, child.output)) ord = new InterpretedOrdering(sortOrder, child.output) // Priority keeps the largest elements, so let's reverse the ordering. val queue = new BoundedPriorityQueue[InternalRow](limit)(ord.reverse) while (child.next()) { queue += child.fetch() } // Close it eagerly since we don't need it. child.close() iterator = queue.toArray.sorted(ord).iterator } override def next(): Boolean = { if (iterator.hasNext) { val _currentRow = iterator.next() currentRow = projection match { case Some(p) => p(_currentRow) case None => _currentRow } true } else { false } } override def fetch(): InternalRow = currentRow override def close(): Unit = child.close() }
Example 168
Source File: package.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable.HashSet import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.{Accumulator, AccumulatorParam, Logging} case class ColumnMetrics( elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty)) val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0) val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { logDebug(s"== ${child.simpleString} ==") logDebug(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case(attr, metric) => val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}") logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount += 1 var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes += HashSet(value.getClass.getName) } i += 1 } currentRow } } } } } }
Example 169
Source File: SortPrefixUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, PrefixComparator} object SortPrefixUtils { def createPrefixGenerator(schema: StructType): UnsafeExternalRowSorter.PrefixComputer = { if (schema.nonEmpty) { val boundReference = BoundReference(0, schema.head.dataType, nullable = true) val prefixProjection = UnsafeProjection.create( SortPrefix(SortOrder(boundReference, Ascending))) new UnsafeExternalRowSorter.PrefixComputer { override def computePrefix(row: InternalRow): Long = { prefixProjection.apply(row).getLong(0) } } } else { new UnsafeExternalRowSorter.PrefixComputer { override def computePrefix(row: InternalRow): Long = 0 } } } }
Example 170
Source File: LeftSemiJoinHash.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, Distribution, ClusteredDistribution} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class LeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryNode with HashSemiJoin { override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") right.execute().zipPartitions(left.execute()) { (buildIter, streamIter) => if (condition.isEmpty) { val hashSet = buildKeyHashSet(buildIter, numRightRows) hashSemiJoin(streamIter, numLeftRows, hashSet, numOutputRows) } else { val hashRelation = HashedRelation(buildIter, numRightRows, rightKeyGenerator) hashSemiJoin(streamIter, numLeftRows, hashRelation, numOutputRows) } } } }
Example 171
Source File: HashSemiJoin.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.metric.LongSQLMetric trait HashSemiJoin { self: SparkPlan => val leftKeys: Seq[Expression] val rightKeys: Seq[Expression] val left: SparkPlan val right: SparkPlan val condition: Option[Expression] override def output: Seq[Attribute] = left.output override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false protected def leftKeyGenerator: Projection = UnsafeProjection.create(leftKeys, left.output) protected def rightKeyGenerator: Projection = UnsafeProjection.create(rightKeys, right.output) @transient private lazy val boundCondition = newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output) protected def buildKeyHashSet( buildIter: Iterator[InternalRow], numBuildRows: LongSQLMetric): java.util.Set[InternalRow] = { val hashSet = new java.util.HashSet[InternalRow]() // Create a Hash set of buildKeys val rightKey = rightKeyGenerator while (buildIter.hasNext) { val currentRow = buildIter.next() numBuildRows += 1 val rowKey = rightKey(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) if (!keyExists) { hashSet.add(rowKey.copy()) } } } hashSet } protected def hashSemiJoin( streamIter: Iterator[InternalRow], numStreamRows: LongSQLMetric, hashSet: java.util.Set[InternalRow], numOutputRows: LongSQLMetric): Iterator[InternalRow] = { val joinKeys = leftKeyGenerator streamIter.filter(current => { numStreamRows += 1 val key = joinKeys(current) val r = !key.anyNull && hashSet.contains(key) if (r) numOutputRows += 1 r }) } protected def hashSemiJoin( streamIter: Iterator[InternalRow], numStreamRows: LongSQLMetric, hashedRelation: HashedRelation, numOutputRows: LongSQLMetric): Iterator[InternalRow] = { val joinKeys = leftKeyGenerator val joinedRow = new JoinedRow streamIter.filter { current => numStreamRows += 1 val key = joinKeys(current) lazy val rowBuffer = hashedRelation.get(key) val r = !key.anyNull && rowBuffer != null && rowBuffer.exists { (row: InternalRow) => boundCondition(joinedRow(current, row)) } if (r) numOutputRows += 1 r } } }
Example 172
Source File: BroadcastLeftSemiJoinHash.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.{InternalAccumulator, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class BroadcastLeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryNode with HashSemiJoin { override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val input = right.execute().map { row => numRightRows += 1 row.copy() }.collect() if (condition.isEmpty) { val hashSet = buildKeyHashSet(input.toIterator, SQLMetrics.nullLongMetric) val broadcastedRelation = sparkContext.broadcast(hashSet) left.execute().mapPartitionsInternal { streamIter => hashSemiJoin(streamIter, numLeftRows, broadcastedRelation.value, numOutputRows) } } else { val hashRelation = HashedRelation(input.toIterator, SQLMetrics.nullLongMetric, rightKeyGenerator, input.size) val broadcastedRelation = sparkContext.broadcast(hashRelation) left.execute().mapPartitionsInternal { streamIter => val hashedRelation = broadcastedRelation.value hashedRelation match { case unsafe: UnsafeHashedRelation => TaskContext.get().internalMetricsToAccumulators( InternalAccumulator.PEAK_EXECUTION_MEMORY).add(unsafe.getUnsafeSize) case _ => } hashSemiJoin(streamIter, numLeftRows, hashedRelation, numOutputRows) } } } }
Example 173
Source File: LeftSemiJoinBNL.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics override def right: SparkPlan = broadcast @transient private lazy val boundCondition = newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val broadcastedRelation = sparkContext.broadcast(broadcast.execute().map { row => numRightRows += 1 row.copy() }.collect().toIndexedSeq) streamed.execute().mapPartitions { streamedIter => val joinedRow = new JoinedRow streamedIter.filter(streamedRow => { numLeftRows += 1 var i = 0 var matched = false while (i < broadcastedRelation.value.size && !matched) { val broadcastedRow = broadcastedRelation.value(i) if (boundCondition(joinedRow(streamedRow, broadcastedRow))) { matched = true } i += 1 } if (matched) { numOutputRows += 1 } matched }) } } }
Example 174
Source File: HashJoin.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.metric.LongSQLMetric trait HashJoin { self: SparkPlan => val leftKeys: Seq[Expression] val rightKeys: Seq[Expression] val buildSide: BuildSide val left: SparkPlan val right: SparkPlan protected lazy val (buildPlan, streamedPlan) = buildSide match { case BuildLeft => (left, right) case BuildRight => (right, left) } protected lazy val (buildKeys, streamedKeys) = buildSide match { case BuildLeft => (leftKeys, rightKeys) case BuildRight => (rightKeys, leftKeys) } override def output: Seq[Attribute] = left.output ++ right.output override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false protected def buildSideKeyGenerator: Projection = UnsafeProjection.create(buildKeys, buildPlan.output) protected def streamSideKeyGenerator: Projection = UnsafeProjection.create(streamedKeys, streamedPlan.output) protected def hashJoin( streamIter: Iterator[InternalRow], numStreamRows: LongSQLMetric, hashedRelation: HashedRelation, numOutputRows: LongSQLMetric): Iterator[InternalRow] = { new Iterator[InternalRow] { private[this] var currentStreamedRow: InternalRow = _ private[this] var currentHashMatches: Seq[InternalRow] = _ private[this] var currentMatchPosition: Int = -1 // Mutable per row objects. private[this] val joinRow = new JoinedRow private[this] val resultProjection: (InternalRow) => InternalRow = UnsafeProjection.create(self.schema) private[this] val joinKeys = streamSideKeyGenerator override final def hasNext: Boolean = (currentMatchPosition != -1 && currentMatchPosition < currentHashMatches.size) || (streamIter.hasNext && fetchNext()) override final def next(): InternalRow = { val ret = buildSide match { case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition)) case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow) } currentMatchPosition += 1 numOutputRows += 1 resultProjection(ret) } private final def fetchNext(): Boolean = { currentHashMatches = null currentMatchPosition = -1 while (currentHashMatches == null && streamIter.hasNext) { currentStreamedRow = streamIter.next() numStreamRows += 1 val key = joinKeys(currentStreamedRow) if (!key.anyNull) { currentHashMatches = hashedRelation.get(key) } } if (currentHashMatches == null) { false } else { currentMatchPosition = 0 true } } } } }
Example 175
Source File: CartesianProduct.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output override private[sql] lazy val metrics = Map( "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"), "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"), "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numLeftRows = longMetric("numLeftRows") val numRightRows = longMetric("numRightRows") val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().map { row => numLeftRows += 1 row.copy() } val rightResults = right.execute().map { row => numRightRows += 1 row.copy() } leftResults.cartesian(rightResults).mapPartitionsInternal { iter => val joinedRow = new JoinedRow iter.map { r => numOutputRows += 1 joinedRow(r._1, r._2) } } } }
Example 176
Source File: JacksonGenerator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.json import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{MapData, ArrayData, DateTimeUtils} import scala.collection.Map import com.fasterxml.jackson.core._ import org.apache.spark.sql.Row import org.apache.spark.sql.types._ private[sql] object JacksonGenerator { def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = { def valWriter: (DataType, Any) => Unit = { case (_, null) | (NullType, _) => gen.writeNull() case (StringType, v) => gen.writeString(v.toString) case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString) case (IntegerType, v: Int) => gen.writeNumber(v) case (ShortType, v: Short) => gen.writeNumber(v) case (FloatType, v: Float) => gen.writeNumber(v) case (DoubleType, v: Double) => gen.writeNumber(v) case (LongType, v: Long) => gen.writeNumber(v) case (DecimalType(), v: Decimal) => gen.writeNumber(v.toJavaBigDecimal) case (ByteType, v: Byte) => gen.writeNumber(v.toInt) case (BinaryType, v: Array[Byte]) => gen.writeBinary(v) case (BooleanType, v: Boolean) => gen.writeBoolean(v) case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString) // For UDT values, they should be in the SQL type's corresponding value type. // We should not see values in the user-defined class at here. // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is // an ArrayData at here, instead of a Vector. case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v) case (ArrayType(ty, _), v: ArrayData) => gen.writeStartArray() v.foreach(ty, (_, value) => valWriter(ty, value)) gen.writeEndArray() case (MapType(kt, vt, _), v: MapData) => gen.writeStartObject() v.foreach(kt, vt, { (k, v) => gen.writeFieldName(k.toString) valWriter(vt, v) }) gen.writeEndObject() case (StructType(ty), v: InternalRow) => gen.writeStartObject() var i = 0 while (i < ty.length) { val field = ty(i) val value = v.get(i, field.dataType) if (value != null) { gen.writeFieldName(field.name) valWriter(field.dataType, value) } i += 1 } gen.writeEndObject() case (dt, v) => sys.error( s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.") } valWriter(rowSchema, row) } }
Example 177
Source File: QueryExecution.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan lazy val toRdd: RDD[InternalRow] = executedPlan.execute() protected def stringOrError[A](f: => A): String = try f.toString catch { case e: Throwable => e.toString } def simpleString: String = { s"""== Physical Plan == |${stringOrError(executedPlan)} """.stripMargin.trim } override def toString: String = { def output = analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}").mkString(", ") s"""== Parsed Logical Plan == |${stringOrError(logical)} |== Analyzed Logical Plan == |${stringOrError(output)} |${stringOrError(analyzed)} |== Optimized Logical Plan == |${stringOrError(optimizedPlan)} |== Physical Plan == |${stringOrError(executedPlan)} """.stripMargin.trim } }
Example 178
Source File: rowFormatConverters.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.rules.Rule case class ConvertToSafe(child: SparkPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = child.outputPartitioning override def outputOrdering: Seq[SortOrder] = child.outputOrdering override def outputsUnsafeRows: Boolean = false override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val convertToSafe = FromUnsafeProjection(child.output.map(_.dataType)) iter.map(convertToSafe) } } } //private[sql] object EnsureRowFormats extends Rule[SparkPlan] { private def onlyHandlesSafeRows(operator: SparkPlan): Boolean = operator.canProcessSafeRows && !operator.canProcessUnsafeRows private def onlyHandlesUnsafeRows(operator: SparkPlan): Boolean = operator.canProcessUnsafeRows && !operator.canProcessSafeRows private def handlesBothSafeAndUnsafeRows(operator: SparkPlan): Boolean = operator.canProcessSafeRows && operator.canProcessUnsafeRows override def apply(operator: SparkPlan): SparkPlan = operator.transformUp { case operator: SparkPlan if onlyHandlesSafeRows(operator) => if (operator.children.exists(_.outputsUnsafeRows)) { operator.withNewChildren { operator.children.map { c => if (c.outputsUnsafeRows) ConvertToSafe(c) else c } } } else { operator } case operator: SparkPlan if onlyHandlesUnsafeRows(operator) => if (operator.children.exists(!_.outputsUnsafeRows)) { operator.withNewChildren { operator.children.map { c => if (!c.outputsUnsafeRows) ConvertToUnsafe(c) else c } } } else { operator } case operator: SparkPlan if handlesBothSafeAndUnsafeRows(operator) => if (operator.children.map(_.outputsUnsafeRows).toSet.size != 1) { // If this operator's children produce both unsafe and safe rows, // convert everything unsafe rows. operator.withNewChildren { operator.children.map { c => if (!c.outputsUnsafeRows) ConvertToUnsafe(c) else c } } } else { operator } } }
Example 179
Source File: Sort.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.{InternalAccumulator, SparkEnv, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{Distribution, OrderedDistribution, UnspecifiedDistribution} import org.apache.spark.sql.execution.metric.SQLMetrics case class Sort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan, testSpillFrequency: Int = 0) extends UnaryNode { override def outputsUnsafeRows: Boolean = true override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = false override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil override private[sql] lazy val metrics = Map( "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"), "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size")) protected override def doExecute(): RDD[InternalRow] = { val schema = child.schema val childOutput = child.output val dataSize = longMetric("dataSize") val spillSize = longMetric("spillSize") child.execute().mapPartitionsInternal { iter => val ordering = newOrdering(sortOrder, childOutput) // The comparator for comparing prefix val boundSortExpression = BindReferences.bindReference(sortOrder.head, childOutput) val prefixComparator = SortPrefixUtils.getPrefixComparator(boundSortExpression) // The generator for prefix val prefixProjection = UnsafeProjection.create(Seq(SortPrefix(boundSortExpression))) val prefixComputer = new UnsafeExternalRowSorter.PrefixComputer { override def computePrefix(row: InternalRow): Long = { prefixProjection.apply(row).getLong(0) } } val pageSize = SparkEnv.get.memoryManager.pageSizeBytes val sorter = new UnsafeExternalRowSorter( schema, ordering, prefixComparator, prefixComputer, pageSize) if (testSpillFrequency > 0) { sorter.setTestSpillFrequency(testSpillFrequency) } // Remember spill data size of this task before execute this operator so that we can // figure out how many bytes we spilled for this operator. val spillSizeBefore = TaskContext.get().taskMetrics().memoryBytesSpilled val sortedIterator = sorter.sort(iter.asInstanceOf[Iterator[UnsafeRow]]) dataSize += sorter.getPeakMemoryUsage spillSize += TaskContext.get().taskMetrics().memoryBytesSpilled - spillSizeBefore TaskContext.get().internalMetricsToAccumulators( InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.getPeakMemoryUsage) sortedIterator } } }
Example 180
Source File: Expand.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} case class Expand( projections: Seq[Seq[Expression]], output: Seq[Attribute], child: SparkPlan) extends UnaryNode { // The GroupExpressions can output data with arbitrary partitioning, so set it // as UNKNOWN partitioning override def outputPartitioning: Partitioning = UnknownPartitioning(0) override def outputsUnsafeRows: Boolean = child.outputsUnsafeRows override def canProcessUnsafeRows: Boolean = true override def canProcessSafeRows: Boolean = true override def references: AttributeSet = AttributeSet(projections.flatten.flatMap(_.references)) private[this] val projection = { if (outputsUnsafeRows) { (exprs: Seq[Expression]) => UnsafeProjection.create(exprs, child.output) } else { (exprs: Seq[Expression]) => newMutableProjection(exprs, child.output)() } } protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") { child.execute().mapPartitions { iter => val groups = projections.map(projection).toArray new Iterator[InternalRow] { private[this] var result: InternalRow = _ private[this] var idx = -1 // -1 means the initial state private[this] var input: InternalRow = _ override final def hasNext: Boolean = (-1 < idx && idx < groups.length) || iter.hasNext override final def next(): InternalRow = { if (idx <= 0) { // in the initial (-1) or beginning(0) of a new input row, fetch the next input tuple input = iter.next() idx = 0 } result = groups(idx)(input) idx += 1 if (idx == groups.length && iter.hasNext) { idx = 0 } result } } } } }
Example 181
Source File: NullableColumnBuilder.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.catalyst.InternalRow private[columnar] trait NullableColumnBuilder extends ColumnBuilder { protected var nulls: ByteBuffer = _ protected var nullCount: Int = _ private var pos: Int = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { nulls = ByteBuffer.allocate(1024) nulls.order(ByteOrder.nativeOrder()) pos = 0 nullCount = 0 super.initialize(initialSize, columnName, useCompression) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { columnStats.gatherStats(row, ordinal) if (row.isNullAt(ordinal)) { nulls = ColumnBuilder.ensureFreeSpace(nulls, 4) nulls.putInt(pos) nullCount += 1 } else { super.appendFrom(row, ordinal) } pos += 1 } abstract override def build(): ByteBuffer = { val nonNulls = super.build() val nullDataLen = nulls.position() nulls.limit(nullDataLen) nulls.rewind() val buffer = ByteBuffer .allocate(4 + nullDataLen + nonNulls.remaining()) .order(ByteOrder.nativeOrder()) .putInt(nullCount) .put(nulls) .put(nonNulls) buffer.rewind() buffer } protected def buildNonNulls(): ByteBuffer = { nulls.limit(nulls.position()).rewind() super.build() } }
Example 182
Source File: CompressibleColumnBuilder.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } protected def isWorthCompressing(encoder: Encoder[T]) = { encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { var i = 0 while (i < compressionEncoders.length) { compressionEncoders(i).gatherCompressibilityStats(row, ordinal) i += 1 } } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } }
Example 183
Source File: CompressionScheme.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.MutableRow import org.apache.spark.sql.execution.columnar.{ColumnType, NativeColumnType} import org.apache.spark.sql.types.AtomicType private[columnar] trait Encoder[T <: AtomicType] { def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {} def compressedSize: Int def uncompressedSize: Int def compressionRatio: Double = { if (uncompressedSize > 0) compressedSize.toDouble / uncompressedSize else 1.0 } def compress(from: ByteBuffer, to: ByteBuffer): ByteBuffer } private[columnar] trait Decoder[T <: AtomicType] { def next(row: MutableRow, ordinal: Int): Unit def hasNext: Boolean } private[columnar] trait CompressionScheme { def typeId: Int def supports(columnType: ColumnType[_]): Boolean def encoder[T <: AtomicType](columnType: NativeColumnType[T]): Encoder[T] def decoder[T <: AtomicType](buffer: ByteBuffer, columnType: NativeColumnType[T]): Decoder[T] } private[columnar] trait WithCompressionSchemes { def schemes: Seq[CompressionScheme] } private[columnar] trait AllCompressionSchemes extends WithCompressionSchemes { override val schemes: Seq[CompressionScheme] = CompressionScheme.all } private[columnar] object CompressionScheme { val all: Seq[CompressionScheme] = Seq(PassThrough, RunLengthEncoding, DictionaryEncoding, BooleanBitSet, IntDelta, LongDelta) private val typeIdToScheme = all.map(scheme => scheme.typeId -> scheme).toMap def apply(typeId: Int): CompressionScheme = { typeIdToScheme.getOrElse(typeId, throw new UnsupportedOperationException( s"Unrecognized compression scheme type ID: $typeId")) } def columnHeaderSize(columnBuffer: ByteBuffer): Int = { val header = columnBuffer.duplicate().order(ByteOrder.nativeOrder) val nullCount = header.getInt() // null count + null positions 4 + 4 * nullCount } }
Example 184
Source File: Generate.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ case class Generate( generator: Generator, join: Boolean, outer: Boolean, output: Seq[Attribute], child: SparkPlan) extends UnaryNode { val boundGenerator = BindReferences.bindReference(generator, child.output) protected override def doExecute(): RDD[InternalRow] = { // boundGenerator.terminate() should be triggered after all of the rows in the partition if (join) { child.execute().mapPartitionsInternal { iter => val generatorNullRow = InternalRow.fromSeq(Seq.fill[Any](generator.elementTypes.size)(null)) val joinedRow = new JoinedRow iter.flatMap { row => // we should always set the left (child output) joinedRow.withLeft(row) val outputRows = boundGenerator.eval(row) if (outer && outputRows.isEmpty) { joinedRow.withRight(generatorNullRow) :: Nil } else { outputRows.map(or => joinedRow.withRight(or)) } } ++ LazyIterator(() => boundGenerator.terminate()).map { row => // we leave the left side as the last element of its child output // keep it the same as Hive does joinedRow.withRight(row) } } } else { child.execute().mapPartitionsInternal { iter => iter.flatMap(row => boundGenerator.eval(row)) ++ LazyIterator(() => boundGenerator.terminate()) } } } }
Example 185
Source File: ExistingRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.sources.{HadoopFsRelation, BaseRelation} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericMutableRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } //private[sql] case class PhysicalRDD( output: Seq[Attribute], rdd: RDD[InternalRow], override val nodeName: String, override val metadata: Map[String, String] = Map.empty, override val outputsUnsafeRows: Boolean = false) extends LeafNode { protected override def doExecute(): RDD[InternalRow] = rdd override def simpleString: String = { val metadataEntries = for ((key, value) <- metadata.toSeq.sorted) yield s"$key: $value" s"Scan $nodeName${output.mkString("[", ",", "]")}${metadataEntries.mkString(" ", ", ", "")}" } } private[sql] object PhysicalRDD { // Metadata keys val INPUT_PATHS = "InputPaths" val PUSHED_FILTERS = "PushedFilters" def createFromDataSource( output: Seq[Attribute], rdd: RDD[InternalRow], relation: BaseRelation, metadata: Map[String, String] = Map.empty): PhysicalRDD = { // All HadoopFsRelations output UnsafeRows val outputUnsafeRows = relation.isInstanceOf[HadoopFsRelation] PhysicalRDD(output, rdd, relation.toString, metadata, outputUnsafeRows) } }
Example 186
Source File: CoGroupedIterator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Ascending, SortOrder, Attribute} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering class CoGroupedIterator( left: Iterator[(InternalRow, Iterator[InternalRow])], right: Iterator[(InternalRow, Iterator[InternalRow])], groupingSchema: Seq[Attribute]) extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] { private val keyOrdering = GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema) private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _ private var currentRightData: (InternalRow, Iterator[InternalRow]) = _ override def hasNext: Boolean = { if (currentLeftData == null && left.hasNext) { currentLeftData = left.next() } if (currentRightData == null && right.hasNext) { currentRightData = right.next() } currentLeftData != null || currentRightData != null } override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { assert(hasNext) if (currentLeftData.eq(null)) { // left is null, right is not null, consume the right data. rightOnly() } else if (currentRightData.eq(null)) { // left is not null, right is null, consume the left data. leftOnly() } else if (currentLeftData._1 == currentRightData._1) { // left and right have the same grouping key, consume both of them. val result = (currentLeftData._1, currentLeftData._2, currentRightData._2) currentLeftData = null currentRightData = null result } else { val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1) assert(compare != 0) if (compare < 0) { // the grouping key of left is smaller, consume the left data. leftOnly() } else { // the grouping key of right is smaller, consume the right data. rightOnly() } } } private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentLeftData._1, currentLeftData._2, Iterator.empty) currentLeftData = null result } private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = { val result = (currentRightData._1, Iterator.empty, currentRightData._2) currentRightData = null result } }
Example 187
Source File: DummyNode.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.local import org.apache.spark.sql.SQLConf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LocalRelation private[local] case class DummyNode( output: Seq[Attribute], relation: LocalRelation, conf: SQLConf) extends LocalNode(conf) { import DummyNode._ private var index: Int = CLOSED private val input: Seq[InternalRow] = relation.data def this(output: Seq[Attribute], data: Seq[Product], conf: SQLConf = new SQLConf) { this(output, LocalRelation.fromProduct(output, data), conf) } def isOpen: Boolean = index != CLOSED override def children: Seq[LocalNode] = Seq.empty override def open(): Unit = { index = -1 } override def next(): Boolean = { index += 1 index < input.size } override def fetch(): InternalRow = { assert(index >= 0 && index < input.size) input(index) } override def close(): Unit = { index = CLOSED } } private object DummyNode { val CLOSED: Int = Int.MinValue }
Example 188
Source File: ReferenceSort.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.{InternalAccumulator, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} import org.apache.spark.sql.catalyst.plans.physical._ import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter case class ReferenceSort( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan) extends UnaryNode { override def requiredChildDistribution: Seq[Distribution] = if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") { child.execute().mapPartitions( { iterator => val ordering = newOrdering(sortOrder, child.output) val sorter = new ExternalSorter[InternalRow, Null, InternalRow]( TaskContext.get(), ordering = Some(ordering)) sorter.insertAll(iterator.map(r => (r.copy(), null))) val baseIterator = sorter.iterator.map(_._1) val context = TaskContext.get() context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.internalMetricsToAccumulators( InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.peakMemoryUsedBytes) CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop()) }, preservesPartitioning = true) } override def output: Seq[Attribute] = child.output override def outputOrdering: Seq[SortOrder] = sortOrder }
Example 189
Source File: ColumnarTestUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, GenericMutableRow} import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayBasedMapData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericMutableRow = { val row = new GenericMutableRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericMutableRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericMutableRow(1) row(0) = value row } (values, rows) } }
Example 190
Source File: ExtraStrategiesSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { sqlContext.experimental.extraStrategies = Nil } } }
Example 191
Source File: DDLTestSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String class DDLScanSource extends RelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { SimpleDDLScan(parameters("from").toInt, parameters("TO").toInt, parameters("Table"))(sqlContext) } } case class SimpleDDLScan(from: Int, to: Int, table: String)(@transient val sqlContext: SQLContext) extends BaseRelation with TableScan { override def schema: StructType = StructType(Seq( StructField("intType", IntegerType, nullable = false, new MetadataBuilder().putString("comment", s"test comment $table").build()), StructField("stringType", StringType, nullable = false), StructField("dateType", DateType, nullable = false), StructField("timestampType", TimestampType, nullable = false), StructField("doubleType", DoubleType, nullable = false), StructField("bigintType", LongType, nullable = false), StructField("tinyintType", ByteType, nullable = false), StructField("decimalType", DecimalType.USER_DEFAULT, nullable = false), StructField("fixedDecimalType", DecimalType(5, 1), nullable = false), StructField("binaryType", BinaryType, nullable = false), StructField("booleanType", BooleanType, nullable = false), StructField("smallIntType", ShortType, nullable = false), StructField("floatType", FloatType, nullable = false), StructField("mapType", MapType(StringType, StringType)), StructField("arrayType", ArrayType(StringType)), StructField("structType", StructType(StructField("f1", StringType) :: StructField("f2", IntegerType) :: Nil ) ) )) override def needConversion: Boolean = false override def buildScan(): RDD[Row] = { // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row] sqlContext.sparkContext.parallelize(from to to).map { e => InternalRow(UTF8String.fromString(s"people$e"), e * 2) }.asInstanceOf[RDD[Row]] } } class DDLTestSuite extends DataSourceTest with SharedSQLContext { protected override lazy val sql = caseInsensitiveContext.sql _ override def beforeAll(): Unit = { super.beforeAll() sql( """ |CREATE TEMPORARY TABLE ddlPeople |USING org.apache.spark.sql.sources.DDLScanSource |OPTIONS ( | From '1', | To '10', | Table 'test1' |) """.stripMargin) } sqlTest( "describe ddlPeople", Seq( Row("intType", "int", "test comment test1"), Row("stringType", "string", ""), Row("dateType", "date", ""), Row("timestampType", "timestamp", ""), Row("doubleType", "double", ""), Row("bigintType", "bigint", ""), Row("tinyintType", "tinyint", ""), Row("decimalType", "decimal(10,0)", ""), Row("fixedDecimalType", "decimal(5,1)", ""), Row("binaryType", "binary", ""), Row("booleanType", "boolean", ""), Row("smallIntType", "smallint", ""), Row("floatType", "float", ""), Row("mapType", "map<string,string>", ""), Row("arrayType", "array<string>", ""), Row("structType", "struct<f1:string,f2:int>", "") )) test("SPARK-7686 DescribeCommand should have correct physical plan output attributes") { val attributes = sql("describe ddlPeople") .queryExecution.executedPlan.output assert(attributes.map(_.name) === Seq("col_name", "data_type", "comment")) assert(attributes.map(_.dataType).toSet === Set(StringType)) } }
Example 192
Source File: GenomicIntervalStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.utvf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{DataFrame, GenomicInterval, SparkSession, Strategy} import org.apache.spark.unsafe.types.UTF8String case class GIntervalRow(contigName: String, start: Int, end: Int) class GenomicIntervalStrategy( spark: SparkSession) extends Strategy with Serializable { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case GenomicInterval(contigName, start, end,output) => GenomicIntervalPlan(plan,spark,GIntervalRow(contigName,start,end),output) :: Nil case _ => Nil } } case class GenomicIntervalPlan(plan: LogicalPlan, spark: SparkSession,interval:GIntervalRow, output: Seq[Attribute]) extends SparkPlan with Serializable { def doExecute(): org.apache.spark.rdd.RDD[InternalRow] = { import spark.implicits._ lazy val genomicInterval = spark.createDataset(Seq(interval)) genomicInterval .rdd .map(r=>{ val proj = UnsafeProjection.create(schema) proj.apply(InternalRow.fromSeq(Seq(UTF8String.fromString(r.contigName),r.start,r.end))) } ) } def children: Seq[SparkPlan] = Nil }
Example 193
Source File: Pileup.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.pileup import htsjdk.samtools.SAMRecord import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.biodatageeks.sequila.datasources.BAM.BDGAlignFileReaderWriter import org.biodatageeks.sequila.datasources.InputDataType import org.biodatageeks.sequila.inputformats.BDGAlignInputFormat import org.biodatageeks.sequila.utils.{InternalParams, TableFuncs} import org.seqdoop.hadoop_bam.CRAMBDGInputFormat import org.slf4j.LoggerFactory import scala.reflect.ClassTag class Pileup[T<:BDGAlignInputFormat](spark:SparkSession)(implicit c: ClassTag[T]) extends BDGAlignFileReaderWriter[T] { val logger = LoggerFactory.getLogger(this.getClass.getCanonicalName) def handlePileup(tableName: String, sampleId: String, refPath:String, output: Seq[Attribute]): RDD[InternalRow] = { logger.info("Calculating pileup on table: {}", tableName) lazy val allAlignments = readTableFile(name=tableName, sampleId) if(logger.isDebugEnabled()) logger.debug("Processing {} reads in total", allAlignments.count() ) val alignments = filterAlignments(allAlignments ) PileupMethods.calculatePileup(alignments, spark ,refPath) } private def filterAlignments(alignments:RDD[SAMRecord]): RDD[SAMRecord] = { // any other filtering conditions should go here val filterFlag = spark.conf.get(InternalParams.filterReadsByFlag, "1796").toInt val cleaned = alignments.filter(read => read.getContig != null && (read.getFlags & filterFlag) == 0) if(logger.isDebugEnabled()) logger.debug("Processing {} cleaned reads in total", cleaned.count() ) cleaned } private def readTableFile(name: String, sampleId: String): RDD[SAMRecord] = { val metadata = TableFuncs.getTableMetadata(spark, name) val path = metadata.location.toString val samplePathTemplate = ( path .split('/') .dropRight(1) ++ Array(s"$sampleId*.{{fileExtension}}")) .mkString("/") metadata.provider match { case Some(f) => if (f == InputDataType.BAMInputDataType) readBAMFile(spark.sqlContext, samplePathTemplate.replace("{{fileExtension}}", "bam"), refPath = None) else if (f == InputDataType.CRAMInputDataType) { val refPath = spark.sqlContext .sparkContext .hadoopConfiguration .get(CRAMBDGInputFormat.REFERENCE_SOURCE_PATH_PROPERTY) readBAMFile(spark.sqlContext, samplePathTemplate.replace("{{fileExtension}}", "cram"), Some(refPath)) } else throw new Exception("Only BAM and CRAM file formats are supported in bdg_coverage.") case None => throw new Exception("Wrong file extension - only BAM and CRAM file formats are supported in bdg_coverage.") } } }
Example 194
Source File: PileupStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.pileup import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{PileupTemplate, SparkSession, Strategy} import org.biodatageeks.sequila.datasources.BAM.BDGAlignFileReaderWriter import org.biodatageeks.sequila.datasources.InputDataType import org.biodatageeks.sequila.inputformats.BDGAlignInputFormat import org.biodatageeks.sequila.utils.TableFuncs import org.seqdoop.hadoop_bam.{BAMBDGInputFormat, CRAMBDGInputFormat} import scala.reflect.ClassTag class PileupStrategy (spark:SparkSession) extends Strategy with Serializable { override def apply(plan: LogicalPlan): Seq[SparkPlan] = { plan match { case PileupTemplate(tableName, sampleId, refPath, output) => val inputFormat = TableFuncs.getTableMetadata(spark, tableName).provider inputFormat match { case Some(f) => if (f == InputDataType.BAMInputDataType) PileupPlan[BAMBDGInputFormat](plan, spark, tableName, sampleId, refPath, output) :: Nil else if (f == InputDataType.CRAMInputDataType) PileupPlan[CRAMBDGInputFormat](plan, spark, tableName, sampleId, refPath, output) :: Nil else Nil case None => throw new RuntimeException("Only BAM and CRAM file formats are supported in pileup function.") } case _ => Nil } } } case class PileupPlan [T<:BDGAlignInputFormat](plan:LogicalPlan, spark:SparkSession, tableName:String, sampleId:String, refPath: String, output:Seq[Attribute])(implicit c: ClassTag[T]) extends SparkPlan with Serializable with BDGAlignFileReaderWriter [T]{ override def children: Seq[SparkPlan] = Nil override protected def doExecute(): RDD[InternalRow] = { new Pileup(spark).handlePileup(tableName, sampleId, refPath, output) } }
Example 195
Source File: PileupMethods.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.pileup import htsjdk.samtools.SAMRecord import org.apache.spark.rdd.MetricsContext._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.biodatageeks.sequila.pileup.model.{Reference, _} import org.biodatageeks.sequila.pileup.timers.PileupTimers._ import org.biodatageeks.sequila.utils.InternalParams import org.slf4j.{Logger, LoggerFactory} import AggregateRDDOperations.implicits._ import AlignmentsRDDOperations.implicits._ def calculatePileup(alignments: RDD[SAMRecord], spark: SparkSession, refPath: String): RDD[InternalRow] = { Reference.init(refPath) val enableInstrumentation = spark .sqlContext .getConf(InternalParams.EnableInstrumentation).toBoolean val alignmentsInstr = if(enableInstrumentation) alignments.instrument() else alignments val aggregates = ContigAggrTimer.time { alignmentsInstr.assembleContigAggregates() .persist(StorageLevel.MEMORY_AND_DISK) //FIXME: Add automatic unpersist } val accumulator = AccumulatorTimer.time {aggregates.accumulateTails(spark)} val broadcast = BroadcastTimer.time{ spark.sparkContext.broadcast(accumulator.value().prepareOverlaps()) } val adjustedEvents = AdjustedEventsTimer.time {aggregates.adjustWithOverlaps(broadcast) } val pileup = EventsToPileupTimer.time {adjustedEvents.toPileup} pileup } }
Example 196
Source File: NCListsJoin.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.NCList import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{SparkPlan, _} @DeveloperApi case class NCListsJoin(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1)), List(condition(2), condition(3))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v2 = right.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) (new Interval[Int](v1Key.getInt(0), v1Key.getInt(1)), x.copy()) } ) val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) (new Interval[Int](v2Key.getInt(0), v2Key.getInt(1)), x.copy()) } ) if (v1.count <= v2.count) { val v3 = NCListsJoinImpl.overlapJoin(context.sparkContext, v1kv, v2kv).flatMap(l => l._2.map(r => (l._1, r))) v3.map { case (l: InternalRow, r: InternalRow) => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema); joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner } } } else { val v3 = NCListsJoinImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r))) v3.map { case (r: InternalRow, l: InternalRow) => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema); joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner } } } } }
Example 197
Source File: NCListsJoinImpl.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.NCList import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.rdd.MetricsContext._ import org.biodatageeks.sequila.rangejoins.common.performance.timers.NCListTimer._ object NCListsJoinImpl extends Serializable { val nclist = NCListBuild.time{sc.broadcast(new NCListTree[Int](localIntervals))} val kvrdd2: RDD[(Int, Iterable[InternalRow])] = rdd2 .instrument() // join entry with the intervals returned from the interval tree .map(x => (NCListLookup.time{nclist.value.getAllOverlappings(x._1)}, x._2)) .filter(x => x._1 != Nil) // filter out entries that do not join anywhere .flatMap(t => t._1.map(s => (s._2, t._2))) // create pairs of (index1, rdd2Elem) .groupByKey indexedRdd1 // this is RDD[(Int, (Interval[Int], Row))] .map(x => (x._1.toInt, x._2._2)) // convert it to (Int, Row) .join(kvrdd2) // join produces RDD[(Int, (Row, Iterable[Row]))] .map(_._2) // end up with RDD[(Row, Iterable[Row])] } }
Example 198
Source File: NCListsJoinChromosome.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.methods.NCList import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{SparkPlan, _} import org.biodatageeks.sequila.rangejoins.NCList.{Interval, NCListsJoinImpl} @DeveloperApi case class NCListsJoinChromosome(left: SparkPlan, right: SparkPlan, condition: Seq[Expression], context: SparkSession) extends BinaryExecNode { def output = left.output ++ right.output lazy val (buildPlan, streamedPlan) = (left, right) lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1),condition(4)), List(condition(2), condition(3),condition(5))) @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output) @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys, streamedPlan.output) protected override def doExecute(): RDD[InternalRow] = { val v1 = left.execute() val v2 = right.execute() val v1kv = v1.map(x => { val v1Key = buildKeyGenerator(x) ((v1Key.getString(2),new Interval[Int](v1Key.getInt(0), v1Key.getInt(1))), x.copy()) } ) val v2kv = v2.map(x => { val v2Key = streamKeyGenerator(x) ((v2Key.getString(2),new Interval[Int](v2Key.getInt(0), v2Key.getInt(1))), x.copy()) } ) if (v1.count <= v2.count) { val v3 = NCListsJoinChromosomeImpl.overlapJoin(context.sparkContext, v1kv, v2kv) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) p.map(r => joiner.join(r._1.asInstanceOf[UnsafeRow], r._2.asInstanceOf[UnsafeRow])) } ) } else { val v3 = NCListsJoinChromosomeImpl.overlapJoin(context.sparkContext, v2kv, v1kv) v3.mapPartitions( p => { val joiner = GenerateUnsafeRowJoiner.create(right.schema, left.schema) p.map(r=>joiner.join(r._2.asInstanceOf[UnsafeRow],r._1.asInstanceOf[UnsafeRow])) } ) } } }
Example 199
Source File: NCListsJoinChromosomeImpl.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.methods.NCList import org.apache.spark.SparkContext import org.apache.spark.rdd.MetricsContext._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.biodatageeks.sequila.rangejoins.NCList.{Interval, NCListTree} import org.biodatageeks.sequila.rangejoins.common.performance.timers.IntervalTreeTimer.IntervalTreeHTSLookup import org.biodatageeks.sequila.rangejoins.common.performance.timers.NCListTimer._ import org.biodatageeks.sequila.rangejoins.methods.IntervalTree.IntervalTreeJoinOptimChromosomeImpl.calcOverlap import scala.collection.immutable.Stream.Empty object NCListsJoinChromosomeImpl extends Serializable { val nclist = NCListBuild.time{sc.broadcast(new NCListTreeChromosome[Long](localIntervals))} val joinedRDD = rdd2 .instrument() .mapPartitions(p=> { p.map(r=> { IntervalTreeHTSLookup.time { val ncl = nclist.value.getAllOverlappings(r._1) if(ncl != Nil) { ncl .map(k => (k,r._2)) .toIterator } else Iterator.empty } }) }) .flatMap(r=>r) indexedRdd1 .map(r=>(r._1,r._2._2)) .join(joinedRDD.map(r=>(r._1._2,r._2))) .map(r=>r._2) } }
Example 200
Source File: IntervalTreeHTSChromosome.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.methods.IntervalTree import org.apache.spark.sql.catalyst.InternalRow import org.biodatageeks.sequila.rangejoins.IntervalTree.{Interval, IntervalWithRow} class IntervalTreeHTSChromosome[T](allRegions: Array[(String,Interval[Int],T)]) extends Serializable { val intervalTreeHashMap:Map[String,IntervalTreeHTS[T]] = { allRegions .groupBy(_._1) .map(x => { val it = new IntervalTreeHTS[T]() x._2.foreach { y => it.put(y._2.start, y._2.end, y._3) } (x._1, it) }) } def getIntervalTreeByChromosome(chr:String): Option[IntervalTreeHTS[T]] = intervalTreeHashMap.get(chr) }