org.apache.spark.sql.catalyst.InternalRow Scala Example

Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0

12 votes

package org.apache.spark.sql.execution.datasources.text

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.util.CompressionCodecs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.util.SerializableConfiguration


  def getCompressionExtension(context: TaskAttemptContext): String = {
    // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile
    if (FileOutputFormat.getCompressOutput(context)) {
      val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec])
      ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension
    } else {
      ""
    }
  }
}

Source File: LocalTableScanExec.scala From drizzle-spark with Apache License 2.0

6 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.execution.metric.SQLMetrics



case class LocalTableScanExec(
    output: Seq[Attribute],
    rows: Seq[InternalRow]) extends LeafExecNode {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  private val unsafeRows: Array[InternalRow] = {
    if (rows.isEmpty) {
      Array.empty
    } else {
      val proj = UnsafeProjection.create(output, output)
      rows.map(r => proj(r).copy()).toArray
    }
  }

  private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1),
    sqlContext.sparkContext.defaultParallelism)

  private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism)

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    rdd.map { r =>
      numOutputRows += 1
      r
    }
  }

  override protected def stringArgs: Iterator[Any] = {
    if (rows.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def executeCollect(): Array[InternalRow] = {
    longMetric("numOutputRows").add(unsafeRows.size)
    unsafeRows
  }

  override def executeTake(limit: Int): Array[InternalRow] = {
    val taken = unsafeRows.take(limit)
    longMetric("numOutputRows").add(taken.size)
    taken
  }
}

Source File: AvroDataToCatalyst.scala From spark-schema-registry with Apache License 2.0

6 votes

package com.hortonworks.spark.registry.avro

import java.io.ByteArrayInputStream

import com.hortonworks.registries.schemaregistry.{SchemaVersionInfo, SchemaVersionKey}
import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient
import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer
import org.apache.avro.Schema
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{BinaryType, DataType}

import scala.collection.JavaConverters._


case class AvroDataToCatalyst(child: Expression, schemaName: String, version: Option[Int], config: Map[String, Object])
  extends UnaryExpression with ExpectsInputTypes {

  override def inputTypes = Seq(BinaryType)

  @transient private lazy val srDeser: AvroSnapshotDeserializer = {
    val obj = new AvroSnapshotDeserializer()
    obj.init(config.asJava)
    obj
  }

  @transient private lazy val srSchema = fetchSchemaVersionInfo(schemaName, version)

  @transient private lazy val avroSchema = new Schema.Parser().parse(srSchema.getSchemaText)

  override lazy val dataType: DataType = SchemaConverters.toSqlType(avroSchema).dataType

  @transient private lazy val avroDeser= new AvroDeserializer(avroSchema, dataType)

  override def nullable: Boolean = true

  override def nullSafeEval(input: Any): Any = {
    val binary = input.asInstanceOf[Array[Byte]]
    val row = avroDeser.deserialize(srDeser.deserialize(new ByteArrayInputStream(binary), srSchema.getVersion))
    val result = row match {
      case r: InternalRow => r.copy()
      case _ => row
    }
    result
  }

  override def simpleString: String = {
    s"from_sr(${child.sql}, ${dataType.simpleString})"
  }

  override def sql: String = {
    s"from_sr(${child.sql}, ${dataType.catalogString})"
  }

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val expr = ctx.addReferenceObj("this", this)
    defineCodeGen(ctx, ev, input =>
      s"(${ctx.boxedType(dataType)})$expr.nullSafeEval($input)")
  }

  private def fetchSchemaVersionInfo(schemaName: String, version: Option[Int]): SchemaVersionInfo = {
    val srClient = new SchemaRegistryClient(config.asJava)
    version.map(v => srClient.getSchemaVersionInfo(new SchemaVersionKey(schemaName, v)))
      .getOrElse(srClient.getLatestSchemaVersionInfo(schemaName))
  }

}

Source File: KustoCsvSerializationUtils.scala From azure-kusto-spark with Apache License 2.0

6 votes

package com.microsoft.kusto.spark.datasink

import java.util.TimeZone

import com.microsoft.kusto.spark.utils.DataTypeMapping
import org.apache.commons.lang3.time.FastDateFormat
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types.DataTypes._
import org.apache.spark.sql.types.StructType

private[kusto] class KustoCsvSerializationUtils (val schema: StructType, timeZone: String){
  private[kusto] val dateFormat = FastDateFormat.getInstance("yyyy-MM-dd'T'HH:mm:ss.SSSXXX", TimeZone.getTimeZone(timeZone))

  private[kusto] def convertRow(row: InternalRow) = {
    val values = new Array[String](row.numFields)
    for (i <- 0 until row.numFields if !row.isNullAt(i))
    {
      val dataType = schema.fields(i).dataType
      values(i) = dataType match {
          case DateType => DateTimeUtils.toJavaDate(row.getInt(i)).toString
          case TimestampType => dateFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(i)))
          case _ => row.get(i, dataType).toString
        }
    }

    values
  }
}

private[kusto] object KustoCsvMapper {
    import org.apache.spark.sql.types.StructType
    import org.json

    def createCsvMapping(schema: StructType): String = {
      val csvMapping = new json.JSONArray()

      for (i <- 0 until schema.length)
      {
        val field = schema.apply(i)
        val dataType = field.dataType
        val mapping = new json.JSONObject()
        mapping.put("Name", field.name)
        mapping.put("Ordinal", i)
        mapping.put("DataType", DataTypeMapping.sparkTypeToKustoTypeMap.getOrElse(dataType, StringType))

        csvMapping.put(mapping)
      }

      csvMapping.toString
    }
  }

Source File: ArrowEvalPythonExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.python

import scala.collection.JavaConverters._

import org.apache.spark.TaskContext
import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.arrow.ArrowUtils
import org.apache.spark.sql.types.StructType


case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan)
  extends EvalPythonExec(udfs, output, child) {

  private val batchSize = conf.arrowMaxRecordsPerBatch
  private val sessionLocalTimeZone = conf.sessionLocalTimeZone
  private val pythonRunnerConf = ArrowUtils.getPythonRunnerConfMap(conf)

  protected override def evaluate(
      funcs: Seq[ChainedPythonFunctions],
      argOffsets: Array[Array[Int]],
      iter: Iterator[InternalRow],
      schema: StructType,
      context: TaskContext): Iterator[InternalRow] = {

    val outputTypes = output.drop(child.output.length).map(_.dataType)

    // DO NOT use iter.grouped(). See BatchIterator.
    val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter)

    val columnarBatchIter = new ArrowPythonRunner(
      funcs,
      PythonEvalType.SQL_SCALAR_PANDAS_UDF,
      argOffsets,
      schema,
      sessionLocalTimeZone,
      pythonRunnerConf).compute(batchIter, context.partitionId(), context)

    new Iterator[InternalRow] {

      private var currentIter = if (columnarBatchIter.hasNext) {
        val batch = columnarBatchIter.next()
        val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType())
        assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " +
          s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}")
        batch.rowIterator.asScala
      } else {
        Iterator.empty
      }

      override def hasNext: Boolean = currentIter.hasNext || {
        if (columnarBatchIter.hasNext) {
          currentIter = columnarBatchIter.next().rowIterator.asScala
          hasNext
        } else {
          false
        }
      }

      override def next(): InternalRow = currentIter.next()
    }
  }
}

Source File: BatchEvalPythonExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.python

import scala.collection.JavaConverters._

import net.razorvine.pickle.{Pickler, Unpickler}

import org.apache.spark.TaskContext
import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.{StructField, StructType}


case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan)
  extends EvalPythonExec(udfs, output, child) {

  protected override def evaluate(
      funcs: Seq[ChainedPythonFunctions],
      argOffsets: Array[Array[Int]],
      iter: Iterator[InternalRow],
      schema: StructType,
      context: TaskContext): Iterator[InternalRow] = {
    EvaluatePython.registerPicklers()  // register pickler for Row

    val dataTypes = schema.map(_.dataType)
    val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython)

    // enable memo iff we serialize the row with schema (schema and class should be memorized)
    val pickle = new Pickler(needConversion)
    // Input iterator to Python: input rows are grouped so we send them in batches to Python.
    // For each row, add it to the queue.
    val inputIterator = iter.map { row =>
      if (needConversion) {
        EvaluatePython.toJava(row, schema)
      } else {
        // fast path for these types that does not need conversion in Python
        val fields = new Array[Any](row.numFields)
        var i = 0
        while (i < row.numFields) {
          val dt = dataTypes(i)
          fields(i) = EvaluatePython.toJava(row.get(i, dt), dt)
          i += 1
        }
        fields
      }
    }.grouped(100).map(x => pickle.dumps(x.toArray))

    // Output iterator for results from Python.
    val outputIterator = new PythonUDFRunner(funcs, PythonEvalType.SQL_BATCHED_UDF, argOffsets)
      .compute(inputIterator, context.partitionId(), context)

    val unpickle = new Unpickler
    val mutableRow = new GenericInternalRow(1)
    val resultType = if (udfs.length == 1) {
      udfs.head.dataType
    } else {
      StructType(udfs.map(u => StructField("", u.dataType, u.nullable)))
    }

    val fromJava = EvaluatePython.makeFromJava(resultType)

    outputIterator.flatMap { pickedResult =>
      val unpickledBatch = unpickle.loads(pickedResult)
      unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
    }.map { result =>
      if (udfs.length == 1) {
        // fast path for single UDF
        mutableRow(0) = fromJava(result)
        mutableRow
      } else {
        fromJava(result).asInstanceOf[InternalRow]
      }
    }
  }
}

Source File: subquery.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.{expressions, InternalRow}
import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId, InSet, Literal, PlanExpression}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{BooleanType, DataType, StructType}


case class ReuseSubquery(conf: SQLConf) extends Rule[SparkPlan] {

  def apply(plan: SparkPlan): SparkPlan = {
    if (!conf.exchangeReuseEnabled) {
      return plan
    }
    // Build a hash map using schema of subqueries to avoid O(N*N) sameResult calls.
    val subqueries = mutable.HashMap[StructType, ArrayBuffer[SubqueryExec]]()
    plan transformAllExpressions {
      case sub: ExecSubqueryExpression =>
        val sameSchema = subqueries.getOrElseUpdate(sub.plan.schema, ArrayBuffer[SubqueryExec]())
        val sameResult = sameSchema.find(_.sameResult(sub.plan))
        if (sameResult.isDefined) {
          sub.withNewPlan(sameResult.get)
        } else {
          sameSchema += sub.plan
          sub
        }
    }
  }
}

Source File: BoundOrdering.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.window

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Projection



private[window] final case class RangeBoundOrdering(
    ordering: Ordering[InternalRow],
    current: Projection,
    bound: Projection)
  extends BoundOrdering {

  override def compare(
      inputRow: InternalRow,
      inputIndex: Int,
      outputRow: InternalRow,
      outputIndex: Int): Int =
    ordering.compare(current(inputRow), bound(outputRow))
}

Source File: NullableColumnAccessor.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.sql.catalyst.InternalRow

private[columnar] trait NullableColumnAccessor extends ColumnAccessor {
  private var nullsBuffer: ByteBuffer = _
  private var nullCount: Int = _
  private var seenNulls: Int = 0

  private var nextNullIndex: Int = _
  private var pos: Int = 0

  abstract override protected def initialize(): Unit = {
    nullsBuffer = underlyingBuffer.duplicate().order(ByteOrder.nativeOrder())
    nullCount = ByteBufferHelper.getInt(nullsBuffer)
    nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
    pos = 0

    underlyingBuffer.position(underlyingBuffer.position() + 4 + nullCount * 4)
    super.initialize()
  }

  abstract override def extractTo(row: InternalRow, ordinal: Int): Unit = {
    if (pos == nextNullIndex) {
      seenNulls += 1

      if (seenNulls < nullCount) {
        nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
      }

      row.setNullAt(ordinal)
    } else {
      super.extractTo(row, ordinal)
    }

    pos += 1
  }

  abstract override def hasNext: Boolean = seenNulls < nullCount || super.hasNext
}

Source File: NullableColumnBuilder.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.sql.catalyst.InternalRow


private[columnar] trait NullableColumnBuilder extends ColumnBuilder {
  protected var nulls: ByteBuffer = _
  protected var nullCount: Int = _
  private var pos: Int = _

  abstract override def initialize(
      initialSize: Int,
      columnName: String,
      useCompression: Boolean): Unit = {

    nulls = ByteBuffer.allocate(1024)
    nulls.order(ByteOrder.nativeOrder())
    pos = 0
    nullCount = 0
    super.initialize(initialSize, columnName, useCompression)
  }

  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
    columnStats.gatherStats(row, ordinal)
    if (row.isNullAt(ordinal)) {
      nulls = ColumnBuilder.ensureFreeSpace(nulls, 4)
      nulls.putInt(pos)
      nullCount += 1
    } else {
      super.appendFrom(row, ordinal)
    }
    pos += 1
  }

  abstract override def build(): ByteBuffer = {
    val nonNulls = super.build()
    val nullDataLen = nulls.position()

    nulls.limit(nullDataLen)
    nulls.rewind()

    val buffer = ByteBuffer
      .allocate(4 + nullDataLen + nonNulls.remaining())
      .order(ByteOrder.nativeOrder())
      .putInt(nullCount)
      .put(nulls)
      .put(nonNulls)

    buffer.rewind()
    buffer
  }

  protected def buildNonNulls(): ByteBuffer = {
    nulls.limit(nulls.position()).rewind()
    super.build()
  }
}

Source File: CompressibleColumnBuilder.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar.compression

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder}
import org.apache.spark.sql.types.AtomicType
import org.apache.spark.unsafe.Platform


private[columnar] trait CompressibleColumnBuilder[T <: AtomicType]
  extends ColumnBuilder with Logging {

  this: NativeColumnBuilder[T] with WithCompressionSchemes =>

  var compressionEncoders: Seq[Encoder[T]] = _

  abstract override def initialize(
      initialSize: Int,
      columnName: String,
      useCompression: Boolean): Unit = {

    compressionEncoders =
      if (useCompression) {
        schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType))
      } else {
        Seq(PassThrough.encoder(columnType))
      }
    super.initialize(initialSize, columnName, useCompression)
  }

  // The various compression schemes, while saving memory use, cause all of the data within
  // the row to become unaligned, thus causing crashes.  Until a way of fixing the compression
  // is found to also allow aligned accesses this must be disabled for SPARC.

  protected def isWorthCompressing(encoder: Encoder[T]) = {
    CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8
  }

  private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
    compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal))
  }

  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
    super.appendFrom(row, ordinal)
    if (!row.isNullAt(ordinal)) {
      gatherCompressibilityStats(row, ordinal)
    }
  }

  override def build(): ByteBuffer = {
    val nonNullBuffer = buildNonNulls()
    val encoder: Encoder[T] = {
      val candidate = compressionEncoders.minBy(_.compressionRatio)
      if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType)
    }

    // Header = null count + null positions
    val headerSize = 4 + nulls.limit()
    val compressedSize = if (encoder.compressedSize == 0) {
      nonNullBuffer.remaining()
    } else {
      encoder.compressedSize
    }

    val compressedBuffer = ByteBuffer
      // Reserves 4 bytes for compression scheme ID
      .allocate(headerSize + 4 + compressedSize)
      .order(ByteOrder.nativeOrder)
      // Write the header
      .putInt(nullCount)
      .put(nulls)

    logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
    encoder.compress(nonNullBuffer, compressedBuffer)
  }
}

private[columnar] object CompressibleColumnBuilder {
  val unaligned = Platform.unaligned()
}

Source File: CompressibleColumnAccessor.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar.compression

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor}
import org.apache.spark.sql.execution.vectorized.WritableColumnVector
import org.apache.spark.sql.types.AtomicType

private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor {
  this: NativeColumnAccessor[T] =>

  private var decoder: Decoder[T] = _

  abstract override protected def initialize(): Unit = {
    super.initialize()
    decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType)
  }

  abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext

  override def extractSingle(row: InternalRow, ordinal: Int): Unit = {
    decoder.next(row, ordinal)
  }

  def decompress(columnVector: WritableColumnVector, capacity: Int): Unit =
    decoder.decompress(columnVector, capacity)
}

Source File: GroupedIterator.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, Expression, SortOrder}
import org.apache.spark.sql.catalyst.expressions.codegen.{GenerateOrdering, GenerateUnsafeProjection}

object GroupedIterator {
  def apply(
      input: Iterator[InternalRow],
      keyExpressions: Seq[Expression],
      inputSchema: Seq[Attribute]): Iterator[(InternalRow, Iterator[InternalRow])] = {
    if (input.hasNext) {
      new GroupedIterator(input.buffered, keyExpressions, inputSchema)
    } else {
      Iterator.empty
    }
  }
}


  def hasNext: Boolean = currentIterator != null || fetchNextGroupIterator

  def next(): (InternalRow, Iterator[InternalRow]) = {
    assert(hasNext) // Ensure we have fetched the next iterator.
    val ret = (keyProjection(currentGroup), currentIterator)
    currentIterator = null
    ret
  }

  private def fetchNextGroupIterator(): Boolean = {
    assert(currentIterator == null)

    if (currentRow == null && input.hasNext) {
      currentRow = input.next()
    }

    if (currentRow == null) {
      // These is no data left, return false.
      false
    } else {
      // Skip to next group.
      // currentRow may be overwritten by `hasNext`, so we should compare them first.
      while (keyOrdering.compare(currentGroup, currentRow) == 0 && input.hasNext) {
        currentRow = input.next()
      }

      if (keyOrdering.compare(currentGroup, currentRow) == 0) {
        // We are in the last group, there is no more groups, return false.
        false
      } else {
        // Now the `currentRow` is the first row of next group.
        currentGroup = currentRow.copy()
        currentIterator = createGroupValuesIterator()
        true
      }
    }
  }

  private def createGroupValuesIterator(): Iterator[InternalRow] = {
    new Iterator[InternalRow] {
      def hasNext: Boolean = currentRow != null || fetchNextRowInGroup()

      def next(): InternalRow = {
        assert(hasNext)
        val res = currentRow
        currentRow = null
        res
      }

      private def fetchNextRowInGroup(): Boolean = {
        assert(currentRow == null)

        if (input.hasNext) {
          // The inner iterator should NOT consume the input into next group, here we use `head` to
          // peek the next input, to see if we should continue to process it.
          if (keyOrdering.compare(currentGroup, input.head) == 0) {
            // Next input is in the current group.  Continue the inner iterator.
            currentRow = input.next()
            true
          } else {
            // Next input is not in the right group.  End this inner iterator.
            false
          }
        } else {
          // There is no more data, return false.
          false
        }
      }
    }
  }
}

Source File: ExistingRDD.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Encoder, Row, SparkSession}
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.types.DataType
import org.apache.spark.util.Utils

object RDDConversions {
  def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = {
    data.mapPartitions { iterator =>
      val numColumns = outputTypes.length
      val mutableRow = new GenericInternalRow(numColumns)
      val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter)
      iterator.map { r =>
        var i = 0
        while (i < numColumns) {
          mutableRow(i) = converters(i)(r.productElement(i))
          i += 1
        }

        mutableRow
      }
    }
  }

  
case class RDDScanExec(
    output: Seq[Attribute],
    rdd: RDD[InternalRow],
    name: String,
    override val outputPartitioning: Partitioning = UnknownPartitioning(0),
    override val outputOrdering: Seq[SortOrder] = Nil) extends LeafExecNode {

  private def rddName: String = Option(rdd.name).map(n => s" $n").getOrElse("")

  override val nodeName: String = s"Scan $name$rddName"

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    rdd.mapPartitionsWithIndexInternal { (index, iter) =>
      val proj = UnsafeProjection.create(schema)
      proj.initialize(index)
      iter.map { r =>
        numOutputRows += 1
        proj(r)
      }
    }
  }

  override def simpleString: String = {
    s"$nodeName${Utils.truncatedString(output, "[", ",", "]")}"
  }
}

Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.util.concurrent.TimeUnit.NANOSECONDS

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning}
import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.sql.execution.streaming.state.StateStoreOps
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType}
import org.apache.spark.util.CompletionIterator


case class StreamingGlobalLimitExec(
    streamLimit: Long,
    child: SparkPlan,
    stateInfo: Option[StatefulOperatorStateInfo] = None,
    outputMode: Option[OutputMode] = None)
  extends UnaryExecNode with StateStoreWriter {

  private val keySchema = StructType(Array(StructField("key", NullType)))
  private val valueSchema = StructType(Array(StructField("value", LongType)))

  override protected def doExecute(): RDD[InternalRow] = {
    metrics // force lazy init at driver

    assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append,
      "StreamingGlobalLimitExec is only valid for streams in Append output mode")

    child.execute().mapPartitionsWithStateStore(
        getStateInfo,
        keySchema,
        valueSchema,
        indexOrdinal = None,
        sqlContext.sessionState,
        Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) =>
      val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null)))
      val numOutputRows = longMetric("numOutputRows")
      val numUpdatedStateRows = longMetric("numUpdatedStateRows")
      val allUpdatesTimeMs = longMetric("allUpdatesTimeMs")
      val commitTimeMs = longMetric("commitTimeMs")
      val updatesStartTimeNs = System.nanoTime

      val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L)
      var cumulativeRowCount = preBatchRowCount

      val result = iter.filter { r =>
        val x = cumulativeRowCount < streamLimit
        if (x) {
          cumulativeRowCount += 1
        }
        x
      }

      CompletionIterator[InternalRow, Iterator[InternalRow]](result, {
        if (cumulativeRowCount > preBatchRowCount) {
          numUpdatedStateRows += 1
          numOutputRows += cumulativeRowCount - preBatchRowCount
          store.put(key, getValueRow(cumulativeRowCount))
        }
        allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs)
        commitTimeMs += timeTakenMs { store.commit() }
        setStoreMetrics(store)
      })
    }
  }

  override def output: Seq[Attribute] = child.output

  override def outputPartitioning: Partitioning = child.outputPartitioning

  override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil

  private def getValueRow(value: Long): UnsafeRow = {
    UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value)))
  }
}

Source File: ContinuousWriteRDD.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.continuous

import org.apache.spark.{Partition, SparkEnv, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory}
import org.apache.spark.util.Utils


class ContinuousWriteRDD(var prev: RDD[InternalRow], writeTask: DataWriterFactory[InternalRow])
    extends RDD[Unit](prev) {

  override val partitioner = prev.partitioner

  override def getPartitions: Array[Partition] = prev.partitions

  override def compute(split: Partition, context: TaskContext): Iterator[Unit] = {
    val epochCoordinator = EpochCoordinatorRef.get(
      context.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
      SparkEnv.get)
    EpochTracker.initializeCurrentEpoch(
      context.getLocalProperty(ContinuousExecution.START_EPOCH_KEY).toLong)
    while (!context.isInterrupted() && !context.isCompleted()) {
      var dataWriter: DataWriter[InternalRow] = null
      // write the data and commit this writer.
      Utils.tryWithSafeFinallyAndFailureCallbacks(block = {
        try {
          val dataIterator = prev.compute(split, context)
          dataWriter = writeTask.createDataWriter(
            context.partitionId(),
            context.taskAttemptId(),
            EpochTracker.getCurrentEpoch.get)
          while (dataIterator.hasNext) {
            dataWriter.write(dataIterator.next())
          }
          logInfo(s"Writer for partition ${context.partitionId()} " +
            s"in epoch ${EpochTracker.getCurrentEpoch.get} is committing.")
          val msg = dataWriter.commit()
          epochCoordinator.send(
            CommitPartitionEpoch(
              context.partitionId(),
              EpochTracker.getCurrentEpoch.get,
              msg)
          )
          logInfo(s"Writer for partition ${context.partitionId()} " +
            s"in epoch ${EpochTracker.getCurrentEpoch.get} committed.")
          EpochTracker.incrementCurrentEpoch()
        } catch {
          case _: InterruptedException =>
          // Continuous shutdown always involves an interrupt. Just finish the task.
        }
      })(catchBlock = {
        // If there is an error, abort this writer. We enter this callback in the middle of
        // rethrowing an exception, so compute() will stop executing at this point.
        logError(s"Writer for partition ${context.partitionId()} is aborting.")
        if (dataWriter != null) dataWriter.abort()
        logError(s"Writer for partition ${context.partitionId()} aborted.")
      })
    }

    Iterator()
  }

  override def clearDependencies() {
    super.clearDependencies()
    prev = null
  }
}

Source File: ContinuousCoalesceExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.continuous

import java.util.UUID

import org.apache.spark.{HashPartitioner, SparkEnv}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeRow}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.streaming.continuous.shuffle.{ContinuousShuffleReadPartition, ContinuousShuffleReadRDD}


case class ContinuousCoalesceExec(numPartitions: Int, child: SparkPlan) extends SparkPlan {
  override def output: Seq[Attribute] = child.output

  override def children: Seq[SparkPlan] = child :: Nil

  override def outputPartitioning: Partitioning = SinglePartition

  override def doExecute(): RDD[InternalRow] = {
    assert(numPartitions == 1)
    new ContinuousCoalesceRDD(
      sparkContext,
      numPartitions,
      conf.continuousStreamingExecutorQueueSize,
      sparkContext.getLocalProperty(ContinuousExecution.EPOCH_INTERVAL_KEY).toLong,
      child.execute())
  }
}

Source File: ContinuousDataSourceRDD.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.continuous

import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.v2.reader._
import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousInputPartitionReader
import org.apache.spark.util.NextIterator

class ContinuousDataSourceRDDPartition(
    val index: Int,
    val inputPartition: InputPartition[InternalRow])
  extends Partition with Serializable {

  // This is semantically a lazy val - it's initialized once the first time a call to
  // ContinuousDataSourceRDD.compute() needs to access it, so it can be shared across
  // all compute() calls for a partition. This ensures that one compute() picks up where the
  // previous one ended.
  // We don't make it actually a lazy val because it needs input which isn't available here.
  // This will only be initialized on the executors.
  private[continuous] var queueReader: ContinuousQueuedDataReader = _
}


  override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = {
    // If attempt number isn't 0, this is a task retry, which we don't support.
    if (context.attemptNumber() != 0) {
      throw new ContinuousTaskRetryException()
    }

    val readerForPartition = {
      val partition = split.asInstanceOf[ContinuousDataSourceRDDPartition]
      if (partition.queueReader == null) {
        partition.queueReader =
          new ContinuousQueuedDataReader(partition, context, dataQueueSize, epochPollIntervalMs)
      }

      partition.queueReader
    }

    new NextIterator[InternalRow] {
      override def getNext(): InternalRow = {
        readerForPartition.next() match {
          case null =>
            finished = true
            null
          case row => row
        }
      }

      override def close(): Unit = {}
    }
  }

  override def getPreferredLocations(split: Partition): Seq[String] = {
    split.asInstanceOf[ContinuousDataSourceRDDPartition].inputPartition.preferredLocations()
  }
}

object ContinuousDataSourceRDD {
  private[continuous] def getContinuousReader(
      reader: InputPartitionReader[InternalRow]): ContinuousInputPartitionReader[_] = {
    reader match {
      case r: ContinuousInputPartitionReader[InternalRow] => r
      case _ =>
        throw new IllegalStateException(s"Unknown continuous reader type ${reader.getClass}")
    }
  }
}

Source File: WriteToContinuousDataSourceExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.continuous

import scala.util.control.NonFatal

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter


case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan)
    extends SparkPlan with Logging {
  override def children: Seq[SparkPlan] = Seq(query)
  override def output: Seq[Attribute] = Nil

  override protected def doExecute(): RDD[InternalRow] = {
    val writerFactory = writer.createWriterFactory()
    val rdd = new ContinuousWriteRDD(query.execute(), writerFactory)

    logInfo(s"Start processing data source writer: $writer. " +
      s"The input RDD has ${rdd.partitions.length} partitions.")
    EpochCoordinatorRef.get(
      sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
      sparkContext.env)
      .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions))

    try {
      // Force the RDD to run so continuous processing starts; no data is actually being collected
      // to the driver, as ContinuousWriteRDD outputs nothing.
      rdd.collect()
    } catch {
      case _: InterruptedException =>
        // Interruption is how continuous queries are ended, so accept and ignore the exception.
      case cause: Throwable =>
        cause match {
          // Do not wrap interruption exceptions that will be handled by streaming specially.
          case _ if StreamExecution.isInterruptionException(cause) => throw cause
          // Only wrap non fatal exceptions.
          case NonFatal(e) => throw new SparkException("Writing job aborted.", e)
          case _ => throw cause
        }
    }

    sparkContext.emptyRDD
  }
}

Source File: ConsoleWriter.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.sources

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.sources.v2.DataSourceOptions
import org.apache.spark.sql.sources.v2.writer.{DataWriterFactory, WriterCommitMessage}
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
import org.apache.spark.sql.types.StructType


class ConsoleWriter(schema: StructType, options: DataSourceOptions)
    extends StreamWriter with Logging {

  // Number of rows to display, by default 20 rows
  protected val numRowsToShow = options.getInt("numRows", 20)

  // Truncate the displayed data if it is too long, by default it is true
  protected val isTruncated = options.getBoolean("truncate", true)

  assert(SparkSession.getActiveSession.isDefined)
  protected val spark = SparkSession.getActiveSession.get

  def createWriterFactory(): DataWriterFactory[InternalRow] = PackedRowWriterFactory

  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {
    // We have to print a "Batch" label for the epoch for compatibility with the pre-data source V2
    // behavior.
    printRows(messages, schema, s"Batch: $epochId")
  }

  def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}

  protected def printRows(
      commitMessages: Array[WriterCommitMessage],
      schema: StructType,
      printMessage: String): Unit = {
    val rows = commitMessages.collect {
      case PackedRowCommitMessage(rs) => rs
    }.flatten

    // scalastyle:off println
    println("-------------------------------------------")
    println(printMessage)
    println("-------------------------------------------")
    // scalastyle:off println
    Dataset.ofRows(spark, LocalRelation(schema.toAttributes, rows))
      .show(numRowsToShow, isTruncated)
  }

  override def toString(): String = {
    s"ConsoleWriter[numRows=$numRowsToShow, truncate=$isTruncated]"
  }
}

Source File: ContinuousRecordEndpoint.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.SparkEnv
import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.v2.reader.streaming.PartitionOffset

case class ContinuousRecordPartitionOffset(partitionId: Int, offset: Int) extends PartitionOffset
case class GetRecord(offset: ContinuousRecordPartitionOffset)


  override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = {
    case GetRecord(ContinuousRecordPartitionOffset(partitionId, offset)) =>
      lock.synchronized {
        val bufOffset = offset - startOffsets(partitionId)
        val buf = buckets(partitionId)
        val record = if (buf.size <= bufOffset) None else Some(buf(bufOffset))

        context.reply(record.map(InternalRow(_)))
      }
  }
}

Source File: StreamingRelation.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.execution.LeafExecNode
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2}

object StreamingRelation {
  def apply(dataSource: DataSource): StreamingRelation = {
    StreamingRelation(
      dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes)
  }
}


case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode {
  override def toString: String = sourceName
  override protected def doExecute(): RDD[InternalRow] = {
    throw new UnsupportedOperationException("StreamingRelationExec cannot be executed")
  }
}

object StreamingExecutionRelation {
  def apply(source: Source, session: SparkSession): StreamingExecutionRelation = {
    StreamingExecutionRelation(source, source.schema.toAttributes)(session)
  }
}

Source File: EventTimeWatermarkExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends UnaryExecNode {

  val eventTimeStats = new EventTimeStatsAccum()
  val delayMs = EventTimeWatermark.getDelayMs(delay)

  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .putLong(EventTimeWatermark.delayKey, delayMs)
        .build()
      a.withMetadata(updatedMetadata)
    } else if (a.metadata.contains(EventTimeWatermark.delayKey)) {
      // Remove existing watermark
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .remove(EventTimeWatermark.delayKey)
        .build()
      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }
}

Source File: CoGroupedIterator.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering


class CoGroupedIterator(
    left: Iterator[(InternalRow, Iterator[InternalRow])],
    right: Iterator[(InternalRow, Iterator[InternalRow])],
    groupingSchema: Seq[Attribute])
  extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] {

  private val keyOrdering =
    GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema)

  private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _
  private var currentRightData: (InternalRow, Iterator[InternalRow]) = _

  override def hasNext: Boolean = {
    if (currentLeftData == null && left.hasNext) {
      currentLeftData = left.next()
    }
    if (currentRightData == null && right.hasNext) {
      currentRightData = right.next()
    }

    currentLeftData != null || currentRightData != null
  }

  override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    assert(hasNext)

    if (currentLeftData.eq(null)) {
      // left is null, right is not null, consume the right data.
      rightOnly()
    } else if (currentRightData.eq(null)) {
      // left is not null, right is null, consume the left data.
      leftOnly()
    } else if (currentLeftData._1 == currentRightData._1) {
      // left and right have the same grouping key, consume both of them.
      val result = (currentLeftData._1, currentLeftData._2, currentRightData._2)
      currentLeftData = null
      currentRightData = null
      result
    } else {
      val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1)
      assert(compare != 0)
      if (compare < 0) {
        // the grouping key of left is smaller, consume the left data.
        leftOnly()
      } else {
        // the grouping key of right is smaller, consume the right data.
        rightOnly()
      }
    }
  }

  private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    val result = (currentLeftData._1, currentLeftData._2, Iterator.empty)
    currentLeftData = null
    result
  }

  private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    val result = (currentRightData._1, Iterator.empty, currentRightData._2)
    currentRightData = null
    result
  }
}

Source File: ReferenceSort.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.errors._
import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.util.CompletionIterator
import org.apache.spark.util.collection.ExternalSorter



case class ReferenceSort(
    sortOrder: Seq[SortOrder],
    global: Boolean,
    child: SparkPlan)
  extends UnaryExecNode {

  override def requiredChildDistribution: Seq[Distribution] =
    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil

  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
    child.execute().mapPartitions( { iterator =>
      val ordering = newOrdering(sortOrder, child.output)
      val sorter = new ExternalSorter[InternalRow, Null, InternalRow](
        TaskContext.get(), ordering = Some(ordering))
      sorter.insertAll(iterator.map(r => (r.copy(), null)))
      val baseIterator = sorter.iterator.map(_._1)
      val context = TaskContext.get()
      context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled)
      context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled)
      context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes)
      CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop())
    }, preservesPartitioning = true)
  }

  override def output: Seq[Attribute] = child.output

  override def outputOrdering: Seq[SortOrder] = sortOrder

  override def outputPartitioning: Partitioning = child.outputPartitioning
}

Source File: ColumnarTestUtils.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{AtomicType, Decimal}
import org.apache.spark.unsafe.types.UTF8String

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericInternalRow = {
    val row = new GenericInternalRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case NULL => null
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
      case STRUCT(_) =>
        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
      case ARRAY(_) =>
        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
      case MAP(_) =>
        ArrayBasedMapData(
          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
      case _ => throw new IllegalArgumentException(s"Unknown column type $columnType")
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericInternalRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericInternalRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: MemorySinkV2Suite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.streaming.sources._
import org.apache.spark.sql.streaming.{OutputMode, StreamTest}
import org.apache.spark.sql.types.StructType

class MemorySinkV2Suite extends StreamTest with BeforeAndAfter {
  test("data writer") {
    val partition = 1234
    val writer = new MemoryDataWriter(
      partition, OutputMode.Append(), new StructType().add("i", "int"))
    writer.write(InternalRow(1))
    writer.write(InternalRow(2))
    writer.write(InternalRow(44))
    val msg = writer.commit()
    assert(msg.data.map(_.getInt(0)) == Seq(1, 2, 44))
    assert(msg.partition == partition)

    // Buffer should be cleared, so repeated commits should give empty.
    assert(writer.commit().data.isEmpty)
  }

  test("streaming writer") {
    val sink = new MemorySinkV2
    val writeSupport = new MemoryStreamWriter(
      sink, OutputMode.Append(), new StructType().add("i", "int"))
    writeSupport.commit(0,
      Array(
        MemoryWriterCommitMessage(0, Seq(Row(1), Row(2))),
        MemoryWriterCommitMessage(1, Seq(Row(3), Row(4))),
        MemoryWriterCommitMessage(2, Seq(Row(6), Row(7)))
      ))
    assert(sink.latestBatchId.contains(0))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7))
    writeSupport.commit(19,
      Array(
        MemoryWriterCommitMessage(3, Seq(Row(11), Row(22))),
        MemoryWriterCommitMessage(0, Seq(Row(33)))
      ))
    assert(sink.latestBatchId.contains(19))
    assert(sink.latestBatchData.map(_.getInt(0)).sorted == Seq(11, 22, 33))

    assert(sink.allData.map(_.getInt(0)).sorted == Seq(1, 2, 3, 4, 6, 7, 11, 22, 33))
  }
}

Source File: ExtraStrategiesSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.test.SharedSQLContext

case class FastOperator(output: Seq[Attribute]) extends SparkPlan {

  override protected def doExecute(): RDD[InternalRow] = {
    val str = Literal("so fast").value
    val row = new GenericInternalRow(Array[Any](str))
    val unsafeProj = UnsafeProjection.create(schema)
    val unsafeRow = unsafeProj(row).copy()
    sparkContext.parallelize(Seq(unsafeRow))
  }

  override def producedAttributes: AttributeSet = outputSet
  override def children: Seq[SparkPlan] = Nil
}

object TestStrategy extends Strategy {
  def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
    case Project(Seq(attr), _) if attr.name == "a" =>
      FastOperator(attr.toAttribute :: Nil) :: Nil
    case _ => Nil
  }
}

class ExtraStrategiesSuite extends QueryTest with SharedSQLContext {
  import testImplicits._

  test("insert an extraStrategy") {
    try {
      spark.experimental.extraStrategies = TestStrategy :: Nil

      val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b")
      checkAnswer(
        df.select("a"),
        Row("so fast"))

      checkAnswer(
        df.select("a", "b"),
        Row("so slow", 1))
    } finally {
      spark.experimental.extraStrategies = Nil
    }
  }
}

Source File: DataSourceTest.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

private[sql] abstract class DataSourceTest extends QueryTest {

  protected def sqlTest(sqlString: String, expectedAnswer: Seq[Row], enableRegex: Boolean = false) {
    test(sqlString) {
      withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> enableRegex.toString) {
        checkAnswer(spark.sql(sqlString), expectedAnswer)
      }
    }
  }

}

class DDLScanSource extends RelationProvider {
  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    SimpleDDLScan(
      parameters("from").toInt,
      parameters("TO").toInt,
      parameters("Table"))(sqlContext.sparkSession)
  }
}

case class SimpleDDLScan(
    from: Int,
    to: Int,
    table: String)(@transient val sparkSession: SparkSession)
  extends BaseRelation with TableScan {

  override def sqlContext: SQLContext = sparkSession.sqlContext

  override def schema: StructType =
    StructType(Seq(
      StructField("intType", IntegerType, nullable = false).withComment(s"test comment $table"),
      StructField("stringType", StringType, nullable = false),
      StructField("dateType", DateType, nullable = false),
      StructField("timestampType", TimestampType, nullable = false),
      StructField("doubleType", DoubleType, nullable = false),
      StructField("bigintType", LongType, nullable = false),
      StructField("tinyintType", ByteType, nullable = false),
      StructField("decimalType", DecimalType.USER_DEFAULT, nullable = false),
      StructField("fixedDecimalType", DecimalType(5, 1), nullable = false),
      StructField("binaryType", BinaryType, nullable = false),
      StructField("booleanType", BooleanType, nullable = false),
      StructField("smallIntType", ShortType, nullable = false),
      StructField("floatType", FloatType, nullable = false),
      StructField("mapType", MapType(StringType, StringType)),
      StructField("arrayType", ArrayType(StringType)),
      StructField("structType",
        StructType(StructField("f1", StringType) :: StructField("f2", IntegerType) :: Nil
        )
      )
    ))

  override def needConversion: Boolean = false

  override def buildScan(): RDD[Row] = {
    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
    sparkSession.sparkContext.parallelize(from to to).map { e =>
      InternalRow(UTF8String.fromString(s"people$e"), e * 2)
    }.asInstanceOf[RDD[Row]]
  }
}

Source File: HierarchyRowFunctions.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hierarchy

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.CatalystTypeConverters
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.Node

private[hierarchy]  case class HierarchyRowFunctions(inputTypes: Seq[DataType]) {

  private[hierarchy] def rowGet[K](i: Int): Row => K = (row: Row) => row.getAs[K](i)

  private[hierarchy] def rowInit[K](pk: Row => K, pathDataType: DataType):
  (Row, Option[Long]) => Row = { (row, myOrdKey) =>
    myOrdKey match {
      case Some(x) => Row(row.toSeq ++ Seq(Node(List(pk(row)),
        pathDataType, ordPath = List(x))): _*)
      case None => Row(row.toSeq ++ Seq(Node(List(pk(row)), pathDataType)): _*)
    }
  }

  private[hierarchy] def rowModifyAndOrder[K](pk: Row => K, pathDataType: DataType):
  (Row, Row, Option[Long]) => Row = {
    (left, right, myord) => {
      val pathComponent: K = pk(right)
      // TODO(weidner): is myNode a ref/ptr or a copy of node?:
      val myNode: Node = left.getAs[Node](left.length - 1)
      val path: Seq[Any] = myNode.path ++ List(pathComponent)

      var node: Node = null  // Node(path, ordPath = myOrdPath)
      myord match {
        case Some(ord) =>
          val parentOrdPath = myNode.ordPath match {
            case x: Seq[Long] => x
            case _ => List()
          }
          node = Node(path, pathDataType, ordPath = parentOrdPath ++ List(ord))
        case None => node = Node(path, pathDataType)
      }
      Row(right.toSeq :+ node: _*)
    }
  }

  private[hierarchy] def rowModify[K](pk: Row => K, pathDataType: DataType):
    (Row, Row) => Row = { (left, right) =>
    val pathComponent: K = pk(right)
    val path: Seq[Any] = left.getAs[Node](left.length - 1).path ++ List(pathComponent)
    val node: Node = Node(path, pathDataType)
    Row(right.toSeq :+ node: _*)
  }

  private[hierarchy] def rowAppend[K](row: Row, node: Node): Row = {
    Row(row.toSeq :+ node: _*)
  }

  private[hierarchy] def rowStartWhere[K](exp: Expression): Row => Boolean = { row =>
    val numColumns = inputTypes.length
    val converters = inputTypes.map(CatalystTypeConverters.createToCatalystConverter)
    val values = Stream.from(0).takeWhile(_ < numColumns).map({ i =>
      converters(i)(row(i))
    })
    val newRow = InternalRow.fromSeq(values)
    exp.eval(newRow).asInstanceOf[Boolean]
  }

  private[hierarchy] def bindExpression(exp: Expression, attributes: Seq[Attribute])
  : Expression = exp.transform {
    case a: AttributeReference =>
      val index = attributes.indexWhere(_.name == a.name)
      BoundReference(index, a.dataType, a.nullable)
  }

}

Source File: ERPCurrencyConversionExpression.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.currency.erp

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes}
import org.apache.spark.sql.currency.CurrencyConversionException
import org.apache.spark.sql.currency.erp.ERPConversionLoader.RConversionOptionsCurried
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

import scala.util.control.NonFatal



case class ERPCurrencyConversionExpression(
    conversionFunction: RConversionOptionsCurried,
    children: Seq[Expression])
  extends Expression
  with ImplicitCastInputTypes
  with CodegenFallback {

  protected val CLIENT_INDEX = 0
  protected val CONVERSION_TYPE_INDEX = 1
  protected val AMOUNT_INDEX = 2
  protected val FROM_INDEX = 3
  protected val TO_INDEX = 4
  protected val DATE_INDEX = 5
  protected val NUM_ARGS = 6

  protected val errorMessage = "Currency conversion library encountered an internal error"


  override def eval(input: InternalRow): Any = {
    val inputArguments = children.map(_.eval(input))

    require(inputArguments.length == NUM_ARGS, "wrong number of arguments")

    // parse arguments
    val client = Option(inputArguments(CLIENT_INDEX).asInstanceOf[UTF8String]).map(_.toString)
    val conversionType =
      Option(inputArguments(CONVERSION_TYPE_INDEX).asInstanceOf[UTF8String]).map(_.toString)
    val amount = Option(inputArguments(AMOUNT_INDEX).asInstanceOf[Decimal].toJavaBigDecimal)
    val sourceCurrency =
      Option(inputArguments(FROM_INDEX).asInstanceOf[UTF8String]).map(_.toString)
    val targetCurrency = Option(inputArguments(TO_INDEX).asInstanceOf[UTF8String]).map(_.toString)
    val date = Option(inputArguments(DATE_INDEX).asInstanceOf[UTF8String]).map(_.toString)

    // perform conversion
    val conversion =
      conversionFunction(client, conversionType, sourceCurrency, targetCurrency, date)
    val resultTry = conversion(amount)

    // If 'resultTry' holds a 'Failure', we have to propagate it because potential failure
    // handling already took place. We just wrap it in case it is a cryptic error.
    resultTry.recover {
      case NonFatal(err) => throw new CurrencyConversionException(errorMessage, err)
    }.get.map(Decimal.apply).orNull
  }

  override def dataType: DataType = DecimalType.forType(DoubleType)

  override def nullable: Boolean = true

  override def inputTypes: Seq[AbstractDataType] =
    Seq(StringType, StringType, DecimalType, StringType, StringType, StringType)

  def inputNames: Seq[String] =
    Seq("client", "conversion_type", "amount", "source", "target", "date")

  def getChild(name: String): Option[Expression] = {
    inputNames.zip(children).find { case (n, _) => name == n }.map(_._2)
  }
}

Source File: NodeType.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import java.sql.Date

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.unsafe.types.UTF8String


class NodeType extends UserDefinedType[Node] {

  override val sqlType = StructType(Seq(
    StructField("path", ArrayType(StringType, containsNull = false), nullable = false),
    StructField("dataType", StringType, nullable = false),
    StructField("preRank", IntegerType, nullable = true),
    StructField("postRank", IntegerType, nullable = true),
    StructField("isLeaf", BooleanType, nullable = true),
    StructField("ordPath", ArrayType(LongType, containsNull=false), nullable = true)
  ))

  override def serialize(obj: Any): Any = obj match {
    case node: Node =>
      InternalRow(new GenericArrayData(node.path.map {
        case null => null
        case p => UTF8String.fromString(p.toString)
      }),
        UTF8String.fromString(node.pathDataTypeJson),
        node.preRank,
        node.postRank,
        node.isLeaf,
        if (node.ordPath == null){
          node.ordPath
        } else {
          new GenericArrayData(node.ordPath)
        })
    case _ => throw new UnsupportedOperationException(s"Cannot serialize ${obj.getClass}")
  }

  // scalastyle:off cyclomatic.complexity
  override def deserialize(datum: Any): Node = datum match {
    case row: InternalRow => {
      val stringArray = row.getArray(0).toArray[UTF8String](StringType).map {
        case null => null
        case somethingElse => somethingElse.toString
      }
      val readDataTypeString: String = row.getString(1)
      val readDataType: DataType = DataType.fromJson(readDataTypeString)
      val path: Seq[Any] = readDataType match {
        case StringType => stringArray
        case LongType => stringArray.map(v => if (v != null) v.toLong else null)
        case IntegerType => stringArray.map(v => if (v != null) v.toInt else null)
        case DoubleType => stringArray.map(v => if (v != null) v.toDouble else null)
        case FloatType => stringArray.map(v => if (v != null) v.toFloat else null)
        case ByteType => stringArray.map(v => if (v != null) v.toByte else null)
        case BooleanType => stringArray.map(v => if (v != null) v.toBoolean else null)
        case TimestampType => stringArray.map(v => if (v != null) v.toLong else null)
        case dt: DataType => sys.error(s"Type $dt not supported for hierarchy path")
      }
      val preRank: Integer = if (row.isNullAt(2)) null else row.getInt(2)
      val postRank: Integer = if (row.isNullAt(3)) null else row.getInt(3)
      // scalastyle:off magic.number
      val isLeaf: java.lang.Boolean = if (row.isNullAt(4)) null else row.getBoolean(4)
      val ordPath: Seq[Long] = if (row.isNullAt(5)) null else row.getArray(5).toLongArray()
      // scalastyle:on magic.number
      Node(
        path,
        readDataTypeString,
        preRank,
        postRank,
        isLeaf,
        ordPath
      )
    }
    case node: Node => node
    case _ => throw new UnsupportedOperationException(s"Cannot deserialize ${datum.getClass}")
  }
  // scalastyle:on

  override def userClass: java.lang.Class[Node] = classOf[Node]
}

case object NodeType extends NodeType

Source File: AnnotationFilter.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.analysis.{UnresolvedException, UnresolvedAttribute}
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.{InternalRow, trees}
import org.apache.spark.sql.types._


case class AnnotationFilter(child: Expression)(
  val filters: Set[String] = Set.empty,
  val exprId: ExprId = NamedExpression.newExprId)
  extends UnaryExpression
  with NamedExpression
  with CodegenFallback {

  override def name: String = child match {
    case e:NamedExpression => e.name
    case _ => throw new UnresolvedException(this, "name of AnnotationFilter with non-named child")
  }

  override lazy val resolved = childrenResolved

  override def toAttribute: Attribute = {
    if (resolved) {
      child.transform ({
        case a:Alias => a.copy(a.child, a.name)(a.exprId, qualifiers = a.qualifiers,
          explicitMetadata = Some(MetadataAccessor.filterMetadata(a.metadata, filters)))
        case a:AttributeReference =>
          a.copy(a.name, a.dataType, a.nullable,
            metadata = MetadataAccessor.filterMetadata(a.metadata, filters))(a.exprId, a.qualifiers)
        case p => p
      }) match {
        case e: NamedExpression => e.toAttribute
        case _ => throw new UnresolvedException(this, "toAttribute of AnnotationFilter with " +
          "no-named child")
      }
    } else {
      UnresolvedAttribute(name)
    }
  }

  override def equals(other: Any): Boolean = other match {
    case aa: AnnotationFilter => child == aa.child && filters == aa.filters &&
      exprId == aa.exprId
    case _ => false
  }

  // scalastyle:off magic.number
  override def hashCode:Int = {
    List[Int](child.hashCode, filters.hashCode, exprId.hashCode)
      .foldLeft(17)((l, r) => 31 * l + r)
  }

  override def metadata: Metadata = {
    child match {
      case named: NamedExpression => MetadataAccessor.filterMetadata(named.metadata, filters)
      case _ => Metadata.empty
    }
  }

  override def qualifiers: Seq[String] = Nil

  override def eval(input: InternalRow): Any = child.eval(input)

  override def nullable: Boolean = child.nullable

  override def dataType: DataType = child.dataType

  override protected final def otherCopyArgs: Seq[AnyRef] = filters :: exprId :: Nil
}

Source File: stringExpressions.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.unsafe.types._
import org.apache.spark.sql.types._


case class Replace(se: Expression, fe: Expression, pe: Expression)
  extends TernaryExpression
  with ImplicitCastInputTypes with CodegenFallback {

  override def inputTypes: Seq[AbstractDataType] = Seq.fill(3)(StringType)

  override def eval(input: InternalRow): Any = {
    val s = se.eval(input).asInstanceOf[UTF8String]
    val f = fe.eval(input).asInstanceOf[UTF8String]
    val p = pe.eval(input).asInstanceOf[UTF8String]
    (s, f, p) match {
      case (null, _, _) | (_, null, _) | (null, null, _) => null
      case (stre, strf, null) =>
        UTF8String.fromString(stre.toString()
          .replaceAllLiterally(strf.toString(), ""))
      case (stre, strf, strp) =>
        UTF8String.fromString(stre.toString()
          .replaceAllLiterally(strf.toString(), strp.toString()))
      case _ =>
        sys.error(s"Unexpected input")
    }
  }

  override def nullable: Boolean = se.nullable

  override def dataType: DataType = StringType

  override def children: Seq[Expression] = se :: fe :: pe :: Nil
}

Source File: GlobalSapSQLContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql

import java.io.File

import com.sap.spark.util.TestUtils
import com.sap.spark.{GlobalSparkContext, WithSQLContext}
import org.apache.spark.SparkContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{BoundReference, Cast}
import org.apache.spark.unsafe.types._
import org.apache.spark.sql.types._
import org.scalatest.Suite

import scala.io.Source

trait GlobalSapSQLContext extends GlobalSparkContext with WithSQLContext {
  self: Suite =>

  override implicit def sqlContext: SQLContext = GlobalSapSQLContext._sqlc

  override protected def setUpSQLContext(): Unit =
    GlobalSapSQLContext.init(sc)

  override protected def tearDownSQLContext(): Unit =
    GlobalSapSQLContext.reset()

  def getDataFrameFromSourceFile(sparkSchema: StructType, path: File): DataFrame = {
    val conversions = sparkSchema.toSeq.zipWithIndex.map({
      case (field, index) =>
        Cast(BoundReference(index, StringType, nullable = true), field.dataType)
    })
    val data = Source.fromFile(path)
      .getLines()
      .map({ line =>
      val stringRow = InternalRow.fromSeq(line.split(",", -1).map(UTF8String.fromString))
      Row.fromSeq(conversions.map({ c => c.eval(stringRow) }))
    })
    val rdd = sc.parallelize(data.toSeq, numberOfSparkWorkers)
    sqlContext.createDataFrame(rdd, sparkSchema)
  }
}

object GlobalSapSQLContext {

  private var _sqlc: SQLContext = _

  private def init(sc: SparkContext): Unit =
    if (_sqlc == null) {
      _sqlc = TestUtils.newSQLContext(sc)
    }

  private def reset(): Unit = {
    if (_sqlc != null) {
      _sqlc.catalog.unregisterAllTables()
    }
  }

}

Source File: ExpressionEvalHelper.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

//
// Partially backported from Spark 1.5.2.
//

import org.apache.spark.sql.extension.OptimizerFactoryForTests
import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project}
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.scalactic.TripleEqualsSupport.Spread
import org.scalatest.FunSuite
import org.scalatest.prop.GeneratorDrivenPropertyChecks

// scalastyle:off


      case _ =>
    }
    expression.eval(inputRow)
  }

  protected def generateProject(
                                 generator: => Projection,
                                 expression: Expression): Projection = {
    try {
      generator
    } catch {
      case e: Throwable =>
        fail(
          s"""
             |Code generation of $expression failed:
             |$e
             |${e.getStackTraceString}
          """.stripMargin)
    }
  }

  protected def checkEvaluationWithoutCodegen(
                                               expression: Expression,
                                               expected: Any,
                                               inputRow: InternalRow = EmptyRow): Unit = {

    val actual = try evaluate(expression, inputRow) catch {
      case e: Exception => fail(s"Exception evaluating $expression", e)
    }
    if (!checkResult(actual, expected)) {
      val input = if (inputRow == EmptyRow) "" else s", input: $inputRow"
      fail(s"Incorrect evaluation (codegen off): $expression, " +
        s"actual: $actual, " +
        s"expected: $expected$input")
    }
  }

  protected def checkEvaluationWithOptimization(
                                                 expression: Expression,
                                                 expected: Any,
                                                 inputRow: InternalRow = EmptyRow): Unit = {
    val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation)
    val optimizedPlan = OptimizerFactoryForTests.default().execute(plan)
    checkEvaluationWithoutCodegen(optimizedPlan.expressions.head, expected, inputRow)
  }

}

Source File: ExcelOutputWriter.scala From spark-hadoopoffice-ds with Apache License 2.0

5 votes

package org.zuinnote.spark.office.excel

import java.math.BigDecimal
import java.sql.Date
import java.sql.Timestamp
import java.text.DateFormat
import java.text.SimpleDateFormat
import java.util.Calendar

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.io.ArrayWritable
import org.apache.hadoop.mapreduce.RecordWriter
import org.apache.hadoop.mapreduce.TaskAttemptContext

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow }
import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.datasources.OutputWriter
import org.apache.spark.sql.types._

import org.zuinnote.hadoop.office.format.common.dao.SpreadSheetCellDAO
import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration
import org.zuinnote.hadoop.office.format.common.util.msexcel.MSExcelUtil
import org.zuinnote.hadoop.office.format.mapreduce._

import org.apache.commons.logging.LogFactory
import org.apache.commons.logging.Log
import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration
import java.util.Locale
import java.text.DecimalFormat
import org.zuinnote.hadoop.office.format.common.converter.ExcelConverterSimpleSpreadSheetCellDAO
import java.text.NumberFormat

// NOTE: This class is instantiated and used on executor side only, no need to be serializable.
private[excel] class ExcelOutputWriter(
  path:       String,
  dataSchema: StructType,
  context:    TaskAttemptContext, options: Map[String, String]) extends OutputWriter {
  
  def write(row: Row): Unit = {
    // check useHeader
    if (useHeader) {
      val headers = row.schema.fieldNames
      var i = 0
      for (x <- headers) {
        val headerColumnSCD = new SpreadSheetCellDAO(x, "", "", MSExcelUtil.getCellAddressA1Format(currentRowNum, i), defaultSheetName)
        recordWriter.write(NullWritable.get(), headerColumnSCD)
        i += 1
      }
      currentRowNum += 1
      useHeader = false
    }
    // for each value in the row
    if (row.size>0) {
      var currentColumnNum = 0;
      val simpleObject = new Array[AnyRef](row.size)
      for (i <- 0 to row.size - 1) { // for each element of the row
        val obj = row.get(i)
        if ((obj.isInstanceOf[Seq[String]]) && (obj.asInstanceOf[Seq[String]].length==5)) {
          val formattedValue = obj.asInstanceOf[Seq[String]](0)
          val comment = obj.asInstanceOf[Seq[String]](1)
          val formula = obj.asInstanceOf[Seq[String]](2)
          val address = obj.asInstanceOf[Seq[String]](3)
          val sheetName = obj.asInstanceOf[Seq[String]](4)
          simpleObject(i) = new SpreadSheetCellDAO(formattedValue,comment,formula,address,sheetName)
        } else {
          simpleObject(i)=obj.asInstanceOf[AnyRef]
        }
      }
      // convert row to spreadsheetcellDAO
      val spreadSheetCellDAORow = simpleConverter.getSpreadSheetCellDAOfromSimpleDataType(simpleObject, defaultSheetName, currentRowNum)
      // write it
      for (x<- spreadSheetCellDAORow) {
        recordWriter.write(NullWritable.get(), x)
      }
    }
    currentRowNum += 1
  }

  override def close(): Unit = {
    recordWriter.close(context)
    currentRowNum = 0;
  }

}

Source File: ZOrderCurveUDT.scala From magellan with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import magellan.BoundingBox
import magellan.index.ZOrderCurve
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow

class ZOrderCurveUDT extends UserDefinedType[ZOrderCurve] {

  override def sqlType: DataType = StructType(
    Seq(
      StructField("xmin", DoubleType, nullable = false),
      StructField("ymin", DoubleType, nullable = false),
      StructField("xmax", DoubleType, nullable = false),
      StructField("ymax", DoubleType, nullable = false),
      StructField("precision", IntegerType, nullable = false),
      StructField("bits", LongType, nullable = false)
    ))

  override def serialize(obj: ZOrderCurve): Any = {
    val row = new GenericInternalRow(6)
    val BoundingBox(xmin, ymin, xmax, ymax) = obj.boundingBox
    row.setDouble(0, xmin)
    row.setDouble(1, ymin)
    row.setDouble(2, xmax)
    row.setDouble(3, ymax)
    row.setInt(4, obj.precision)
    row.setLong(5, obj.bits)
    row
  }

  override def deserialize(datum: Any): ZOrderCurve = {
    val row = datum.asInstanceOf[InternalRow]
    val boundingBox = BoundingBox(row.getDouble(0),
      row.getDouble(1),
      row.getDouble(2),
      row.getDouble(3))

    new ZOrderCurve(boundingBox, row.getInt(4), row.getLong(5))
  }

  override def userClass: Class[ZOrderCurve] = classOf[ZOrderCurve]

}

Source File: PointUDT.scala From magellan with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import magellan._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow

class PointUDT extends UserDefinedType[Point] with GeometricUDT {

  override val sqlType: StructType = StructType(
    Seq(
      StructField("type", IntegerType, nullable = false),
      StructField("xmin", DoubleType, nullable = false),
      StructField("ymin", DoubleType, nullable = false),
      StructField("xmax", DoubleType, nullable = false),
      StructField("ymax", DoubleType, nullable = false),
      StructField("x", DoubleType, nullable = false),
      StructField("y", DoubleType, nullable = false)
    ))

  override def serialize(point: Point): InternalRow = {
    val row = new GenericInternalRow(7)
    row.setInt(0, point.getType())
    row.setDouble(1, point.getX())
    row.setDouble(2, point.getY())
    row.setDouble(3, point.getX())
    row.setDouble(4, point.getY())
    row.setDouble(5, point.getX())
    row.setDouble(6, point.getY())
    row
  }

  override def serialize(shape: Shape) = serialize(shape.asInstanceOf[Point])

  override def userClass: Class[Point] = classOf[Point]

  override def deserialize(datum: Any): Point = {
    val row = datum.asInstanceOf[InternalRow]
    require(row.numFields == 7)
    Point(row.getDouble(5), row.getDouble(6))
  }

  override def pyUDT: String = "magellan.types.PointUDT"

  def serialize(x: Double, y: Double): InternalRow = {
    val row = new GenericInternalRow(7)
    row.setInt(0, 1)
    row.setDouble(1, x)
    row.setDouble(2, y)
    row.setDouble(3, x)
    row.setDouble(4, y)
    row.setDouble(5, x)
    row.setDouble(6, y)
    row
  }

  override val geometryType = new Point().getType()

}

Source File: LineUDT.scala From magellan with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import magellan._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow

class LineUDT extends UserDefinedType[Line] with GeometricUDT {

  override def sqlType: DataType = StructType(
    Seq(
      StructField("type", IntegerType, nullable = false),
      StructField("xmin", DoubleType, nullable = false),
      StructField("ymin", DoubleType, nullable = false),
      StructField("xmax", DoubleType, nullable = false),
      StructField("ymax", DoubleType, nullable = false),
      StructField("startX", DoubleType, nullable = false),
      StructField("startY", DoubleType, nullable = false),
      StructField("endX", DoubleType, nullable = false),
      StructField("endY", DoubleType, nullable = false)
    ))

  override def serialize(line: Line): InternalRow = {
    val row = new GenericInternalRow(9)
    row.setInt(0, 2)
    val BoundingBox(xmin, ymin, xmax, ymax) = line.boundingBox
    row.setDouble(1, xmin)
    row.setDouble(2, ymin)
    row.setDouble(3, xmax)
    row.setDouble(4, ymax)
    row.setDouble(5, line.getStart().getX())
    row.setDouble(6, line.getStart().getY())
    row.setDouble(7, line.getEnd().getX())
    row.setDouble(8, line.getEnd().getY())
    row
  }

  override def serialize(shape: Shape) = serialize(shape.asInstanceOf[Line])

  override def userClass: Class[Line] = classOf[Line]

  override def deserialize(datum: Any): Line = {
    val row = datum.asInstanceOf[InternalRow]
    val startX = row.getDouble(5)
    val startY = row.getDouble(6)
    val endX = row.getDouble(7)
    val endY = row.getDouble(8)
    val line = new Line()
    val start = Point(startX, startY)
    val end = Point(endX, endY)
    line.setStart(start)
    line.setEnd(end)
    line
  }

  override def pyUDT: String = "magellan.types.LineUDT"

  override val geometryType = new Line().getType()

}

Source File: PolygonUDT.scala From magellan with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import magellan._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow

class PolygonUDT extends UserDefinedType[Polygon] with GeometricUDT {

  override val sqlType: StructType = StructType(Seq(
    StructField("type", IntegerType, nullable = false),
    StructField("xmin", DoubleType, nullable = false),
    StructField("ymin", DoubleType, nullable = false),
    StructField("xmax", DoubleType, nullable = false),
    StructField("ymax", DoubleType, nullable = false),
    StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true),
    StructField("xcoordinates", ArrayType(DoubleType, containsNull = false), nullable = true),
    StructField("ycoordinates", ArrayType(DoubleType, containsNull = false), nullable = true)
  ))

  override def serialize(polygon: Polygon): InternalRow = {
    polygon.serialize()
  }

  override def serialize(shape: Shape) = serialize(shape.asInstanceOf[Polygon])

  override def userClass: Class[Polygon] = classOf[Polygon]

  override def deserialize(datum: Any): Polygon = {
    val row = datum.asInstanceOf[InternalRow]
    val polygon = new Polygon()
    polygon.init(row)
    polygon
  }

  override val geometryType = new Polygon().getType()
}

Source File: PolyLineUDT.scala From magellan with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import magellan._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow

class PolyLineUDT extends UserDefinedType[PolyLine] with GeometricUDT {

  override val sqlType: StructType = StructType(
    Seq(
      StructField("type", IntegerType, nullable = false),
      StructField("xmin", DoubleType, nullable = false),
      StructField("ymin", DoubleType, nullable = false),
      StructField("xmax", DoubleType, nullable = false),
      StructField("ymax", DoubleType, nullable = false),
      StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true),
      StructField("xcoordinates", ArrayType(DoubleType, containsNull = false), nullable = true),
      StructField("ycoordinates", ArrayType(DoubleType, containsNull = false), nullable = true)
    ))

  override def serialize(polyLine: PolyLine): InternalRow = {
   polyLine.serialize()
  }

  override def serialize(shape: Shape) = serialize(shape.asInstanceOf[PolyLine])

  override def userClass: Class[PolyLine] = classOf[PolyLine]

  override def deserialize(datum: Any): PolyLine = {
    val row = datum.asInstanceOf[InternalRow]
    val polyline = new PolyLine()
    polyline.init(row)
    polyline
  }

  override def pyUDT: String = "magellan.types.PolyLineUDT"

  override val geometryType = new PolyLine().getType()
}

Source File: serdes.scala From magellan with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import magellan._
import magellan.catalyst.MagellanExpression
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.types._


case class MagellanSerializer(
    override val child: Expression,
    _dataType: DataType)
  extends UnaryExpression
  with MagellanExpression
  with CodegenFallback
  with NonSQLExpression {

  override def nullable: Boolean = false

  override protected def nullSafeEval(input: Any): Any = {
    val shape = input.asInstanceOf[Shape]
    serialize(shape)
  }

  override def dataType: DataType = _dataType
}

case class MagellanDeserializer(
    override val child: Expression, klass: Class[_ <: Shape])
  extends UnaryExpression
  with MagellanExpression
  with CodegenFallback
  with NonSQLExpression {

  override def nullable: Boolean = false

  override protected def nullSafeEval(input: Any): Any = {
    newInstance(input.asInstanceOf[InternalRow])
  }

  override def dataType: DataType = ObjectType(klass)
}

Source File: MagellanExpression.scala From magellan with Apache License 2.0

5 votes

package magellan.catalyst

import magellan._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types._

trait MagellanExpression {

  private val SERIALIZERS = Map(
    new Point().getType() -> new PointUDT,
    new Line().getType() -> new LineUDT,
    new PolyLine().getType() -> new PolyLineUDT,
    new Polygon().getType()  -> new PolygonUDT)

  def newInstance(row: InternalRow): Shape = {
    SERIALIZERS.get(row.getInt(0)).fold(NullShape.asInstanceOf[Shape])(_.deserialize(row))
  }

  def serialize(shape: Shape): Any = {
    SERIALIZERS.get(shape.getType()).get.serialize(shape)
  }

  def sqlType(klass: Class[_ <: Shape]): DataType = {
    SERIALIZERS.get(klass.newInstance().getType()).get
  }
}

Source File: TransformerSuite.scala From magellan with Apache License 2.0

5 votes

package magellan.catalyst

import magellan.TestingUtils._
import magellan.{MockPointExpr, Point, TestSparkContext}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, Transformer}
import org.apache.spark.sql.magellan.dsl.expressions._
import org.scalatest.FunSuite


class TransformerSuite extends FunSuite with TestSparkContext {

  test("transform") {
    val sqlCtx = this.sqlContext
    val path = this.getClass.getClassLoader.getResource("testpoint/").getPath
    val df = sqlCtx.read.format("magellan").load(path)
    import sqlCtx.implicits._
    val dbl = (x: Point) => Point(2 * x.getX(), 2 * x.getY())
    val point = df.withColumn("transformed", $"point".transform(dbl))
      .select($"transformed")
      .first()(0).asInstanceOf[Point]

    assert(point.getX() ~== -199.0 absTol 1.0)
  }

  test("eval: transform") {
    val fn = (p: Point) => Point(2 * p.getX(), 2 * p.getY())
    val expr = Transformer(MockPointExpr(Point(1.0, 2.0)), fn)
    val result = expr.eval(null).asInstanceOf[InternalRow]
    // skip the type
    assert(result.getDouble(1) === 2.0)
    assert(result.getDouble(2) === 4.0)
  }
}

Source File: ColumnarShuffledHashJoinExec.scala From OAP with Apache License 2.0

5 votes

package com.intel.sparkColumnarPlugin.execution

import java.util.concurrent.TimeUnit._

import com.intel.sparkColumnarPlugin.vectorized._

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.{BinaryExecNode, CodegenSupport, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics

import scala.collection.JavaConverters._
import org.apache.spark.internal.Logging
import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}
import scala.collection.mutable.ListBuffer
import org.apache.arrow.vector.ipc.message.ArrowFieldNode
import org.apache.arrow.vector.ipc.message.ArrowRecordBatch
import org.apache.arrow.vector.types.pojo.ArrowType
import org.apache.arrow.vector.types.pojo.Field
import org.apache.arrow.vector.types.pojo.Schema
import org.apache.arrow.gandiva.expression._
import org.apache.arrow.gandiva.evaluator._

import io.netty.buffer.ArrowBuf
import com.google.common.collect.Lists;

import com.intel.sparkColumnarPlugin.expression._
import com.intel.sparkColumnarPlugin.vectorized.ExpressionEvaluator
import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec
import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide}


class ColumnarShuffledHashJoinExec(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    joinType: JoinType,
    buildSide: BuildSide,
    condition: Option[Expression],
    left: SparkPlan,
    right: SparkPlan) extends ShuffledHashJoinExec(
    leftKeys,
    rightKeys,
    joinType,
    buildSide,
    condition,
    left,
    right) {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "joinTime" -> SQLMetrics.createTimingMetric(sparkContext, "join time"),
    "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))

  override def supportsColumnar = true

  //TODO() Disable code generation
  //override def supportCodegen: Boolean = false

  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
    val numOutputRows = longMetric("numOutputRows")
    val joinTime = longMetric("joinTime")
    val buildTime = longMetric("buildTime")
    val resultSchema = this.schema
    streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) { (streamIter, buildIter) =>
      //val hashed = buildHashedRelation(buildIter)
      //join(streamIter, hashed, numOutputRows)
      val vjoin = ColumnarShuffledHashJoin.create(leftKeys, rightKeys, resultSchema, joinType, buildSide, condition, left, right, buildTime, joinTime, numOutputRows)
      val vjoinResult = vjoin.columnarInnerJoin(streamIter, buildIter)
      TaskContext.get().addTaskCompletionListener[Unit](_ => {
        vjoin.close()
      })
      new CloseableColumnBatchIterator(vjoinResult)
    }
  }
}

Source File: ColumnarSortExec.scala From OAP with Apache License 2.0

5 votes

package com.intel.sparkColumnarPlugin.execution

import com.intel.sparkColumnarPlugin.expression._
import com.intel.sparkColumnarPlugin.vectorized._

import java.util.concurrent.TimeUnit._

import org.apache.spark.{SparkEnv, TaskContext, SparkContext}
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.sql.execution._
import org.apache.spark.sql.catalyst.expressions.SortOrder
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics}
import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector}


class ColumnarSortExec(
    sortOrder: Seq[SortOrder],
    global: Boolean,
    child: SparkPlan,
    testSpillFrequency: Int = 0)
    extends SortExec(sortOrder, global, child, testSpillFrequency) {
  override def supportsColumnar = true

  // Disable code generation
  override def supportCodegen: Boolean = false

  override lazy val metrics = Map(
    "totalSortTime" -> SQLMetrics
      .createTimingMetric(sparkContext, "time in sort + shuffle process"),
    "sortTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in sort process"),
    "shuffleTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in shuffle process"),
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "numOutputBatches" -> SQLMetrics.createMetric(sparkContext, "number of output batches"))

  override def doExecuteColumnar(): RDD[ColumnarBatch] = {
    val elapse = longMetric("totalSortTime")
    val sortTime = longMetric("sortTime")
    val shuffleTime = longMetric("shuffleTime")
    val numOutputRows = longMetric("numOutputRows")
    val numOutputBatches = longMetric("numOutputBatches")
    child.executeColumnar().mapPartitions { iter =>
      val hasInput = iter.hasNext
      val res = if (!hasInput) {
        Iterator.empty
      } else {
        val sorter = ColumnarSorter.create(
          sortOrder,
          true,
          child.output,
          sortTime,
          numOutputBatches,
          numOutputRows,
          shuffleTime,
          elapse)
        TaskContext
          .get()
          .addTaskCompletionListener[Unit](_ => {
            sorter.close()
          })
        new CloseableColumnBatchIterator(sorter.createColumnarIterator(iter))
      }
      res
    }
  }
}

Source File: ColumnarSubquery.scala From OAP with Apache License 2.0

5 votes

package com.intel.sparkColumnarPlugin.expression

import org.apache.arrow.gandiva.evaluator._
import org.apache.arrow.gandiva.exceptions.GandivaException
import org.apache.arrow.gandiva.expression._
import org.apache.arrow.vector.types.pojo.ArrowType
import org.apache.arrow.vector.types.pojo.Field

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.{expressions, InternalRow}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.execution.BaseSubqueryExec
import org.apache.spark.sql.execution.ExecSubqueryExpression
import org.apache.spark.sql.execution.ScalarSubquery
import org.apache.spark.sql.types._

import scala.collection.mutable.ListBuffer

class ColumnarScalarSubquery(
  query: ScalarSubquery)
  extends Expression with ColumnarExpression {

  override def dataType: DataType = query.dataType
  override def children: Seq[Expression] = Nil
  override def nullable: Boolean = true
  override def toString: String = query.toString
  override def eval(input: InternalRow): Any = query.eval(input)
  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = query.doGenCode(ctx, ev)
  override def canEqual(that: Any): Boolean = query.canEqual(that)
  override def productArity: Int = query.productArity
  override def productElement(n: Int): Any = query.productElement(n)
  override def doColumnarCodeGen(args: java.lang.Object): (TreeNode, ArrowType) = {
    val value = query.eval(null)
    val resultType = CodeGeneration.getResultType(query.dataType)
    query.dataType match {
      case t: StringType =>
        (TreeBuilder.makeStringLiteral(value.toString().asInstanceOf[String]), resultType)
      case t: IntegerType =>
        (TreeBuilder.makeLiteral(value.asInstanceOf[Integer]), resultType)
      case t: LongType =>
        (TreeBuilder.makeLiteral(value.asInstanceOf[java.lang.Long]), resultType)
      case t: DoubleType =>
        (TreeBuilder.makeLiteral(value.asInstanceOf[java.lang.Double]), resultType)
      case d: DecimalType =>
        val v = value.asInstanceOf[Decimal]
        (TreeBuilder.makeDecimalLiteral(v.toString, v.precision, v.scale), resultType)
      case d: DateType =>
        throw new UnsupportedOperationException(s"DateType is not supported yet.")
    }
  }
}

Source File: OapOutputWriter.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{OutputWriter, WriteResult}
import org.apache.spark.sql.execution.datasources.oap.io.OapDataWriter
import org.apache.spark.sql.types.StructType


private[oap] class OapOutputWriter(
    path: String,
    dataSchema: StructType,
    context: TaskAttemptContext) extends OutputWriter {
  private var rowCount = 0
  private var partitionString: String = ""

  override def setPartitionString(ps: String): Unit = {
    partitionString = ps
  }

  private val writer: OapDataWriter = {
    val isCompressed = FileOutputFormat.getCompressOutput(context)
    val conf = context.getConfiguration
    val file: Path = new Path(path)
    val fs = file.getFileSystem(conf)
    val fileOut = fs.create(file, false)

    new OapDataWriter(isCompressed, fileOut, dataSchema, conf)
  }

  override def write(row: InternalRow): Unit = {
    rowCount += 1
    writer.write(row)
  }

  override def close(): Unit = {
    writer.close()
  }

  override def writeStatus(): WriteResult = {
    OapWriteResult(dataFileName, rowCount, partitionString)
  }

  def dataFileName: String = new Path(path).getName
}

Source File: OapIndexOutputWriter.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.parquet.hadoop.util.ContextUtil

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.OutputWriter
import org.apache.spark.sql.oap.adapter.InputFileNameHolderAdapter

// TODO: parameter name "path" is ambiguous
private[index] class OapIndexOutputWriter(
    path: String,
    context: TaskAttemptContext
) extends OutputWriter {

  private val outputFormat = new OapIndexOutputFormat() {
    override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
      val outputPath = FileOutputFormat.getOutputPath(context)
      val configuration = ContextUtil.getConfiguration(context)
      IndexUtils.generateTempIndexFilePath(
        configuration, inputFileName, outputPath, path, extension)
    }
  }

  private var recordWriter: RecordWriter[Void, InternalRow] = _

  private var inputFileName: String = _

  private var rowCount: Long = 0

  override def write(row: InternalRow): Unit = {
    checkStartOfNewFile()
    recordWriter.write(null, row)
    rowCount += 1
  }

  override def close(): Unit = {
    closeWriter()
  }

  private def initWriter(): Unit = {
    inputFileName = InputFileNameHolderAdapter.getInputFileName().toString
    recordWriter = outputFormat.getRecordWriter(context)
    rowCount = 0
  }

  private def closeWriter(): Unit = {
    if (recordWriter != null) {
      recordWriter.close(context)
      recordWriter = null
    }
  }

  private def checkStartOfNewFile(): Unit = {
    if (inputFileName != InputFileNameHolderAdapter.getInputFileName().toString) {
      closeWriter()
      initWriter()
    }
  }
}

Source File: OapIndexOutputFormat.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.index

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.parquet.format.CompressionCodec
import org.apache.parquet.hadoop.util.ContextUtil

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.OapException
import org.apache.spark.sql.execution.datasources.oap.index.OapIndexProperties.IndexVersion
import org.apache.spark.sql.internal.oap.OapConf
import org.apache.spark.sql.types.StructType

private[index] class OapIndexOutputFormat extends FileOutputFormat[Void, InternalRow] {

  private val BTREE_WRITER_VERSION = OapConf.OAP_INDEX_BTREE_WRITER_VERSION.key

  private def getCodec(taskAttemptContext: TaskAttemptContext): CompressionCodec = {
    val configuration = ContextUtil.getConfiguration(taskAttemptContext)
    CompressionCodec.valueOf(
      configuration.get(
        OapConf.OAP_INDEX_BTREE_COMPRESSION.key,
        OapConf.OAP_INDEX_BTREE_COMPRESSION.defaultValueString).toUpperCase)
  }

  private def getWriterVersion(taskAttemptContext: TaskAttemptContext) = {
    val configuration = ContextUtil.getConfiguration(taskAttemptContext)
    val indexVersion =
      configuration.get(BTREE_WRITER_VERSION, OapIndexProperties.DEFAULT_WRITER_VERSION.toString)
    IndexVersion.fromString(indexVersion)
  }

  override def getRecordWriter(
      taskAttemptContext: TaskAttemptContext): RecordWriter[Void, InternalRow] = {

    val configuration = ContextUtil.getConfiguration(taskAttemptContext)

    def canBeSkipped(file: Path): Boolean = {
      val isAppend = configuration.get(OapIndexFileFormat.IS_APPEND).toBoolean
      if (isAppend) {
        val target = new Path(FileOutputFormat.getOutputPath(taskAttemptContext), file.getName)
        target.getFileSystem(configuration).exists(target)
      } else {
        false
      }
    }

    val codec = getCodec(taskAttemptContext)
    val writerVersion = getWriterVersion(taskAttemptContext)

    val extension = "." + configuration.get(OapIndexFileFormat.INDEX_TIME) +
        "." + configuration.get(OapIndexFileFormat.INDEX_NAME) +
        ".index"

    val file = getDefaultWorkFile(taskAttemptContext, extension)

    val schema = StructType.fromString(configuration.get(OapIndexFileFormat.ROW_SCHEMA))

    val indexType = configuration.get(OapIndexFileFormat.INDEX_TYPE, "")

    if (canBeSkipped(file)) {
      new DummyIndexRecordWriter()
    } else if (indexType == "BTREE") {
      BTreeIndexRecordWriter(configuration, file, schema, codec, writerVersion)
    } else if (indexType == "BITMAP") {
      val writer = file.getFileSystem(configuration).create(file, true)
      new BitmapIndexRecordWriter(configuration, writer, schema)
    } else {
      throw new OapException("Unknown Index Type: " + indexType)
    }
  }
}

Source File: package.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering
import org.apache.spark.sql.execution.datasources.oap.io.ColumnStatistics
import org.apache.spark.sql.execution.datasources.oap.utils.OapUtils
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StructField, StructType}

package object oap {
  type Key = InternalRow

  def order(sf: StructField): Ordering[Key] = GenerateOrdering.create(StructType(Array(sf)))

  // Return if the rowGroup or file can be skipped by min max statistics
  def isSkippedByStatistics(
      columnStats: Array[ColumnStatistics],
      filter: Filter,
      schema: StructType): Boolean = filter match {
    case Or(left, right) =>
      isSkippedByStatistics(columnStats, left, schema) &&
          isSkippedByStatistics(columnStats, right, schema)
    case And(left, right) =>
      isSkippedByStatistics(columnStats, left, schema) ||
          isSkippedByStatistics(columnStats, right, schema)
    case IsNotNull(attribute) =>
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      !stat.hasNonNullValue
    case EqualTo(attribute, handle) =>
      val key = OapUtils.keyFromAny(handle)
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      val comp = order(schema(idx))
      (OapUtils.keyFromBytes(stat.min, schema(idx).dataType), OapUtils.keyFromBytes(
        stat.max, schema(idx).dataType)) match {
        case (Some(v1), Some(v2)) => comp.gt(v1, key) || comp.lt(v2, key)
        case _ => false
      }
    case LessThan(attribute, handle) =>
      val key = OapUtils.keyFromAny(handle)
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      val comp = order(schema(idx))
      OapUtils.keyFromBytes(stat.min, schema(idx).dataType) match {
        case Some(v) => comp.gteq(v, key)
        case None => false
      }
    case LessThanOrEqual(attribute, handle) =>
      val key = OapUtils.keyFromAny(handle)
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      val comp = order(schema(idx))
      OapUtils.keyFromBytes(stat.min, schema(idx).dataType) match {
        case Some(v) => comp.gt(v, key)
        case None => false
      }
    case GreaterThan(attribute, handle) =>
      val key = OapUtils.keyFromAny(handle)
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      val comp = order(schema(idx))
      OapUtils.keyFromBytes(stat.max, schema(idx).dataType) match {
        case Some(v) => comp.lteq(v, key)
        case None => false
      }
    case GreaterThanOrEqual(attribute, handle) =>
      val key = OapUtils.keyFromAny(handle)
      val idx = schema.fieldIndex(attribute)
      val stat = columnStats(idx)
      val comp = order(schema(idx))
      OapUtils.keyFromBytes(stat.max, schema(idx).dataType) match {
        case Some(v) => comp.lt(v, key)
        case None => false
      }
    case _ => false
  }
}


class OapException(message: String, cause: Throwable) extends Exception(message, cause) {

  def this(message: String) = this(message, null)
}

Source File: OapDataReader.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.hadoop.fs.FSDataInputStream

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{OapException, PartitionedFile}
import org.apache.spark.sql.execution.datasources.oap.INDEX_STAT._
import org.apache.spark.sql.execution.datasources.oap.OapFileFormat
import org.apache.spark.sql.execution.datasources.oap.io.OapDataFileProperties.DataFileVersion
import org.apache.spark.sql.execution.datasources.oap.io.OapDataFileProperties.DataFileVersion.DataFileVersion
import org.apache.spark.unsafe.types.UTF8String

abstract class OapDataReader {
  def read(file: PartitionedFile): Iterator[InternalRow]

  // The two following fields have to be defined by certain versions of OapDataReader for use in
  // [[OapMetricsManager]]
  def rowsReadByIndex: Option[Long]
  def indexStat: INDEX_STAT
}

object OapDataReader extends Logging {

  def readVersion(is: FSDataInputStream, fileLen: Long): DataFileVersion = {
    val MAGIC_VERSION_LENGTH = 4
    val metaEnd = fileLen - 4

    // seek to the position of data file meta length
    is.seek(metaEnd)
    val metaLength = is.readInt()
    // read all bytes of data file meta
    val magicBuffer = new Array[Byte](MAGIC_VERSION_LENGTH)
    is.readFully(metaEnd - metaLength, magicBuffer)

    val magic = UTF8String.fromBytes(magicBuffer).toString
    magic match {
      case m if ! m.contains("OAP") => throw new OapException("Not a valid Oap Data File")
      case m if m == "OAP1" => DataFileVersion.OAP_DATAFILE_V1
      case _ => throw new OapException("Not a supported Oap Data File version")
    }
  }

  def getDataFileClassFor(dataReaderClassFromDataSourceMeta: String, reader: OapDataReader): String
    = {
    dataReaderClassFromDataSourceMeta match {
      case c if c == OapFileFormat.PARQUET_DATA_FILE_CLASSNAME => c
      case c if c == OapFileFormat.ORC_DATA_FILE_CLASSNAME => c
      case c if c == OapFileFormat.OAP_DATA_FILE_CLASSNAME =>
        reader match {
          case r: OapDataReaderV1 => OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME
          case _ => throw new OapException(s"Undefined connection for $reader")
        }
      case _ => throw new OapException(
        s"Undefined data reader class name $dataReaderClassFromDataSourceMeta")
    }
  }
}

Source File: OapIndexWriteTaskStatsTracker.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.util.concurrent.atomic.LongAdder

import org.apache.hadoop.fs.Path

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.oap.index.IndexBuildResult
import org.apache.spark.sql.oap.adapter.InputFileNameHolderAdapter

case class IndexWriteTaskStats(writeStatus: Seq[IndexBuildResult]) extends WriteTaskStats

class OapIndexWriteTaskStatsTracker extends WriteTaskStatsTracker with Logging {

  private[this] var curInputFileName: String = _

  private[this] var statusMap: Map[String, LongAdder] = Map.empty[String, LongAdder]

  override def newPartition(partitionValues: InternalRow): Unit = {
    // currently unhandled
  }

  override def newBucket(bucketId: Int): Unit = {
    // currently unhandled
  }

  override def newFile(filePath: String): Unit = {
    // currently unhandled
  }

  override def newRow(row: InternalRow): Unit = {
    val inputFileName = InputFileNameHolderAdapter.getInputFileName().toString
    if (curInputFileName != inputFileName) {
      curInputFileName = inputFileName
      statusMap = statusMap + (inputFileName -> new LongAdder)
    }
    statusMap(curInputFileName).increment()
  }

  override def getFinalStats(): WriteTaskStats = {
    val results = statusMap.map {
      case (filePath, rowCount) =>
        val path = new Path(filePath)
        IndexBuildResult(path.getName, rowCount.longValue(), "", path.getParent.toString)
    }
    IndexWriteTaskStats(results.toSeq)
  }
}

class OapIndexWriteJobStatsTracker extends WriteJobStatsTracker with Logging {

  private[this] var indexBuildResultSeq: Seq[IndexBuildResult] = _

  override def newTaskInstance(): WriteTaskStatsTracker = new OapIndexWriteTaskStatsTracker

  override def processStats(stats: Seq[WriteTaskStats]): Unit = {
    indexBuildResultSeq = stats.flatMap(_.asInstanceOf[IndexWriteTaskStats].writeStatus)
  }

  def indexBuildResults: Seq[IndexBuildResult] = indexBuildResultSeq
}

Source File: ParquetReadSupportWrapper.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import java.util.{Map => JMap}

import org.apache.hadoop.conf.Configuration
import org.apache.parquet.hadoop.api.{InitContext, ReadSupport}
import org.apache.parquet.hadoop.api.ReadSupport.ReadContext
import org.apache.parquet.io.api.RecordMaterializer
import org.apache.parquet.schema._

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow


  override def prepareForRead(
      conf: Configuration,
      keyValueMetaData: JMap[String, String],
      fileSchema: MessageType,
      readContext: ReadContext): RecordMaterializer[InternalRow] = {
    readSupport.prepareForRead(conf, keyValueMetaData, fileSchema, readContext)
  }
}
object ParquetReadSupportWrapper {
  // Proxy ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA value.
  val SPARK_ROW_REQUESTED_SCHEMA: String = ParquetReadSupport.SPARK_ROW_REQUESTED_SCHEMA
}

Source File: NonNullKeySuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.utils

import scala.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.util.ByteBufferOutputStream

class NonNullKeySuite extends SparkFunSuite with Logging {

  private lazy val random = new Random(0)
  private lazy val values = {
    val booleans: Seq[Boolean] = Seq(true, false)
    val bytes: Seq[Byte] = Seq(Byte.MinValue, 0, 10, 30, Byte.MaxValue)
    val shorts: Seq[Short] = Seq(Short.MinValue, -100, 0, 10, 200, Short.MaxValue)
    val ints: Seq[Int] = Seq(Int.MinValue, -100, 0, 100, 12346, Int.MaxValue)
    val longs: Seq[Long] = Seq(Long.MinValue, -10000, 0, 20, Long.MaxValue)
    val floats: Seq[Float] = Seq(Float.MinValue, Float.MinPositiveValue, Float.MaxValue)
    val doubles: Seq[Double] = Seq(Double.MinValue, Double.MinPositiveValue, Double.MaxValue)
    val strings: Seq[UTF8String] =
      Seq("", "test", "b plus tree", "BTreeRecordReaderWriter").map(UTF8String.fromString)
    val binaries: Seq[Array[Byte]] = (0 until 20 by 5).map{ size =>
      val buf = new Array[Byte](size)
      random.nextBytes(buf)
      buf
    }
    val values = booleans ++ bytes ++ shorts ++ ints ++ longs ++
      floats ++ doubles ++ strings ++ binaries ++ Nil
    random.shuffle(values)
  }
  private def toSparkDataType(any: Any): DataType = {
    any match {
      case _: Boolean => BooleanType
      case _: Short => ShortType
      case _: Byte => ByteType
      case _: Int => IntegerType
      case _: Long => LongType
      case _: Float => FloatType
      case _: Double => DoubleType
      case _: UTF8String => StringType
      case _: Array[Byte] => BinaryType
    }
  }

  test("Read/Write Based On Schema") {
    values.grouped(10).foreach { valueSeq =>
      val schema = StructType(valueSeq.zipWithIndex.map {
        case (v, i) => StructField(s"col$i", toSparkDataType(v))
      })
      val nnkw = new NonNullKeyWriter(schema)
      val nnkr = new NonNullKeyReader(schema)
      val row = InternalRow.fromSeq(valueSeq)
      val buf = new ByteBufferOutputStream()
      nnkw.writeKey(buf, row)
      val answerRow = nnkr.readKey(FiberCache(buf.toByteArray), 0)._1
      assert(row.equals(answerRow))
    }
  }
}

Source File: StatisticsTest.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.statistics

import java.io.ByteArrayOutputStream

import scala.collection.mutable.ArrayBuffer

import org.scalatest.BeforeAndAfterEach

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.BaseOrdering
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering
import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache
import org.apache.spark.sql.execution.datasources.oap.index.RangeInterval
import org.apache.spark.sql.execution.datasources.oap.utils.{NonNullKeyReader, NonNullKeyWriter}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.unsafe.Platform
import org.apache.spark.unsafe.memory.MemoryBlock
import org.apache.spark.unsafe.types.UTF8String

abstract class StatisticsTest extends SparkFunSuite with BeforeAndAfterEach {

  protected def rowGen(i: Int): InternalRow = InternalRow(i, UTF8String.fromString(s"test#$i"))

  protected lazy val schema: StructType = StructType(StructField("a", IntegerType)
    :: StructField("b", StringType) :: Nil)
  @transient
  protected lazy val nnkw: NonNullKeyWriter = new NonNullKeyWriter(schema)
  @transient
  protected lazy val nnkr: NonNullKeyReader = new NonNullKeyReader(schema)
  @transient
  protected lazy val ordering: BaseOrdering = GenerateOrdering.create(schema)
  @transient
  protected lazy val partialOrdering: BaseOrdering =
    GenerateOrdering.create(StructType(schema.dropRight(1)))
  protected var out: ByteArrayOutputStream = _

  protected var intervalArray: ArrayBuffer[RangeInterval] = new ArrayBuffer[RangeInterval]()

  override def beforeEach(): Unit = {
    out = new ByteArrayOutputStream(8000)
  }

  override def afterEach(): Unit = {
    out.close()
    intervalArray.clear()
  }

  protected def generateInterval(
      start: InternalRow, end: InternalRow,
      startInclude: Boolean, endInclude: Boolean): Unit = {
    intervalArray.clear()
    intervalArray.append(new RangeInterval(start, end, startInclude, endInclude))
  }

  protected def checkInternalRow(row1: InternalRow, row2: InternalRow): Unit = {
    val res = row1 == row2 // it works..
    assert(res, s"row1: $row1 does not match $row2")
  }

  protected def wrapToFiberCache(out: ByteArrayOutputStream): FiberCache = {
    val bytes = out.toByteArray
    FiberCache(bytes)
  }
}

Source File: DeltaByteArrayEncoderSuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.scalacheck.{Arbitrary, Gen, Properties}
import org.scalacheck.Prop.forAll
import org.scalatest.prop.Checkers

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.oap.adapter.PropertiesAdapter
import org.apache.spark.sql.execution.datasources.oap.filecache.StringFiberBuilder
import org.apache.spark.sql.types.StringType
import org.apache.spark.unsafe.types.UTF8String

class DeltaByteArrayEncoderCheck extends Properties("DeltaByteArrayEncoder") {

  private val rowCountInEachGroup = Gen.choose(1, 1024)
  private val rowCountInLastGroup = Gen.choose(1, 1024)
  private val groupCount = Gen.choose(1, 100)

  property("Encoding/Decoding String Type") = forAll { (values: Array[String]) =>

    forAll(rowCountInEachGroup, rowCountInLastGroup, groupCount) {
      (rowCount, lastCount, groupCount) =>
        if (values.nonEmpty) {
          // This is the 'PLAIN' FiberBuilder to validate the 'Encoding/Decoding'
          // Normally, the test case should be:
          // values => encoded bytes => decoded bytes => decoded values (Using ColumnValues class)
          // Validate if 'values' and 'decoded values' are identical.
          // But ColumnValues only support read value form DataFile. So, we have to use another way
          // to validate.
          val referenceFiberBuilder = StringFiberBuilder(rowCount, 0)
          val fiberBuilder = DeltaByteArrayFiberBuilder(rowCount, 0, StringType)
          val fiberParser = DeltaByteArrayDataFiberParser(
            new OapDataFileMetaV1(rowCountInEachGroup = rowCount), StringType)
          !(0 until groupCount).exists { group =>
            // If lastCount > rowCount, assume lastCount = rowCount
            val count = if (group < groupCount - 1) {
              rowCount
            } else if (lastCount > rowCount) {
              rowCount
            } else {
              lastCount
            }
            (0 until count).foreach { row =>
              fiberBuilder.append(InternalRow(UTF8String.fromString(values(row % values.length))))
              referenceFiberBuilder
                .append(InternalRow(UTF8String.fromString(values(row % values.length))))
            }
            val bytes = fiberBuilder.build().fiberData
            val parsedBytes = fiberParser.parse(bytes, count)
            val referenceBytes = referenceFiberBuilder.build().fiberData
            referenceFiberBuilder.clear()
            fiberBuilder.clear()
            assert(parsedBytes.length == referenceBytes.length)
            parsedBytes.zip(referenceBytes).exists(byte => byte._1 != byte._2)
          }
        } else true
    }
  }
}

class DeltaByteArrayEncoderSuite extends SparkFunSuite with Checkers {

  test("Check Encoding/Decoding") {
    check(PropertiesAdapter.getProp(new DictionaryBasedEncoderCheck()))
  }
}

Source File: DictionaryBasedEncoderSuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.parquet.bytes.BytesInput
import org.apache.parquet.column.page.DictionaryPage
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBinaryDictionary
import org.scalacheck.{Arbitrary, Gen, Properties}
import org.scalacheck.Prop.forAll
import org.scalatest.prop.Checkers

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.oap.adapter.PropertiesAdapter
import org.apache.spark.sql.execution.datasources.oap.filecache.StringFiberBuilder
import org.apache.spark.sql.types.StringType
import org.apache.spark.unsafe.types.UTF8String

class DictionaryBasedEncoderCheck extends Properties("DictionaryBasedEncoder") {
  private val rowCountInEachGroup = Gen.choose(1, 1024)
  private val rowCountInLastGroup = Gen.choose(1, 1024)
  private val groupCount = Gen.choose(1, 100)

  property("Encoding/Decoding String Type") = forAll { (values: Array[String]) =>

    forAll(rowCountInEachGroup, rowCountInLastGroup, groupCount) {
      (rowCount, lastCount, groupCount) =>
        if (values.nonEmpty) {
          // This is the 'PLAIN' FiberBuilder to validate the 'Encoding/Decoding'
          // Normally, the test case should be:
          // values => encoded bytes => decoded bytes => decoded values (Using ColumnValues class)
          // Validate if 'values' and 'decoded values' are identical.
          // But ColumnValues only support read value form DataFile. So, we have to use another way
          // to validate.
          val referenceFiberBuilder = StringFiberBuilder(rowCount, 0)
          val fiberBuilder = PlainBinaryDictionaryFiberBuilder(rowCount, 0, StringType)
          !(0 until groupCount).exists { group =>
            // If lastCount > rowCount, assume lastCount = rowCount
            val count =
              if (group < groupCount - 1) {
                rowCount
              } else if (lastCount > rowCount) {
                rowCount
              } else {
                lastCount
              }
            (0 until count).foreach { row =>
              fiberBuilder.append(InternalRow(UTF8String.fromString(values(row % values.length))))
              referenceFiberBuilder
                .append(InternalRow(UTF8String.fromString(values(row % values.length))))
            }
            val bytes = fiberBuilder.build().fiberData
            val dictionary = new PlainBinaryDictionary(
              new DictionaryPage(
                BytesInput.from(fiberBuilder.buildDictionary),
                fiberBuilder.getDictionarySize,
                org.apache.parquet.column.Encoding.PLAIN))
            val fiberParser = PlainDictionaryFiberParser(
              new OapDataFileMetaV1(rowCountInEachGroup = rowCount), dictionary, StringType)
            val parsedBytes = fiberParser.parse(bytes, count)
            val referenceBytes = referenceFiberBuilder.build().fiberData
            referenceFiberBuilder.clear()
            referenceFiberBuilder.resetDictionary()
            fiberBuilder.clear()
            fiberBuilder.resetDictionary()
            assert(parsedBytes.length == referenceBytes.length)
            parsedBytes.zip(referenceBytes).exists(byte => byte._1 != byte._2)
          }
        } else {
          true
        }
    }
  }
}

class DictionaryBasedEncoderSuite extends SparkFunSuite with Checkers {

  test("Check Encoding/Decoding") {
    check(PropertiesAdapter.getProp(new DictionaryBasedEncoderCheck()))
  }
}

Source File: ArrowFileFormat.scala From OAP with Apache License 2.0

5 votes

package com.intel.oap.spark.sql.execution.datasources.arrow

import scala.collection.JavaConverters._

import com.intel.oap.spark.sql.execution.datasources.arrow.ArrowFileFormat.UnsafeItr
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.{ArrowFilters, ArrowOptions}
import com.intel.oap.spark.sql.execution.datasources.v2.arrow.ArrowSQLConf._
import org.apache.arrow.dataset.scanner.ScanOptions
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.FileStatus
import org.apache.hadoop.mapreduce.Job

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriterFactory, PartitionedFile}
import org.apache.spark.sql.execution.datasources.v2.arrow.ArrowUtils
import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.util.CaseInsensitiveStringMap;

class ArrowFileFormat extends FileFormat with DataSourceRegister with Serializable {

  val batchSize = 4096

  def convert(files: Seq[FileStatus], options: Map[String, String]): Option[StructType] = {
    ArrowUtils.readSchema(files, new CaseInsensitiveStringMap(options.asJava))
  }

  override def inferSchema(
                            sparkSession: SparkSession,
                            options: Map[String, String],
                            files: Seq[FileStatus]): Option[StructType] = {
    convert(files, options)
  }

  override def prepareWrite(
                             sparkSession: SparkSession,
                             job: Job,
                             options: Map[String, String],
                             dataSchema: StructType): OutputWriterFactory = {
    throw new UnsupportedOperationException("Write is not supported for Arrow source")
  }

  override def supportBatch(sparkSession: SparkSession, dataSchema: StructType): Boolean = true

  override def buildReaderWithPartitionValues(sparkSession: SparkSession,
      dataSchema: StructType,
      partitionSchema: StructType,
      requiredSchema: StructType,
      filters: Seq[Filter],
      options: Map[String, String],
      hadoopConf: Configuration): PartitionedFile => Iterator[InternalRow] = {
    (file: PartitionedFile) => {

      val sqlConf = sparkSession.sessionState.conf;
      val enableFilterPushDown = sqlConf.arrowFilterPushDown
      val factory = ArrowUtils.makeArrowDiscovery(
        file.filePath, new ArrowOptions(
          new CaseInsensitiveStringMap(
            options.asJava).asScala.toMap))

      // todo predicate validation / pushdown
      val dataset = factory.finish();

      val filter = if (enableFilterPushDown) {
        ArrowFilters.translateFilters(filters)
      } else {
        org.apache.arrow.dataset.filter.Filter.EMPTY
      }

      val scanOptions = new ScanOptions(requiredSchema.map(f => f.name).toArray,
        filter, batchSize)
      val scanner = dataset.newScan(scanOptions)
      val itrList = scanner
        .scan()
        .iterator()
        .asScala
        .map(task => task.scan())
        .toList

      val itr = itrList
        .toIterator
        .flatMap(itr => itr.asScala)
        .map(vsr => ArrowUtils.loadVsr(vsr, file.partitionValues, partitionSchema, dataSchema))
      new UnsafeItr(itr).asInstanceOf[Iterator[InternalRow]]
    }
  }

  override def shortName(): String = "arrow"
}

object ArrowFileFormat {
  class UnsafeItr[T](delegate: Iterator[T]) extends Iterator[T] {
    override def hasNext: Boolean = delegate.hasNext

    override def next(): T = delegate.next()
  }
}

Source File: MatrixUDT.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.linalg

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData}
import org.apache.spark.sql.types._


private[spark] class MatrixUDT extends UserDefinedType[Matrix] {

  override def sqlType: StructType = {
    // type: 0 = sparse, 1 = dense
    // the dense matrix is built by numRows, numCols, values and isTransposed, all of which are
    // set as not nullable, except values since in the future, support for binary matrices might
    // be added for which values are not needed.
    // the sparse matrix needs colPtrs and rowIndices, which are set as
    // null, while building the dense matrix.
    StructType(Seq(
      StructField("type", ByteType, nullable = false),
      StructField("numRows", IntegerType, nullable = false),
      StructField("numCols", IntegerType, nullable = false),
      StructField("colPtrs", ArrayType(IntegerType, containsNull = false), nullable = true),
      StructField("rowIndices", ArrayType(IntegerType, containsNull = false), nullable = true),
      StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true),
      StructField("isTransposed", BooleanType, nullable = false)
      ))
  }

  override def serialize(obj: Matrix): InternalRow = {
    val row = new GenericInternalRow(7)
    obj match {
      case sm: SparseMatrix =>
        row.setByte(0, 0)
        row.setInt(1, sm.numRows)
        row.setInt(2, sm.numCols)
        row.update(3, UnsafeArrayData.fromPrimitiveArray(sm.colPtrs))
        row.update(4, UnsafeArrayData.fromPrimitiveArray(sm.rowIndices))
        row.update(5, UnsafeArrayData.fromPrimitiveArray(sm.values))
        row.setBoolean(6, sm.isTransposed)

      case dm: DenseMatrix =>
        row.setByte(0, 1)
        row.setInt(1, dm.numRows)
        row.setInt(2, dm.numCols)
        row.setNullAt(3)
        row.setNullAt(4)
        row.update(5, UnsafeArrayData.fromPrimitiveArray(dm.values))
        row.setBoolean(6, dm.isTransposed)
    }
    row
  }

  override def deserialize(datum: Any): Matrix = {
    datum match {
      case row: InternalRow =>
        require(row.numFields == 7,
          s"MatrixUDT.deserialize given row with length ${row.numFields} but requires length == 7")
        val tpe = row.getByte(0)
        val numRows = row.getInt(1)
        val numCols = row.getInt(2)
        val values = row.getArray(5).toDoubleArray()
        val isTransposed = row.getBoolean(6)
        tpe match {
          case 0 =>
            val colPtrs = row.getArray(3).toIntArray()
            val rowIndices = row.getArray(4).toIntArray()
            new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed)
          case 1 =>
            new DenseMatrix(numRows, numCols, values, isTransposed)
        }
    }
  }

  override def userClass: Class[Matrix] = classOf[Matrix]

  override def equals(o: Any): Boolean = {
    o match {
      case v: MatrixUDT => true
      case _ => false
    }
  }

  // see [SPARK-8647], this achieves the needed constant hash code without constant no.
  override def hashCode(): Int = classOf[MatrixUDT].getName.hashCode()

  override def typeName: String = "matrix"

  override def pyUDT: String = "pyspark.ml.linalg.MatrixUDT"

  private[spark] override def asNullable: MatrixUDT = this
}

Source File: VectorUDT.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.linalg

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData}
import org.apache.spark.sql.types._


private[spark] class VectorUDT extends UserDefinedType[Vector] {

  override def sqlType: StructType = {
    // type: 0 = sparse, 1 = dense
    // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse
    // vectors. The "values" field is nullable because we might want to add binary vectors later,
    // which uses "size" and "indices", but not "values".
    StructType(Seq(
      StructField("type", ByteType, nullable = false),
      StructField("size", IntegerType, nullable = true),
      StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true),
      StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true)))
  }

  override def serialize(obj: Vector): InternalRow = {
    obj match {
      case SparseVector(size, indices, values) =>
        val row = new GenericInternalRow(4)
        row.setByte(0, 0)
        row.setInt(1, size)
        row.update(2, UnsafeArrayData.fromPrimitiveArray(indices))
        row.update(3, UnsafeArrayData.fromPrimitiveArray(values))
        row
      case DenseVector(values) =>
        val row = new GenericInternalRow(4)
        row.setByte(0, 1)
        row.setNullAt(1)
        row.setNullAt(2)
        row.update(3, UnsafeArrayData.fromPrimitiveArray(values))
        row
    }
  }

  override def deserialize(datum: Any): Vector = {
    datum match {
      case row: InternalRow =>
        require(row.numFields == 4,
          s"VectorUDT.deserialize given row with length ${row.numFields} but requires length == 4")
        val tpe = row.getByte(0)
        tpe match {
          case 0 =>
            val size = row.getInt(1)
            val indices = row.getArray(2).toIntArray()
            val values = row.getArray(3).toDoubleArray()
            new SparseVector(size, indices, values)
          case 1 =>
            val values = row.getArray(3).toDoubleArray()
            new DenseVector(values)
        }
    }
  }

  override def pyUDT: String = "pyspark.ml.linalg.VectorUDT"

  override def userClass: Class[Vector] = classOf[Vector]

  override def equals(o: Any): Boolean = {
    o match {
      case v: VectorUDT => true
      case _ => false
    }
  }

  // see [SPARK-8647], this achieves the needed constant hash code without constant no.
  override def hashCode(): Int = classOf[VectorUDT].getName.hashCode()

  override def typeName: String = "vector"

  private[spark] override def asNullable: VectorUDT = this
}

Source File: LocalRelation.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.analysis
import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal}
import org.apache.spark.sql.types.{StructField, StructType}

object LocalRelation {
  def apply(output: Attribute*): LocalRelation = new LocalRelation(output)

  def apply(output1: StructField, output: StructField*): LocalRelation = {
    new LocalRelation(StructType(output1 +: output).toAttributes)
  }

  def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }

  def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }
}

case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
  extends LeafNode with analysis.MultiInstanceRelation {

  // A local relation must have resolved output.
  require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.")

  
  override final def newInstance(): this.type = {
    LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type]
  }

  override protected def stringArgs: Iterator[Any] = {
    if (data.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def sameResult(plan: LogicalPlan): Boolean = {
    plan.canonicalized match {
      case LocalRelation(otherOutput, otherData) =>
        otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data
      case _ => false
    }
  }

  override lazy val statistics =
    Statistics(sizeInBytes =
      (output.map(n => BigInt(n.dataType.defaultSize))).sum * data.length)

  def toSQL(inlineTableName: String): String = {
    require(data.nonEmpty)
    val types = output.map(_.dataType)
    val rows = data.map { row =>
      val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql }
      cells.mkString("(", ", ", ")")
    }
    "VALUES " + rows.mkString(", ") +
      " AS " + inlineTableName +
      output.map(_.name).mkString("(", ", ", ")")
  }
}

Source File: MonotonicallyIncreasingID.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{DataType, LongType}


  @transient private[this] var count: Long = _

  @transient private[this] var partitionMask: Long = _

  override protected def initializeInternal(partitionIndex: Int): Unit = {
    count = 0L
    partitionMask = partitionIndex.toLong << 33
  }

  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override protected def evalInternal(input: InternalRow): Long = {
    val currentCount = count
    count += 1
    partitionMask + currentCount
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val countTerm = ctx.freshName("count")
    val partitionMaskTerm = ctx.freshName("partitionMask")
    ctx.addMutableState(ctx.JAVA_LONG, countTerm, "")
    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "")
    ctx.addPartitionInitializationStatement(s"$countTerm = 0L;")
    ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;")

    ev.copy(code = s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
      $countTerm++;""", isNull = "false")
  }

  override def prettyName: String = "monotonically_increasing_id"

  override def sql: String = s"$prettyName()"
}

Source File: misc.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen._
import org.apache.spark.sql.types._


@ExpressionDescription(
  usage = "_FUNC_() - Returns the current database.",
  extended = """
    Examples:
      > SELECT _FUNC_();
       default
  """)
case class CurrentDatabase() extends LeafExpression with Unevaluable {
  override def dataType: DataType = StringType
  override def foldable: Boolean = true
  override def nullable: Boolean = false
  override def prettyName: String = "current_database"
}

Source File: BoundAttribute.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.errors.attachTree
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types._


case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
  extends LeafExpression {

  override def toString: String = s"input[$ordinal, ${dataType.simpleString}, $nullable]"

  // Use special getter for primitive types (for UnsafeRow)
  override def eval(input: InternalRow): Any = {
    if (input.isNullAt(ordinal)) {
      null
    } else {
      dataType match {
        case BooleanType => input.getBoolean(ordinal)
        case ByteType => input.getByte(ordinal)
        case ShortType => input.getShort(ordinal)
        case IntegerType | DateType => input.getInt(ordinal)
        case LongType | TimestampType => input.getLong(ordinal)
        case FloatType => input.getFloat(ordinal)
        case DoubleType => input.getDouble(ordinal)
        case StringType => input.getUTF8String(ordinal)
        case BinaryType => input.getBinary(ordinal)
        case CalendarIntervalType => input.getInterval(ordinal)
        case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale)
        case t: StructType => input.getStruct(ordinal, t.size)
        case _: ArrayType => input.getArray(ordinal)
        case _: MapType => input.getMap(ordinal)
        case _ => input.get(ordinal, dataType)
      }
    }
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val javaType = ctx.javaType(dataType)
    val value = ctx.getValue(ctx.INPUT_ROW, dataType, ordinal.toString)
    if (ctx.currentVars != null && ctx.currentVars(ordinal) != null) {
      val oev = ctx.currentVars(ordinal)
      ev.isNull = oev.isNull
      ev.value = oev.value
      val code = oev.code
      oev.code = ""
      ev.copy(code = code)
    } else if (nullable) {
      ev.copy(code = s"""
        boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal);
        $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value);""")
    } else {
      ev.copy(code = s"""$javaType ${ev.value} = $value;""", isNull = "false")
    }
  }
}

object BindReferences extends Logging {

  def bindReference[A <: Expression](
      expression: A,
      input: AttributeSeq,
      allowFailures: Boolean = false): A = {
    expression.transform { case a: AttributeReference =>
      attachTree(a, "Binding attribute") {
        val ordinal = input.indexOf(a.exprId)
        if (ordinal == -1) {
          if (allowFailures) {
            a
          } else {
            sys.error(s"Couldn't find $a in ${input.attrs.mkString("[", ",", "]")}")
          }
        } else {
          BoundReference(ordinal, a.dataType, input(ordinal).nullable)
        }
      }
    }.asInstanceOf[A] // Kind of a hack, but safe.  TODO: Tighten return type when possible.
  }
}

Source File: decimalExpressions.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types._


case class CheckOverflow(child: Expression, dataType: DecimalType) extends UnaryExpression {

  override def nullable: Boolean = true

  override def nullSafeEval(input: Any): Any = {
    val d = input.asInstanceOf[Decimal].clone()
    if (d.changePrecision(dataType.precision, dataType.scale)) {
      d
    } else {
      null
    }
  }

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    nullSafeCodeGen(ctx, ev, eval => {
      val tmp = ctx.freshName("tmp")
      s"""
         | Decimal $tmp = $eval.clone();
         | if ($tmp.changePrecision(${dataType.precision}, ${dataType.scale})) {
         |   ${ev.value} = $tmp;
         | } else {
         |   ${ev.isNull} = true;
         | }
       """.stripMargin
    })
  }

  override def toString: String = s"CheckOverflow($child, $dataType)"

  override def sql: String = child.sql
}

Source File: ReferenceToExpressions.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess}
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.expressions.objects.LambdaVariable
import org.apache.spark.sql.types.DataType


case class ReferenceToExpressions(result: Expression, children: Seq[Expression])
  extends Expression {

  override def nullable: Boolean = result.nullable
  override def dataType: DataType = result.dataType

  override def checkInputDataTypes(): TypeCheckResult = {
    if (result.references.nonEmpty) {
      return TypeCheckFailure("The result expression cannot reference to any attributes.")
    }

    var maxOrdinal = -1
    result foreach {
      case b: BoundReference if b.ordinal > maxOrdinal => maxOrdinal = b.ordinal
      case _ =>
    }
    if (maxOrdinal > children.length) {
      return TypeCheckFailure(s"The result expression need $maxOrdinal input expressions, but " +
        s"there are only ${children.length} inputs.")
    }

    TypeCheckSuccess
  }

  private lazy val projection = UnsafeProjection.create(children)

  override def eval(input: InternalRow): Any = {
    result.eval(projection(input))
  }

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val childrenGen = children.map(_.genCode(ctx))
    val (classChildrenVars, initClassChildrenVars) = childrenGen.zip(children).map {
      case (childGen, child) =>
        // SPARK-18125: The children vars are local variables. If the result expression uses
        // splitExpression, those variables cannot be accessed so compilation fails.
        // To fix it, we use class variables to hold those local variables.
        val classChildVarName = ctx.freshName("classChildVar")
        val classChildVarIsNull = ctx.freshName("classChildVarIsNull")
        ctx.addMutableState(ctx.javaType(child.dataType), classChildVarName, "")
        ctx.addMutableState("boolean", classChildVarIsNull, "")

        val classChildVar =
          LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType)

        val initCode = s"${classChildVar.value} = ${childGen.value};\n" +
          s"${classChildVar.isNull} = ${childGen.isNull};"

        (classChildVar, initCode)
    }.unzip

    val resultGen = result.transform {
      case b: BoundReference => classChildrenVars(b.ordinal)
    }.genCode(ctx)

    ExprCode(code = childrenGen.map(_.code).mkString("\n") + initClassChildrenVars.mkString("\n") +
      resultGen.code, isNull = resultGen.isNull, value = resultGen.value)
  }
}

Source File: GeneratePredicate.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._


object GeneratePredicate extends CodeGenerator[Expression, Predicate] {

  protected def canonicalize(in: Expression): Expression = ExpressionCanonicalizer.execute(in)

  protected def bind(in: Expression, inputSchema: Seq[Attribute]): Expression =
    BindReferences.bindReference(in, inputSchema)

  protected def create(predicate: Expression): Predicate = {
    val ctx = newCodeGenContext()
    val eval = predicate.genCode(ctx)

    val codeBody = s"""
      public SpecificPredicate generate(Object[] references) {
        return new SpecificPredicate(references);
      }

      class SpecificPredicate extends ${classOf[Predicate].getName} {
        private final Object[] references;
        ${ctx.declareMutableStates()}

        public SpecificPredicate(Object[] references) {
          this.references = references;
          ${ctx.initMutableStates()}
        }

        public void initialize(int partitionIndex) {
          ${ctx.initPartition()}
        }

        ${ctx.declareAddedFunctions()}

        public boolean eval(InternalRow ${ctx.INPUT_ROW}) {
          ${eval.code}
          return !${eval.isNull} && ${eval.value};
        }
      }"""

    val code = CodeFormatter.stripOverlappingComments(
      new CodeAndComment(codeBody, ctx.getPlaceHolderToComments()))
    logDebug(s"Generated predicate '$predicate':\n${CodeFormatter.format(code)}")

    CodeGenerator.compile(code).generate(ctx.references.toArray).asInstanceOf[Predicate]
  }
}

Source File: ResolveInlineTables.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import scala.util.control.NonFatal

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Cast
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.types.{StructField, StructType}


  private[analysis] def convert(table: UnresolvedInlineTable): LocalRelation = {
    // For each column, traverse all the values and find a common data type and nullability.
    val fields = table.rows.transpose.zip(table.names).map { case (column, name) =>
      val inputTypes = column.map(_.dataType)
      val tpe = TypeCoercion.findWiderTypeWithoutStringPromotion(inputTypes).getOrElse {
        table.failAnalysis(s"incompatible types found in column $name for inline table")
      }
      StructField(name, tpe, nullable = column.exists(_.nullable))
    }
    val attributes = StructType(fields).toAttributes
    assert(fields.size == table.names.size)

    val newRows: Seq[InternalRow] = table.rows.map { row =>
      InternalRow.fromSeq(row.zipWithIndex.map { case (e, ci) =>
        val targetType = fields(ci).dataType
        try {
          if (e.dataType.sameType(targetType)) {
            e.eval()
          } else {
            Cast(e, targetType).eval()
          }
        } catch {
          case NonFatal(ex) =>
            table.failAnalysis(s"failed to evaluate expression ${e.sql}: ${ex.getMessage}")
        }
      })
    }

    LocalRelation(attributes, newRows)
  }
}

Source File: DeclarativeAggregateEvaluator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.aggregate

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateSafeProjection


case class DeclarativeAggregateEvaluator(function: DeclarativeAggregate, input: Seq[Attribute]) {

  lazy val initializer = GenerateSafeProjection.generate(function.initialValues)

  lazy val updater = GenerateSafeProjection.generate(
    function.updateExpressions,
    function.aggBufferAttributes ++ input)

  lazy val merger = GenerateSafeProjection.generate(
    function.mergeExpressions,
    function.aggBufferAttributes ++ function.inputAggBufferAttributes)

  lazy val evaluator = GenerateSafeProjection.generate(
    function.evaluateExpression :: Nil,
    function.aggBufferAttributes)

  def initialize(): InternalRow = initializer.apply(InternalRow.empty).copy()

  def update(values: InternalRow*): InternalRow = {
    val joiner = new JoinedRow
    val buffer = values.foldLeft(initialize()) { (buffer, input) =>
      updater(joiner(buffer, input))
    }
    buffer.copy()
  }

  def merge(buffers: InternalRow*): InternalRow = {
    val joiner = new JoinedRow
    val buffer = buffers.foldLeft(initialize()) { (left, right) =>
      merger(joiner(left, right))
    }
    buffer.copy()
  }

  def eval(buffer: InternalRow): InternalRow = evaluator(buffer).copy()
}

Source File: GeneratorExpressionSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types._

class GeneratorExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
  private def checkTuple(actual: Expression, expected: Seq[InternalRow]): Unit = {
    assert(actual.eval(null).asInstanceOf[TraversableOnce[InternalRow]].toSeq === expected)
  }

  private final val empty_array = CreateArray(Seq.empty)
  private final val int_array = CreateArray(Seq(1, 2, 3).map(Literal(_)))
  private final val str_array = CreateArray(Seq("a", "b", "c").map(Literal(_)))

  test("explode") {
    val int_correct_answer = Seq(create_row(1), create_row(2), create_row(3))
    val str_correct_answer = Seq(create_row("a"), create_row("b"), create_row("c"))

    checkTuple(Explode(empty_array), Seq.empty)
    checkTuple(Explode(int_array), int_correct_answer)
    checkTuple(Explode(str_array), str_correct_answer)
  }

  test("posexplode") {
    val int_correct_answer = Seq(create_row(0, 1), create_row(1, 2), create_row(2, 3))
    val str_correct_answer = Seq(create_row(0, "a"), create_row(1, "b"), create_row(2, "c"))

    checkTuple(PosExplode(CreateArray(Seq.empty)), Seq.empty)
    checkTuple(PosExplode(int_array), int_correct_answer)
    checkTuple(PosExplode(str_array), str_correct_answer)
  }

  test("inline") {
    val correct_answer = Seq(create_row(0, "a"), create_row(1, "b"), create_row(2, "c"))

    checkTuple(
      Inline(Literal.create(Array(), ArrayType(new StructType().add("id", LongType)))),
      Seq.empty)

    checkTuple(
      Inline(CreateArray(Seq(
        CreateStruct(Seq(Literal(0), Literal("a"))),
        CreateStruct(Seq(Literal(1), Literal("b"))),
        CreateStruct(Seq(Literal(2), Literal("c")))
      ))),
      correct_answer)
  }

  test("stack") {
    checkTuple(Stack(Seq(1, 1).map(Literal(_))), Seq(create_row(1)))
    checkTuple(Stack(Seq(1, 1, 2).map(Literal(_))), Seq(create_row(1, 2)))
    checkTuple(Stack(Seq(2, 1, 2).map(Literal(_))), Seq(create_row(1), create_row(2)))
    checkTuple(Stack(Seq(2, 1, 2, 3).map(Literal(_))), Seq(create_row(1, 2), create_row(3, null)))
    checkTuple(Stack(Seq(3, 1, 2, 3).map(Literal(_))), Seq(1, 2, 3).map(create_row(_)))
    checkTuple(Stack(Seq(4, 1, 2, 3).map(Literal(_))), Seq(1, 2, 3, null).map(create_row(_)))

    checkTuple(
      Stack(Seq(3, 1, 1.0, "a", 2, 2.0, "b", 3, 3.0, "c").map(Literal(_))),
      Seq(create_row(1, 1.0, "a"), create_row(2, 2.0, "b"), create_row(3, 3.0, "c")))

    assert(Stack(Seq(Literal(1))).checkInputDataTypes().isFailure)
    assert(Stack(Seq(Literal(1.0))).checkInputDataTypes().isFailure)
    assert(Stack(Seq(Literal(1), Literal(1), Literal(1.0))).checkInputDataTypes().isSuccess)
    assert(Stack(Seq(Literal(2), Literal(1), Literal(1.0))).checkInputDataTypes().isFailure)
  }
}

Source File: ExpressionEvalHelperSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{DataType, IntegerType}


case class BadCodegenExpression() extends LeafExpression {
  override def nullable: Boolean = false
  override def eval(input: InternalRow): Any = 10
  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    ev.copy(code =
      s"""
        |int some_variable = 11;
        |int ${ev.value} = 10;
      """.stripMargin)
  }
  override def dataType: DataType = IntegerType
}

Source File: ObjectExpressionsSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.objects.Invoke
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{IntegerType, ObjectType}


class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("SPARK-16622: The returned value of the called method in Invoke can be null") {
    val inputRow = InternalRow.fromSeq(Seq((false, null)))
    val cls = classOf[Tuple2[Boolean, java.lang.Integer]]
    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
    val invoke = Invoke(inputObject, "_2", IntegerType)
    checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow)
  }

  test("MapObjects should make copies of unsafe-backed data") {
    // test UnsafeRow-backed data
    val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]]
    val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4))))
    val structExpected = new GenericArrayData(
      Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4))))
    checkEvalutionWithUnsafeProjection(
      structEncoder.serializer.head, structExpected, structInputRow)

    // test UnsafeArray-backed data
    val arrayEncoder = ExpressionEncoder[Array[Array[Int]]]
    val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4))))
    val arrayExpected = new GenericArrayData(
      Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4))))
    checkEvalutionWithUnsafeProjection(
      arrayEncoder.serializer.head, arrayExpected, arrayInputRow)

    // test UnsafeMap-backed data
    val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]]
    val mapInputRow = InternalRow.fromSeq(Seq(Array(
      Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400))))
    val mapExpected = new GenericArrayData(Seq(
      new ArrayBasedMapData(
        new GenericArrayData(Array(1, 2)),
        new GenericArrayData(Array(100, 200))),
      new ArrayBasedMapData(
        new GenericArrayData(Array(3, 4)),
        new GenericArrayData(Array(300, 400)))))
    checkEvalutionWithUnsafeProjection(
      mapEncoder.serializer.head, mapExpected, mapInputRow)
  }
}

Source File: GenerateUnsafeRowJoinerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import scala.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.RandomDataGenerator
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
import org.apache.spark.sql.types._


class GenerateUnsafeRowJoinerSuite extends SparkFunSuite {

  private val fixed = Seq(IntegerType)
  private val variable = Seq(IntegerType, StringType)

  test("simple fixed width types") {
    testConcat(0, 0, fixed)
    testConcat(0, 1, fixed)
    testConcat(1, 0, fixed)
    testConcat(64, 0, fixed)
    testConcat(0, 64, fixed)
    testConcat(64, 64, fixed)
  }

  test("randomized fix width types") {
    for (i <- 0 until 20) {
      testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed)
    }
  }

  test("simple variable width types") {
    testConcat(0, 0, variable)
    testConcat(0, 1, variable)
    testConcat(1, 0, variable)
    testConcat(64, 0, variable)
    testConcat(0, 64, variable)
    testConcat(64, 64, variable)
  }

  test("randomized variable width types") {
    for (i <- 0 until 10) {
      testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable)
    }
  }

  private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = {
    for (i <- 0 until 10) {
      testConcatOnce(numFields1, numFields2, candidateTypes)
    }
  }

  private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) {
    info(s"schema size $numFields1, $numFields2")
    val random = new Random()
    val schema1 = RandomDataGenerator.randomSchema(random, numFields1, candidateTypes)
    val schema2 = RandomDataGenerator.randomSchema(random, numFields2, candidateTypes)

    // Create the converters needed to convert from external row to internal row and to UnsafeRows.
    val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1)
    val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2)
    val converter1 = UnsafeProjection.create(schema1)
    val converter2 = UnsafeProjection.create(schema2)

    // Create the input rows, convert them into UnsafeRows.
    val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply()
    val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply()
    val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow])
    val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow])

    // Run the joiner.
    val mergedSchema = StructType(schema1 ++ schema2)
    val concater = GenerateUnsafeRowJoiner.create(schema1, schema2)
    val output = concater.join(row1, row2)

    // Test everything equals ...
    for (i <- mergedSchema.indices) {
      if (i < schema1.size) {
        assert(output.isNullAt(i) === row1.isNullAt(i))
        if (!output.isNullAt(i)) {
          assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType))
        }
      } else {
        assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size))
        if (!output.isNullAt(i)) {
          assert(output.get(i, mergedSchema(i).dataType) ===
            row2.get(i - schema1.size, mergedSchema(i).dataType))
        }
      }
    }
  }

}

Source File: ConvertToLocalRelationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor


class ConvertToLocalRelationSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("LocalRelation", FixedPoint(100),
        ConvertToLocalRelation) :: Nil
  }

  test("Project on LocalRelation should be turned into a single LocalRelation") {
    val testRelation = LocalRelation(
      LocalRelation('a.int, 'b.int).output,
      InternalRow(1, 2) :: InternalRow(4, 5) :: Nil)

    val correctAnswer = LocalRelation(
      LocalRelation('a1.int, 'b1.int).output,
      InternalRow(1, 3) :: InternalRow(4, 6) :: Nil)

    val projectOnLocal = testRelation.select(
      UnresolvedAttribute("a").as("a1"),
      (UnresolvedAttribute("b") + 1).as("b1"))

    val optimized = Optimize.execute(projectOnLocal.analyze)

    comparePlans(optimized, correctAnswer)
  }

}

Source File: LocalTableScanExec.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.execution.metric.SQLMetrics



case class LocalTableScanExec(
    output: Seq[Attribute],
    rows: Seq[InternalRow]) extends LeafExecNode {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  private val unsafeRows: Array[InternalRow] = {
    if (rows.isEmpty) {
      Array.empty
    } else {
      val proj = UnsafeProjection.create(output, output)
      rows.map(r => proj(r).copy()).toArray
    }
  }

  private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1),
    sqlContext.sparkContext.defaultParallelism)

  private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism)

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    rdd.map { r =>
      numOutputRows += 1
      r
    }
  }

  override protected def stringArgs: Iterator[Any] = {
    if (rows.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def executeCollect(): Array[InternalRow] = {
    longMetric("numOutputRows").add(unsafeRows.size)
    unsafeRows
  }

  override def executeTake(limit: Int): Array[InternalRow] = {
    val taken = unsafeRows.take(limit)
    longMetric("numOutputRows").add(taken.size)
    taken
  }
}

Source File: RowIterator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import java.util.NoSuchElementException

import org.apache.spark.sql.catalyst.InternalRow


  def toScala: Iterator[InternalRow] = new RowIteratorToScala(this)
}

object RowIterator {
  def fromScala(scalaIter: Iterator[InternalRow]): RowIterator = {
    scalaIter match {
      case wrappedRowIter: RowIteratorToScala => wrappedRowIter.rowIter
      case _ => new RowIteratorFromScala(scalaIter)
    }
  }
}

private final class RowIteratorToScala(val rowIter: RowIterator) extends Iterator[InternalRow] {
  private [this] var hasNextWasCalled: Boolean = false
  private [this] var _hasNext: Boolean = false
  override def hasNext: Boolean = {
    // Idempotency:
    if (!hasNextWasCalled) {
      _hasNext = rowIter.advanceNext()
      hasNextWasCalled = true
    }
    _hasNext
  }
  override def next(): InternalRow = {
    if (!hasNext) throw new NoSuchElementException
    hasNextWasCalled = false
    rowIter.getRow
  }
}

private final class RowIteratorFromScala(scalaIter: Iterator[InternalRow]) extends RowIterator {
  private[this] var _next: InternalRow = null
  override def advanceNext(): Boolean = {
    if (scalaIter.hasNext) {
      _next = scalaIter.next()
      true
    } else {
      _next = null
      false
    }
  }
  override def getRow: InternalRow = _next
  override def toScala: Iterator[InternalRow] = scalaIter
}

Source File: ShuffledHashJoinExec.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class ShuffledHashJoinExec(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    joinType: JoinType,
    buildSide: BuildSide,
    condition: Option[Expression],
    left: SparkPlan,
    right: SparkPlan)
  extends BinaryExecNode with HashJoin {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "buildDataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size of build side"),
    "buildTime" -> SQLMetrics.createTimingMetric(sparkContext, "time to build hash map"))

  override def requiredChildDistribution: Seq[Distribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  private def buildHashedRelation(iter: Iterator[InternalRow]): HashedRelation = {
    val buildDataSize = longMetric("buildDataSize")
    val buildTime = longMetric("buildTime")
    val start = System.nanoTime()
    val context = TaskContext.get()
    val relation = HashedRelation(iter, buildKeys, taskMemoryManager = context.taskMemoryManager())
    buildTime += (System.nanoTime() - start) / 1000000
    buildDataSize += relation.estimatedSize
    // This relation is usually used until the end of task.
    context.addTaskCompletionListener(_ => relation.close())
    relation
  }

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    streamedPlan.execute().zipPartitions(buildPlan.execute()) { (streamIter, buildIter) =>
      val hashed = buildHashedRelation(buildIter)
      join(streamIter, hashed, numOutputRows)
    }
  }
}

Source File: CartesianProductExec.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark._
import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD}
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.util.CompletionIterator
import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter


class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int)
  extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) {

  override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = {
    // We will not sort the rows, so prefixComparator and recordComparator are null.
    val sorter = UnsafeExternalSorter.create(
      context.taskMemoryManager(),
      SparkEnv.get.blockManager,
      SparkEnv.get.serializerManager,
      context,
      null,
      null,
      1024,
      SparkEnv.get.memoryManager.pageSizeBytes,
      SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold",
        UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD),
      false)

    val partition = split.asInstanceOf[CartesianPartition]
    for (y <- rdd2.iterator(partition.s2, context)) {
      sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false)
    }

    // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow]
    def createIter(): Iterator[UnsafeRow] = {
      val iter = sorter.getIterator
      val unsafeRow = new UnsafeRow(numFieldsOfRight)
      new Iterator[UnsafeRow] {
        override def hasNext: Boolean = {
          iter.hasNext
        }
        override def next(): UnsafeRow = {
          iter.loadNext()
          unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength)
          unsafeRow
        }
      }
    }

    val resultIter =
      for (x <- rdd1.iterator(partition.s1, context);
           y <- createIter()) yield (x, y)
    CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]](
      resultIter, sorter.cleanupResources())
  }
}


case class CartesianProductExec(
    left: SparkPlan,
    right: SparkPlan,
    condition: Option[Expression]) extends BinaryExecNode {
  override def output: Seq[Attribute] = left.output ++ right.output

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")

    val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]]
    val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]

    val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size)
    pair.mapPartitionsWithIndexInternal { (index, iter) =>
      val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
      val filtered = if (condition.isDefined) {
        val boundCondition = newPredicate(condition.get, left.output ++ right.output)
        boundCondition.initialize(index)
        val joined = new JoinedRow

        iter.filter { r =>
          boundCondition.eval(joined(r._1, r._2))
        }
      } else {
        iter
      }
      filtered.map { r =>
        numOutputRows += 1
        joiner.join(r._1, r._2)
      }
    }
  }
}

Source File: RecordReaderIterator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.io.Closeable

import org.apache.hadoop.mapreduce.RecordReader

import org.apache.spark.sql.catalyst.InternalRow


class RecordReaderIterator[T](
    private[this] var rowReader: RecordReader[_, T]) extends Iterator[T] with Closeable {
  private[this] var havePair = false
  private[this] var finished = false

  override def hasNext: Boolean = {
    if (!finished && !havePair) {
      finished = !rowReader.nextKeyValue
      if (finished) {
        // Close and release the reader here; close() will also be called when the task
        // completes, but for tasks that read from many files, it helps to release the
        // resources early.
        close()
      }
      havePair = !finished
    }
    !finished
  }

  override def next(): T = {
    if (!hasNext) {
      throw new java.util.NoSuchElementException("End of stream")
    }
    havePair = false
    rowReader.getCurrentValue
  }

  override def close(): Unit = {
    if (rowReader != null) {
      try {
        // Close the reader and release it. Note: it's very important that we don't close the
        // reader more than once, since that exposes us to MAPREDUCE-5918 when running against
        // older Hadoop 2.x releases. That bug can lead to non-deterministic corruption issues
        // when reading compressed input.
        rowReader.close()
      } finally {
        rowReader = null
      }
    }
  }
}

Source File: ParquetOutputWriter.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce._
import org.apache.parquet.hadoop.ParquetOutputFormat

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.OutputWriter

// NOTE: This class is instantiated and used on executor side only, no need to be serializable.
private[parquet] class ParquetOutputWriter(path: String, context: TaskAttemptContext)
  extends OutputWriter {

  private val recordWriter: RecordWriter[Void, InternalRow] = {
    new ParquetOutputFormat[InternalRow]() {
      override def getDefaultWorkFile(context: TaskAttemptContext, extension: String): Path = {
        new Path(path)
      }
    }.getRecordWriter(context)
  }

  override def write(row: Row): Unit = throw new UnsupportedOperationException("call writeInternal")

  override def writeInternal(row: InternalRow): Unit = recordWriter.write(null, row)

  override def close(): Unit = recordWriter.close(context)
}

Source File: Exchange.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.exchange

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

import org.apache.spark.broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, UnaryExecNode}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType


case class ReuseExchange(conf: SQLConf) extends Rule[SparkPlan] {

  def apply(plan: SparkPlan): SparkPlan = {
    if (!conf.exchangeReuseEnabled) {
      return plan
    }
    // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls.
    val exchanges = mutable.HashMap[StructType, ArrayBuffer[Exchange]]()
    plan.transformUp {
      case exchange: Exchange =>
        // the exchanges that have same results usually also have same schemas (same column names).
        val sameSchema = exchanges.getOrElseUpdate(exchange.schema, ArrayBuffer[Exchange]())
        val samePlan = sameSchema.find { e =>
          exchange.sameResult(e)
        }
        if (samePlan.isDefined) {
          // Keep the output of this exchange, the following plans require that to resolve
          // attributes.
          ReusedExchangeExec(exchange.output, samePlan.get)
        } else {
          sameSchema += exchange
          exchange
        }
    }
  }
}

Source File: BoundOrdering.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.window

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Projection



private[window] final case class RangeBoundOrdering(
    ordering: Ordering[InternalRow],
    current: Projection,
    bound: Projection)
  extends BoundOrdering {

  override def compare(
      inputRow: InternalRow,
      inputIndex: Int,
      outputRow: InternalRow,
      outputIndex: Int): Int =
    ordering.compare(current(inputRow), bound(outputRow))
}

Source File: NullableColumnAccessor.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.sql.catalyst.InternalRow

private[columnar] trait NullableColumnAccessor extends ColumnAccessor {
  private var nullsBuffer: ByteBuffer = _
  private var nullCount: Int = _
  private var seenNulls: Int = 0

  private var nextNullIndex: Int = _
  private var pos: Int = 0

  abstract override protected def initialize(): Unit = {
    nullsBuffer = underlyingBuffer.duplicate().order(ByteOrder.nativeOrder())
    nullCount = ByteBufferHelper.getInt(nullsBuffer)
    nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
    pos = 0

    underlyingBuffer.position(underlyingBuffer.position + 4 + nullCount * 4)
    super.initialize()
  }

  abstract override def extractTo(row: InternalRow, ordinal: Int): Unit = {
    if (pos == nextNullIndex) {
      seenNulls += 1

      if (seenNulls < nullCount) {
        nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
      }

      row.setNullAt(ordinal)
    } else {
      super.extractTo(row, ordinal)
    }

    pos += 1
  }

  abstract override def hasNext: Boolean = seenNulls < nullCount || super.hasNext
}

Source File: NullableColumnBuilder.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.sql.catalyst.InternalRow


private[columnar] trait NullableColumnBuilder extends ColumnBuilder {
  protected var nulls: ByteBuffer = _
  protected var nullCount: Int = _
  private var pos: Int = _

  abstract override def initialize(
      initialSize: Int,
      columnName: String,
      useCompression: Boolean): Unit = {

    nulls = ByteBuffer.allocate(1024)
    nulls.order(ByteOrder.nativeOrder())
    pos = 0
    nullCount = 0
    super.initialize(initialSize, columnName, useCompression)
  }

  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
    columnStats.gatherStats(row, ordinal)
    if (row.isNullAt(ordinal)) {
      nulls = ColumnBuilder.ensureFreeSpace(nulls, 4)
      nulls.putInt(pos)
      nullCount += 1
    } else {
      super.appendFrom(row, ordinal)
    }
    pos += 1
  }

  abstract override def build(): ByteBuffer = {
    val nonNulls = super.build()
    val nullDataLen = nulls.position()

    nulls.limit(nullDataLen)
    nulls.rewind()

    val buffer = ByteBuffer
      .allocate(4 + nullDataLen + nonNulls.remaining())
      .order(ByteOrder.nativeOrder())
      .putInt(nullCount)
      .put(nulls)
      .put(nonNulls)

    buffer.rewind()
    buffer
  }

  protected def buildNonNulls(): ByteBuffer = {
    nulls.limit(nulls.position()).rewind()
    super.build()
  }
}

Source File: CompressibleColumnBuilder.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar.compression

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder}
import org.apache.spark.sql.types.AtomicType
import org.apache.spark.unsafe.Platform


private[columnar] trait CompressibleColumnBuilder[T <: AtomicType]
  extends ColumnBuilder with Logging {

  this: NativeColumnBuilder[T] with WithCompressionSchemes =>

  var compressionEncoders: Seq[Encoder[T]] = _

  abstract override def initialize(
      initialSize: Int,
      columnName: String,
      useCompression: Boolean): Unit = {

    compressionEncoders =
      if (useCompression) {
        schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType))
      } else {
        Seq(PassThrough.encoder(columnType))
      }
    super.initialize(initialSize, columnName, useCompression)
  }

  // The various compression schemes, while saving memory use, cause all of the data within
  // the row to become unaligned, thus causing crashes.  Until a way of fixing the compression
  // is found to also allow aligned accesses this must be disabled for SPARC.

  protected def isWorthCompressing(encoder: Encoder[T]) = {
    CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8
  }

  private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
    compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal))
  }

  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
    super.appendFrom(row, ordinal)
    if (!row.isNullAt(ordinal)) {
      gatherCompressibilityStats(row, ordinal)
    }
  }

  override def build(): ByteBuffer = {
    val nonNullBuffer = buildNonNulls()
    val encoder: Encoder[T] = {
      val candidate = compressionEncoders.minBy(_.compressionRatio)
      if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType)
    }

    // Header = null count + null positions
    val headerSize = 4 + nulls.limit()
    val compressedSize = if (encoder.compressedSize == 0) {
      nonNullBuffer.remaining()
    } else {
      encoder.compressedSize
    }

    val compressedBuffer = ByteBuffer
      // Reserves 4 bytes for compression scheme ID
      .allocate(headerSize + 4 + compressedSize)
      .order(ByteOrder.nativeOrder)
      // Write the header
      .putInt(nullCount)
      .put(nulls)

    logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
    encoder.compress(nonNullBuffer, compressedBuffer)
  }
}

private[columnar] object CompressibleColumnBuilder {
  val unaligned = Platform.unaligned()
}

Source File: CompressibleColumnAccessor.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar.compression

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor}
import org.apache.spark.sql.types.AtomicType

private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor {
  this: NativeColumnAccessor[T] =>

  private var decoder: Decoder[T] = _

  abstract override protected def initialize(): Unit = {
    super.initialize()
    decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType)
  }

  abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext

  override def extractSingle(row: InternalRow, ordinal: Int): Unit = {
    decoder.next(row, ordinal)
  }
}

Source File: GenerateExec.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.execution.metric.SQLMetrics


case class GenerateExec(
    generator: Generator,
    join: Boolean,
    outer: Boolean,
    generatorOutput: Seq[Attribute],
    child: SparkPlan)
  extends UnaryExecNode {

  override def output: Seq[Attribute] = {
    if (join) {
      child.output ++ generatorOutput
    } else {
      generatorOutput
    }
  }

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  override def producedAttributes: AttributeSet = AttributeSet(output)

  override def outputPartitioning: Partitioning = child.outputPartitioning

  val boundGenerator = BindReferences.bindReference(generator, child.output)

  protected override def doExecute(): RDD[InternalRow] = {
    // boundGenerator.terminate() should be triggered after all of the rows in the partition
    val rows = if (join) {
      child.execute().mapPartitionsInternal { iter =>
        val generatorNullRow = new GenericInternalRow(generator.elementSchema.length)
        val joinedRow = new JoinedRow

        iter.flatMap { row =>
          // we should always set the left (child output)
          joinedRow.withLeft(row)
          val outputRows = boundGenerator.eval(row)
          if (outer && outputRows.isEmpty) {
            joinedRow.withRight(generatorNullRow) :: Nil
          } else {
            outputRows.map(joinedRow.withRight)
          }
        } ++ LazyIterator(boundGenerator.terminate).map { row =>
          // we leave the left side as the last element of its child output
          // keep it the same as Hive does
          joinedRow.withRight(row)
        }
      }
    } else {
      child.execute().mapPartitionsInternal { iter =>
        iter.flatMap(boundGenerator.eval) ++ LazyIterator(boundGenerator.terminate)
      }
    }

    val numOutputRows = longMetric("numOutputRows")
    rows.mapPartitionsWithIndexInternal { (index, iter) =>
      val proj = UnsafeProjection.create(output, output)
      proj.initialize(index)
      iter.map { r =>
        numOutputRows += 1
        proj(r)
      }
    }
  }
}

Source File: commands.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.errors.TreeNodeException
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.catalyst.plans.QueryPlan
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.debug._
import org.apache.spark.sql.execution.streaming.IncrementalExecution
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types._


case class ExplainCommand(
    logicalPlan: LogicalPlan,
    override val output: Seq[Attribute] =
      Seq(AttributeReference("plan", StringType, nullable = true)()),
    extended: Boolean = false,
    codegen: Boolean = false)
  extends RunnableCommand {

  // Run through the optimizer to generate the physical plan.
  override def run(sparkSession: SparkSession): Seq[Row] = try {
    val queryExecution =
      if (logicalPlan.isStreaming) {
        // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the
        // output mode does not matter since there is no `Sink`.
        new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0)
      } else {
        sparkSession.sessionState.executePlan(logicalPlan)
      }
    val outputString =
      if (codegen) {
        codegenString(queryExecution.executedPlan)
      } else if (extended) {
        queryExecution.toString
      } else {
        queryExecution.simpleString
      }
    Seq(Row(outputString))
  } catch { case cause: TreeNodeException[_] =>
    ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_))
  }
}

Source File: StreamingRelation.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LeafNode
import org.apache.spark.sql.execution.LeafExecNode
import org.apache.spark.sql.execution.datasources.DataSource

object StreamingRelation {
  def apply(dataSource: DataSource): StreamingRelation = {
    StreamingRelation(
      dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes)
  }
}


case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode {
  override def toString: String = sourceName
  override protected def doExecute(): RDD[InternalRow] = {
    throw new UnsupportedOperationException("StreamingRelationExec cannot be executed")
  }
}

object StreamingExecutionRelation {
  def apply(source: Source): StreamingExecutionRelation = {
    StreamingExecutionRelation(source, source.schema.toAttributes)
  }
}

Source File: EventTimeWatermarkExec.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends SparkPlan {

  val eventTimeStats = new EventTimeStatsAccum()
  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
          .withMetadata(a.metadata)
          .putLong(EventTimeWatermark.delayKey, delay.milliseconds)
          .build()

      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }

  override def children: Seq[SparkPlan] = child :: Nil
}

Source File: CoGroupedIterator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering


class CoGroupedIterator(
    left: Iterator[(InternalRow, Iterator[InternalRow])],
    right: Iterator[(InternalRow, Iterator[InternalRow])],
    groupingSchema: Seq[Attribute])
  extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] {

  private val keyOrdering =
    GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema)

  private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _
  private var currentRightData: (InternalRow, Iterator[InternalRow]) = _

  override def hasNext: Boolean = {
    if (currentLeftData == null && left.hasNext) {
      currentLeftData = left.next()
    }
    if (currentRightData == null && right.hasNext) {
      currentRightData = right.next()
    }

    currentLeftData != null || currentRightData != null
  }

  override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    assert(hasNext)

    if (currentLeftData.eq(null)) {
      // left is null, right is not null, consume the right data.
      rightOnly()
    } else if (currentRightData.eq(null)) {
      // left is not null, right is null, consume the left data.
      leftOnly()
    } else if (currentLeftData._1 == currentRightData._1) {
      // left and right have the same grouping key, consume both of them.
      val result = (currentLeftData._1, currentLeftData._2, currentRightData._2)
      currentLeftData = null
      currentRightData = null
      result
    } else {
      val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1)
      assert(compare != 0)
      if (compare < 0) {
        // the grouping key of left is smaller, consume the left data.
        leftOnly()
      } else {
        // the grouping key of right is smaller, consume the right data.
        rightOnly()
      }
    }
  }

  private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    val result = (currentLeftData._1, currentLeftData._2, Iterator.empty)
    currentLeftData = null
    result
  }

  private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    val result = (currentRightData._1, Iterator.empty, currentRightData._2)
    currentRightData = null
    result
  }
}

Source File: ReferenceSort.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.errors._
import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.util.CompletionIterator
import org.apache.spark.util.collection.ExternalSorter



case class ReferenceSort(
    sortOrder: Seq[SortOrder],
    global: Boolean,
    child: SparkPlan)
  extends UnaryExecNode {

  override def requiredChildDistribution: Seq[Distribution] =
    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil

  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
    child.execute().mapPartitions( { iterator =>
      val ordering = newOrdering(sortOrder, child.output)
      val sorter = new ExternalSorter[InternalRow, Null, InternalRow](
        TaskContext.get(), ordering = Some(ordering))
      sorter.insertAll(iterator.map(r => (r.copy(), null)))
      val baseIterator = sorter.iterator.map(_._1)
      val context = TaskContext.get()
      context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled)
      context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled)
      context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes)
      CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop())
    }, preservesPartitioning = true)
  }

  override def output: Seq[Attribute] = child.output

  override def outputOrdering: Seq[SortOrder] = sortOrder

  override def outputPartitioning: Partitioning = child.outputPartitioning
}

Source File: ColumnarTestUtils.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{AtomicType, Decimal}
import org.apache.spark.unsafe.types.UTF8String

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericInternalRow = {
    val row = new GenericInternalRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case NULL => null
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
      case STRUCT(_) =>
        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
      case ARRAY(_) =>
        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
      case MAP(_) =>
        ArrayBasedMapData(
          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
      case _ => throw new IllegalArgumentException(s"Unknown column type $columnType")
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericInternalRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericInternalRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: ExtraStrategiesSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.test.SharedSQLContext

case class FastOperator(output: Seq[Attribute]) extends SparkPlan {

  override protected def doExecute(): RDD[InternalRow] = {
    val str = Literal("so fast").value
    val row = new GenericInternalRow(Array[Any](str))
    val unsafeProj = UnsafeProjection.create(schema)
    val unsafeRow = unsafeProj(row).copy()
    sparkContext.parallelize(Seq(unsafeRow))
  }

  override def producedAttributes: AttributeSet = outputSet
  override def children: Seq[SparkPlan] = Nil
}

object TestStrategy extends Strategy {
  def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
    case Project(Seq(attr), _) if attr.name == "a" =>
      FastOperator(attr.toAttribute :: Nil) :: Nil
    case _ => Nil
  }
}

class ExtraStrategiesSuite extends QueryTest with SharedSQLContext {
  import testImplicits._

  test("insert an extraStrategy") {
    try {
      spark.experimental.extraStrategies = TestStrategy :: Nil

      val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b")
      checkAnswer(
        df.select("a"),
        Row("so fast"))

      checkAnswer(
        df.select("a", "b"),
        Row("so slow", 1))
    } finally {
      spark.experimental.extraStrategies = Nil
    }
  }
}

Source File: CarbonLoadParams.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.management

import java.text.SimpleDateFormat
import java.util

import scala.collection.mutable

import org.apache.hadoop.conf.Configuration
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.command.UpdateTableModel
import org.apache.spark.sql.execution.datasources.LogicalRelation

import org.apache.carbondata.core.indexstore.PartitionSpec
import org.apache.carbondata.core.statusmanager.SegmentStatus
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.carbondata.events.OperationContext
import org.apache.carbondata.processing.loading.model.CarbonLoadModel


case class CarbonLoadParams(
    sparkSession: SparkSession,
    tableName: String,
    sizeInBytes: Long,
    isOverwriteTable: Boolean,
    carbonLoadModel: CarbonLoadModel,
    hadoopConf: Configuration,
    logicalPartitionRelation: LogicalRelation,
    dateFormat : SimpleDateFormat,
    timeStampFormat : SimpleDateFormat,
    optionsOriginal: Map[String, String],
    finalPartition : Map[String, Option[String]],
    currPartitions: util.List[PartitionSpec],
    partitionStatus : SegmentStatus,
    var dataFrame: Option[DataFrame],
    scanResultRDD : Option[RDD[InternalRow]],
    updateModel: Option[UpdateTableModel],
    operationContext: OperationContext) {
}

Source File: MergeProjection.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.mutation.merge

import java.sql.{Date, Timestamp}

import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection}
import org.apache.spark.sql.catalyst.util.DateTimeUtils


case class MergeProjection(
    @transient tableCols: Seq[String],
    @transient statusCol : String,
    @transient ds: Dataset[Row],
    @transient rltn: CarbonDatasourceHadoopRelation,
    @transient sparkSession: SparkSession,
    @transient mergeAction: MergeAction) {

  private val cutOffDate = Integer.MAX_VALUE >> 1

  val isUpdate = mergeAction.isInstanceOf[UpdateAction]
  val isDelete = mergeAction.isInstanceOf[DeleteAction]

  def apply(row: GenericRowWithSchema): InternalRow = {
    // TODO we can avoid these multiple conversions if this is added as a SparkPlan node.
    val values = row.values.map {
      case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s)
      case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d)
      case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b)
      case d: Date => DateTimeUtils.fromJavaDate(d)
      case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t)
      case value => value
    }

    projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow]
  }

  val (projection, output) = generateProjection

  private def generateProjection: (Projection, Array[Expression]) = {
    val existingDsOutput = rltn.carbonRelation.schema.toAttributes
    val colsMap = mergeAction match {
      case UpdateAction(updateMap) => updateMap
      case InsertAction(insertMap) => insertMap
      case _ => null
    }
    if (colsMap != null) {
      val output = new Array[Expression](tableCols.length)
      val expecOutput = new Array[Expression](tableCols.length)
      colsMap.foreach { case (k, v) =>
        val tableIndex = tableCols.indexOf(k.toString().toLowerCase)
        if (tableIndex < 0) {
          throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap")
        }
        output(tableIndex) = v.expr.transform {
          case a: Attribute if !a.resolved =>
            ds.queryExecution.analyzed.resolveQuoted(a.name,
              sparkSession.sessionState.analyzer.resolver).get
        }
        expecOutput(tableIndex) =
          existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get
      }
      if (output.contains(null)) {
        throw new CarbonMergeDataSetException(s"Not all columns are mapped")
      }
      (new InterpretedMutableProjection(output++Seq(
        ds.queryExecution.analyzed.resolveQuoted(statusCol,
        sparkSession.sessionState.analyzer.resolver).get),
        ds.queryExecution.analyzed.output), expecOutput)
    } else {
      (null, null)
    }
  }
}

Source File: SparkUnsafeRowReadSuport.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.carbondata.execution.datasources.readsupport

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection}
import org.apache.spark.sql.types.StructType

import org.apache.carbondata.core.metadata.schema.table.CarbonTable
import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn
import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport


class SparkUnsafeRowReadSuport(requiredSchema: StructType) extends CarbonReadSupport[InternalRow] {
  private val unsafeProjection = UnsafeProjection.create(requiredSchema)
  override def initialize(carbonColumns: Array[CarbonColumn],
      carbonTable: CarbonTable): Unit = {
  }

  override def readRow(data: Array[AnyRef]): InternalRow = {
    unsafeProjection(new GenericInternalRow(data.asInstanceOf[Array[Any]]))
  }

  override def close(): Unit = {
    // Nothing to close
  }
}

Source File: CarbonTaskCompletionListener.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.carbondata.execution.datasources.tasklisteners

import org.apache.hadoop.io.NullWritable
import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext}
import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.RecordReaderIterator
import org.apache.spark.util.TaskCompletionListener

import org.apache.carbondata.common.logging.LogServiceFactory
import org.apache.carbondata.core.memory.UnsafeMemoryManager
import org.apache.carbondata.core.util.{DataTypeUtil, ThreadLocalTaskInfo}
import org.apache.carbondata.hadoop.internal.ObjectArrayWritable


trait CarbonCompactionTaskCompletionListener extends TaskCompletionListener

case class CarbonQueryTaskCompletionListenerImpl(iter: RecordReaderIterator[InternalRow],
    freeMemory: Boolean = false) extends CarbonQueryTaskCompletionListener {
  override def onTaskCompletion(context: TaskContext): Unit = {
    if (iter != null) {
      try {
        iter.close()
      } catch {
        case e: Exception =>
          LogServiceFactory.getLogService(this.getClass.getCanonicalName).error(e)
      }
    }
    if (freeMemory) {
      UnsafeMemoryManager.INSTANCE
        .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId)
      ThreadLocalTaskInfo.clearCarbonTaskInfo()
    }
    DataTypeUtil.clearFormatter()
  }
}

case class CarbonLoadTaskCompletionListenerImpl(recordWriter: RecordWriter[NullWritable,
  ObjectArrayWritable],
    taskAttemptContext: TaskAttemptContext) extends CarbonLoadTaskCompletionListener {

  override def onTaskCompletion(context: TaskContext): Unit = {
    try {
      recordWriter.close(taskAttemptContext)
    } finally {
      UnsafeMemoryManager.INSTANCE
        .freeMemoryAll(ThreadLocalTaskInfo.getCarbonTaskInfo.getTaskId)
      ThreadLocalTaskInfo.clearCarbonTaskInfo()
      DataTypeUtil.clearFormatter()
    }
  }
}

Source File: CarbonDataSourceScan.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.strategy

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
import org.apache.spark.sql.execution.FileSourceScanExec
import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}


class CarbonDataSourceScan(
    override val output: Seq[Attribute],
    val rdd: RDD[InternalRow],
    @transient override val relation: HadoopFsRelation,
    val partitioning: Partitioning,
    val md: Map[String, String],
    identifier: Option[TableIdentifier],
    @transient private val logicalRelation: LogicalRelation)
  extends FileSourceScanExec(
    relation,
    output,
    relation.dataSchema,
    Seq.empty,
    Seq.empty,
    identifier) {

  // added lazy since spark 2.3.2 version (SPARK-PR#21815)
  override lazy val supportsBatch: Boolean = true

  // added lazy since spark 2.3.2 version (SPARK-PR#21815)
  override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) =
    (partitioning, Nil)

  // added lazy since spark 2.3.2 version (SPARK-PR#21815)
  override lazy val metadata: Map[String, String] = md

  override def inputRDDs(): Seq[RDD[InternalRow]] = rdd :: Nil

}

Source File: CarbonDataSourceScan.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.strategy

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
import org.apache.spark.sql.execution.FileSourceScanExec
import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation}


class CarbonDataSourceScan(
    override val output: Seq[Attribute],
    val rdd: RDD[InternalRow],
    @transient override val relation: HadoopFsRelation,
    val partitioning: Partitioning,
    val md: Map[String, String],
    identifier: Option[TableIdentifier],
    @transient private val logicalRelation: LogicalRelation)
  extends FileSourceScanExec(
    relation,
    output,
    relation.dataSchema,
    Seq.empty,
    None,
    Seq.empty,
    identifier) {

  // added lazy since spark 2.3.2 version (SPARK-PR#21815)
  override lazy val supportsBatch: Boolean = true

  // added lazy since spark 2.3.2 version (SPARK-PR#21815)
  override lazy val (outputPartitioning, outputOrdering): (Partitioning, Seq[SortOrder]) =
    (partitioning, Nil)

  // added lazy since spark 2.3.2 version (SPARK-PR#21815)
  override lazy val metadata: Map[String, String] = md

  override def inputRDDs(): Seq[RDD[InternalRow]] = rdd :: Nil

}

Source File: RowStreamParserImp.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.streaming.parser

import java.text.SimpleDateFormat
import java.util

import org.apache.hadoop.conf.Configuration
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.processing.loading.ComplexDelimitersEnum
import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants


class RowStreamParserImp extends CarbonStreamParser {

  var configuration: Configuration = null
  var isVarcharTypeMapping: Array[Boolean] = null
  var structType: StructType = null
  var encoder: ExpressionEncoder[Row] = null

  var timeStampFormat: SimpleDateFormat = null
  var dateFormat: SimpleDateFormat = null
  var complexDelimiters: util.ArrayList[String] = new util.ArrayList[String]()
  var serializationNullFormat: String = null

  override def initialize(configuration: Configuration,
      structType: StructType, isVarcharTypeMapping: Array[Boolean]): Unit = {
    this.configuration = configuration
    this.structType = structType
    this.encoder = RowEncoder.apply(this.structType).resolveAndBind()
    this.isVarcharTypeMapping = isVarcharTypeMapping

    this.timeStampFormat = new SimpleDateFormat(
      this.configuration.get(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT))
    this.dateFormat = new SimpleDateFormat(
      this.configuration.get(CarbonCommonConstants.CARBON_DATE_FORMAT))
    this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_1"))
    this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_2"))
    this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_3"))
    this.complexDelimiters.add(ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_4.value())
    this.serializationNullFormat =
      this.configuration.get(DataLoadProcessorConstants.SERIALIZATION_NULL_FORMAT)
  }

  override def parserRow(value: InternalRow): Array[Object] = {
    this.encoder.fromRow(value).toSeq.zipWithIndex.map { case (x, i) =>
      FieldConverter.objectToString(
        x, serializationNullFormat, complexDelimiters,
        timeStampFormat, dateFormat,
        isVarcharType = i < this.isVarcharTypeMapping.length && this.isVarcharTypeMapping(i),
        binaryCodec = null)
    } }.toArray

  override def close(): Unit = {
  }

}

Source File: VectorUDT.scala From ann4s with Apache License 2.0

5 votes

package org.apache.spark.ml.nn

import ann4s.{Vector0, Vector16, Vector8, Vector32, Vector64, Vector}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData}
import org.apache.spark.sql.types._

class VectorUDT extends UserDefinedType[Vector] {

  override def sqlType: DataType = _sqlType

  override def serialize(obj: Vector): InternalRow = {
    val row = new GenericInternalRow(5)
    row.setNullAt(1)
    row.setNullAt(2)
    row.setNullAt(3)
    row.setNullAt(4)
    obj match {
      case Vector0 =>
        row.setByte(0, 0)
      case Vector8(values, w, b) =>
        row.setByte(0, 1)
        row.update(1, UnsafeArrayData.fromPrimitiveArray(values))
        row.update(3, UnsafeArrayData.fromPrimitiveArray(Array(w, b)))
      case Vector16(values) =>
        row.setByte(0, 2)
        row.update(2, UnsafeArrayData.fromPrimitiveArray(values))
      case Vector32(values) =>
        row.setByte(0, 3)
        row.update(3, UnsafeArrayData.fromPrimitiveArray(values))
      case Vector64(values) =>
        row.setByte(0, 4)
        row.update(4, UnsafeArrayData.fromPrimitiveArray(values))
    }
    row
  }

  override def deserialize(datum: Any): Vector = {
    datum match {
      case row: InternalRow =>
        require(row.numFields == 5,
          s"nn.VectorUDT.deserialize given row with length ${row.numFields} but requires length == 5")
        val tpe = row.getByte(0)
        tpe match {
          case 0 =>
            Vector0
          case 1 =>
            val wb = row.getArray(3).toFloatArray()
            Vector8(row.getArray(1).toByteArray(), wb(0), wb(1))
          case 2 =>
            Vector16(row.getArray(2).toShortArray())
          case 3 =>
            Vector32(row.getArray(3).toFloatArray())
          case 4 =>
            Vector64(row.getArray(4).toDoubleArray())
        }
    }
  }

  override def userClass: Class[Vector] = classOf[Vector]

  override def equals(o: Any): Boolean = {
    o match {
      case _: VectorUDT => true
      case _ => false
    }
  }

  override def hashCode(): Int = classOf[VectorUDT].getName.hashCode

  override def typeName: String = "nn.vector"

  private[spark] override def asNullable: VectorUDT = this

  private[this] val _sqlType = {
    StructType(Seq(
      StructField("type", ByteType, nullable = false),
      StructField("fixed8", ArrayType(ByteType, containsNull = false), nullable = true),
      StructField("fixed16", ArrayType(ShortType, containsNull = false), nullable = true),
      StructField("float32", ArrayType(FloatType, containsNull = false), nullable = true),
      StructField("float64", ArrayType(DoubleType, containsNull = false), nullable = true)))
  }
}

object VectorUDT {

  def register(): Unit = {
    UDTRegistration.register("ann4s.Vector", "org.apache.spark.ml.nn.VectorUDT")
    UDTRegistration.register("ann4s.EmptyVector", "org.apache.spark.ml.nn.VectorUDT")
    UDTRegistration.register("ann4s.Fixed8Vector", "org.apache.spark.ml.nn.VectorUDT")
    UDTRegistration.register("ann4s.Fixed16Vector", "org.apache.spark.ml.nn.VectorUDT")
    UDTRegistration.register("ann4s.Float32Vector", "org.apache.spark.ml.nn.VectorUDT")
    UDTRegistration.register("ann4s.Float64Vector", "org.apache.spark.ml.nn.VectorUDT")
  }

}

Source File: ScanPartition.scala From spark-dynamodb with Apache License 2.0

5 votes

package com.audienceproject.spark.dynamodb.datasource

import com.amazonaws.services.dynamodbv2.document.Item
import com.audienceproject.shaded.google.common.util.concurrent.RateLimiter
import com.audienceproject.spark.dynamodb.connector.DynamoConnector
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.sources.v2.reader.{InputPartition, InputPartitionReader}
import org.apache.spark.sql.types.{StructField, StructType}

import scala.collection.JavaConverters._

class ScanPartition(schema: StructType,
                    partitionIndex: Int,
                    connector: DynamoConnector,
                    filters: Array[Filter])
    extends InputPartition[InternalRow] {

    private val requiredColumns = schema.map(_.name)

    @transient
    private lazy val typeConversions = schema.collect({
        case StructField(name, dataType, _, _) => name -> TypeConversion(name, dataType)
    }).toMap

    override def createPartitionReader(): InputPartitionReader[InternalRow] = {
        if (connector.isEmpty) new EmptyReader
        else new PartitionReader
    }

    private class EmptyReader extends InputPartitionReader[InternalRow] {
        override def next(): Boolean = false

        override def get(): InternalRow = throw new IllegalStateException("Unable to call get() on empty iterator")

        override def close(): Unit = {}
    }

    private class PartitionReader extends InputPartitionReader[InternalRow] {

        private val pageIterator = connector.scan(partitionIndex, requiredColumns, filters).pages().iterator().asScala
        private val rateLimiter = RateLimiter.create(connector.readLimit)

        private var innerIterator: Iterator[InternalRow] = Iterator.empty

        private var currentRow: InternalRow = _
        private var proceed = false

        override def next(): Boolean = {
            proceed = true
            innerIterator.hasNext || {
                if (pageIterator.hasNext) {
                    nextPage()
                    next()
                }
                else false
            }
        }

        override def get(): InternalRow = {
            if (proceed) {
                currentRow = innerIterator.next()
                proceed = false
            }
            currentRow
        }

        override def close(): Unit = {}

        private def nextPage(): Unit = {
            val page = pageIterator.next()
            val result = page.getLowLevelResult
            Option(result.getScanResult.getConsumedCapacity).foreach(cap => rateLimiter.acquire(cap.getCapacityUnits.toInt max 1))
            innerIterator = result.getItems.iterator().asScala.map(itemToRow(requiredColumns))
        }

    }

    private def itemToRow(requiredColumns: Seq[String])(item: Item): InternalRow =
        if (requiredColumns.nonEmpty) InternalRow.fromSeq(requiredColumns.map(columnName => typeConversions(columnName)(item)))
        else InternalRow.fromSeq(item.asMap().asScala.values.toSeq.map(_.toString))

}

Source File: DynamoWriterFactory.scala From spark-dynamodb with Apache License 2.0

5 votes

package com.audienceproject.spark.dynamodb.datasource

import com.audienceproject.spark.dynamodb.connector.{ColumnSchema, TableConnector}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory}
import org.apache.spark.sql.types.StructType

class DynamoWriterFactory(connector: TableConnector,
                          parameters: Map[String, String],
                          schema: StructType)
    extends DataWriterFactory[InternalRow] {

    private val batchSize = parameters.getOrElse("writebatchsize", "25").toInt
    private val update = parameters.getOrElse("update", "false").toBoolean
    private val delete = parameters.getOrElse("delete", "false").toBoolean

    private val region = parameters.get("region")
    private val roleArn = parameters.get("rolearn")

    override def createDataWriter(partitionId: Int, taskId: Long, epochId: Long): DataWriter[InternalRow] = {
        val columnSchema = new ColumnSchema(connector.keySchema, schema)
        val client = connector.getDynamoDB(region, roleArn)
        if (update) {
            assert(!delete, "Please provide exactly one of 'update' or 'delete' options.")
            new DynamoUpdateWriter(columnSchema, connector, client)
        } else if (delete) {
            new DynamoBatchDeleteWriter(batchSize, columnSchema, connector, client)
        } else {
            new DynamoBatchWriter(batchSize, columnSchema, connector, client)
        }
    }

}

Source File: TypeConversion.scala From spark-dynamodb with Apache License 2.0

5 votes

package com.audienceproject.spark.dynamodb.datasource

import com.amazonaws.services.dynamodbv2.document.{IncompatibleTypeException, Item}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

import scala.collection.JavaConverters._

private[dynamodb] object TypeConversion {

    def apply(attrName: String, sparkType: DataType): Item => Any =

        sparkType match {
            case BooleanType => nullableGet(_.getBOOL)(attrName)
            case StringType => nullableGet(item => attrName => UTF8String.fromString(item.getString(attrName)))(attrName)
            case IntegerType => nullableGet(_.getInt)(attrName)
            case LongType => nullableGet(_.getLong)(attrName)
            case DoubleType => nullableGet(_.getDouble)(attrName)
            case FloatType => nullableGet(_.getFloat)(attrName)
            case BinaryType => nullableGet(_.getBinary)(attrName)
            case DecimalType() => nullableGet(_.getNumber)(attrName)
            case ArrayType(innerType, _) =>
                nullableGet(_.getList)(attrName).andThen(extractArray(convertValue(innerType)))
            case MapType(keyType, valueType, _) =>
                if (keyType != StringType) throw new IllegalArgumentException(s"Invalid Map key type '${keyType.typeName}'. DynamoDB only supports String as Map key type.")
                nullableGet(_.getRawMap)(attrName).andThen(extractMap(convertValue(valueType)))
            case StructType(fields) =>
                val nestedConversions = fields.collect({ case StructField(name, dataType, _, _) => name -> convertValue(dataType) })
                nullableGet(_.getRawMap)(attrName).andThen(extractStruct(nestedConversions))
            case _ => throw new IllegalArgumentException(s"Spark DataType '${sparkType.typeName}' could not be mapped to a corresponding DynamoDB data type.")
        }

    private val stringConverter = (value: Any) => UTF8String.fromString(value.asInstanceOf[String])

    private def convertValue(sparkType: DataType): Any => Any =

        sparkType match {
            case IntegerType => nullableConvert(_.intValue())
            case LongType => nullableConvert(_.longValue())
            case DoubleType => nullableConvert(_.doubleValue())
            case FloatType => nullableConvert(_.floatValue())
            case DecimalType() => nullableConvert(identity)
            case ArrayType(innerType, _) => extractArray(convertValue(innerType))
            case MapType(keyType, valueType, _) =>
                if (keyType != StringType) throw new IllegalArgumentException(s"Invalid Map key type '${keyType.typeName}'. DynamoDB only supports String as Map key type.")
                extractMap(convertValue(valueType))
            case StructType(fields) =>
                val nestedConversions = fields.collect({ case StructField(name, dataType, _, _) => name -> convertValue(dataType) })
                extractStruct(nestedConversions)
            case BooleanType => {
                case boolean: Boolean => boolean
                case _ => null
            }
            case StringType => {
                case string: String => UTF8String.fromString(string)
                case _ => null
            }
            case BinaryType => {
                case byteArray: Array[Byte] => byteArray
                case _ => null
            }
            case _ => throw new IllegalArgumentException(s"Spark DataType '${sparkType.typeName}' could not be mapped to a corresponding DynamoDB data type.")
        }

    private def nullableGet(getter: Item => String => Any)(attrName: String): Item => Any = {
        case item if item.hasAttribute(attrName) => try getter(item)(attrName) catch {
            case _: NumberFormatException => null
            case _: IncompatibleTypeException => null
        }
        case _ => null
    }

    private def nullableConvert(converter: java.math.BigDecimal => Any): Any => Any = {
        case item: java.math.BigDecimal => converter(item)
        case _ => null
    }

    private def extractArray(converter: Any => Any): Any => Any = {
        case list: java.util.List[_] => new GenericArrayData(list.asScala.map(converter))
        case set: java.util.Set[_] => new GenericArrayData(set.asScala.map(converter).toSeq)
        case _ => null
    }

    private def extractMap(converter: Any => Any): Any => Any = {
        case map: java.util.Map[_, _] => ArrayBasedMapData(map, stringConverter, converter)
        case _ => null
    }

    private def extractStruct(conversions: Seq[(String, Any => Any)]): Any => Any = {
        case map: java.util.Map[_, _] => InternalRow.fromSeq(conversions.map({
            case (name, conv) => conv(map.get(name))
        }))
        case _ => null
    }

}

Source File: DynamoBatchWriter.scala From spark-dynamodb with Apache License 2.0

5 votes

package com.audienceproject.spark.dynamodb.datasource

import com.amazonaws.services.dynamodbv2.document.DynamoDB
import com.audienceproject.shaded.google.common.util.concurrent.RateLimiter
import com.audienceproject.spark.dynamodb.connector.{ColumnSchema, TableConnector}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.v2.writer.{DataWriter, WriterCommitMessage}

import scala.collection.mutable.ArrayBuffer

class DynamoBatchWriter(batchSize: Int,
                        columnSchema: ColumnSchema,
                        connector: TableConnector,
                        client: DynamoDB)
    extends DataWriter[InternalRow] {

    protected val buffer: ArrayBuffer[InternalRow] = new ArrayBuffer[InternalRow](batchSize)
    protected val rateLimiter: RateLimiter = RateLimiter.create(connector.writeLimit)

    override def write(record: InternalRow): Unit = {
        buffer += record.copy()
        if (buffer.size == batchSize) {
            flush()
        }
    }

    override def commit(): WriterCommitMessage = {
        flush()
        new WriterCommitMessage {}
    }

    override def abort(): Unit = {}

    protected def flush(): Unit = {
        if (buffer.nonEmpty) {
            connector.putItems(columnSchema, buffer)(client, rateLimiter)
            buffer.clear()
        }
    }

}

Source File: JavaConverter.scala From spark-dynamodb with Apache License 2.0

5 votes

package com.audienceproject.spark.dynamodb.catalyst

import java.util

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

import scala.collection.JavaConverters._

object JavaConverter {

    def convertRowValue(row: InternalRow, index: Int, elementType: DataType): Any = {
        elementType match {
            case ArrayType(innerType, _) => convertArray(row.getArray(index), innerType)
            case MapType(keyType, valueType, _) => convertMap(row.getMap(index), keyType, valueType)
            case StructType(fields) => convertStruct(row.getStruct(index, fields.length), fields)
            case StringType => row.getString(index)
            case _ => row.get(index, elementType)
        }
    }

    def convertArray(array: ArrayData, elementType: DataType): Any = {
        elementType match {
            case ArrayType(innerType, _) => array.toSeq[ArrayData](elementType).map(convertArray(_, innerType)).asJava
            case MapType(keyType, valueType, _) => array.toSeq[MapData](elementType).map(convertMap(_, keyType, valueType)).asJava
            case structType: StructType => array.toSeq[InternalRow](structType).map(convertStruct(_, structType.fields)).asJava
            case StringType => convertStringArray(array).asJava
            case _ => array.toSeq[Any](elementType).asJava
        }
    }

    def convertMap(map: MapData, keyType: DataType, valueType: DataType): util.Map[String, Any] = {
        if (keyType != StringType) throw new IllegalArgumentException(
            s"Invalid Map key type '${keyType.typeName}'. DynamoDB only supports String as Map key type.")
        val keys = convertStringArray(map.keyArray())
        val values = valueType match {
            case ArrayType(innerType, _) => map.valueArray().toSeq[ArrayData](valueType).map(convertArray(_, innerType))
            case MapType(innerKeyType, innerValueType, _) => map.valueArray().toSeq[MapData](valueType).map(convertMap(_, innerKeyType, innerValueType))
            case structType: StructType => map.valueArray().toSeq[InternalRow](structType).map(convertStruct(_, structType.fields))
            case StringType => convertStringArray(map.valueArray())
            case _ => map.valueArray().toSeq[Any](valueType)
        }
        val kvPairs = for (i <- 0 until map.numElements()) yield keys(i) -> values(i)
        Map(kvPairs: _*).asJava
    }

    def convertStruct(row: InternalRow, fields: Seq[StructField]): util.Map[String, Any] = {
        val kvPairs = for (i <- 0 until row.numFields) yield
            if (row.isNullAt(i)) fields(i).name -> null
            else fields(i).name -> convertRowValue(row, i, fields(i).dataType)
        Map(kvPairs: _*).asJava
    }

    def convertStringArray(array: ArrayData): Seq[String] =
        array.toSeq[UTF8String](StringType).map(_.toString)

}

Source File: ExistingDStream.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.rdd.{EmptyRDD, RDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream


private[streaming]
case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow])
    extends SparkPlan with StreamPlan {

  def children = Nil

  override def doExecute() = {
    assert(validTime != null)
    Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime))
      .asInstanceOf[Option[RDD[InternalRow]]]
      .getOrElse(new EmptyRDD[InternalRow](sparkContext))
  }
}

Source File: SocketTextSource.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.sources

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.{BaseRelation, SchemaRelationProvider}
import org.apache.spark.sql.streaming.StreamPlan
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.dstream.DStream

class SocketTextSource extends SchemaRelationProvider {
  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      schema: StructType): BaseRelation = {

    require(parameters.contains("host") &&
      parameters.contains("port") &&
      parameters.contains("messageToRow"))

    val messageToRow = {
      try {
        val clz = Class.forName(parameters("messageToRow"))
        clz.newInstance().asInstanceOf[MessageToRowConverter]
      } catch {
        case e: Exception => sys.error(s"Failed to load class : ${e.toString}")
      }
    }

    new SocketTextRelation(
      parameters("host"),
      parameters("port").toInt,
      messageToRow,
      schema,
      sqlContext)
  }
}


case class SocketTextRelation(
    host: String,
    port: Int,
    messageToRowConverter: MessageToRowConverter,
    val schema: StructType,
    @transient val sqlContext: SQLContext)
  extends StreamBaseRelation
  with StreamPlan {

  // Currently only support Kafka with String messages
  @transient private val socketStream = streamSqlContext.streamingContext.socketTextStream(
    host, port)

  @transient val stream: DStream[InternalRow] =
    socketStream.map(messageToRowConverter.toRow(_, schema))
}

Source File: MessageDelimiter.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.sources

import org.apache.spark.Logging
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Cast, EmptyRow, Literal}
import org.apache.spark.sql.types.StructType

class MessageDelimiter extends MessageToRowConverter with Logging {
  val delimiter = " "

  def toRow(msg: String, schema: StructType): InternalRow = {
    val splitted = msg.split(delimiter).map(Literal(_))
    val casted = splitted.indices.map(i => Cast(splitted(i), schema(i).dataType).eval(EmptyRow))
    InternalRow.fromSeq(casted)
  }

  def toMessage(row: Row): String = row.mkString(delimiter)
}

trait MessageToRowConverter extends Serializable {
  def toRow(message: String, schema: StructType): InternalRow

  def toMessage(row: Row): String
}

Source File: StreamPlan.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

private[streaming] object StreamPlan {
  val currentContext = new ThreadLocal[StreamSQLContext]()
}

trait StreamPlan {
  protected var validTime: Time = null
  def streamSqlContext = StreamPlan.currentContext.get()
  def stream: DStream[InternalRow]
  def setValidTime(time: Time): Unit = {
    validTime = time
  }
}

Source File: MQTTStreamSink.scala From bahir with Apache License 2.0

5 votes

package org.apache.bahir.sql.streaming.mqtt

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.eclipse.paho.client.mqttv3.MqttException

import org.apache.spark.SparkEnv
import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister}
import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport}
import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage}
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StructType

import org.apache.bahir.utils.Logging
import org.apache.bahir.utils.Retry


class MQTTStreamWriter (schema: StructType, parameters: DataSourceOptions)
    extends StreamWriter with Logging {
  override def createWriterFactory(): DataWriterFactory[InternalRow] = {
    // Skipping client identifier as single batch can be distributed to multiple
    // Spark worker process. MQTT server does not support two connections
    // declaring same client ID at given point in time.
    val params = parameters.asMap().asScala.filterNot(
      _._1.equalsIgnoreCase("clientId")
    )
    MQTTDataWriterFactory(params)
  }

  override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}

  override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {}
}

case class MQTTDataWriterFactory(config: mutable.Map[String, String])
    extends DataWriterFactory[InternalRow] {
  override def createDataWriter(
    partitionId: Int, taskId: Long, epochId: Long
  ): DataWriter[InternalRow] = new MQTTDataWriter(config)
}

case object MQTTWriterCommitMessage extends WriterCommitMessage

class MQTTDataWriter(config: mutable.Map[String, String]) extends DataWriter[InternalRow] {
  private lazy val publishAttempts: Int =
    SparkEnv.get.conf.getInt("spark.mqtt.client.publish.attempts", -1)
  private lazy val publishBackoff: Long =
    SparkEnv.get.conf.getTimeAsMs("spark.mqtt.client.publish.backoff", "5s")

  private lazy val (_, _, topic, _, _, qos, _, _, _) = MQTTUtils.parseConfigParams(config.toMap)

  override def write(record: InternalRow): Unit = {
    val client = CachedMQTTClient.getOrCreate(config.toMap)
    val message = record.getBinary(0)
    Retry(publishAttempts, publishBackoff, classOf[MqttException]) {
      // In case of errors, retry sending the message.
      client.publish(topic, message, qos, false)
    }
  }

  override def commit(): WriterCommitMessage = MQTTWriterCommitMessage

  override def abort(): Unit = {}
}

case class MQTTRelation(override val sqlContext: SQLContext, data: DataFrame)
    extends BaseRelation {
  override def schema: StructType = data.schema
}

class MQTTStreamSinkProvider extends DataSourceV2 with StreamWriteSupport
    with DataSourceRegister with CreatableRelationProvider {
  override def createStreamWriter(queryId: String, schema: StructType,
      mode: OutputMode, options: DataSourceOptions): StreamWriter = {
    new MQTTStreamWriter(schema, options)
  }

  override def createRelation(sqlContext: SQLContext, mode: SaveMode,
      parameters: Map[String, String], data: DataFrame): BaseRelation = {
    MQTTRelation(sqlContext, data)
  }

  override def shortName(): String = "mqtt"
}

Source File: JCUDACodegenIterator.scala From GPUEnabler with Apache License 2.0

5 votes

package com.ibm.gpuenabler

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.Encoder


abstract class JCUDACodegenIterator extends Iterator[InternalRow] {
  def hasNext() : Boolean
  def next() : InternalRow
  def init[T](itr : java.util.Iterator[InternalRow], 
              args: Array[Any],
              size : Int,
              cached: Int, 
              gpuPtrs: java.util.List[java.util.Map[String, CachedGPUMeta]], 
              blockID: Int,
              userGridSizes: Array[Array[Int]], 
              userBlockSizes: Array[Array[Int]], 
              stages: Int, 
              smSize: Int,
              inpEnc: Encoder[T])
}

Source File: ArrowEvalPythonExec.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.python

import scala.collection.JavaConverters._

import org.apache.spark.TaskContext
import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.StructType


case class ArrowEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan)
  extends EvalPythonExec(udfs, output, child) {

  private val batchSize = conf.arrowMaxRecordsPerBatch
  private val sessionLocalTimeZone = conf.sessionLocalTimeZone
  private val pandasRespectSessionTimeZone = conf.pandasRespectSessionTimeZone

  protected override def evaluate(
      funcs: Seq[ChainedPythonFunctions],
      bufferSize: Int,
      reuseWorker: Boolean,
      argOffsets: Array[Array[Int]],
      iter: Iterator[InternalRow],
      schema: StructType,
      context: TaskContext): Iterator[InternalRow] = {

    val outputTypes = output.drop(child.output.length).map(_.dataType)

    // DO NOT use iter.grouped(). See BatchIterator.
    val batchIter = if (batchSize > 0) new BatchIterator(iter, batchSize) else Iterator(iter)

    val columnarBatchIter = new ArrowPythonRunner(
        funcs, bufferSize, reuseWorker,
        PythonEvalType.SQL_SCALAR_PANDAS_UDF, argOffsets, schema,
        sessionLocalTimeZone, pandasRespectSessionTimeZone)
      .compute(batchIter, context.partitionId(), context)

    new Iterator[InternalRow] {

      private var currentIter = if (columnarBatchIter.hasNext) {
        val batch = columnarBatchIter.next()
        val actualDataTypes = (0 until batch.numCols()).map(i => batch.column(i).dataType())
        assert(outputTypes == actualDataTypes, "Invalid schema from pandas_udf: " +
          s"expected ${outputTypes.mkString(", ")}, got ${actualDataTypes.mkString(", ")}")
        batch.rowIterator.asScala
      } else {
        Iterator.empty
      }

      override def hasNext: Boolean = currentIter.hasNext || {
        if (columnarBatchIter.hasNext) {
          currentIter = columnarBatchIter.next().rowIterator.asScala
          hasNext
        } else {
          false
        }
      }

      override def next(): InternalRow = currentIter.next()
    }
  }
}

Source File: BatchEvalPythonExec.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.python

import scala.collection.JavaConverters._

import net.razorvine.pickle.{Pickler, Unpickler}

import org.apache.spark.TaskContext
import org.apache.spark.api.python.{ChainedPythonFunctions, PythonEvalType}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.{StructField, StructType}


case class BatchEvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan)
  extends EvalPythonExec(udfs, output, child) {

  protected override def evaluate(
      funcs: Seq[ChainedPythonFunctions],
      bufferSize: Int,
      reuseWorker: Boolean,
      argOffsets: Array[Array[Int]],
      iter: Iterator[InternalRow],
      schema: StructType,
      context: TaskContext): Iterator[InternalRow] = {
    EvaluatePython.registerPicklers()  // register pickler for Row

    val dataTypes = schema.map(_.dataType)
    val needConversion = dataTypes.exists(EvaluatePython.needConversionInPython)

    // enable memo iff we serialize the row with schema (schema and class should be memorized)
    val pickle = new Pickler(needConversion)
    // Input iterator to Python: input rows are grouped so we send them in batches to Python.
    // For each row, add it to the queue.
    val inputIterator = iter.map { row =>
      if (needConversion) {
        EvaluatePython.toJava(row, schema)
      } else {
        // fast path for these types that does not need conversion in Python
        val fields = new Array[Any](row.numFields)
        var i = 0
        while (i < row.numFields) {
          val dt = dataTypes(i)
          fields(i) = EvaluatePython.toJava(row.get(i, dt), dt)
          i += 1
        }
        fields
      }
    }.grouped(100).map(x => pickle.dumps(x.toArray))

    // Output iterator for results from Python.
    val outputIterator = new PythonUDFRunner(
        funcs, bufferSize, reuseWorker, PythonEvalType.SQL_BATCHED_UDF, argOffsets)
      .compute(inputIterator, context.partitionId(), context)

    val unpickle = new Unpickler
    val mutableRow = new GenericInternalRow(1)
    val resultType = if (udfs.length == 1) {
      udfs.head.dataType
    } else {
      StructType(udfs.map(u => StructField("", u.dataType, u.nullable)))
    }

    val fromJava = EvaluatePython.makeFromJava(resultType)

    outputIterator.flatMap { pickedResult =>
      val unpickledBatch = unpickle.loads(pickedResult)
      unpickledBatch.asInstanceOf[java.util.ArrayList[Any]].asScala
    }.map { result =>
      if (udfs.length == 1) {
        // fast path for single UDF
        mutableRow(0) = fromJava(result)
        mutableRow
      } else {
        fromJava(result).asInstanceOf[InternalRow]
      }
    }
  }
}

Source File: BoundOrdering.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.window

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Projection



private[window] final case class RangeBoundOrdering(
    ordering: Ordering[InternalRow],
    current: Projection,
    bound: Projection)
  extends BoundOrdering {

  override def compare(
      inputRow: InternalRow,
      inputIndex: Int,
      outputRow: InternalRow,
      outputIndex: Int): Int =
    ordering.compare(current(inputRow), bound(outputRow))
}

Source File: NullableColumnAccessor.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.sql.catalyst.InternalRow

private[columnar] trait NullableColumnAccessor extends ColumnAccessor {
  private var nullsBuffer: ByteBuffer = _
  private var nullCount: Int = _
  private var seenNulls: Int = 0

  private var nextNullIndex: Int = _
  private var pos: Int = 0

  abstract override protected def initialize(): Unit = {
    nullsBuffer = underlyingBuffer.duplicate().order(ByteOrder.nativeOrder())
    nullCount = ByteBufferHelper.getInt(nullsBuffer)
    nextNullIndex = if (nullCount > 0) ByteBufferHelper.getInt(nullsBuffer) else -1
    pos = 0

    underlyingBuffer.position(underlyingBuffer.position() + 4 + nullCount * 4)
    super.initialize()
  }

  abstract override def extractTo(row: InternalRow, ordinal: Int): Unit = {
    if (pos == nextNullIndex) {
      seenNulls += 1

      if (seenNulls < nullCount) {
        nextNullIndex = ByteBufferHelper.getInt(nullsBuffer)
      }

      row.setNullAt(ordinal)
    } else {
      super.extractTo(row, ordinal)
    }

    pos += 1
  }

  abstract override def hasNext: Boolean = seenNulls < nullCount || super.hasNext
}

Source File: NullableColumnBuilder.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.sql.catalyst.InternalRow


private[columnar] trait NullableColumnBuilder extends ColumnBuilder {
  protected var nulls: ByteBuffer = _
  protected var nullCount: Int = _
  private var pos: Int = _

  abstract override def initialize(
      initialSize: Int,
      columnName: String,
      useCompression: Boolean): Unit = {

    nulls = ByteBuffer.allocate(1024)
    nulls.order(ByteOrder.nativeOrder())
    pos = 0
    nullCount = 0
    super.initialize(initialSize, columnName, useCompression)
  }

  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
    columnStats.gatherStats(row, ordinal)
    if (row.isNullAt(ordinal)) {
      nulls = ColumnBuilder.ensureFreeSpace(nulls, 4)
      nulls.putInt(pos)
      nullCount += 1
    } else {
      super.appendFrom(row, ordinal)
    }
    pos += 1
  }

  abstract override def build(): ByteBuffer = {
    val nonNulls = super.build()
    val nullDataLen = nulls.position()

    nulls.limit(nullDataLen)
    nulls.rewind()

    val buffer = ByteBuffer
      .allocate(4 + nullDataLen + nonNulls.remaining())
      .order(ByteOrder.nativeOrder())
      .putInt(nullCount)
      .put(nulls)
      .put(nonNulls)

    buffer.rewind()
    buffer
  }

  protected def buildNonNulls(): ByteBuffer = {
    nulls.limit(nulls.position()).rewind()
    super.build()
  }
}

Source File: CompressibleColumnBuilder.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar.compression

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder}
import org.apache.spark.sql.types.AtomicType
import org.apache.spark.unsafe.Platform


private[columnar] trait CompressibleColumnBuilder[T <: AtomicType]
  extends ColumnBuilder with Logging {

  this: NativeColumnBuilder[T] with WithCompressionSchemes =>

  var compressionEncoders: Seq[Encoder[T]] = _

  abstract override def initialize(
      initialSize: Int,
      columnName: String,
      useCompression: Boolean): Unit = {

    compressionEncoders =
      if (useCompression) {
        schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType))
      } else {
        Seq(PassThrough.encoder(columnType))
      }
    super.initialize(initialSize, columnName, useCompression)
  }

  // The various compression schemes, while saving memory use, cause all of the data within
  // the row to become unaligned, thus causing crashes.  Until a way of fixing the compression
  // is found to also allow aligned accesses this must be disabled for SPARC.

  protected def isWorthCompressing(encoder: Encoder[T]) = {
    CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8
  }

  private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
    compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal))
  }

  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
    super.appendFrom(row, ordinal)
    if (!row.isNullAt(ordinal)) {
      gatherCompressibilityStats(row, ordinal)
    }
  }

  override def build(): ByteBuffer = {
    val nonNullBuffer = buildNonNulls()
    val encoder: Encoder[T] = {
      val candidate = compressionEncoders.minBy(_.compressionRatio)
      if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType)
    }

    // Header = null count + null positions
    val headerSize = 4 + nulls.limit()
    val compressedSize = if (encoder.compressedSize == 0) {
      nonNullBuffer.remaining()
    } else {
      encoder.compressedSize
    }

    val compressedBuffer = ByteBuffer
      // Reserves 4 bytes for compression scheme ID
      .allocate(headerSize + 4 + compressedSize)
      .order(ByteOrder.nativeOrder)
      // Write the header
      .putInt(nullCount)
      .put(nulls)

    logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
    encoder.compress(nonNullBuffer, compressedBuffer)
  }
}

private[columnar] object CompressibleColumnBuilder {
  val unaligned = Platform.unaligned()
}

Source File: CompressibleColumnAccessor.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar.compression

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.columnar.{ColumnAccessor, NativeColumnAccessor}
import org.apache.spark.sql.execution.vectorized.WritableColumnVector
import org.apache.spark.sql.types.AtomicType

private[columnar] trait CompressibleColumnAccessor[T <: AtomicType] extends ColumnAccessor {
  this: NativeColumnAccessor[T] =>

  private var decoder: Decoder[T] = _

  abstract override protected def initialize(): Unit = {
    super.initialize()
    decoder = CompressionScheme(underlyingBuffer.getInt()).decoder(buffer, columnType)
  }

  abstract override def hasNext: Boolean = super.hasNext || decoder.hasNext

  override def extractSingle(row: InternalRow, ordinal: Int): Unit = {
    decoder.next(row, ordinal)
  }

  def decompress(columnVector: WritableColumnVector, capacity: Int): Unit =
    decoder.decompress(columnVector, capacity)
}

Source File: MicroBatchWriter.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.sources

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriterFactory, SupportsWriteInternalRow, WriterCommitMessage}
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter


class MicroBatchWriter(batchId: Long, writer: StreamWriter) extends DataSourceWriter {
  override def commit(messages: Array[WriterCommitMessage]): Unit = {
    writer.commit(batchId, messages)
  }

  override def abort(messages: Array[WriterCommitMessage]): Unit = writer.abort(batchId, messages)

  override def createWriterFactory(): DataWriterFactory[Row] = writer.createWriterFactory()
}

class InternalRowMicroBatchWriter(batchId: Long, writer: StreamWriter)
  extends DataSourceWriter with SupportsWriteInternalRow {
  override def commit(messages: Array[WriterCommitMessage]): Unit = {
    writer.commit(batchId, messages)
  }

  override def abort(messages: Array[WriterCommitMessage]): Unit = writer.abort(batchId, messages)

  override def createInternalRowWriterFactory(): DataWriterFactory[InternalRow] =
    writer match {
      case w: SupportsWriteInternalRow => w.createInternalRowWriterFactory()
      case _ => throw new IllegalStateException(
        "InternalRowMicroBatchWriter should only be created with base writer support")
    }
}

Source File: EventTimeWatermarkExec.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends UnaryExecNode {

  val eventTimeStats = new EventTimeStatsAccum()
  val delayMs = EventTimeWatermark.getDelayMs(delay)

  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .putLong(EventTimeWatermark.delayKey, delayMs)
        .build()
      a.withMetadata(updatedMetadata)
    } else if (a.metadata.contains(EventTimeWatermark.delayKey)) {
      // Remove existing watermark
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .remove(EventTimeWatermark.delayKey)
        .build()
      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }
}

Source File: CoGroupedIterator.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Ascending, Attribute, SortOrder}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering


class CoGroupedIterator(
    left: Iterator[(InternalRow, Iterator[InternalRow])],
    right: Iterator[(InternalRow, Iterator[InternalRow])],
    groupingSchema: Seq[Attribute])
  extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] {

  private val keyOrdering =
    GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema)

  private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _
  private var currentRightData: (InternalRow, Iterator[InternalRow]) = _

  override def hasNext: Boolean = {
    if (currentLeftData == null && left.hasNext) {
      currentLeftData = left.next()
    }
    if (currentRightData == null && right.hasNext) {
      currentRightData = right.next()
    }

    currentLeftData != null || currentRightData != null
  }

  override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    assert(hasNext)

    if (currentLeftData.eq(null)) {
      // left is null, right is not null, consume the right data.
      rightOnly()
    } else if (currentRightData.eq(null)) {
      // left is not null, right is null, consume the left data.
      leftOnly()
    } else if (currentLeftData._1 == currentRightData._1) {
      // left and right have the same grouping key, consume both of them.
      val result = (currentLeftData._1, currentLeftData._2, currentRightData._2)
      currentLeftData = null
      currentRightData = null
      result
    } else {
      val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1)
      assert(compare != 0)
      if (compare < 0) {
        // the grouping key of left is smaller, consume the left data.
        leftOnly()
      } else {
        // the grouping key of right is smaller, consume the right data.
        rightOnly()
      }
    }
  }

  private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    val result = (currentLeftData._1, currentLeftData._2, Iterator.empty)
    currentLeftData = null
    result
  }

  private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    val result = (currentRightData._1, Iterator.empty, currentRightData._2)
    currentRightData = null
    result
  }
}

Source File: ReferenceSort.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.errors._
import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.util.CompletionIterator
import org.apache.spark.util.collection.ExternalSorter



case class ReferenceSort(
    sortOrder: Seq[SortOrder],
    global: Boolean,
    child: SparkPlan)
  extends UnaryExecNode {

  override def requiredChildDistribution: Seq[Distribution] =
    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil

  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
    child.execute().mapPartitions( { iterator =>
      val ordering = newOrdering(sortOrder, child.output)
      val sorter = new ExternalSorter[InternalRow, Null, InternalRow](
        TaskContext.get(), ordering = Some(ordering))
      sorter.insertAll(iterator.map(r => (r.copy(), null)))
      val baseIterator = sorter.iterator.map(_._1)
      val context = TaskContext.get()
      context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled)
      context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled)
      context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes)
      CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop())
    }, preservesPartitioning = true)
  }

  override def output: Seq[Attribute] = child.output

  override def outputOrdering: Seq[SortOrder] = sortOrder

  override def outputPartitioning: Partitioning = child.outputPartitioning
}

Source File: ColumnarTestUtils.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{AtomicType, Decimal}
import org.apache.spark.unsafe.types.UTF8String

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericInternalRow = {
    val row = new GenericInternalRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case NULL => null
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
      case STRUCT(_) =>
        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
      case ARRAY(_) =>
        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
      case MAP(_) =>
        ArrayBasedMapData(
          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
      case _ => throw new IllegalArgumentException(s"Unknown column type $columnType")
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericInternalRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericInternalRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: ExtraStrategiesSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.test.SharedSQLContext

case class FastOperator(output: Seq[Attribute]) extends SparkPlan {

  override protected def doExecute(): RDD[InternalRow] = {
    val str = Literal("so fast").value
    val row = new GenericInternalRow(Array[Any](str))
    val unsafeProj = UnsafeProjection.create(schema)
    val unsafeRow = unsafeProj(row).copy()
    sparkContext.parallelize(Seq(unsafeRow))
  }

  override def producedAttributes: AttributeSet = outputSet
  override def children: Seq[SparkPlan] = Nil
}

object TestStrategy extends Strategy {
  def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
    case Project(Seq(attr), _) if attr.name == "a" =>
      FastOperator(attr.toAttribute :: Nil) :: Nil
    case _ => Nil
  }
}

class ExtraStrategiesSuite extends QueryTest with SharedSQLContext {
  import testImplicits._

  test("insert an extraStrategy") {
    try {
      spark.experimental.extraStrategies = TestStrategy :: Nil

      val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b")
      checkAnswer(
        df.select("a"),
        Row("so fast"))

      checkAnswer(
        df.select("a", "b"),
        Row("so slow", 1))
    } finally {
      spark.experimental.extraStrategies = Nil
    }
  }
}

Source File: DataSourceTest.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

private[sql] abstract class DataSourceTest extends QueryTest {

  protected def sqlTest(sqlString: String, expectedAnswer: Seq[Row], enableRegex: Boolean = false) {
    test(sqlString) {
      withSQLConf(SQLConf.SUPPORT_QUOTED_REGEX_COLUMN_NAME.key -> enableRegex.toString) {
        checkAnswer(spark.sql(sqlString), expectedAnswer)
      }
    }
  }

}

class DDLScanSource extends RelationProvider {
  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    SimpleDDLScan(
      parameters("from").toInt,
      parameters("TO").toInt,
      parameters("Table"))(sqlContext.sparkSession)
  }
}

case class SimpleDDLScan(
    from: Int,
    to: Int,
    table: String)(@transient val sparkSession: SparkSession)
  extends BaseRelation with TableScan {

  override def sqlContext: SQLContext = sparkSession.sqlContext

  override def schema: StructType =
    StructType(Seq(
      StructField("intType", IntegerType, nullable = false).withComment(s"test comment $table"),
      StructField("stringType", StringType, nullable = false),
      StructField("dateType", DateType, nullable = false),
      StructField("timestampType", TimestampType, nullable = false),
      StructField("doubleType", DoubleType, nullable = false),
      StructField("bigintType", LongType, nullable = false),
      StructField("tinyintType", ByteType, nullable = false),
      StructField("decimalType", DecimalType.USER_DEFAULT, nullable = false),
      StructField("fixedDecimalType", DecimalType(5, 1), nullable = false),
      StructField("binaryType", BinaryType, nullable = false),
      StructField("booleanType", BooleanType, nullable = false),
      StructField("smallIntType", ShortType, nullable = false),
      StructField("floatType", FloatType, nullable = false),
      StructField("mapType", MapType(StringType, StringType)),
      StructField("arrayType", ArrayType(StringType)),
      StructField("structType",
        StructType(StructField("f1", StringType) :: StructField("f2", IntegerType) :: Nil
        )
      )
    ))

  override def needConversion: Boolean = false

  override def buildScan(): RDD[Row] = {
    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
    sparkSession.sparkContext.parallelize(from to to).map { e =>
      InternalRow(UTF8String.fromString(s"people$e"), e * 2)
    }.asInstanceOf[RDD[Row]]
  }
}

Source File: UnivocityGenerator.scala From mimir with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.ubodin.csv

import java.io.Writer

import com.univocity.parsers.csv.CsvWriter

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types._

private[csv] class UnivocityGenerator(
    schema: StructType,
    writer: Writer,
    options: CSVOptions) {
  private val writerSettings = options.asWriterSettings
  writerSettings.setHeaders(schema.fieldNames: _*)
  private val gen = new CsvWriter(writer, writerSettings)
  private var printHeader = options.headerFlag

  // A `ValueConverter` is responsible for converting a value of an `InternalRow` to `String`.
  // When the value is null, this converter should not be called.
  private type ValueConverter = (InternalRow, Int) => String

  // `ValueConverter`s for all values in the fields of the schema
  private val valueConverters: Array[ValueConverter] =
    schema.map(_.dataType).map(makeConverter).toArray

  private def makeConverter(dataType: DataType): ValueConverter = dataType match {
    case DateType =>
      (row: InternalRow, ordinal: Int) =>
        options.dateFormat.format(DateTimeUtils.toJavaDate(row.getInt(ordinal)))

    case TimestampType =>
      (row: InternalRow, ordinal: Int) =>
        options.timestampFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal)))

    case udt: UserDefinedType[_] => makeConverter(udt.sqlType)

    case dt: DataType =>
      (row: InternalRow, ordinal: Int) =>
        row.get(ordinal, dt).toString
  }

  private def convertRow(row: InternalRow): Seq[String] = {
    var i = 0
    val values = new Array[String](row.numFields)
    while (i < row.numFields) {
      if (!row.isNullAt(i)) {
        values(i) = valueConverters(i).apply(row, i)
      } else {
        values(i) = options.nullValue
      }
      i += 1
    }
    values
  }

  
  def write(row: InternalRow): Unit = {
    if (printHeader) {
      gen.writeHeaders()
    }
    gen.writeRow(convertRow(row): _*)
    printHeader = false
  }

  def close(): Unit = gen.close()

  def flush(): Unit = gen.flush()
}

Source File: PDFDataSource.scala From mimir with Apache License 2.0

5 votes

package mimir.exec.spark.datasource.pdf


import org.apache.spark.sql.sources.v2._
import org.apache.spark.sql.types._
import org.apache.spark.sql.sources.v2.reader._
import scala.collection.JavaConverters._
import org.apache.spark.sql.catalyst.InternalRow
import java.util.{Collections, List => JList, Optional}
import org.apache.spark.unsafe.types.UTF8String
import mimir.exec.spark.datasource.csv.CSVDataSourceReader

class DefaultSource extends DataSourceV2 with ReadSupport {

  def createReader(options: DataSourceOptions) = {
    val path = options.get("path").get
    val pages = options.get("pages").orElse("all")
    val area = Option(options.get("area").orElse(null))
    val hasGrid = options.get("gridLines").orElse("false").toBoolean
    val pdfExtractor = new PDFTableExtractor()
    val outPath = s"${path}.csv"
    pdfExtractor.defaultExtract(path, pages, area, Some(outPath), hasGrid)
    //println(s"------PDFDataSource----$path -> $outPath")
    //println({scala.io.Source.fromFile(outPath).mkString})
    new CSVDataSourceReader(outPath, options.asMap().asScala.toMap)
  }
}

Source File: SageMakerProtobufWriter.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.protobuf

import java.io.ByteArrayOutputStream

import org.apache.hadoop.fs.Path
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext}

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.execution.datasources.OutputWriter
import org.apache.spark.sql.types.StructType


  def write(row: Row): Unit = {
    val labelColumnName = options.getOrElse("labelColumnName", "label")
    val featuresColumnName = options.getOrElse("featuresColumnName", "features")

    val record = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Some(labelColumnName))
    record.writeTo(byteArrayOutputStream)

    recordWriter.write(NullWritable.get(), new BytesWritable(byteArrayOutputStream.toByteArray))
    byteArrayOutputStream.reset()
  }

  override def close(): Unit = {
    recordWriter.close(context)
  }
}

Source File: StarryLocalTableScanExec.scala From starry with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.{RDD, StarryRDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class StarryLocalTableScanExec(
                                     tableName: String,
                                     output: Seq[Attribute],
                                     @transient rows: Seq[InternalRow]) extends LeafExecNode {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  @transient private lazy val unsafeRows: Array[InternalRow] = {
    if (rows.isEmpty) {
      Array.empty
    } else {
      val proj = UnsafeProjection.create(output, output)
      rows.map(r => proj(r).copy()).toArray
    }
  }

  private lazy val rdd = new StarryRDD(sparkContext, tableName, unsafeRows)

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    rdd.map { r =>
      numOutputRows += 1
      r
    }
  }

  override protected def stringArgs: Iterator[Any] = {
    if (rows.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def executeCollect(): Array[InternalRow] = {
    longMetric("numOutputRows").add(unsafeRows.length)
    unsafeRows
  }

  override def executeTake(limit: Int): Array[InternalRow] = {
    val taken = unsafeRows.take(limit)
    longMetric("numOutputRows").add(taken.length)
    taken
  }
}

Source File: StarryHashJoinExec.scala From starry with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import com.github.passionke.starry.SparkPlanExecutor
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.JoinType
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}


case class StarryHashJoinExec(
                               leftKeys: Seq[Expression],
                               rightKeys: Seq[Expression],
                               joinType: JoinType,
                               buildSide: BuildSide,
                               condition: Option[Expression],
                               left: SparkPlan,
                               right: SparkPlan)
  extends BinaryExecNode with HashJoin {

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"),
    "avgHashProbe" -> SQLMetrics.createAverageMetric(sparkContext, "avg hash probe"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")
    val avgHashProbe = longMetric("avgHashProbe")
    val rows = SparkPlanExecutor.doExec(buildPlan)
    val hashed = HashedRelation(rows.iterator, buildKeys, rows.length, null)
    streamedPlan.execute().mapPartitions { streamedIter =>
      join(streamedIter, hashed, numOutputRows, avgHashProbe)
    }
  }
}

Source File: StarryTakeOrderedAndProjectExec.scala From starry with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.exchange

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering
import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, SortOrder, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition}
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.util.Utils


case class StarryTakeOrderedAndProjectExec(
                                            limit: Int,
                                            sortOrder: Seq[SortOrder],
                                            projectList: Seq[NamedExpression],
                                            child: SparkPlan) extends UnaryExecNode {

  override def output: Seq[Attribute] = {
    projectList.map(_.toAttribute)
  }

  override def executeCollect(): Array[InternalRow] = {
    val ord = new LazilyGeneratedOrdering(sortOrder, child.output)
    val data = child.execute().map(_.copy()).takeOrdered(limit)(ord)
    if (projectList != child.output) {
      val proj = UnsafeProjection.create(projectList, child.output)
      data.map(r => proj(r).copy())
    } else {
      data
    }
  }

  protected override def doExecute(): RDD[InternalRow] = {
    val ord = new LazilyGeneratedOrdering(sortOrder, child.output)
    val localTopK: RDD[InternalRow] = {
      child.execute().map(_.copy()).mapPartitions { iter =>
        org.apache.spark.util.collection.Utils.takeOrdered(iter, limit)(ord)
      }
    }
    localTopK.mapPartitions { iter =>
      val topK = org.apache.spark.util.collection.Utils.takeOrdered(iter.map(_.copy()), limit)(ord)
      if (projectList != child.output) {
        val proj = UnsafeProjection.create(projectList, child.output)
        topK.map(r => proj(r))
      } else {
        topK
      }
    }
  }

  override def outputOrdering: Seq[SortOrder] = sortOrder

  override def outputPartitioning: Partitioning = SinglePartition

  override def simpleString: String = {
    val orderByString = Utils.truncatedString(sortOrder, "[", ",", "]")
    val outputString = Utils.truncatedString(output, "[", ",", "]")

    s"TakeOrderedAndProject(limit=$limit, orderBy=$orderByString, output=$outputString)"
  }
}

Source File: StarryUnionExec.scala From starry with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import com.github.passionke.starry.SparkPlanExecutor
import org.apache.spark.rdd.{RDD, StarryRDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute


case class StarryUnionExec(children: Seq[SparkPlan]) extends SparkPlan {
  override def output: Seq[Attribute] =
    children.map(_.output).transpose.map(attrs =>
      attrs.head.withNullability(attrs.exists(_.nullable)))

  protected override def doExecute(): RDD[InternalRow] = {
    val b = children.flatMap(child => {
      SparkPlanExecutor.doExec(child)
    })
    new StarryRDD(sparkContext, b)
  }

}

Source File: StarryLocalRelation.scala From starry with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.logical

import org.apache.spark.sql.catalyst.{InternalRow, analysis}
import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal}
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LocalRelation, Statistics}


  override final def newInstance(): this.type = {
    LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type]
  }

  override protected def stringArgs: Iterator[Any] = {
    if (data.isEmpty) {
      Iterator("<empty>", output)
    } else {
      Iterator(output)
    }
  }

  override def computeStats(): Statistics =
    Statistics(sizeInBytes = output.map(n => BigInt(n.dataType.defaultSize)).sum * data.length)

  def toSQL(inlineTableName: String): String = {
    require(data.nonEmpty)
    val types = output.map(_.dataType)
    val rows = data.map { row =>
      val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql }
      cells.mkString("(", ", ", ")")
    }
    "VALUES " + rows.mkString(", ") +
      " AS " + inlineTableName +
      output.map(_.name).mkString("(", ", ", ")")
  }

}

Source File: SparkPlanExecutor.scala From starry with Apache License 2.0

5 votes

package com.github.passionke.starry

import org.apache.spark.{Partition, StarryTaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.{ReuseSubquery, SparkPlan}


object SparkPlanExecutor {

  def exec(plan: SparkPlan, sparkSession: SparkSession) = {
    val newPlan = Seq(
      ReuseSubquery(sparkSession.sessionState.conf))
      .foldLeft(plan) { case (sp, rule) => rule.apply(sp) }
    doExec(newPlan)
  }

  def firstPartition(rdd: RDD[InternalRow]): Partition = {
    rdd.partitions.head
  }

  def doExec(sparkPlan: SparkPlan): List[InternalRow] = {
    val rdd = sparkPlan.execute().map(ite => ite.copy())
    val partition = firstPartition(rdd)
    rdd.compute(partition, new StarryTaskContext).toList
  }

  def rddCompute(rdd: RDD[InternalRow]): List[InternalRow] = {
    val partition = firstPartition(rdd)
    rdd.compute(partition, new StarryTaskContext).toList
  }

}

Source File: KinesisWriteTask.scala From kinesis-sql with Apache License 2.0

5 votes

package org.apache.spark.sql.kinesis

import java.nio.ByteBuffer

import com.amazonaws.services.kinesis.producer.{KinesisProducer, UserRecordResult}
import com.google.common.util.concurrent.{FutureCallback, Futures}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, UnsafeProjection}
import org.apache.spark.sql.types.{BinaryType, StringType}

private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, String],
                                        inputSchema: Seq[Attribute]) extends Logging {

  private var producer: KinesisProducer = _
  private val projection = createProjection
  private val streamName = producerConfiguration.getOrElse(
    KinesisSourceProvider.SINK_STREAM_NAME_KEY, "")

  def execute(iterator: Iterator[InternalRow]): Unit = {
    producer = CachedKinesisProducer.getOrCreate(producerConfiguration)
    while (iterator.hasNext) {
      val currentRow = iterator.next()
      val projectedRow = projection(currentRow)
      val partitionKey = projectedRow.getString(0)
      val data = projectedRow.getBinary(1)

      sendData(partitionKey, data)
    }
  }

  def sendData(partitionKey: String, data: Array[Byte]): String = {
    var sentSeqNumbers = new String

    val future = producer.addUserRecord(streamName, partitionKey, ByteBuffer.wrap(data))

    val kinesisCallBack = new FutureCallback[UserRecordResult]() {

      override def onFailure(t: Throwable): Unit = {
        logError(s"Writing to  $streamName failed due to ${t.getCause}")
      }

      override def onSuccess(result: UserRecordResult): Unit = {
        val shardId = result.getShardId
        sentSeqNumbers = result.getSequenceNumber
      }
    }
    Futures.addCallback(future, kinesisCallBack)

    producer.flushSync()
    sentSeqNumbers
  }

  def close(): Unit = {
    if (producer != null) {
      producer.flush()
      producer = null
    }
  }

  private def createProjection: UnsafeProjection = {

    val partitionKeyExpression = inputSchema
      .find(_.name == KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME).getOrElse(
      throw new IllegalStateException("Required attribute " +
        s"'${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME}' not found"))

    partitionKeyExpression.dataType match {
      case StringType | BinaryType => // ok
      case t =>
        throw new IllegalStateException(s"${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME} " +
          "attribute type must be a String or BinaryType")
    }

    val dataExpression = inputSchema.find(_.name == KinesisWriter.DATA_ATTRIBUTE_NAME).getOrElse(
      throw new IllegalStateException("Required attribute " +
        s"'${KinesisWriter.DATA_ATTRIBUTE_NAME}' not found")
    )

    dataExpression.dataType match {
      case StringType | BinaryType => // ok
      case t =>
        throw new IllegalStateException(s"${KinesisWriter.DATA_ATTRIBUTE_NAME} " +
          "attribute type must be a String or BinaryType")
    }

    UnsafeProjection.create(
      Seq(Cast(partitionKeyExpression, StringType), Cast(dataExpression, StringType)), inputSchema)
  }

}

Source File: relations.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.execution

import edu.ucla.cs.wis.bigdatalog.spark.BigDatalogContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.sql.execution.LeafNode
import org.apache.spark.sql.catalyst.InternalRow

abstract class RecursiveRelation(name: String, output: Seq[Attribute], partitioning: Seq[Int])
  extends LeafNode {

  @transient
  final val bigDatalogContext = SQLContext.getActive().getOrElse(null).asInstanceOf[BigDatalogContext]

  override def simpleString = s"$nodeName " + output.mkString("[", ",", "]") + "(" + name + ")"

  override def outputPartitioning: Partitioning = {
    if (partitioning == null || partitioning.isEmpty)
      UnknownPartitioning(0)
    else
      new HashPartitioning(partitioning.zip(output).filter(_._1 == 1).map(_._2), bigDatalogContext.conf.numShufflePartitions)
  }

  override def outputsUnsafeRows: Boolean = true

  override def doExecute(): RDD[InternalRow] = {
    bigDatalogContext.getRDD(name)
  }
}

case class LinearRecursiveRelation(name : String, output : Seq[Attribute], partitioning: Seq[Int])
  extends RecursiveRelation(name, output, partitioning)

case class NonLinearRecursiveRelation(name : String, output : Seq[Attribute], partitioning: Seq[Int])
  extends RecursiveRelation("all_" + name, output, partitioning)

case class AggregateRelation(name : String, output : Seq[Attribute], partitioning: Seq[Int])
  extends RecursiveRelation(name, output, partitioning)

Source File: AggregateSetRDDMinMaxPartition.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.execution.setrdd

import edu.ucla.cs.wis.bigdatalog.spark.SchemaInfo
import edu.ucla.cs.wis.bigdatalog.spark.execution.aggregates.{AggregateSetRDDPartition, KeyValueToInternalRowIterator, MonotonicAggregate}
import edu.ucla.cs.wis.bigdatalog.spark.storage.map.UnsafeFixedWidthMonotonicAggregationMap
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow

class AggregateSetRDDMinMaxPartition(aggregateStore: UnsafeFixedWidthMonotonicAggregationMap,
                                     schemaInfo: SchemaInfo,
                                     monotonicAggregate: MonotonicAggregate)
  extends AggregateSetRDDPartition(aggregateStore, schemaInfo, monotonicAggregate) with Serializable {

  def size: Int = aggregateStore.numElements()

  def iterator: Iterator[InternalRow] = {
      KeyValueToInternalRowIterator(aggregateStore.iterator(), monotonicAggregate.generateResultProjection())
  }

  // update() merges the results produced during the iteration into this partition
  // during update():
  //  - the underlying aggregateSetRDDPartition storage is updated.
  //  - a 2nd aggregateSetRDDPartition is produced indicating the rows that changed during the merge
  // This is similar to regular aggregation, just that we-use the same hashmap each iteration
  def update(iter: Iterator[InternalRow],
             monotonicAggregate: MonotonicAggregate): (AggregateSetRDDPartition, SetRDDPartition[InternalRow]) = {//Iterator[InternalRow]) = {

    val start = System.currentTimeMillis()
    val before = aggregateStore.numElements()
    // this is going to perform the aggregation and return an iterator over the output
    val maIter = monotonicAggregate.getAggregationIterator(iter, aggregateStore)

    logInfo("Update deltaSPrime set size before %s after %s, delta set size %s took %s ms"
      .format(before, aggregateStore.numElements(), maIter.deltaSet.size, System.currentTimeMillis() - start))

    val hashMapIter = new JavaHashMapIterator(maIter.deltaSet, monotonicAggregate.generateResultProjection())

    (new AggregateSetRDDMinMaxPartition(aggregateStore, schemaInfo, monotonicAggregate),
      SetRDDHashSetPartition(hashMapIter, schemaInfo))
  }
}

class JavaHashMapIterator(hashMap: java.util.HashMap[UnsafeRow, UnsafeRow],
                          resultProjection: (UnsafeRow, UnsafeRow) => UnsafeRow) extends Iterator[InternalRow] {
  val iterator = hashMap.entrySet().iterator()

  override def hasNext: Boolean = iterator.hasNext

  override def next: InternalRow = {
    val entry = iterator.next()
    return resultProjection(entry.getKey, entry.getValue)
  }
}

Source File: AggregateSetRDDPartition.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.execution.aggregates

import edu.ucla.cs.wis.bigdatalog.spark.SchemaInfo
import edu.ucla.cs.wis.bigdatalog.spark.execution.setrdd.SetRDDPartition
import edu.ucla.cs.wis.bigdatalog.spark.storage.map.UnsafeFixedWidthMonotonicAggregationMap
import org.apache.spark.sql.catalyst.InternalRow

abstract class AggregateSetRDDPartition(val aggregateStore: UnsafeFixedWidthMonotonicAggregationMap,
                                        val schemaInfo: SchemaInfo,
                                        val monotonicAggregate: MonotonicAggregate)
  extends Serializable
  with org.apache.spark.Logging {

  def this() = this(null, null, null)

  def size: Int

  def iterator: Iterator[InternalRow]

  def update(iter: Iterator[InternalRow], monotonicAggregate: MonotonicAggregate): (AggregateSetRDDPartition, SetRDDPartition[InternalRow])
}

Source File: SetRDDHashSetPartition.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.execution.setrdd

import edu.ucla.cs.wis.bigdatalog.spark.SchemaInfo
import edu.ucla.cs.wis.bigdatalog.spark.storage.HashSetManager
import edu.ucla.cs.wis.bigdatalog.spark.storage.set.hashset.{HashSet, HashSetRowIterator}
import org.apache.spark.sql.catalyst.InternalRow
import scala.reflect.ClassTag

class SetRDDHashSetPartition(val set: HashSet,
                             schemaInfo: SchemaInfo,
                             numFactsGenerated: Long = 0,
                             numFactsDerived: Long = 0)
                            (implicit val cTag: ClassTag[InternalRow])
  extends SetRDDPartition[InternalRow](numFactsGenerated, numFactsDerived) with Serializable {

  def this(set: HashSet, schemaInfo: SchemaInfo) = this(set, schemaInfo, 0, 0)

  override def size: Long = set.size

  override def iterator: Iterator[InternalRow] = HashSetRowIterator.create(set)

  override def union(otherPart: SetRDDPartition[InternalRow], rddId: Int): SetRDDHashSetPartition = {
    val start = System.currentTimeMillis()
    val newPartition = otherPart match {
      case otherPart: SetRDDHashSetPartition => {
        val set : HashSet = this.set.union(otherPart.set)
        new SetRDDHashSetPartition(set, schemaInfo)
      }
      case other => union(otherPart.iterator, rddId)
    }

    logInfo("Union set size %s for rdd %s took %s ms".format(this.set.size, rddId, System.currentTimeMillis() - start))
    newPartition
  }

  override def union(iter: Iterator[InternalRow], rddId: Int): SetRDDHashSetPartition = {
    val start = System.currentTimeMillis()
    // add items to the existing set
    val newSet = this.set
    while (iter.hasNext)
      newSet.insert(iter.next())

    logInfo("Union set size %s for rdd %s took %s ms".format(this.set.size, rddId, System.currentTimeMillis() - start))
    new SetRDDHashSetPartition(newSet, schemaInfo)
  }

  override def diff(iter: Iterator[InternalRow], rddId: Int): SetRDDHashSetPartition = {
    val start = System.currentTimeMillis()
    val diffSet = HashSetManager.create(schemaInfo)
    //var row: InternalRow = null
    var numFactsGenerated: Long = 0
    while (iter.hasNext) {
      //row = iter.next()
      numFactsGenerated += 1
      this.set.ifNotExistsInsert(iter.next(), diffSet)
      //if (!this.set.exists(row))
      //  diffSet.insert(row)
    }
    logInfo("Diff set size %s for rdd %s took %s ms".format(diffSet.size, rddId, System.currentTimeMillis() - start))
    new SetRDDHashSetPartition(diffSet, schemaInfo, numFactsGenerated, diffSet.size)
  }
}

object SetRDDHashSetPartition {
  def apply(iter: Iterator[InternalRow], schemaInfo: SchemaInfo): SetRDDHashSetPartition = {
    val set = HashSetManager.create(schemaInfo)

    while (iter.hasNext)
      set.insert(iter.next())

    new SetRDDHashSetPartition(set, schemaInfo)
  }
}

Source File: ShuffleHashJoin.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.execution

import edu.ucla.cs.wis.bigdatalog.spark.BigDatalogContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning, PartitioningCollection}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashJoin, HashedRelation}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class ShuffleHashJoin(leftKeys: Seq[Expression],
                            rightKeys: Seq[Expression],
                            buildSide: BuildSide,
                            left: SparkPlan,
                            right: SparkPlan)
  extends BinaryNode with HashJoin {

  @transient
  final protected val bigDatalogContext = SQLContext.getActive().getOrElse(null).asInstanceOf[BigDatalogContext]

  val cacheBuildSide = bigDatalogContext.getConf.getBoolean("spark.datalog.shufflehashjoin.cachebuildside", true)

  override lazy val metrics = Map(
    "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"),
    "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"),
    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))

  var cachedBuildPlan: RDD[HashedRelation] = null

  override def output: Seq[Attribute] = left.output ++ right.output

  override def outputPartitioning: Partitioning =
    PartitioningCollection(Seq(left.outputPartitioning, right.outputPartitioning))

  override def requiredChildDistribution: Seq[ClusteredDistribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  override def outputsUnsafeRows: Boolean = true
  override def canProcessUnsafeRows: Boolean = true
  override def canProcessSafeRows: Boolean = false

  protected override def doExecute(): RDD[InternalRow] = {
    val numStreamedRows = buildSide match {
      case BuildLeft => longMetric("numRightRows")
      case BuildRight => longMetric("numLeftRows")
    }
    val numOutputRows = longMetric("numOutputRows")

    if (cacheBuildSide) {
      if (cachedBuildPlan == null) {
        cachedBuildPlan = buildPlan.execute()
          .mapPartitionsInternal(iter => Iterator(HashedRelation(iter, SQLMetrics.nullLongMetric, buildSideKeyGenerator)))
          .persist()
      }
      cachedBuildPlan.zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) =>
        hashJoin(streamedIter, numStreamedRows, buildIter.next(), numOutputRows)}
    } else {
      buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamedIter) =>
        val hashedRelation = HashedRelation(buildIter, SQLMetrics.nullLongMetric, buildSideKeyGenerator)
        hashJoin(streamedIter, numStreamedRows, hashedRelation, numOutputRows)
      }
    }
  }
}

Source File: RelationCatalog.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types.StructType

import scala.collection.mutable.HashMap

class RelationCatalog extends Serializable {
  val directory = HashMap.empty[String, RelationInfo]

  def addRelation(name : String, schema : StructType) : Unit = {
    val relationInfo = new RelationInfo().setSchema(schema)
    directory.get(name) match {
      case Some(oldRelationInfo) =>
        // update rdd if already present.  Schema should not change
        oldRelationInfo.setRDD(relationInfo.getRDD())
      case None => directory.put(name, relationInfo)
    }
  }

  def setRDD(name : String, rdd : RDD[InternalRow]) : Unit = {
    directory.get(name) match {
      case Some(oldRelationInfo) => oldRelationInfo.setRDD(rdd)
      case None => directory.put(name, new RelationInfo().setRDD(rdd))
    }
  }

  def getRelationInfo(name : String) : RelationInfo = {
    if (directory.contains(name))
      directory(name)
    else
      null
  }

  def removeRDD(name : String) : Unit = {
    directory.remove(name)
  }

  def clear() : Unit = {
    directory.clear()
  }

  override def toString(): String = {
    val output = new StringBuilder()
    directory.iterator.foreach(f => output.append(f.toString()))
    output.toString()
  }
}

class RelationInfo() extends Serializable {
  private var schema : StructType = _
  private var rdd : RDD[InternalRow] = _

  def getSchema() : StructType = schema

  def setSchema(schema : StructType) : RelationInfo = {
    this.schema = schema
    this
  }

  def getRDD() : RDD[InternalRow] = rdd

  def setRDD(rdd : RDD[InternalRow]) : RelationInfo = {
    this.rdd = rdd
    this
  }

  override def toString() : String = {
    "schema: " + this.schema + (if (rdd != null) " RDD")
  }
}

Source File: HashSetRowIterator.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.storage.set.hashset

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, UnsafeProjection, UnsafeRow}
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}

class ObjectHashSetRowIterator(set: ObjectHashSet) extends Iterator[InternalRow] {
  val rawIter = set.iterator()

  override final def hasNext(): Boolean = {
    rawIter.hasNext
  }

  override final def next(): InternalRow = {
    rawIter.next()
  }
}

class IntKeysHashSetRowIterator(set: IntKeysHashSet) extends Iterator[InternalRow] {
  val rawIter = set.iterator()
  val uRow = new UnsafeRow()
  val bufferHolder = new BufferHolder()
  val rowWriter = new UnsafeRowWriter()

  override final def hasNext(): Boolean = {
    rawIter.hasNext
  }

  override final def next(): InternalRow = {
    bufferHolder.reset()
    rowWriter.initialize(bufferHolder, 1)
    rowWriter.write(0, rawIter.next())

    uRow.pointTo(bufferHolder.buffer, 1, bufferHolder.totalSize())
    uRow
  }
}

class LongKeysHashSetRowIterator(set: LongKeysHashSet) extends Iterator[InternalRow] {
  val rawIter = set.iterator()
  val numFields = set.schemaInfo.arity
  val uRow = new UnsafeRow()
  val bufferHolder = new BufferHolder()
  val rowWriter = new UnsafeRowWriter()

  override final def hasNext(): Boolean = {
    rawIter.hasNext
  }

  override final def next(): InternalRow = {
    bufferHolder.reset()
    rowWriter.initialize(bufferHolder, numFields)

    val value = rawIter.nextLong()
    if (numFields == 2) {
      rowWriter.write(0, (value >> 32).toInt)
      rowWriter.write(1, value.toInt)
    } else {
      rowWriter.write(0, value)
    }
    uRow.pointTo(bufferHolder.buffer, numFields, bufferHolder.totalSize())
    uRow
  }
}

object HashSetRowIterator {
  def create(set: HashSet): Iterator[InternalRow] = {
    set match {
      //case set: UnsafeFixedWidthSet => set.iterator().asScala
      case set: IntKeysHashSet => new IntKeysHashSetRowIterator(set)
      case set: LongKeysHashSet => new LongKeysHashSetRowIterator(set)
      case set: ObjectHashSet => new ObjectHashSetRowIterator(set)
    }
  }
}

Source File: LocalRelation.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.plans.logical

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, analysis}
import org.apache.spark.sql.types.{StructField, StructType}

object LocalRelation {
  def apply(output: Attribute*): LocalRelation = new LocalRelation(output)

  def apply(output1: StructField, output: StructField*): LocalRelation = {
    new LocalRelation(StructType(output1 +: output).toAttributes)
  }

  def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }

  def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = {
    val schema = StructType.fromAttributes(output)
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow]))
  }
}

case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil)
  extends LeafNode with analysis.MultiInstanceRelation {

  
  override final def newInstance(): this.type = {
    LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type]
  }

  override protected def stringArgs = Iterator(output)

  override def sameResult(plan: LogicalPlan): Boolean = plan match {
    case LocalRelation(otherOutput, otherData) =>
      otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data
    case _ => false
  }

  override lazy val statistics =
    Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length)
}

Source File: MonotonicallyIncreasingID.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
import org.apache.spark.sql.types.{LongType, DataType}


  @transient private[this] var count: Long = _

  @transient private[this] var partitionMask: Long = _

  override protected def initInternal(): Unit = {
    count = 0L
    partitionMask = TaskContext.getPartitionId().toLong << 33
  }

  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override protected def evalInternal(input: InternalRow): Long = {
    val currentCount = count
    count += 1
    partitionMask + currentCount
  }

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val countTerm = ctx.freshName("count")
    val partitionMaskTerm = ctx.freshName("partitionMask")
    ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;")
    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm,
      s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;")

    ev.isNull = "false"
    s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
      $countTerm++;
    """
  }
}

Source File: randomExpressions.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.types.{DataType, DoubleType}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom


case class Randn(seed: Long) extends RDG {
  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()

  def this() = this(Utils.random.nextLong())

  def this(seed: Expression) = this(seed match {
    case IntegerLiteral(s) => s
    case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
  })

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val rngTerm = ctx.freshName("rng")
    val className = classOf[XORShiftRandom].getName
    ctx.addMutableState(className, rngTerm,
      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
    ev.isNull = "false"
    s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();
    """
  }
}

Source File: BoundAttribute.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.errors.attachTree
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.types._


case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean)
  extends LeafExpression with NamedExpression {

  override def toString: String = s"input[$ordinal, $dataType]"

  // Use special getter for primitive types (for UnsafeRow)
  override def eval(input: InternalRow): Any = {
    if (input.isNullAt(ordinal)) {
      null
    } else {
      dataType match {
        case BooleanType => input.getBoolean(ordinal)
        case ByteType => input.getByte(ordinal)
        case ShortType => input.getShort(ordinal)
        case IntegerType | DateType => input.getInt(ordinal)
        case LongType | TimestampType => input.getLong(ordinal)
        case FloatType => input.getFloat(ordinal)
        case DoubleType => input.getDouble(ordinal)
        case StringType => input.getUTF8String(ordinal)
        case BinaryType => input.getBinary(ordinal)
        case CalendarIntervalType => input.getInterval(ordinal)
        case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale)
        case t: StructType => input.getStruct(ordinal, t.size)
        case _: ArrayType => input.getArray(ordinal)
        case _: MapType => input.getMap(ordinal)
        case _ => input.get(ordinal, dataType)
      }
    }
  }

  override def name: String = s"i[$ordinal]"

  override def toAttribute: Attribute = throw new UnsupportedOperationException

  override def qualifiers: Seq[String] = throw new UnsupportedOperationException

  override def exprId: ExprId = throw new UnsupportedOperationException

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val javaType = ctx.javaType(dataType)
    val value = ctx.getValue(ctx.INPUT_ROW, dataType, ordinal.toString)
    s"""
      boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal);
      $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value);
    """
  }
}

object BindReferences extends Logging {

  def bindReference[A <: Expression](
      expression: A,
      input: Seq[Attribute],
      allowFailures: Boolean = false): A = {
    expression.transform { case a: AttributeReference =>
      attachTree(a, "Binding attribute") {
        val ordinal = input.indexWhere(_.exprId == a.exprId)
        if (ordinal == -1) {
          if (allowFailures) {
            a
          } else {
            sys.error(s"Couldn't find $a in ${input.mkString("[", ",", "]")}")
          }
        } else {
          BoundReference(ordinal, a.dataType, a.nullable)
        }
      }
    }.asInstanceOf[A] // Kind of a hack, but safe.  TODO: Tighten return type when possible.
  }
}

Source File: decimalExpressions.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.types._


case class CheckOverflow(child: Expression, dataType: DecimalType) extends UnaryExpression {

  override def nullable: Boolean = true

  override def nullSafeEval(input: Any): Any = {
    val d = input.asInstanceOf[Decimal].clone()
    if (d.changePrecision(dataType.precision, dataType.scale)) {
      d
    } else {
      null
    }
  }

  override protected def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    nullSafeCodeGen(ctx, ev, eval => {
      val tmp = ctx.freshName("tmp")
      s"""
         | Decimal $tmp = $eval.clone();
         | if ($tmp.changePrecision(${dataType.precision}, ${dataType.scale})) {
         |   ${ev.value} = $tmp;
         | } else {
         |   ${ev.isNull} = true;
         | }
       """.stripMargin
    })
  }

  override def toString: String = s"CheckOverflow($child, $dataType)"
}

Source File: GeneratePredicate.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._


object GeneratePredicate extends CodeGenerator[Expression, (InternalRow) => Boolean] {

  protected def canonicalize(in: Expression): Expression = ExpressionCanonicalizer.execute(in)

  protected def bind(in: Expression, inputSchema: Seq[Attribute]): Expression =
    BindReferences.bindReference(in, inputSchema)

  protected def create(predicate: Expression): ((InternalRow) => Boolean) = {
    val ctx = newCodeGenContext()
    val eval = predicate.gen(ctx)
    val code = s"""
      public SpecificPredicate generate($exprType[] expr) {
        return new SpecificPredicate(expr);
      }

      class SpecificPredicate extends ${classOf[Predicate].getName} {
        private final $exprType[] expressions;
        ${declareMutableStates(ctx)}
        ${declareAddedFunctions(ctx)}

        public SpecificPredicate($exprType[] expr) {
          expressions = expr;
          ${initMutableStates(ctx)}
        }

        public boolean eval(InternalRow ${ctx.INPUT_ROW}) {
          ${eval.code}
          return !${eval.isNull} && ${eval.value};
        }
      }"""

    logDebug(s"Generated predicate '$predicate':\n${CodeFormatter.format(code)}")

    val p = compile(code).generate(ctx.references.toArray).asInstanceOf[Predicate]
    (r: InternalRow) => p.eval(r)
  }
}

Source File: SortOrder.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
import org.apache.spark.sql.types._
import org.apache.spark.util.collection.unsafe.sort.PrefixComparators.BinaryPrefixComparator
import org.apache.spark.util.collection.unsafe.sort.PrefixComparators.DoublePrefixComparator

abstract sealed class SortDirection
case object Ascending extends SortDirection
case object Descending extends SortDirection


case class SortPrefix(child: SortOrder) extends UnaryExpression {

  override def eval(input: InternalRow): Any = throw new UnsupportedOperationException

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val childCode = child.child.gen(ctx)
    val input = childCode.value
    val BinaryPrefixCmp = classOf[BinaryPrefixComparator].getName
    val DoublePrefixCmp = classOf[DoublePrefixComparator].getName

    val (nullValue: Long, prefixCode: String) = child.child.dataType match {
      case BooleanType =>
        (Long.MinValue, s"$input ? 1L : 0L")
      case _: IntegralType =>
        (Long.MinValue, s"(long) $input")
      case DateType | TimestampType =>
        (Long.MinValue, s"(long) $input")
      case FloatType | DoubleType =>
        (DoublePrefixComparator.computePrefix(Double.NegativeInfinity),
          s"$DoublePrefixCmp.computePrefix((double)$input)")
      case StringType => (0L, s"$input.getPrefix()")
      case BinaryType => (0L, s"$BinaryPrefixCmp.computePrefix($input)")
      case dt: DecimalType if dt.precision - dt.scale <= Decimal.MAX_LONG_DIGITS =>
        val prefix = if (dt.precision <= Decimal.MAX_LONG_DIGITS) {
          s"$input.toUnscaledLong()"
        } else {
          // reduce the scale to fit in a long
          val p = Decimal.MAX_LONG_DIGITS
          val s = p - (dt.precision - dt.scale)
          s"$input.changePrecision($p, $s) ? $input.toUnscaledLong() : ${Long.MinValue}L"
        }
        (Long.MinValue, prefix)
      case dt: DecimalType =>
        (DoublePrefixComparator.computePrefix(Double.NegativeInfinity),
          s"$DoublePrefixCmp.computePrefix($input.toDouble())")
      case _ => (0L, "0L")
    }

    childCode.code +
    s"""
      |long ${ev.value} = ${nullValue}L;
      |boolean ${ev.isNull} = false;
      |if (!${childCode.isNull}) {
      |  ${ev.value} = $prefixCode;
      |}
    """.stripMargin
  }

  override def dataType: DataType = LongType
}

Source File: RowTest.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericRow, GenericRowWithSchema}
import org.apache.spark.sql.types._
import org.scalatest.{Matchers, FunSpec}

class RowTest extends FunSpec with Matchers {

  val schema = StructType(
    StructField("col1", StringType) ::
    StructField("col2", StringType) ::
    StructField("col3", IntegerType) :: Nil)
  val values = Array("value1", "value2", 1)
  val valuesWithoutCol3 = Array[Any](null, "value2", null)

  val sampleRow: Row = new GenericRowWithSchema(values, schema)
  val sampleRowWithoutCol3: Row = new GenericRowWithSchema(valuesWithoutCol3, schema)
  val noSchemaRow: Row = new GenericRow(values)

  describe("Row (without schema)") {
    it("throws an exception when accessing by fieldName") {
      intercept[UnsupportedOperationException] {
        noSchemaRow.fieldIndex("col1")
      }
      intercept[UnsupportedOperationException] {
        noSchemaRow.getAs("col1")
      }
    }
  }

  describe("Row (with schema)") {
    it("fieldIndex(name) returns field index") {
      sampleRow.fieldIndex("col1") shouldBe 0
      sampleRow.fieldIndex("col3") shouldBe 2
    }

    it("getAs[T] retrieves a value by fieldname") {
      sampleRow.getAs[String]("col1") shouldBe "value1"
      sampleRow.getAs[Int]("col3") shouldBe 1
    }

    it("Accessing non existent field throws an exception") {
      intercept[IllegalArgumentException] {
        sampleRow.getAs[String]("non_existent")
      }
    }

    it("getValuesMap() retrieves values of multiple fields as a Map(field -> value)") {
      val expected = Map(
        "col1" -> "value1",
        "col2" -> "value2"
      )
      sampleRow.getValuesMap(List("col1", "col2")) shouldBe expected
    }

    it("getValuesMap() retrieves null value on non AnyVal Type") {
      val expected = Map(
        "col1" -> null,
        "col2" -> "value2"
      )
      sampleRowWithoutCol3.getValuesMap[String](List("col1", "col2")) shouldBe expected
    }

    it("getAs() on type extending AnyVal throws an exception when accessing field that is null") {
      intercept[NullPointerException] {
        sampleRowWithoutCol3.getInt(sampleRowWithoutCol3.fieldIndex("col3"))
      }
    }

    it("getAs() on type extending AnyVal does not throw exception when value is null"){
      sampleRowWithoutCol3.getAs[String](sampleRowWithoutCol3.fieldIndex("col1")) shouldBe null
    }
  }

  describe("row equals") {
    val externalRow = Row(1, 2)
    val externalRow2 = Row(1, 2)
    val internalRow = InternalRow(1, 2)
    val internalRow2 = InternalRow(1, 2)

    it("equality check for external rows") {
      externalRow shouldEqual externalRow2
    }

    it("equality check for internal rows") {
      internalRow shouldEqual internalRow2
    }
  }
}

Source File: GenerateUnsafeRowJoinerSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import scala.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.RandomDataGenerator
import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow}
import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
import org.apache.spark.sql.types._


class GenerateUnsafeRowJoinerSuite extends SparkFunSuite {

  private val fixed = Seq(IntegerType)
  private val variable = Seq(IntegerType, StringType)

  test("simple fixed width types") {
    testConcat(0, 0, fixed)
    testConcat(0, 1, fixed)
    testConcat(1, 0, fixed)
    testConcat(64, 0, fixed)
    testConcat(0, 64, fixed)
    testConcat(64, 64, fixed)
  }

  test("randomized fix width types") {
    for (i <- 0 until 20) {
      testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed)
    }
  }

  test("simple variable width types") {
    testConcat(0, 0, variable)
    testConcat(0, 1, variable)
    testConcat(1, 0, variable)
    testConcat(64, 0, variable)
    testConcat(0, 64, variable)
    testConcat(64, 64, variable)
  }

  test("randomized variable width types") {
    for (i <- 0 until 10) {
      testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable)
    }
  }

  private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = {
    for (i <- 0 until 10) {
      testConcatOnce(numFields1, numFields2, candidateTypes)
    }
  }

  private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) {
    info(s"schema size $numFields1, $numFields2")
    val schema1 = RandomDataGenerator.randomSchema(numFields1, candidateTypes)
    val schema2 = RandomDataGenerator.randomSchema(numFields2, candidateTypes)

    // Create the converters needed to convert from external row to internal row and to UnsafeRows.
    val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1)
    val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2)
    val converter1 = UnsafeProjection.create(schema1)
    val converter2 = UnsafeProjection.create(schema2)

    // Create the input rows, convert them into UnsafeRows.
    val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply()
    val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply()
    val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow])
    val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow])

    // Run the joiner.
    val mergedSchema = StructType(schema1 ++ schema2)
    val concater = GenerateUnsafeRowJoiner.create(schema1, schema2)
    val output = concater.join(row1, row2)

    // Test everything equals ...
    for (i <- mergedSchema.indices) {
      if (i < schema1.size) {
        assert(output.isNullAt(i) === row1.isNullAt(i))
        if (!output.isNullAt(i)) {
          assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType))
        }
      } else {
        assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size))
        if (!output.isNullAt(i)) {
          assert(output.get(i, mergedSchema(i).dataType) ===
            row2.get(i - schema1.size, mergedSchema(i).dataType))
        }
      }
    }
  }

}

Source File: ConvertToLocalRelationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor


class ConvertToLocalRelationSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("LocalRelation", FixedPoint(100),
        ConvertToLocalRelation) :: Nil
  }

  test("Project on LocalRelation should be turned into a single LocalRelation") {
    val testRelation = LocalRelation(
      LocalRelation('a.int, 'b.int).output,
      InternalRow(1, 2) :: InternalRow(4, 5) :: Nil)

    val correctAnswer = LocalRelation(
      LocalRelation('a1.int, 'b1.int).output,
      InternalRow(1, 3) :: InternalRow(4, 6) :: Nil)

    val projectOnLocal = testRelation.select(
      UnresolvedAttribute("a").as("a1"),
      (UnresolvedAttribute("b") + 1).as("b1"))

    val optimized = Optimize.execute(projectOnLocal.analyze)

    comparePlans(optimized, correctAnswer)
  }

}

Source File: RowIterator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import java.util.NoSuchElementException

import org.apache.spark.sql.catalyst.InternalRow


  def toScala: Iterator[InternalRow] = new RowIteratorToScala(this)
}

object RowIterator {
  def fromScala(scalaIter: Iterator[InternalRow]): RowIterator = {
    scalaIter match {
      case wrappedRowIter: RowIteratorToScala => wrappedRowIter.rowIter
      case _ => new RowIteratorFromScala(scalaIter)
    }
  }
}

private final class RowIteratorToScala(val rowIter: RowIterator) extends Iterator[InternalRow] {
  private [this] var hasNextWasCalled: Boolean = false
  private [this] var _hasNext: Boolean = false
  override def hasNext: Boolean = {
    // Idempotency:
    if (!hasNextWasCalled) {
      _hasNext = rowIter.advanceNext()
      hasNextWasCalled = true
    }
    _hasNext
  }
  override def next(): InternalRow = {
    if (!hasNext) throw new NoSuchElementException
    hasNextWasCalled = false
    rowIter.getRow
  }
}

private final class RowIteratorFromScala(scalaIter: Iterator[InternalRow]) extends RowIterator {
  private[this] var _next: InternalRow = null
  override def advanceNext(): Boolean = {
    if (scalaIter.hasNext) {
      _next = scalaIter.next()
      true
    } else {
      _next = null
      false
    }
  }
  override def getRow: InternalRow = _next
  override def toScala: Iterator[InternalRow] = scalaIter
}

Source File: SeqScanNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute


case class SeqScanNode(conf: SQLConf, output: Seq[Attribute], data: Seq[InternalRow])
  extends LeafLocalNode(conf) {

  private[this] var iterator: Iterator[InternalRow] = _
  private[this] var currentRow: InternalRow = _

  override def open(): Unit = {
    iterator = data.iterator
  }

  override def next(): Boolean = {
    if (iterator.hasNext) {
      currentRow = iterator.next()
      true
    } else {
      false
    }
  }

  override def fetch(): InternalRow = currentRow

  override def close(): Unit = {
    // Do nothing
  }
}

Source File: FilterNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
import org.apache.spark.sql.catalyst.expressions.codegen.GeneratePredicate


case class FilterNode(conf: SQLConf, condition: Expression, child: LocalNode)
  extends UnaryLocalNode(conf) {

  private[this] var predicate: (InternalRow) => Boolean = _

  override def output: Seq[Attribute] = child.output

  override def open(): Unit = {
    child.open()
    predicate = GeneratePredicate.generate(condition, child.output)
  }

  override def next(): Boolean = {
    var found = false
    while (!found && child.next()) {
      found = predicate.apply(child.fetch())
    }
    found
  }

  override def fetch(): InternalRow = child.fetch()

  override def close(): Unit = child.close()
}

Source File: ExpandNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Projection}

case class ExpandNode(
    conf: SQLConf,
    projections: Seq[Seq[Expression]],
    output: Seq[Attribute],
    child: LocalNode) extends UnaryLocalNode(conf) {

  assert(projections.size > 0)

  private[this] var result: InternalRow = _
  private[this] var idx: Int = _
  private[this] var input: InternalRow = _
  private[this] var groups: Array[Projection] = _

  override def open(): Unit = {
    child.open()
    groups = projections.map(ee => newProjection(ee, child.output)).toArray
    idx = groups.length
  }

  override def next(): Boolean = {
    if (idx >= groups.length) {
      if (child.next()) {
        input = child.fetch()
        idx = 0
      } else {
        return false
      }
    }
    result = groups(idx)(input)
    idx += 1
    true
  }

  override def fetch(): InternalRow = result

  override def close(): Unit = child.close()
}

Source File: IntersectNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import scala.collection.mutable

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute

case class IntersectNode(conf: SQLConf, left: LocalNode, right: LocalNode)
  extends BinaryLocalNode(conf) {

  override def output: Seq[Attribute] = left.output

  private[this] var leftRows: mutable.HashSet[InternalRow] = _

  private[this] var currentRow: InternalRow = _

  override def open(): Unit = {
    left.open()
    leftRows = mutable.HashSet[InternalRow]()
    while (left.next()) {
      leftRows += left.fetch().copy()
    }
    left.close()
    right.open()
  }

  override def next(): Boolean = {
    currentRow = null
    while (currentRow == null && right.next()) {
      currentRow = right.fetch()
      if (!leftRows.contains(currentRow)) {
        currentRow = null
      }
    }
    currentRow != null
  }

  override def fetch(): InternalRow = currentRow

  override def close(): Unit = {
    left.close()
    right.close()
  }

}

Source File: BinaryHashJoinNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.joins.{HashedRelation, BuildLeft, BuildRight, BuildSide}


case class BinaryHashJoinNode(
    conf: SQLConf,
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    buildSide: BuildSide,
    left: LocalNode,
    right: LocalNode)
  extends BinaryLocalNode(conf) with HashJoinNode {

  protected override val (streamedNode, streamedKeys) = buildSide match {
    case BuildLeft => (right, rightKeys)
    case BuildRight => (left, leftKeys)
  }

  private val (buildNode, buildKeys) = buildSide match {
    case BuildLeft => (left, leftKeys)
    case BuildRight => (right, rightKeys)
  }

  override def output: Seq[Attribute] = left.output ++ right.output

  private def buildSideKeyGenerator: Projection = {
    // We are expecting the data types of buildKeys and streamedKeys are the same.
    assert(buildKeys.map(_.dataType) == streamedKeys.map(_.dataType))
    UnsafeProjection.create(buildKeys, buildNode.output)
  }

  protected override def doOpen(): Unit = {
    buildNode.open()
    val hashedRelation = HashedRelation(buildNode, buildSideKeyGenerator)
    // We have built the HashedRelation. So, close buildNode.
    buildNode.close()

    streamedNode.open()
    // Set the HashedRelation used by the HashJoinNode.
    withHashedRelation(hashedRelation)
  }

  override def close(): Unit = {
    // Please note that we do not need to call the close method of our buildNode because
    // it has been called in this.open.
    streamedNode.close()
  }
}

Source File: HashJoinNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.joins._


  private def fetchNextMatch(): Boolean = {
    currentHashMatches = null
    currentMatchPosition = -1

    while (currentHashMatches == null && streamedNode.next()) {
      currentStreamedRow = streamedNode.fetch()
      val key = joinKeys(currentStreamedRow)
      if (!key.anyNull) {
        currentHashMatches = hashed.get(key)
      }
    }

    if (currentHashMatches == null) {
      false
    } else {
      currentMatchPosition = 0
      true
    }
  }

  override def fetch(): InternalRow = {
    val ret = buildSide match {
      case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition))
      case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow)
    }
    resultProjection(ret)
  }
}

Source File: SampleNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.util.random.{BernoulliCellSampler, PoissonSampler}



case class SampleNode(
    conf: SQLConf,
    lowerBound: Double,
    upperBound: Double,
    withReplacement: Boolean,
    seed: Long,
    child: LocalNode) extends UnaryLocalNode(conf) {

  override def output: Seq[Attribute] = child.output

  private[this] var iterator: Iterator[InternalRow] = _

  private[this] var currentRow: InternalRow = _

  override def open(): Unit = {
    child.open()
    val sampler =
      if (withReplacement) {
        // Disable gap sampling since the gap sampling method buffers two rows internally,
        // requiring us to copy the row, which is more expensive than the random number generator.
        new PoissonSampler[InternalRow](upperBound - lowerBound, useGapSamplingIfPossible = false)
      } else {
        new BernoulliCellSampler[InternalRow](lowerBound, upperBound)
      }
    sampler.setSeed(seed)
    iterator = sampler.sample(child.asIterator)
  }

  override def next(): Boolean = {
    if (iterator.hasNext) {
      currentRow = iterator.next()
      true
    } else {
      false
    }
  }

  override def fetch(): InternalRow = currentRow

  override def close(): Unit = child.close()

}

Source File: UnionNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute

case class UnionNode(conf: SQLConf, children: Seq[LocalNode]) extends LocalNode(conf) {

  override def output: Seq[Attribute] = children.head.output

  private[this] var currentChild: LocalNode = _

  private[this] var nextChildIndex: Int = _

  override def open(): Unit = {
    currentChild = children.head
    currentChild.open()
    nextChildIndex = 1
  }

  private def advanceToNextChild(): Boolean = {
    var found = false
    var exit = false
    while (!exit && !found) {
      if (currentChild != null) {
        currentChild.close()
      }
      if (nextChildIndex >= children.size) {
        found = false
        exit = true
      } else {
        currentChild = children(nextChildIndex)
        nextChildIndex += 1
        currentChild.open()
        found = currentChild.next()
      }
    }
    found
  }

  override def close(): Unit = {
    if (currentChild != null) {
      currentChild.close()
    }
  }

  override def fetch(): InternalRow = currentChild.fetch()

  override def next(): Boolean = {
    if (currentChild.next()) {
      true
    } else {
      advanceToNextChild()
    }
  }
}

Source File: BroadcastHashJoinNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.joins.{BuildLeft, BuildRight, BuildSide, HashedRelation}


case class BroadcastHashJoinNode(
    conf: SQLConf,
    streamedKeys: Seq[Expression],
    streamedNode: LocalNode,
    buildSide: BuildSide,
    buildOutput: Seq[Attribute],
    hashedRelation: Broadcast[HashedRelation])
  extends UnaryLocalNode(conf) with HashJoinNode {

  override val child = streamedNode

  // Because we do not pass in the buildNode, we take the output of buildNode to
  // create the inputSet properly.
  override def inputSet: AttributeSet = AttributeSet(child.output ++ buildOutput)

  override def output: Seq[Attribute] = buildSide match {
    case BuildRight => streamedNode.output ++ buildOutput
    case BuildLeft => buildOutput ++ streamedNode.output
  }

  protected override def doOpen(): Unit = {
    streamedNode.open()
    // Set the HashedRelation used by the HashJoinNode.
    withHashedRelation(hashedRelation.value)
  }

  override def close(): Unit = {
    streamedNode.close()
  }
}

Source File: TakeOrderedAndProjectNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.util.BoundedPriorityQueue

case class TakeOrderedAndProjectNode(
    conf: SQLConf,
    limit: Int,
    sortOrder: Seq[SortOrder],
    projectList: Option[Seq[NamedExpression]],
    child: LocalNode) extends UnaryLocalNode(conf) {

  private[this] var projection: Option[Projection] = _
  private[this] var ord: InterpretedOrdering = _
  private[this] var iterator: Iterator[InternalRow] = _
  private[this] var currentRow: InternalRow = _

  override def output: Seq[Attribute] = {
    val projectOutput = projectList.map(_.map(_.toAttribute))
    projectOutput.getOrElse(child.output)
  }

  override def open(): Unit = {
    child.open()
    projection = projectList.map(new InterpretedProjection(_, child.output))
    ord = new InterpretedOrdering(sortOrder, child.output)
    // Priority keeps the largest elements, so let's reverse the ordering.
    val queue = new BoundedPriorityQueue[InternalRow](limit)(ord.reverse)
    while (child.next()) {
      queue += child.fetch()
    }
    // Close it eagerly since we don't need it.
    child.close()
    iterator = queue.toArray.sorted(ord).iterator
  }

  override def next(): Boolean = {
    if (iterator.hasNext) {
      val _currentRow = iterator.next()
      currentRow = projection match {
        case Some(p) => p(_currentRow)
        case None => _currentRow
      }
      true
    } else {
      false
    }
  }

  override def fetch(): InternalRow = currentRow

  override def close(): Unit = child.close()

}

Source File: package.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import scala.collection.mutable.HashSet

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
import org.apache.spark.{Accumulator, AccumulatorParam, Logging}


    case class ColumnMetrics(
        elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty))
    val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0)

    val numColumns: Int = child.output.size
    val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics())

    def dumpStats(): Unit = {
      logDebug(s"== ${child.simpleString} ==")
      logDebug(s"Tuples output: ${tupleCount.value}")
      child.output.zip(columnStats).foreach { case(attr, metric) =>
        val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}")
        logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
      }
    }

    protected override def doExecute(): RDD[InternalRow] = {
      child.execute().mapPartitions { iter =>
        new Iterator[InternalRow] {
          def hasNext: Boolean = iter.hasNext
          def next(): InternalRow = {
            val currentRow = iter.next()
            tupleCount += 1
            var i = 0
            while (i < numColumns) {
              val value = currentRow.get(i, output(i).dataType)
              if (value != null) {
                columnStats(i).elementTypes += HashSet(value.getClass.getName)
              }
              i += 1
            }
            currentRow
          }
        }
      }
    }
  }
}

Source File: SortPrefixUtils.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types._
import org.apache.spark.util.collection.unsafe.sort.{PrefixComparators, PrefixComparator}


object SortPrefixUtils {

  
  def createPrefixGenerator(schema: StructType): UnsafeExternalRowSorter.PrefixComputer = {
    if (schema.nonEmpty) {
      val boundReference = BoundReference(0, schema.head.dataType, nullable = true)
      val prefixProjection = UnsafeProjection.create(
        SortPrefix(SortOrder(boundReference, Ascending)))
      new UnsafeExternalRowSorter.PrefixComputer {
        override def computePrefix(row: InternalRow): Long = {
          prefixProjection.apply(row).getLong(0)
        }
      }
    } else {
      new UnsafeExternalRowSorter.PrefixComputer {
        override def computePrefix(row: InternalRow): Long = 0
      }
    }
  }
}

Source File: LeftSemiJoinHash.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, Distribution, ClusteredDistribution}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class LeftSemiJoinHash(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan,
    condition: Option[Expression]) extends BinaryNode with HashSemiJoin {

  override private[sql] lazy val metrics = Map(
    "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"),
    "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"),
    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))

  override def outputPartitioning: Partitioning = left.outputPartitioning

  override def requiredChildDistribution: Seq[Distribution] =
    ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil

  protected override def doExecute(): RDD[InternalRow] = {
    val numLeftRows = longMetric("numLeftRows")
    val numRightRows = longMetric("numRightRows")
    val numOutputRows = longMetric("numOutputRows")

    right.execute().zipPartitions(left.execute()) { (buildIter, streamIter) =>
      if (condition.isEmpty) {
        val hashSet = buildKeyHashSet(buildIter, numRightRows)
        hashSemiJoin(streamIter, numLeftRows, hashSet, numOutputRows)
      } else {
        val hashRelation = HashedRelation(buildIter, numRightRows, rightKeyGenerator)
        hashSemiJoin(streamIter, numLeftRows, hashRelation, numOutputRows)
      }
    }
  }
}

Source File: HashSemiJoin.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.metric.LongSQLMetric


trait HashSemiJoin {
  self: SparkPlan =>
  val leftKeys: Seq[Expression]
  val rightKeys: Seq[Expression]
  val left: SparkPlan
  val right: SparkPlan
  val condition: Option[Expression]

  override def output: Seq[Attribute] = left.output

  override def outputsUnsafeRows: Boolean = true
  override def canProcessUnsafeRows: Boolean = true
  override def canProcessSafeRows: Boolean = false

  protected def leftKeyGenerator: Projection =
    UnsafeProjection.create(leftKeys, left.output)

  protected def rightKeyGenerator: Projection =
    UnsafeProjection.create(rightKeys, right.output)

  @transient private lazy val boundCondition =
    newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output)

  protected def buildKeyHashSet(
      buildIter: Iterator[InternalRow], numBuildRows: LongSQLMetric): java.util.Set[InternalRow] = {
    val hashSet = new java.util.HashSet[InternalRow]()

    // Create a Hash set of buildKeys
    val rightKey = rightKeyGenerator
    while (buildIter.hasNext) {
      val currentRow = buildIter.next()
      numBuildRows += 1
      val rowKey = rightKey(currentRow)
      if (!rowKey.anyNull) {
        val keyExists = hashSet.contains(rowKey)
        if (!keyExists) {
          hashSet.add(rowKey.copy())
        }
      }
    }

    hashSet
  }

  protected def hashSemiJoin(
    streamIter: Iterator[InternalRow],
    numStreamRows: LongSQLMetric,
    hashSet: java.util.Set[InternalRow],
    numOutputRows: LongSQLMetric): Iterator[InternalRow] = {
    val joinKeys = leftKeyGenerator
    streamIter.filter(current => {
      numStreamRows += 1
      val key = joinKeys(current)
      val r = !key.anyNull && hashSet.contains(key)
      if (r) numOutputRows += 1
      r
    })
  }

  protected def hashSemiJoin(
      streamIter: Iterator[InternalRow],
      numStreamRows: LongSQLMetric,
      hashedRelation: HashedRelation,
      numOutputRows: LongSQLMetric): Iterator[InternalRow] = {
    val joinKeys = leftKeyGenerator
    val joinedRow = new JoinedRow
    streamIter.filter { current =>
      numStreamRows += 1
      val key = joinKeys(current)
      lazy val rowBuffer = hashedRelation.get(key)
      val r = !key.anyNull && rowBuffer != null && rowBuffer.exists {
        (row: InternalRow) => boundCondition(joinedRow(current, row))
      }
      if (r) numOutputRows += 1
      r
    }
  }
}

Source File: BroadcastLeftSemiJoinHash.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.{InternalAccumulator, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class BroadcastLeftSemiJoinHash(
    leftKeys: Seq[Expression],
    rightKeys: Seq[Expression],
    left: SparkPlan,
    right: SparkPlan,
    condition: Option[Expression]) extends BinaryNode with HashSemiJoin {

  override private[sql] lazy val metrics = Map(
    "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"),
    "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"),
    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numLeftRows = longMetric("numLeftRows")
    val numRightRows = longMetric("numRightRows")
    val numOutputRows = longMetric("numOutputRows")

    val input = right.execute().map { row =>
      numRightRows += 1
      row.copy()
    }.collect()

    if (condition.isEmpty) {
      val hashSet = buildKeyHashSet(input.toIterator, SQLMetrics.nullLongMetric)
      val broadcastedRelation = sparkContext.broadcast(hashSet)

      left.execute().mapPartitionsInternal { streamIter =>
        hashSemiJoin(streamIter, numLeftRows, broadcastedRelation.value, numOutputRows)
      }
    } else {
      val hashRelation =
        HashedRelation(input.toIterator, SQLMetrics.nullLongMetric, rightKeyGenerator, input.size)
      val broadcastedRelation = sparkContext.broadcast(hashRelation)

      left.execute().mapPartitionsInternal { streamIter =>
        val hashedRelation = broadcastedRelation.value
        hashedRelation match {
          case unsafe: UnsafeHashedRelation =>
            TaskContext.get().internalMetricsToAccumulators(
              InternalAccumulator.PEAK_EXECUTION_MEMORY).add(unsafe.getUnsafeSize)
          case _ =>
        }
        hashSemiJoin(streamIter, numLeftRows, hashedRelation, numOutputRows)
      }
    }
  }
}

Source File: LeftSemiJoinBNL.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


  override def right: SparkPlan = broadcast

  @transient private lazy val boundCondition =
    newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output)

  protected override def doExecute(): RDD[InternalRow] = {
    val numLeftRows = longMetric("numLeftRows")
    val numRightRows = longMetric("numRightRows")
    val numOutputRows = longMetric("numOutputRows")

    val broadcastedRelation =
      sparkContext.broadcast(broadcast.execute().map { row =>
        numRightRows += 1
        row.copy()
      }.collect().toIndexedSeq)

    streamed.execute().mapPartitions { streamedIter =>
      val joinedRow = new JoinedRow

      streamedIter.filter(streamedRow => {
        numLeftRows += 1
        var i = 0
        var matched = false

        while (i < broadcastedRelation.value.size && !matched) {
          val broadcastedRow = broadcastedRelation.value(i)
          if (boundCondition(joinedRow(streamedRow, broadcastedRow))) {
            matched = true
          }
          i += 1
        }
        if (matched) {
          numOutputRows += 1
        }
        matched
      })
    }
  }
}

Source File: HashJoin.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.metric.LongSQLMetric


trait HashJoin {
  self: SparkPlan =>

  val leftKeys: Seq[Expression]
  val rightKeys: Seq[Expression]
  val buildSide: BuildSide
  val left: SparkPlan
  val right: SparkPlan

  protected lazy val (buildPlan, streamedPlan) = buildSide match {
    case BuildLeft => (left, right)
    case BuildRight => (right, left)
  }

  protected lazy val (buildKeys, streamedKeys) = buildSide match {
    case BuildLeft => (leftKeys, rightKeys)
    case BuildRight => (rightKeys, leftKeys)
  }

  override def output: Seq[Attribute] = left.output ++ right.output

  override def outputsUnsafeRows: Boolean = true
  override def canProcessUnsafeRows: Boolean = true
  override def canProcessSafeRows: Boolean = false

  protected def buildSideKeyGenerator: Projection =
    UnsafeProjection.create(buildKeys, buildPlan.output)

  protected def streamSideKeyGenerator: Projection =
    UnsafeProjection.create(streamedKeys, streamedPlan.output)

  protected def hashJoin(
      streamIter: Iterator[InternalRow],
      numStreamRows: LongSQLMetric,
      hashedRelation: HashedRelation,
      numOutputRows: LongSQLMetric): Iterator[InternalRow] =
  {
    new Iterator[InternalRow] {
      private[this] var currentStreamedRow: InternalRow = _
      private[this] var currentHashMatches: Seq[InternalRow] = _
      private[this] var currentMatchPosition: Int = -1

      // Mutable per row objects.
      private[this] val joinRow = new JoinedRow
      private[this] val resultProjection: (InternalRow) => InternalRow =
        UnsafeProjection.create(self.schema)

      private[this] val joinKeys = streamSideKeyGenerator

      override final def hasNext: Boolean =
        (currentMatchPosition != -1 && currentMatchPosition < currentHashMatches.size) ||
          (streamIter.hasNext && fetchNext())

      override final def next(): InternalRow = {
        val ret = buildSide match {
          case BuildRight => joinRow(currentStreamedRow, currentHashMatches(currentMatchPosition))
          case BuildLeft => joinRow(currentHashMatches(currentMatchPosition), currentStreamedRow)
        }
        currentMatchPosition += 1
        numOutputRows += 1
        resultProjection(ret)
      }

      
      private final def fetchNext(): Boolean = {
        currentHashMatches = null
        currentMatchPosition = -1

        while (currentHashMatches == null && streamIter.hasNext) {
          currentStreamedRow = streamIter.next()
          numStreamRows += 1
          val key = joinKeys(currentStreamedRow)
          if (!key.anyNull) {
            currentHashMatches = hashedRelation.get(key)
          }
        }

        if (currentHashMatches == null) {
          false
        } else {
          currentMatchPosition = 0
          true
        }
      }
    }
  }
}

Source File: CartesianProduct.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.joins

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow}
import org.apache.spark.sql.execution.{BinaryNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode {
  override def output: Seq[Attribute] = left.output ++ right.output

  override private[sql] lazy val metrics = Map(
    "numLeftRows" -> SQLMetrics.createLongMetric(sparkContext, "number of left rows"),
    "numRightRows" -> SQLMetrics.createLongMetric(sparkContext, "number of right rows"),
    "numOutputRows" -> SQLMetrics.createLongMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numLeftRows = longMetric("numLeftRows")
    val numRightRows = longMetric("numRightRows")
    val numOutputRows = longMetric("numOutputRows")

    val leftResults = left.execute().map { row =>
      numLeftRows += 1
      row.copy()
    }
    val rightResults = right.execute().map { row =>
      numRightRows += 1
      row.copy()
    }

    leftResults.cartesian(rightResults).mapPartitionsInternal { iter =>
      val joinedRow = new JoinedRow
      iter.map { r =>
        numOutputRows += 1
        joinedRow(r._1, r._2)
      }
    }
  }
}

Source File: JacksonGenerator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.json

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.util.{MapData, ArrayData, DateTimeUtils}

import scala.collection.Map

import com.fasterxml.jackson.core._

import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

private[sql] object JacksonGenerator {
  
  def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = {
    def valWriter: (DataType, Any) => Unit = {
      case (_, null) | (NullType, _) => gen.writeNull()
      case (StringType, v) => gen.writeString(v.toString)
      case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString)
      case (IntegerType, v: Int) => gen.writeNumber(v)
      case (ShortType, v: Short) => gen.writeNumber(v)
      case (FloatType, v: Float) => gen.writeNumber(v)
      case (DoubleType, v: Double) => gen.writeNumber(v)
      case (LongType, v: Long) => gen.writeNumber(v)
      case (DecimalType(), v: Decimal) => gen.writeNumber(v.toJavaBigDecimal)
      case (ByteType, v: Byte) => gen.writeNumber(v.toInt)
      case (BinaryType, v: Array[Byte]) => gen.writeBinary(v)
      case (BooleanType, v: Boolean) => gen.writeBoolean(v)
      case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString)
      // For UDT values, they should be in the SQL type's corresponding value type.
      // We should not see values in the user-defined class at here.
      // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is
      // an ArrayData at here, instead of a Vector.
      case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v)

      case (ArrayType(ty, _), v: ArrayData) =>
        gen.writeStartArray()
        v.foreach(ty, (_, value) => valWriter(ty, value))
        gen.writeEndArray()

      case (MapType(kt, vt, _), v: MapData) =>
        gen.writeStartObject()
        v.foreach(kt, vt, { (k, v) =>
          gen.writeFieldName(k.toString)
          valWriter(vt, v)
        })
        gen.writeEndObject()

      case (StructType(ty), v: InternalRow) =>
        gen.writeStartObject()
        var i = 0
        while (i < ty.length) {
          val field = ty(i)
          val value = v.get(i, field.dataType)
          if (value != null) {
            gen.writeFieldName(field.name)
            valWriter(field.dataType, value)
          }
          i += 1
        }
        gen.writeEndObject()

      case (dt, v) =>
        sys.error(
          s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.")
    }

    valWriter(rowSchema, row)
  }
}

Source File: QueryExecution.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan


  lazy val toRdd: RDD[InternalRow] = executedPlan.execute()

  protected def stringOrError[A](f: => A): String =
    try f.toString catch { case e: Throwable => e.toString }

  def simpleString: String = {
    s"""== Physical Plan ==
       |${stringOrError(executedPlan)}
      """.stripMargin.trim
  }

  override def toString: String = {
    def output =
      analyzed.output.map(o => s"${o.name}: ${o.dataType.simpleString}").mkString(", ")

    s"""== Parsed Logical Plan ==
       |${stringOrError(logical)}
       |== Analyzed Logical Plan ==
       |${stringOrError(output)}
       |${stringOrError(analyzed)}
       |== Optimized Logical Plan ==
       |${stringOrError(optimizedPlan)}
       |== Physical Plan ==
       |${stringOrError(executedPlan)}
    """.stripMargin.trim
  }
}

Source File: rowFormatConverters.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.catalyst.rules.Rule


case class ConvertToSafe(child: SparkPlan) extends UnaryNode {
  override def output: Seq[Attribute] = child.output
  override def outputPartitioning: Partitioning = child.outputPartitioning
  override def outputOrdering: Seq[SortOrder] = child.outputOrdering
  override def outputsUnsafeRows: Boolean = false
  override def canProcessUnsafeRows: Boolean = true
  override def canProcessSafeRows: Boolean = false
  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val convertToSafe = FromUnsafeProjection(child.output.map(_.dataType))
      iter.map(convertToSafe)
    }
  }
}

//private[sql]
object EnsureRowFormats extends Rule[SparkPlan] {

  private def onlyHandlesSafeRows(operator: SparkPlan): Boolean =
    operator.canProcessSafeRows && !operator.canProcessUnsafeRows

  private def onlyHandlesUnsafeRows(operator: SparkPlan): Boolean =
    operator.canProcessUnsafeRows && !operator.canProcessSafeRows

  private def handlesBothSafeAndUnsafeRows(operator: SparkPlan): Boolean =
    operator.canProcessSafeRows && operator.canProcessUnsafeRows

  override def apply(operator: SparkPlan): SparkPlan = operator.transformUp {
    case operator: SparkPlan if onlyHandlesSafeRows(operator) =>
      if (operator.children.exists(_.outputsUnsafeRows)) {
        operator.withNewChildren {
          operator.children.map {
            c => if (c.outputsUnsafeRows) ConvertToSafe(c) else c
          }
        }
      } else {
        operator
      }
    case operator: SparkPlan if onlyHandlesUnsafeRows(operator) =>
      if (operator.children.exists(!_.outputsUnsafeRows)) {
        operator.withNewChildren {
          operator.children.map {
            c => if (!c.outputsUnsafeRows) ConvertToUnsafe(c) else c
          }
        }
      } else {
        operator
      }
    case operator: SparkPlan if handlesBothSafeAndUnsafeRows(operator) =>
      if (operator.children.map(_.outputsUnsafeRows).toSet.size != 1) {
        // If this operator's children produce both unsafe and safe rows,
        // convert everything unsafe rows.
        operator.withNewChildren {
          operator.children.map {
            c => if (!c.outputsUnsafeRows) ConvertToUnsafe(c) else c
          }
        }
      } else {
        operator
      }
  }
}

Source File: Sort.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.{InternalAccumulator, SparkEnv, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.{Distribution, OrderedDistribution, UnspecifiedDistribution}
import org.apache.spark.sql.execution.metric.SQLMetrics


case class Sort(
    sortOrder: Seq[SortOrder],
    global: Boolean,
    child: SparkPlan,
    testSpillFrequency: Int = 0)
  extends UnaryNode {

  override def outputsUnsafeRows: Boolean = true
  override def canProcessUnsafeRows: Boolean = true
  override def canProcessSafeRows: Boolean = false

  override def output: Seq[Attribute] = child.output

  override def outputOrdering: Seq[SortOrder] = sortOrder

  override def requiredChildDistribution: Seq[Distribution] =
    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil

  override private[sql] lazy val metrics = Map(
    "dataSize" -> SQLMetrics.createSizeMetric(sparkContext, "data size"),
    "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"))

  protected override def doExecute(): RDD[InternalRow] = {
    val schema = child.schema
    val childOutput = child.output

    val dataSize = longMetric("dataSize")
    val spillSize = longMetric("spillSize")

    child.execute().mapPartitionsInternal { iter =>
      val ordering = newOrdering(sortOrder, childOutput)

      // The comparator for comparing prefix
      val boundSortExpression = BindReferences.bindReference(sortOrder.head, childOutput)
      val prefixComparator = SortPrefixUtils.getPrefixComparator(boundSortExpression)

      // The generator for prefix
      val prefixProjection = UnsafeProjection.create(Seq(SortPrefix(boundSortExpression)))
      val prefixComputer = new UnsafeExternalRowSorter.PrefixComputer {
        override def computePrefix(row: InternalRow): Long = {
          prefixProjection.apply(row).getLong(0)
        }
      }

      val pageSize = SparkEnv.get.memoryManager.pageSizeBytes
      val sorter = new UnsafeExternalRowSorter(
        schema, ordering, prefixComparator, prefixComputer, pageSize)
      if (testSpillFrequency > 0) {
        sorter.setTestSpillFrequency(testSpillFrequency)
      }

      // Remember spill data size of this task before execute this operator so that we can
      // figure out how many bytes we spilled for this operator.
      val spillSizeBefore = TaskContext.get().taskMetrics().memoryBytesSpilled

      val sortedIterator = sorter.sort(iter.asInstanceOf[Iterator[UnsafeRow]])

      dataSize += sorter.getPeakMemoryUsage
      spillSize += TaskContext.get().taskMetrics().memoryBytesSpilled - spillSizeBefore

      TaskContext.get().internalMetricsToAccumulators(
        InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.getPeakMemoryUsage)
      sortedIterator
    }
  }
}

Source File: Expand.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.errors._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning}


case class Expand(
    projections: Seq[Seq[Expression]],
    output: Seq[Attribute],
    child: SparkPlan)
  extends UnaryNode {

  // The GroupExpressions can output data with arbitrary partitioning, so set it
  // as UNKNOWN partitioning
  override def outputPartitioning: Partitioning = UnknownPartitioning(0)

  override def outputsUnsafeRows: Boolean = child.outputsUnsafeRows
  override def canProcessUnsafeRows: Boolean = true
  override def canProcessSafeRows: Boolean = true

  override def references: AttributeSet =
    AttributeSet(projections.flatten.flatMap(_.references))

  private[this] val projection = {
    if (outputsUnsafeRows) {
      (exprs: Seq[Expression]) => UnsafeProjection.create(exprs, child.output)
    } else {
      (exprs: Seq[Expression]) => newMutableProjection(exprs, child.output)()
    }
  }

  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "execute") {
    child.execute().mapPartitions { iter =>
      val groups = projections.map(projection).toArray
      new Iterator[InternalRow] {
        private[this] var result: InternalRow = _
        private[this] var idx = -1  // -1 means the initial state
        private[this] var input: InternalRow = _

        override final def hasNext: Boolean = (-1 < idx && idx < groups.length) || iter.hasNext

        override final def next(): InternalRow = {
          if (idx <= 0) {
            // in the initial (-1) or beginning(0) of a new input row, fetch the next input tuple
            input = iter.next()
            idx = 0
          }

          result = groups(idx)(input)
          idx += 1

          if (idx == groups.length && iter.hasNext) {
            idx = 0
          }

          result
        }
      }
    }
  }
}

Source File: NullableColumnBuilder.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.sql.catalyst.InternalRow


private[columnar] trait NullableColumnBuilder extends ColumnBuilder {
  protected var nulls: ByteBuffer = _
  protected var nullCount: Int = _
  private var pos: Int = _

  abstract override def initialize(
      initialSize: Int,
      columnName: String,
      useCompression: Boolean): Unit = {

    nulls = ByteBuffer.allocate(1024)
    nulls.order(ByteOrder.nativeOrder())
    pos = 0
    nullCount = 0
    super.initialize(initialSize, columnName, useCompression)
  }

  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
    columnStats.gatherStats(row, ordinal)
    if (row.isNullAt(ordinal)) {
      nulls = ColumnBuilder.ensureFreeSpace(nulls, 4)
      nulls.putInt(pos)
      nullCount += 1
    } else {
      super.appendFrom(row, ordinal)
    }
    pos += 1
  }

  abstract override def build(): ByteBuffer = {
    val nonNulls = super.build()
    val nullDataLen = nulls.position()

    nulls.limit(nullDataLen)
    nulls.rewind()

    val buffer = ByteBuffer
      .allocate(4 + nullDataLen + nonNulls.remaining())
      .order(ByteOrder.nativeOrder())
      .putInt(nullCount)
      .put(nulls)
      .put(nonNulls)

    buffer.rewind()
    buffer
  }

  protected def buildNonNulls(): ByteBuffer = {
    nulls.limit(nulls.position()).rewind()
    super.build()
  }
}

Source File: CompressibleColumnBuilder.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar.compression

import java.nio.{ByteBuffer, ByteOrder}

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder}
import org.apache.spark.sql.types.AtomicType


private[columnar] trait CompressibleColumnBuilder[T <: AtomicType]
  extends ColumnBuilder with Logging {

  this: NativeColumnBuilder[T] with WithCompressionSchemes =>

  var compressionEncoders: Seq[Encoder[T]] = _

  abstract override def initialize(
      initialSize: Int,
      columnName: String,
      useCompression: Boolean): Unit = {

    compressionEncoders =
      if (useCompression) {
        schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType))
      } else {
        Seq(PassThrough.encoder(columnType))
      }
    super.initialize(initialSize, columnName, useCompression)
  }

  protected def isWorthCompressing(encoder: Encoder[T]) = {
    encoder.compressionRatio < 0.8
  }

  private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {
    var i = 0
    while (i < compressionEncoders.length) {
      compressionEncoders(i).gatherCompressibilityStats(row, ordinal)
      i += 1
    }
  }

  abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = {
    super.appendFrom(row, ordinal)
    if (!row.isNullAt(ordinal)) {
      gatherCompressibilityStats(row, ordinal)
    }
  }

  override def build(): ByteBuffer = {
    val nonNullBuffer = buildNonNulls()
    val encoder: Encoder[T] = {
      val candidate = compressionEncoders.minBy(_.compressionRatio)
      if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType)
    }

    // Header = null count + null positions
    val headerSize = 4 + nulls.limit()
    val compressedSize = if (encoder.compressedSize == 0) {
      nonNullBuffer.remaining()
    } else {
      encoder.compressedSize
    }

    val compressedBuffer = ByteBuffer
      // Reserves 4 bytes for compression scheme ID
      .allocate(headerSize + 4 + compressedSize)
      .order(ByteOrder.nativeOrder)
      // Write the header
      .putInt(nullCount)
      .put(nulls)

    logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}")
    encoder.compress(nonNullBuffer, compressedBuffer)
  }
}

Source File: CompressionScheme.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar.compression

import java.nio.{ByteBuffer, ByteOrder}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.MutableRow
import org.apache.spark.sql.execution.columnar.{ColumnType, NativeColumnType}
import org.apache.spark.sql.types.AtomicType

private[columnar] trait Encoder[T <: AtomicType] {
  def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = {}

  def compressedSize: Int

  def uncompressedSize: Int

  def compressionRatio: Double = {
    if (uncompressedSize > 0) compressedSize.toDouble / uncompressedSize else 1.0
  }

  def compress(from: ByteBuffer, to: ByteBuffer): ByteBuffer
}

private[columnar] trait Decoder[T <: AtomicType] {
  def next(row: MutableRow, ordinal: Int): Unit

  def hasNext: Boolean
}

private[columnar] trait CompressionScheme {
  def typeId: Int

  def supports(columnType: ColumnType[_]): Boolean

  def encoder[T <: AtomicType](columnType: NativeColumnType[T]): Encoder[T]

  def decoder[T <: AtomicType](buffer: ByteBuffer, columnType: NativeColumnType[T]): Decoder[T]
}

private[columnar] trait WithCompressionSchemes {
  def schemes: Seq[CompressionScheme]
}

private[columnar] trait AllCompressionSchemes extends WithCompressionSchemes {
  override val schemes: Seq[CompressionScheme] = CompressionScheme.all
}

private[columnar] object CompressionScheme {
  val all: Seq[CompressionScheme] =
    Seq(PassThrough, RunLengthEncoding, DictionaryEncoding, BooleanBitSet, IntDelta, LongDelta)

  private val typeIdToScheme = all.map(scheme => scheme.typeId -> scheme).toMap

  def apply(typeId: Int): CompressionScheme = {
    typeIdToScheme.getOrElse(typeId, throw new UnsupportedOperationException(
      s"Unrecognized compression scheme type ID: $typeId"))
  }

  def columnHeaderSize(columnBuffer: ByteBuffer): Int = {
    val header = columnBuffer.duplicate().order(ByteOrder.nativeOrder)
    val nullCount = header.getInt()
    // null count + null positions
    4 + 4 * nullCount
  }
}

Source File: Generate.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._


case class Generate(
    generator: Generator,
    join: Boolean,
    outer: Boolean,
    output: Seq[Attribute],
    child: SparkPlan)
  extends UnaryNode {

  val boundGenerator = BindReferences.bindReference(generator, child.output)

  protected override def doExecute(): RDD[InternalRow] = {
    // boundGenerator.terminate() should be triggered after all of the rows in the partition
    if (join) {
      child.execute().mapPartitionsInternal { iter =>
        val generatorNullRow = InternalRow.fromSeq(Seq.fill[Any](generator.elementTypes.size)(null))
        val joinedRow = new JoinedRow

        iter.flatMap { row =>
          // we should always set the left (child output)
          joinedRow.withLeft(row)
          val outputRows = boundGenerator.eval(row)
          if (outer && outputRows.isEmpty) {
            joinedRow.withRight(generatorNullRow) :: Nil
          } else {
            outputRows.map(or => joinedRow.withRight(or))
          }
        } ++ LazyIterator(() => boundGenerator.terminate()).map { row =>
          // we leave the left side as the last element of its child output
          // keep it the same as Hive does
          joinedRow.withRight(row)
        }
      }
    } else {
      child.execute().mapPartitionsInternal { iter =>
        iter.flatMap(row => boundGenerator.eval(row)) ++
        LazyIterator(() => boundGenerator.terminate())
      }
    }
  }
}

Source File: ExistingRDD.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters}
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
import org.apache.spark.sql.sources.{HadoopFsRelation, BaseRelation}
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.{Row, SQLContext}


object RDDConversions {
  def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = {
    data.mapPartitions { iterator =>
      val numColumns = outputTypes.length
      val mutableRow = new GenericMutableRow(numColumns)
      val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter)
      iterator.map { r =>
        var i = 0
        while (i < numColumns) {
          mutableRow(i) = converters(i)(r.productElement(i))
          i += 1
        }

        mutableRow
      }
    }
  }

  
//private[sql]
case class PhysicalRDD(
    output: Seq[Attribute],
    rdd: RDD[InternalRow],
    override val nodeName: String,
    override val metadata: Map[String, String] = Map.empty,
    override val outputsUnsafeRows: Boolean = false)
  extends LeafNode {

  protected override def doExecute(): RDD[InternalRow] = rdd

  override def simpleString: String = {
    val metadataEntries = for ((key, value) <- metadata.toSeq.sorted) yield s"$key: $value"
    s"Scan $nodeName${output.mkString("[", ",", "]")}${metadataEntries.mkString(" ", ", ", "")}"
  }
}

private[sql] object PhysicalRDD {
  // Metadata keys
  val INPUT_PATHS = "InputPaths"
  val PUSHED_FILTERS = "PushedFilters"

  def createFromDataSource(
      output: Seq[Attribute],
      rdd: RDD[InternalRow],
      relation: BaseRelation,
      metadata: Map[String, String] = Map.empty): PhysicalRDD = {
    // All HadoopFsRelations output UnsafeRows
    val outputUnsafeRows = relation.isInstanceOf[HadoopFsRelation]
    PhysicalRDD(output, rdd, relation.toString, metadata, outputUnsafeRows)
  }
}

Source File: CoGroupedIterator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Ascending, SortOrder, Attribute}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering


class CoGroupedIterator(
    left: Iterator[(InternalRow, Iterator[InternalRow])],
    right: Iterator[(InternalRow, Iterator[InternalRow])],
    groupingSchema: Seq[Attribute])
  extends Iterator[(InternalRow, Iterator[InternalRow], Iterator[InternalRow])] {

  private val keyOrdering =
    GenerateOrdering.generate(groupingSchema.map(SortOrder(_, Ascending)), groupingSchema)

  private var currentLeftData: (InternalRow, Iterator[InternalRow]) = _
  private var currentRightData: (InternalRow, Iterator[InternalRow]) = _

  override def hasNext: Boolean = {
    if (currentLeftData == null && left.hasNext) {
      currentLeftData = left.next()
    }
    if (currentRightData == null && right.hasNext) {
      currentRightData = right.next()
    }

    currentLeftData != null || currentRightData != null
  }

  override def next(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    assert(hasNext)

    if (currentLeftData.eq(null)) {
      // left is null, right is not null, consume the right data.
      rightOnly()
    } else if (currentRightData.eq(null)) {
      // left is not null, right is null, consume the left data.
      leftOnly()
    } else if (currentLeftData._1 == currentRightData._1) {
      // left and right have the same grouping key, consume both of them.
      val result = (currentLeftData._1, currentLeftData._2, currentRightData._2)
      currentLeftData = null
      currentRightData = null
      result
    } else {
      val compare = keyOrdering.compare(currentLeftData._1, currentRightData._1)
      assert(compare != 0)
      if (compare < 0) {
        // the grouping key of left is smaller, consume the left data.
        leftOnly()
      } else {
        // the grouping key of right is smaller, consume the right data.
        rightOnly()
      }
    }
  }

  private def leftOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    val result = (currentLeftData._1, currentLeftData._2, Iterator.empty)
    currentLeftData = null
    result
  }

  private def rightOnly(): (InternalRow, Iterator[InternalRow], Iterator[InternalRow]) = {
    val result = (currentRightData._1, Iterator.empty, currentRightData._2)
    currentRightData = null
    result
  }
}

Source File: DummyNode.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.local

import org.apache.spark.sql.SQLConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation


private[local] case class DummyNode(
    output: Seq[Attribute],
    relation: LocalRelation,
    conf: SQLConf)
  extends LocalNode(conf) {

  import DummyNode._

  private var index: Int = CLOSED
  private val input: Seq[InternalRow] = relation.data

  def this(output: Seq[Attribute], data: Seq[Product], conf: SQLConf = new SQLConf) {
    this(output, LocalRelation.fromProduct(output, data), conf)
  }

  def isOpen: Boolean = index != CLOSED

  override def children: Seq[LocalNode] = Seq.empty

  override def open(): Unit = {
    index = -1
  }

  override def next(): Boolean = {
    index += 1
    index < input.size
  }

  override def fetch(): InternalRow = {
    assert(index >= 0 && index < input.size)
    input(index)
  }

  override def close(): Unit = {
    index = CLOSED
  }
}

private object DummyNode {
  val CLOSED: Int = Int.MinValue
}

Source File: ReferenceSort.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.{InternalAccumulator, TaskContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.errors._
import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder}
import org.apache.spark.sql.catalyst.plans.physical._
import org.apache.spark.util.CompletionIterator
import org.apache.spark.util.collection.ExternalSorter



case class ReferenceSort(
    sortOrder: Seq[SortOrder],
    global: Boolean,
    child: SparkPlan)
  extends UnaryNode {

  override def requiredChildDistribution: Seq[Distribution] =
    if (global) OrderedDistribution(sortOrder) :: Nil else UnspecifiedDistribution :: Nil

  protected override def doExecute(): RDD[InternalRow] = attachTree(this, "sort") {
    child.execute().mapPartitions( { iterator =>
      val ordering = newOrdering(sortOrder, child.output)
      val sorter = new ExternalSorter[InternalRow, Null, InternalRow](
        TaskContext.get(), ordering = Some(ordering))
      sorter.insertAll(iterator.map(r => (r.copy(), null)))
      val baseIterator = sorter.iterator.map(_._1)
      val context = TaskContext.get()
      context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled)
      context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled)
      context.internalMetricsToAccumulators(
        InternalAccumulator.PEAK_EXECUTION_MEMORY).add(sorter.peakMemoryUsedBytes)
      CompletionIterator[InternalRow, Iterator[InternalRow]](baseIterator, sorter.stop())
    }, preservesPartitioning = true)
  }

  override def output: Seq[Attribute] = child.output

  override def outputOrdering: Seq[SortOrder] = sortOrder
}

Source File: ColumnarTestUtils.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.columnar

import scala.collection.immutable.HashSet
import scala.util.Random

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, GenericMutableRow}
import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayBasedMapData}
import org.apache.spark.sql.types.{AtomicType, Decimal}
import org.apache.spark.unsafe.types.UTF8String

object ColumnarTestUtils {
  def makeNullRow(length: Int): GenericMutableRow = {
    val row = new GenericMutableRow(length)
    (0 until length).foreach(row.setNullAt)
    row
  }

  def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = {
    def randomBytes(length: Int) = {
      val bytes = new Array[Byte](length)
      Random.nextBytes(bytes)
      bytes
    }

    (columnType match {
      case NULL => null
      case BOOLEAN => Random.nextBoolean()
      case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte
      case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort
      case INT => Random.nextInt()
      case LONG => Random.nextLong()
      case FLOAT => Random.nextFloat()
      case DOUBLE => Random.nextDouble()
      case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32)))
      case BINARY => randomBytes(Random.nextInt(32))
      case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale)
      case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale)
      case STRUCT(_) =>
        new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10))))
      case ARRAY(_) =>
        new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt()))
      case MAP(_) =>
        ArrayBasedMapData(
          Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32)))))
    }).asInstanceOf[JvmType]
  }

  def makeRandomValues(
      head: ColumnType[_],
      tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail)

  def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = {
    columnTypes.map(makeRandomValue(_))
  }

  def makeUniqueRandomValues[JvmType](
      columnType: ColumnType[JvmType],
      count: Int): Seq[JvmType] = {

    Iterator.iterate(HashSet.empty[JvmType]) { set =>
      set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next()
    }.drop(count).next().toSeq
  }

  def makeRandomRow(
      head: ColumnType[_],
      tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail)

  def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = {
    val row = new GenericMutableRow(columnTypes.length)
    makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) =>
      row(index) = value
    }
    row
  }

  def makeUniqueValuesAndSingleValueRows[T <: AtomicType](
      columnType: NativeColumnType[T],
      count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = {

    val values = makeUniqueRandomValues(columnType, count)
    val rows = values.map { value =>
      val row = new GenericMutableRow(1)
      row(0) = value
      row
    }

    (values, rows)
  }
}

Source File: ExtraStrategiesSuite.scala From BigDatalog with Apache License 2.0

5 votes

package test.org.apache.spark.sql

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute}
import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.{Row, Strategy, QueryTest}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.unsafe.types.UTF8String

case class FastOperator(output: Seq[Attribute]) extends SparkPlan {

  override protected def doExecute(): RDD[InternalRow] = {
    val str = Literal("so fast").value
    val row = new GenericInternalRow(Array[Any](str))
    sparkContext.parallelize(Seq(row))
  }

  override def children: Seq[SparkPlan] = Nil
}

object TestStrategy extends Strategy {
  def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
    case Project(Seq(attr), _) if attr.name == "a" =>
      FastOperator(attr.toAttribute :: Nil) :: Nil
    case _ => Nil
  }
}

class ExtraStrategiesSuite extends QueryTest with SharedSQLContext {
  import testImplicits._

  test("insert an extraStrategy") {
    try {
      sqlContext.experimental.extraStrategies = TestStrategy :: Nil

      val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b")
      checkAnswer(
        df.select("a"),
        Row("so fast"))

      checkAnswer(
        df.select("a", "b"),
        Row("so slow", 1))
    } finally {
      sqlContext.experimental.extraStrategies = Nil
    }
  }
}

Source File: DDLTestSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String

class DDLScanSource extends RelationProvider {
  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    SimpleDDLScan(parameters("from").toInt, parameters("TO").toInt, parameters("Table"))(sqlContext)
  }
}

case class SimpleDDLScan(from: Int, to: Int, table: String)(@transient val sqlContext: SQLContext)
  extends BaseRelation with TableScan {

  override def schema: StructType =
    StructType(Seq(
      StructField("intType", IntegerType, nullable = false,
        new MetadataBuilder().putString("comment", s"test comment $table").build()),
      StructField("stringType", StringType, nullable = false),
      StructField("dateType", DateType, nullable = false),
      StructField("timestampType", TimestampType, nullable = false),
      StructField("doubleType", DoubleType, nullable = false),
      StructField("bigintType", LongType, nullable = false),
      StructField("tinyintType", ByteType, nullable = false),
      StructField("decimalType", DecimalType.USER_DEFAULT, nullable = false),
      StructField("fixedDecimalType", DecimalType(5, 1), nullable = false),
      StructField("binaryType", BinaryType, nullable = false),
      StructField("booleanType", BooleanType, nullable = false),
      StructField("smallIntType", ShortType, nullable = false),
      StructField("floatType", FloatType, nullable = false),
      StructField("mapType", MapType(StringType, StringType)),
      StructField("arrayType", ArrayType(StringType)),
      StructField("structType",
        StructType(StructField("f1", StringType) :: StructField("f2", IntegerType) :: Nil
        )
      )
    ))

  override def needConversion: Boolean = false

  override def buildScan(): RDD[Row] = {
    // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row]
    sqlContext.sparkContext.parallelize(from to to).map { e =>
      InternalRow(UTF8String.fromString(s"people$e"), e * 2)
    }.asInstanceOf[RDD[Row]]
  }
}

class DDLTestSuite extends DataSourceTest with SharedSQLContext {
  protected override lazy val sql = caseInsensitiveContext.sql _

  override def beforeAll(): Unit = {
    super.beforeAll()
    sql(
      """
      |CREATE TEMPORARY TABLE ddlPeople
      |USING org.apache.spark.sql.sources.DDLScanSource
      |OPTIONS (
      |  From '1',
      |  To '10',
      |  Table 'test1'
      |)
      """.stripMargin)
  }

  sqlTest(
      "describe ddlPeople",
      Seq(
        Row("intType", "int", "test comment test1"),
        Row("stringType", "string", ""),
        Row("dateType", "date", ""),
        Row("timestampType", "timestamp", ""),
        Row("doubleType", "double", ""),
        Row("bigintType", "bigint", ""),
        Row("tinyintType", "tinyint", ""),
        Row("decimalType", "decimal(10,0)", ""),
        Row("fixedDecimalType", "decimal(5,1)", ""),
        Row("binaryType", "binary", ""),
        Row("booleanType", "boolean", ""),
        Row("smallIntType", "smallint", ""),
        Row("floatType", "float", ""),
        Row("mapType", "map<string,string>", ""),
        Row("arrayType", "array<string>", ""),
        Row("structType", "struct<f1:string,f2:int>", "")
      ))

  test("SPARK-7686 DescribeCommand should have correct physical plan output attributes") {
    val attributes = sql("describe ddlPeople")
      .queryExecution.executedPlan.output
    assert(attributes.map(_.name) === Seq("col_name", "data_type", "comment"))
    assert(attributes.map(_.dataType).toSet === Set(StringType))
  }
}

Source File: GenomicIntervalStrategy.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.utvf

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.{DataFrame, GenomicInterval, SparkSession, Strategy}
import org.apache.spark.unsafe.types.UTF8String

case class GIntervalRow(contigName: String, start: Int, end: Int)
class GenomicIntervalStrategy( spark: SparkSession) extends Strategy with Serializable  {
  def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {

    case GenomicInterval(contigName, start, end,output) => GenomicIntervalPlan(plan,spark,GIntervalRow(contigName,start,end),output) :: Nil
    case _ => Nil

  }
}

case class GenomicIntervalPlan(plan: LogicalPlan, spark: SparkSession,interval:GIntervalRow, output: Seq[Attribute]) extends SparkPlan with Serializable {
  def doExecute(): org.apache.spark.rdd.RDD[InternalRow] = {
    import spark.implicits._

    lazy val genomicInterval = spark.createDataset(Seq(interval))
    genomicInterval
        .rdd
      .map(r=>{
        val proj =  UnsafeProjection.create(schema)
        proj.apply(InternalRow.fromSeq(Seq(UTF8String.fromString(r.contigName),r.start,r.end)))
        }
      )
  }
  def children: Seq[SparkPlan] = Nil
}

Source File: Pileup.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.pileup

import htsjdk.samtools.SAMRecord
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.biodatageeks.sequila.datasources.BAM.BDGAlignFileReaderWriter
import org.biodatageeks.sequila.datasources.InputDataType
import org.biodatageeks.sequila.inputformats.BDGAlignInputFormat
import org.biodatageeks.sequila.utils.{InternalParams, TableFuncs}
import org.seqdoop.hadoop_bam.CRAMBDGInputFormat
import org.slf4j.LoggerFactory

import scala.reflect.ClassTag


class Pileup[T<:BDGAlignInputFormat](spark:SparkSession)(implicit c: ClassTag[T]) extends BDGAlignFileReaderWriter[T] {
  val logger = LoggerFactory.getLogger(this.getClass.getCanonicalName)

  def handlePileup(tableName: String, sampleId: String, refPath:String, output: Seq[Attribute]): RDD[InternalRow] = {
    logger.info("Calculating pileup on table: {}", tableName)

    lazy val allAlignments = readTableFile(name=tableName, sampleId)

    if(logger.isDebugEnabled()) logger.debug("Processing {} reads in total", allAlignments.count() )

    val alignments = filterAlignments(allAlignments )


    PileupMethods.calculatePileup(alignments, spark ,refPath)

  }

  private def filterAlignments(alignments:RDD[SAMRecord]): RDD[SAMRecord] = {
    // any other filtering conditions should go here
    val filterFlag = spark.conf.get(InternalParams.filterReadsByFlag, "1796").toInt
    val cleaned = alignments.filter(read => read.getContig != null && (read.getFlags & filterFlag) == 0)
    if(logger.isDebugEnabled()) logger.debug("Processing {} cleaned reads in total", cleaned.count() )
    cleaned
  }

  private def readTableFile(name: String, sampleId: String): RDD[SAMRecord] = {
    val metadata = TableFuncs.getTableMetadata(spark, name)
    val path = metadata.location.toString

    val samplePathTemplate = (
      path
      .split('/')
      .dropRight(1) ++ Array(s"$sampleId*.{{fileExtension}}"))
      .mkString("/")

    metadata.provider match {
      case Some(f) =>
        if (f == InputDataType.BAMInputDataType)
           readBAMFile(spark.sqlContext, samplePathTemplate.replace("{{fileExtension}}", "bam"), refPath = None)
        else if (f == InputDataType.CRAMInputDataType) {
          val refPath = spark.sqlContext
            .sparkContext
            .hadoopConfiguration
            .get(CRAMBDGInputFormat.REFERENCE_SOURCE_PATH_PROPERTY)
           readBAMFile(spark.sqlContext, samplePathTemplate.replace("{{fileExtension}}", "cram"), Some(refPath))
        }
        else throw new Exception("Only BAM and CRAM file formats are supported in bdg_coverage.")
      case None => throw new Exception("Wrong file extension - only BAM and CRAM file formats are supported in bdg_coverage.")
    }
  }
}

Source File: PileupStrategy.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.pileup

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.{PileupTemplate, SparkSession, Strategy}
import org.biodatageeks.sequila.datasources.BAM.BDGAlignFileReaderWriter
import org.biodatageeks.sequila.datasources.InputDataType
import org.biodatageeks.sequila.inputformats.BDGAlignInputFormat
import org.biodatageeks.sequila.utils.TableFuncs
import org.seqdoop.hadoop_bam.{BAMBDGInputFormat, CRAMBDGInputFormat}

import scala.reflect.ClassTag

class PileupStrategy (spark:SparkSession) extends Strategy with Serializable {
  override def apply(plan: LogicalPlan): Seq[SparkPlan] = {
    plan match {
      case PileupTemplate(tableName, sampleId, refPath, output) =>
        val inputFormat = TableFuncs.getTableMetadata(spark, tableName).provider
        inputFormat match {
          case Some(f) =>
            if (f == InputDataType.BAMInputDataType)
              PileupPlan[BAMBDGInputFormat](plan, spark, tableName, sampleId, refPath, output) :: Nil
            else if (f == InputDataType.CRAMInputDataType)
              PileupPlan[CRAMBDGInputFormat](plan, spark, tableName, sampleId, refPath, output) :: Nil
            else Nil
          case None => throw new RuntimeException("Only BAM and CRAM file formats are supported in pileup function.")
        }
      case _ => Nil
    }
  }
}

case class PileupPlan [T<:BDGAlignInputFormat](plan:LogicalPlan, spark:SparkSession,
                                               tableName:String,
                                               sampleId:String,
                                               refPath: String,
                                               output:Seq[Attribute])(implicit c: ClassTag[T])
  extends SparkPlan with Serializable  with BDGAlignFileReaderWriter [T]{

  override def children: Seq[SparkPlan] = Nil

  override protected def doExecute(): RDD[InternalRow] = {
   new Pileup(spark).handlePileup(tableName, sampleId, refPath, output)
  }

}

Source File: PileupMethods.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.pileup

import htsjdk.samtools.SAMRecord
import org.apache.spark.rdd.MetricsContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.biodatageeks.sequila.pileup.model.{Reference, _}
import org.biodatageeks.sequila.pileup.timers.PileupTimers._
import org.biodatageeks.sequila.utils.InternalParams
import org.slf4j.{Logger, LoggerFactory}

import AggregateRDDOperations.implicits._
import AlignmentsRDDOperations.implicits._

  def calculatePileup(alignments: RDD[SAMRecord], spark: SparkSession, refPath: String): RDD[InternalRow] = {

    Reference.init(refPath)
    val enableInstrumentation = spark
      .sqlContext
      .getConf(InternalParams.EnableInstrumentation).toBoolean
    val alignmentsInstr = if(enableInstrumentation) alignments.instrument() else alignments
    val aggregates = ContigAggrTimer.time {
      alignmentsInstr.assembleContigAggregates()
        .persist(StorageLevel.MEMORY_AND_DISK) //FIXME: Add automatic unpersist
    }
    val accumulator = AccumulatorTimer.time {aggregates.accumulateTails(spark)}

    val broadcast = BroadcastTimer.time{
      spark.sparkContext.broadcast(accumulator.value().prepareOverlaps())
    }
    val adjustedEvents = AdjustedEventsTimer.time {aggregates.adjustWithOverlaps(broadcast) }
    val pileup = EventsToPileupTimer.time {adjustedEvents.toPileup}
    pileup
  }
}

Source File: NCListsJoin.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.rangejoins.NCList

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.execution.{SparkPlan, _}

@DeveloperApi
case class NCListsJoin(left: SparkPlan,
                     right: SparkPlan,
                     condition: Seq[Expression],
                     context: SparkSession) extends BinaryExecNode {
  def output = left.output ++ right.output

  lazy val (buildPlan, streamedPlan) = (left, right)

  lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1)),
    List(condition(2), condition(3)))

  @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output)
  @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys,
    streamedPlan.output)

  protected override def doExecute(): RDD[InternalRow] = {
    val v1 = left.execute()
    val v2 = right.execute()

    val v1kv = v1.map(x => {
      val v1Key = buildKeyGenerator(x)

      (new Interval[Int](v1Key.getInt(0), v1Key.getInt(1)),
        x.copy())
    } )

    val v2kv = v2.map(x => {
      val v2Key = streamKeyGenerator(x)
      (new Interval[Int](v2Key.getInt(0), v2Key.getInt(1)),
        x.copy())
    } )
    
    if (v1.count <= v2.count) {


      val v3 = NCListsJoinImpl.overlapJoin(context.sparkContext, v1kv, v2kv).flatMap(l => l._2.map(r => (l._1, r)))
      v3.map {
        case (l: InternalRow, r: InternalRow) => {
          val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema);
          joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner
        }
      }
    } else {
      val v3 = NCListsJoinImpl.overlapJoin(context.sparkContext, v2kv, v1kv).flatMap(l => l._2.map(r => (l._1, r)))
      v3.map {
        case (r: InternalRow, l: InternalRow) => {
          val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema);
          joiner.join(l.asInstanceOf[UnsafeRow], r.asInstanceOf[UnsafeRow]).asInstanceOf[InternalRow] //resultProj(joinedRow(l, r)) joiner.joiner
        }
      }
    }
  }
}

Source File: NCListsJoinImpl.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.rangejoins.NCList

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow

import org.apache.spark.rdd.MetricsContext._
import org.biodatageeks.sequila.rangejoins.common.performance.timers.NCListTimer._

object NCListsJoinImpl extends Serializable {

  
    val nclist = NCListBuild.time{sc.broadcast(new NCListTree[Int](localIntervals))}
    val kvrdd2: RDD[(Int, Iterable[InternalRow])] = rdd2
        .instrument()
      // join entry with the intervals returned from the interval tree
      .map(x => (NCListLookup.time{nclist.value.getAllOverlappings(x._1)}, x._2))
      .filter(x => x._1 != Nil) // filter out entries that do not join anywhere
      .flatMap(t => t._1.map(s => (s._2, t._2))) // create pairs of (index1, rdd2Elem)
      .groupByKey

    indexedRdd1 // this is RDD[(Int, (Interval[Int], Row))]
      .map(x => (x._1.toInt, x._2._2)) // convert it to (Int, Row)
      .join(kvrdd2) // join produces RDD[(Int, (Row, Iterable[Row]))]
      .map(_._2) // end up with RDD[(Row, Iterable[Row])]
  }

}

Source File: NCListsJoinChromosome.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.rangejoins.methods.NCList

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.execution.{SparkPlan, _}
import org.biodatageeks.sequila.rangejoins.NCList.{Interval, NCListsJoinImpl}

@DeveloperApi
case class NCListsJoinChromosome(left: SparkPlan,
                     right: SparkPlan,
                     condition: Seq[Expression],
                     context: SparkSession) extends BinaryExecNode {
  def output = left.output ++ right.output

  lazy val (buildPlan, streamedPlan) = (left, right)

  lazy val (buildKeys, streamedKeys) = (List(condition(0), condition(1),condition(4)),
    List(condition(2), condition(3),condition(5)))


  @transient lazy val buildKeyGenerator = new InterpretedProjection(buildKeys, buildPlan.output)
  @transient lazy val streamKeyGenerator = new InterpretedProjection(streamedKeys,
    streamedPlan.output)

  protected override def doExecute(): RDD[InternalRow] = {
    val v1 = left.execute()
    val v2 = right.execute()

    val v1kv = v1.map(x => {
      val v1Key = buildKeyGenerator(x)

      ((v1Key.getString(2),new Interval[Int](v1Key.getInt(0), v1Key.getInt(1))),
        x.copy())
    } )

    val v2kv = v2.map(x => {
      val v2Key = streamKeyGenerator(x)
      ((v2Key.getString(2),new Interval[Int](v2Key.getInt(0), v2Key.getInt(1))),
        x.copy())
    } )
    
    if (v1.count <= v2.count) {


      val v3 = NCListsJoinChromosomeImpl.overlapJoin(context.sparkContext, v1kv, v2kv)
      v3.mapPartitions(
        p => {
          val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
          p.map(r => joiner.join(r._1.asInstanceOf[UnsafeRow], r._2.asInstanceOf[UnsafeRow]))
        }
      )
    } else {
      val v3 = NCListsJoinChromosomeImpl.overlapJoin(context.sparkContext, v2kv, v1kv)
      v3.mapPartitions(
        p => {
          val joiner = GenerateUnsafeRowJoiner.create(right.schema, left.schema)
          p.map(r=>joiner.join(r._2.asInstanceOf[UnsafeRow],r._1.asInstanceOf[UnsafeRow]))
        }

      )
    }
  }
}

Source File: NCListsJoinChromosomeImpl.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.rangejoins.methods.NCList

import org.apache.spark.SparkContext
import org.apache.spark.rdd.MetricsContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.biodatageeks.sequila.rangejoins.NCList.{Interval, NCListTree}
import org.biodatageeks.sequila.rangejoins.common.performance.timers.IntervalTreeTimer.IntervalTreeHTSLookup
import org.biodatageeks.sequila.rangejoins.common.performance.timers.NCListTimer._
import org.biodatageeks.sequila.rangejoins.methods.IntervalTree.IntervalTreeJoinOptimChromosomeImpl.calcOverlap

import scala.collection.immutable.Stream.Empty

object NCListsJoinChromosomeImpl extends Serializable {

  
    val nclist = NCListBuild.time{sc.broadcast(new NCListTreeChromosome[Long](localIntervals))}

    val joinedRDD = rdd2
      .instrument()
      .mapPartitions(p=> {
        p.map(r=> {
          IntervalTreeHTSLookup.time {
            val ncl = nclist.value.getAllOverlappings(r._1)
            if(ncl != Nil) {
                  ncl
                    .map(k => (k,r._2))
                    .toIterator
              }
            else Iterator.empty
          }
        })
      })
      .flatMap(r=>r)

    indexedRdd1
      .map(r=>(r._1,r._2._2))
      .join(joinedRDD.map(r=>(r._1._2,r._2)))
      .map(r=>r._2)
  }


}

Source File: IntervalTreeHTSChromosome.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.rangejoins.methods.IntervalTree

import org.apache.spark.sql.catalyst.InternalRow
import org.biodatageeks.sequila.rangejoins.IntervalTree.{Interval, IntervalWithRow}

class IntervalTreeHTSChromosome[T](allRegions: Array[(String,Interval[Int],T)]) extends Serializable {

  val intervalTreeHashMap:Map[String,IntervalTreeHTS[T]] = {

    allRegions
      .groupBy(_._1)
      .map(x => {
        val it = new IntervalTreeHTS[T]()
        x._2.foreach { y =>
          it.put(y._2.start, y._2.end, y._3)
        }
        (x._1, it)
      })
  }

  def getIntervalTreeByChromosome(chr:String): Option[IntervalTreeHTS[T]] = intervalTreeHashMap.get(chr)

}

org.apache.spark.sql.catalyst.InternalRow Scala Examples