org.apache.spark.sql.types.LongType Scala Example

Source File: GroupedIteratorSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: SQLTransformerSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
import org.apache.spark.sql.types.{LongType, StructField, StructType}
import org.apache.spark.storage.StorageLevel

class SQLTransformerSuite extends MLTest with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new SQLTransformer())
  }

  test("transform numeric data") {
    val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
     val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
      .toDF("id", "v1", "v2", "v3", "v4")
    val resultSchema = sqlTrans.transformSchema(original.schema)
    testTransformerByGlobalCheckFunc[(Int, Double, Double)](
      original,
      sqlTrans,
      "id",
      "v1",
      "v2",
      "v3",
      "v4") { rows =>
      assert(rows.head.schema.toString == resultSchema.toString)
      assert(resultSchema == expected.schema)
      assert(rows == expected.collect().toSeq)
      assert(original.sparkSession.catalog.listTables().count() == 0)
    }
  }

  test("read/write") {
    val t = new SQLTransformer()
      .setStatement("select * from __THIS__")
    testDefaultReadWrite(t)
  }

  test("transformSchema") {
    val df = spark.range(10)
    val outputSchema = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
      .transformSchema(df.schema)
    val expected = StructType(Seq(StructField("id1", LongType, nullable = false)))
    assert(outputSchema === expected)
  }

  test("SPARK-22538: SQLTransformer should not unpersist given dataset") {
    val df = spark.range(10).toDF()
    df.cache()
    df.count()
    assert(df.storageLevel != StorageLevel.NONE)
    val sqlTrans = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
    testTransformerByGlobalCheckFunc[Long](df, sqlTrans, "id1") { _ => }
    assert(df.storageLevel != StorageLevel.NONE)
  }
}

Source File: Mean.scala From deequ with Apache License 2.0

5 votes

package com.amazon.deequ.analyzers

import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric}
import org.apache.spark.sql.{Column, Row}
import org.apache.spark.sql.functions.{count, sum}
import org.apache.spark.sql.types.{DoubleType, StructType, LongType}
import Analyzers._

case class MeanState(sum: Double, count: Long) extends DoubleValuedState[MeanState] {

  override def sum(other: MeanState): MeanState = {
    MeanState(sum + other.sum, count + other.count)
  }

  override def metricValue(): Double = {
    if (count == 0L) Double.NaN else sum / count
  }
}

case class Mean(column: String, where: Option[String] = None)
  extends StandardScanShareableAnalyzer[MeanState]("Mean", column)
  with FilterableAnalyzer {

  override def aggregationFunctions(): Seq[Column] = {
    sum(conditionalSelection(column, where)).cast(DoubleType) ::
      count(conditionalSelection(column, where)).cast(LongType) :: Nil
  }

  override def fromAggregationResult(result: Row, offset: Int): Option[MeanState] = {

    ifNoNullsIn(result, offset, howMany = 2) { _ =>
      MeanState(result.getDouble(offset), result.getLong(offset + 1))
    }
  }

  override protected def additionalPreconditions(): Seq[StructType => Unit] = {
    hasColumn(column) :: isNumeric(column) :: Nil
  }

  override def filterCondition: Option[String] = where
}

Source File: CustomSchemaTest.scala From spark-sftp with Apache License 2.0

5 votes

package com.springml.spark.sftp

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, _}
import org.scalatest.{BeforeAndAfterEach, FunSuite}


class CustomSchemaTest extends FunSuite with BeforeAndAfterEach {
  var ss: SparkSession = _

  val csvTypesMap = Map("ProposalId" -> IntegerType,
    "OpportunityId" -> StringType,
    "Clicks" -> LongType,
    "Impressions" -> LongType
  )

  val jsonTypesMap = Map("name" -> StringType,
    "age" -> IntegerType
  )

  override def beforeEach() {
    ss = SparkSession.builder().master("local").appName("Custom Schema Test").getOrCreate()
  }

  private def validateTypes(field : StructField, typeMap : Map[String, DataType]) = {
    val expectedType = typeMap(field.name)
    assert(expectedType == field.dataType)
  }

  private def columnArray(typeMap : Map[String, DataType]) : Array[StructField] = {
    val columns = typeMap.map(x => new StructField(x._1, x._2, true))

    val columnStruct = Array[StructField] ()
    columns.copyToArray(columnStruct)

    columnStruct
  }

  test ("Read CSV with custom schema") {
    val columnStruct = columnArray(csvTypesMap)
    val expectedSchema = StructType(columnStruct)

    val fileLocation = getClass.getResource("/sample.csv").getPath
    val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext)
    val rdd = dsr.buildScan()

    assert(dsr.schema.fields.length == columnStruct.length)
    dsr.schema.fields.foreach(s => validateTypes(s, csvTypesMap))
  }

  test ("Read Json with custom schema") {
    val columnStruct = columnArray(jsonTypesMap)
    val expectedSchema = StructType(columnStruct)

    val fileLocation = getClass.getResource("/people.json").getPath
    val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext)
    val rdd = dsr.buildScan()

    assert(dsr.schema.fields.length == columnStruct.length)
    dsr.schema.fields.foreach(s => validateTypes(s, jsonTypesMap))
  }

}

Source File: FreqStatsTransformerSpec.scala From pravda-ml with Apache License 2.0

5 votes

package odkl.analysis.spark.texts

import odkl.analysis.spark.TestEnv
import org.apache.spark.ml.odkl.texts.FreqStatsTransformer
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ArrayType, LongType, StringType, StructType}
import org.scalatest.FlatSpec


class FreqStatsTransformerSpec extends FlatSpec with TestEnv with org.scalatest.Matchers {

  "FreqStatsTransformer" should "count freq" in {
    val fTransformer =  new FreqStatsTransformer()
      .setInputDataCol("data")
      .setOutputColFreq("Freq")
      .setOutputColTerm("Term")

    val schema = new StructType().add("data",ArrayType(StringType,true))
    val inDF = sqlc.createDataFrame(
      sc.parallelize(Seq(Seq[String]("a","b","c"),Seq[String]("a","b","a")))
        .map(f => {Row(f)}), schema)

    val correctAns = Array[(String,Double)](("a",2D/5D),("b",2D/5D),("c",1D/5D))
    val realAns = fTransformer.transform(inDF).sort("Term").collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"))})
    assertResult(correctAns)(realAns)

  }
  "FreqStatsTransformer" should "filter freq by uni and bi treshold" in {
    val fTransformer =  new FreqStatsTransformer()
      .setInputDataCol("data")
      .setOutputColFreq("Freq")
      .setOutputColTerm("Term")
      .setTresholdArr(Array[Double](1.5D/8D,1.1D/8D))

    val schema = new StructType().add("data",ArrayType(StringType,true))
    val inDF = sqlc.createDataFrame(
      sc.parallelize(Seq(Seq[String]("a","b","c","c a", "c a"),Seq[String]("a","b","a", "c a", "a b")))
        .map(f => {Row(f)}), schema)

    val correctAns = Array[(String,Double)](("a",2D/8D),("b",2D/8D),("c a",2D/8D))
    val realAnsDF = fTransformer.transform(inDF).sort("Term")
      val realAns = realAnsDF.collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"))})
    assertResult(correctAns)(realAns)

  }

  "FreqStatsTransformer" should "extract max timestamp by term" in {
    val fTransformer =  new FreqStatsTransformer()
      .setInputDataCol("data")
      .setOutputColFreq("Freq")
      .setOutputColTerm("Term")
        .setWithTimestamp(true)
        .setTimestampColumnName("timestamp")
      .setTresholdArr(Array[Double](1D/8D,1.1D/8D))

    val schema =
      new StructType().add("data",ArrayType(StringType,true)).add("timestamp",LongType)
    val inDF = sqlc.createDataFrame(
      sc.parallelize(Seq(Seq(Seq[String]("a","c","c a", "c a"),100L),Seq(Seq[String]("c a", "a b"),150L),Seq(Seq[String]("b"),200L)))
        .map(f => {Row.fromSeq(f)}), schema)

    inDF.collect()
    val correctAns = Array[(String,Double,Long)](("a",1D/6D,100L),("a b",1D/6D, 150L),("b",1D/6D,200L),
      ("c",1D/6D, 100L),("c a",2D/6D, 150L))
    val realAns = fTransformer.transform(inDF).sort("Term").collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"),f.getAs[Long]("timestamp"))})
    assertResult(correctAns)(realAns)
    assertResult(correctAns(1))(realAns(1))

  }
}

Source File: HashBasedDeduplicatorSpec.scala From pravda-ml with Apache License 2.0

5 votes

package odkl.analysis.spark.texts

import odkl.analysis.spark.TestEnv
import org.apache.spark.ml.odkl.texts.HashBasedDeduplicator
import org.apache.spark.ml.linalg.{VectorUDT, Vectors}
import org.apache.spark.ml.odkl.MatrixUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructType}
import org.scalatest.FlatSpec

class HashBasedDeduplicatorSpec extends FlatSpec with TestEnv with org.scalatest.Matchers {
  "cotrect HashBasedDeduplicator " should " remove similar vectors based on hash " in {

    val vectorsSize = 10000

    val vector1 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 1.0, 1.0)), 1L, "vector1")
    val vector2 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 1.0, 0.0)), 1L, "vector2")
    val vector3 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 0.0, 1.0)), 2L, "vector3") //pretty similar, but in 2nd bucket
    val vector4 = (Vectors.sparse(vectorsSize, Array(1, 2), Array(1.0, 1.0)), 1L, "vector4") //completly another but in 1-st bucket

    val schema = new StructType()
      .add("vector", MatrixUtils.vectorUDT)
      .add("hash", LongType)
      .add("alias", StringType)

    val dataFrame = sqlc.createDataFrame(sc.parallelize(Seq(vector1, vector2, vector3, vector4).map(Row.fromTuple(_))), schema)
    val deduplicator = new HashBasedDeduplicator()
      .setInputColHash("hash")
      .setInputColVector("vector")
      .setSimilarityTreshold(0.80)

   val answer = deduplicator.transform(dataFrame)
        .collect().map(row => (row.getLong(1), row.getString(2)))

    assert(answer.exists(_._2 == "vector1")) //should stay
    assert(!answer.exists(_._2 == "vector2")) //should be removed
    assert(answer.exists(_._2 == "vector3")) //should stay cause in other bucket (FalseNegative)
    assert(answer.exists(_._2 == "vector4")) //should stay cause different (FalsePositive)
  }
}

Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import java.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed}
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{LongType, StructType}


  def setDim(value: Long): this.type = set(dim, value)


  def this() = this(Identifiable.randomUID("randomProjectionsHasher"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val dimensity = {
      if (!isSet(dim)) {//If dimensions is not set - will search  AttributeGroup in metadata as it comes from OdklCountVectorizer
        val vectorsIndex = dataset.schema.fieldIndex($(inputCol))
        AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size
      } else {
        $(dim).toInt
      }
    }
    val projectionMatrix = dataset.sqlContext.sparkContext.broadcast(
      Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix])
  //the matrix of random vectors to costruct hash

    val binHashSparseVectorColumn = udf((vector: Vector) => {
      projectionMatrix.value.multiply(vector).values
        .map(f =>  if (f>0) 1L else 0L)
        .view.zipWithIndex
        .foldLeft(0L) {case  (acc,(v, i)) => acc | (v << i) }

    })
    dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), LongType)
  }

}

Source File: DecimalExpressionSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{LongType, DecimalType, Decimal}


class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {
  //非标准的值
  test("UnscaledValue") {
    val d1 = Decimal("10.1")
    checkEvaluation(UnscaledValue(Literal(d1)), 101L)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(UnscaledValue(Literal(d2)), 101L)
    checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null)
  }
  //十进制
  test("MakeDecimal") {
    checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1"))
    checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null)
  }
  //提高精度
  test("PromotePrecision") {
    val d1 = Decimal("10.1")
    checkEvaluation(PromotePrecision(Literal(d1)), d1)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(PromotePrecision(Literal(d2)), d2)
    checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null)
  }
  //检查溢出
  test("CheckOverflow") {
    val d1 = Decimal("10.1")
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null)

    val d2 = Decimal(101, 3, 1)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null)

    checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null)
  }

}

Source File: MonotonicallyIncreasingID.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
import org.apache.spark.sql.types.{LongType, DataType}


  @transient private[this] var count: Long = _

  @transient private[this] var partitionMask: Long = _

  override protected def initInternal(): Unit = {
    count = 0L
    partitionMask = TaskContext.getPartitionId().toLong << 33
  }

  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override protected def evalInternal(input: InternalRow): Long = {
    val currentCount = count
    count += 1
    partitionMask + currentCount
  }

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val countTerm = ctx.freshName("count")
    val partitionMaskTerm = ctx.freshName("partitionMask")
    ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;")
    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm,
      s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;")

    ev.isNull = "false"
    s"""
      final ${ctx.javaType(dataType)} ${ev.primitive} = $partitionMaskTerm + $countTerm;
      $countTerm++;
    """
  }
}

Source File: ArrangePostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

package it.agilelab.bigdata.DataQuality.postprocessors

import com.typesafe.config.Config
import it.agilelab.bigdata.DataQuality.checks.CheckResult
import it.agilelab.bigdata.DataQuality.metrics.MetricResult
import it.agilelab.bigdata.DataQuality.sources.HdfsFile
import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig
import it.agilelab.bigdata.DataQuality.utils
import it.agilelab.bigdata.DataQuality.utils.DQSettings
import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter}
import org.apache.hadoop.fs.FileSystem
import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, NumericType}
import org.apache.spark.sql.{Column, DataFrame, SQLContext}

import scala.collection.JavaConversions._

final class ArrangePostprocessor(config: Config, settings: DQSettings)
    extends BasicPostprocessor(config, settings) {

  private case class ColumnSelector(name: String, tipo: Option[String] = None, format: Option[String] = None, precision: Option[Integer] = None) {
    def toColumn()(implicit df: DataFrame): Column = {

      val dataType: Option[NumericType with Product with Serializable] =
        tipo.getOrElse("").toUpperCase match {
          case "DOUBLE" => Some(DoubleType)
          case "INT"    => Some(IntegerType)
          case "LONG"   => Some(LongType)
          case _        => None
        }

      import org.apache.spark.sql.functions.format_number
      import org.apache.spark.sql.functions.format_string

      (dataType, precision, format) match {
        case (Some(dt), None, None) => df(name).cast(dt)
        case(Some(dt), None, Some(f)) => format_string(f, df(name).cast(dt)).alias(name)
        case (Some(dt), Some(p),None) => format_number(df(name).cast(dt), p).alias(name)
        case (None, Some(p), None) => format_number(df(name), p).alias(name)
        case (None, None, Some(f)) => format_string(f, df(name)).alias(name)
        case _ => df(name)
      }
    }
  }

  private val vs = config.getString("source")
  private val target: HdfsTargetConfig = {
    val conf = config.getConfig("saveTo")
    utils.parseTargetConfig(conf)(settings).get
  }

  private val columns: Seq[ColumnSelector] =
    config.getAnyRefList("columnOrder").map {
      case x: String => ColumnSelector(x)
      case x: java.util.HashMap[_, String] => {
        val (name, v) = x.head.asInstanceOf[String Tuple2 _]

        v match {
          case v: String =>
            ColumnSelector(name, Option(v))
          case v: java.util.HashMap[String, _] => {
            val k = v.head._1
            val f = v.head._2

            f match {
              case f: Integer =>
                ColumnSelector(name, Option(k), None, Option(f))
              case f: String =>
                ColumnSelector(name, Option(k), Option(f))
            }
          }
        }
      }
    }

  override def process(vsRef: Set[HdfsFile],
                       metRes: Seq[MetricResult],
                       chkRes: Seq[CheckResult])(
      implicit fs: FileSystem,
      sqlContext: SQLContext,
      settings: DQSettings): HdfsFile = {

    val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vs).head
    implicit val df: DataFrame = HdfsReader.load(reqVS, settings.ref_date).head

    val arrangeDF = df.select(columns.map(_.toColumn): _*)

    HdfsWriter.saveVirtualSource(arrangeDF, target, settings.refDateString)(
      fs,
      sqlContext.sparkContext)

    new HdfsFile(target)
  }
}

Source File: SchemaColumnRandom.scala From data-faker with MIT License

5 votes

package com.dunnhumby.datafaker.schema.table.columns

import java.sql.{Date, Timestamp}
import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol
import org.apache.spark.sql.Column
import org.apache.spark.sql.functions.{to_utc_timestamp, round, rand, from_unixtime, to_date}
import org.apache.spark.sql.types.{IntegerType, LongType}

trait SchemaColumnRandom[T] extends SchemaColumn

object SchemaColumnRandom {
  val FloatDP = 3
  val DoubleDP = 3

  def apply(name: String, min: Int, max: Int): SchemaColumn = SchemaColumnRandomNumeric(name, min, max)
  def apply(name: String, min: Long, max: Long): SchemaColumn = SchemaColumnRandomNumeric(name, min, max)
  def apply(name: String, min: Float, max: Float): SchemaColumn = SchemaColumnRandomNumeric(name, min, max)
  def apply(name: String, min: Double, max: Double): SchemaColumn = SchemaColumnRandomNumeric(name, min, max)
  def apply(name: String, min: Date, max: Date): SchemaColumn = SchemaColumnRandomDate(name, min, max)
  def apply(name: String, min: Timestamp, max: Timestamp): SchemaColumn = SchemaColumnRandomTimestamp(name, min, max)
  def apply(name: String): SchemaColumn = SchemaColumnRandomBoolean(name)
}

private case class SchemaColumnRandomNumeric[T: Numeric](override val name: String, min: T, max: T) extends SchemaColumnRandom[T] {
  override def column(rowID: Option[Column] = None): Column = {
    import Numeric.Implicits._

    (min, max) match {
      case (_: Int, _: Int) => round(rand() * (max - min) + min, 0).cast(IntegerType)
      case (_: Long, _: Long) => round(rand() * (max - min) + min, 0).cast(LongType)
      case (_: Float, _: Float) => round(rand() * (max - min) + min, SchemaColumnRandom.FloatDP)
      case (_: Double, _: Double) => round(rand() * (max - min) + min, SchemaColumnRandom.DoubleDP)
    }
  }
}

private case class SchemaColumnRandomTimestamp(override val name: String, min: Timestamp, max: Timestamp) extends SchemaColumnRandom[Timestamp] {
  override def column(rowID: Option[Column] = None): Column = {
    val minTime = min.getTime / 1000
    val maxTime = max.getTime / 1000
    to_utc_timestamp(from_unixtime(rand() * (maxTime - minTime) + minTime), "UTC")
  }
}

private case class SchemaColumnRandomDate(override val name: String, min: Date, max: Date) extends SchemaColumnRandom[Date] {
  val timestamp = SchemaColumnRandomTimestamp(name, new Timestamp(min.getTime), new Timestamp(max.getTime + 86400000))

  override def column(rowID: Option[Column] = None): Column = to_date(timestamp.column())
}

private case class SchemaColumnRandomBoolean(override val name: String) extends SchemaColumnRandom[Boolean] {
  override def column(rowID: Option[Column] = None): Column = rand() < 0.5f
}

object SchemaColumnRandomProtocol extends SchemaColumnRandomProtocol
trait SchemaColumnRandomProtocol extends YamlParserProtocol {

  import net.jcazevedo.moultingyaml._

  implicit object SchemaColumnRandomFormat extends YamlFormat[SchemaColumnRandom[_]] {

    override def read(yaml: YamlValue): SchemaColumnRandom[_] = {
      val fields = yaml.asYamlObject.fields
      val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set"))
      val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError(s"data_type not set for $name"))

      if (dataType == SchemaColumnDataType.Boolean) {
        SchemaColumnRandomBoolean(name)
      }
      else {
        val min = fields.getOrElse(YamlString("min"), deserializationError(s"min not set for $name"))
        val max = fields.getOrElse(YamlString("max"), deserializationError(s"max not set for $name"))

        dataType match {
          case SchemaColumnDataType.Int => SchemaColumnRandomNumeric(name, min.convertTo[Int], max.convertTo[Int])
          case SchemaColumnDataType.Long => SchemaColumnRandomNumeric(name, min.convertTo[Long], max.convertTo[Long])
          case SchemaColumnDataType.Float => SchemaColumnRandomNumeric(name, min.convertTo[Float], max.convertTo[Float])
          case SchemaColumnDataType.Double => SchemaColumnRandomNumeric(name, min.convertTo[Double], max.convertTo[Double])
          case SchemaColumnDataType.Date => SchemaColumnRandomDate(name, min.convertTo[Date], max.convertTo[Date])
          case SchemaColumnDataType.Timestamp => SchemaColumnRandomTimestamp(name, min.convertTo[Timestamp], max.convertTo[Timestamp])
          case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Random}")
        }
      }

    }

    override def write(obj: SchemaColumnRandom[_]): YamlValue = ???

  }

}

Source File: MonotonicallyIncreasingID.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{DataType, LongType}


  @transient private[this] var count: Long = _

  @transient private[this] var partitionMask: Long = _

  override protected def initializeInternal(partitionIndex: Int): Unit = {
    count = 0L
    partitionMask = partitionIndex.toLong << 33
  }

  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override protected def evalInternal(input: InternalRow): Long = {
    val currentCount = count
    count += 1
    partitionMask + currentCount
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val countTerm = ctx.addMutableState(ctx.JAVA_LONG, "count")
    val partitionMaskTerm = "partitionMask"
    ctx.addImmutableStateIfNotExists(ctx.JAVA_LONG, partitionMaskTerm)
    ctx.addPartitionInitializationStatement(s"$countTerm = 0L;")
    ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;")

    ev.copy(code = s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
      $countTerm++;""", isNull = "false")
  }

  override def prettyName: String = "monotonically_increasing_id"

  override def sql: String = s"$prettyName()"
}

Source File: MySQLDialect.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.jdbc

import java.sql.Types

import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder}

private case object MySQLDialect extends JdbcDialect {

  override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql")

  override def getCatalystType(
      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
    if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) {
      // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
      // byte arrays instead of longs.
      md.putLong("binarylong", 1)
      Option(LongType)
    } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
      Option(BooleanType)
    } else None
  }

  override def quoteIdentifier(colName: String): String = {
    s"`$colName`"
  }

  override def getTableExistsQuery(table: String): String = {
    s"SELECT 1 FROM $table LIMIT 1"
  }

  override def isCascadingTruncateTable(): Option[Boolean] = Some(false)
}

Source File: ResolveInlineTablesSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Literal, Rand}
import org.apache.spark.sql.catalyst.expressions.aggregate.Count
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.types.{LongType, NullType}


class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter {

  private def lit(v: Any): Literal = Literal(v)

  test("validate inputs are foldable") {
    ResolveInlineTables.validateInputEvaluable(
      UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)))))

    // nondeterministic (rand) should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1)))))
    }

    // aggregate should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1))))))
    }

    // unresolved attribute should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A")))))
    }
  }

  test("validate input dimensions") {
    ResolveInlineTables.validateInputDimension(
      UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2)))))

    // num alias != data dimension
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2)))))
    }

    // num alias == data dimension, but data themselves are inconsistent
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22)))))
    }
  }

  test("do not fire the rule if not all expressions are resolved") {
    val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A"))))
    assert(ResolveInlineTables(table) == table)
  }

  test("convert") {
    val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted = ResolveInlineTables.convert(table)

    assert(converted.output.map(_.dataType) == Seq(LongType))
    assert(converted.data.size == 2)
    assert(converted.data(0).getLong(0) == 1L)
    assert(converted.data(1).getLong(0) == 2L)
  }

  test("nullability inference in convert") {
    val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted1 = ResolveInlineTables.convert(table1)
    assert(!converted1.schema.fields(0).nullable)

    val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType))))
    val converted2 = ResolveInlineTables.convert(table2)
    assert(converted2.schema.fields(0).nullable)
  }
}

Source File: RandomSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.scalatest.Matchers._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{IntegerType, LongType}

class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("random") {
    checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001)
    checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001)

    checkDoubleEvaluation(
      new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001)
    checkDoubleEvaluation(
      new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001)
  }

  test("SPARK-9127 codegen with long seed") {
    checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001)
    checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001)
  }
}

Source File: DecimalExpressionSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{Decimal, DecimalType, LongType}

class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("UnscaledValue") {
    val d1 = Decimal("10.1")
    checkEvaluation(UnscaledValue(Literal(d1)), 101L)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(UnscaledValue(Literal(d2)), 101L)
    checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("MakeDecimal") {
    checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1"))
    checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null)
  }

  test("PromotePrecision") {
    val d1 = Decimal("10.1")
    checkEvaluation(PromotePrecision(Literal(d1)), d1)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(PromotePrecision(Literal(d2)), d2)
    checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("CheckOverflow") {
    val d1 = Decimal("10.1")
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null)

    val d2 = Decimal(101, 3, 1)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null)

    checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null)
  }

}

Source File: MonotonicallyIncreasingID.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{DataType, LongType}


  @transient private[this] var count: Long = _

  @transient private[this] var partitionMask: Long = _

  override protected def initializeInternal(partitionIndex: Int): Unit = {
    count = 0L
    partitionMask = partitionIndex.toLong << 33
  }

  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override protected def evalInternal(input: InternalRow): Long = {
    val currentCount = count
    count += 1
    partitionMask + currentCount
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val countTerm = ctx.freshName("count")
    val partitionMaskTerm = ctx.freshName("partitionMask")
    ctx.addMutableState(ctx.JAVA_LONG, countTerm, "")
    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "")
    ctx.addPartitionInitializationStatement(s"$countTerm = 0L;")
    ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;")

    ev.copy(code = s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
      $countTerm++;""", isNull = "false")
  }

  override def prettyName: String = "monotonically_increasing_id"

  override def sql: String = s"$prettyName()"
}

Source File: SQLTransformerSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.types.{LongType, StructField, StructType}

class SQLTransformerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new SQLTransformer())
  }

  test("transform numeric data") {
    val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    val result = sqlTrans.transform(original)
    val resultSchema = sqlTrans.transformSchema(original.schema)
    val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
      .toDF("id", "v1", "v2", "v3", "v4")
    assert(result.schema.toString == resultSchema.toString)
    assert(resultSchema == expected.schema)
    assert(result.collect().toSeq == expected.collect().toSeq)
    assert(original.sparkSession.catalog.listTables().count() == 0)
  }

  test("read/write") {
    val t = new SQLTransformer()
      .setStatement("select * from __THIS__")
    testDefaultReadWrite(t)
  }

  test("transformSchema") {
    val df = spark.range(10)
    val outputSchema = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
      .transformSchema(df.schema)
    val expected = StructType(Seq(StructField("id1", LongType, nullable = false)))
    assert(outputSchema === expected)
  }
}

Source File: LoadInteractionsInHive.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.util

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.opencypher.morpheus.api.MorpheusSession

object LoadInteractionsInHive {

  val databaseName = "customers"
  val baseTableName = s"$databaseName.csv_input"

  def load(show: Boolean = false)(implicit session: MorpheusSession): DataFrame = {

    val datafile = getClass.getResource("/customer-interactions/csv/customer-interactions.csv").toURI.getPath
    val structType = StructType(Seq(
      StructField("interactionId", LongType, nullable = false),
      StructField("date", StringType, nullable = false),
      StructField("customerIdx", LongType, nullable = false),
      StructField("empNo", LongType, nullable = false),
      StructField("empName", StringType, nullable = false),
      StructField("type", StringType, nullable = false),
      StructField("outcomeScore", StringType, nullable = false),
      StructField("accountHolderId", StringType, nullable = false),
      StructField("policyAccountNumber", StringType, nullable = false),
      StructField("customerId", StringType, nullable = false),
      StructField("customerName", StringType, nullable = false)
    ))

    val baseTable: DataFrame = session.sparkSession.read
      .format("csv")
      .option("header", "true")
      .schema(structType)
      .load(datafile)

    if (show) baseTable.show()

    session.sql(s"DROP DATABASE IF EXISTS $databaseName CASCADE")
    session.sql(s"CREATE DATABASE $databaseName")

    baseTable.write.saveAsTable(s"$baseTableName")

    // Create views for nodes
    createView(baseTableName, "interactions", true, "interactionId", "date", "type", "outcomeScore")
    createView(baseTableName, "customers", true, "customerIdx", "customerId", "customerName")
    createView(baseTableName, "account_holders", true, "accountHolderId")
    createView(baseTableName, "policies", true, "policyAccountNumber")
    createView(baseTableName, "customer_reps", true, "empNo", "empName")

    // Create views for relationships
    createView(baseTableName, "has_customer_reps", false, "interactionId", "empNo")
    createView(baseTableName, "has_customers", false, "interactionId", "customerIdx")
    createView(baseTableName, "has_policies", false, "interactionId", "policyAccountNumber")
    createView(baseTableName, "has_account_holders", false, "interactionId", "accountHolderId")

    baseTable
  }

  def createView(fromTable: String, viewName: String, distinct: Boolean, columns: String*)
    (implicit session: MorpheusSession): Unit = {
    val distinctString = if (distinct) "DISTINCT" else ""

    session.sql(
      s"""
         |CREATE VIEW $databaseName.${viewName}_SEED AS
         | SELECT $distinctString ${columns.mkString(", ")}
         | FROM $fromTable
         | WHERE date < '2017-01-01'
      """.stripMargin)

    session.sql(
      s"""
         |CREATE VIEW $databaseName.${viewName}_DELTA AS
         | SELECT $distinctString ${columns.mkString(", ")}
         | FROM $fromTable
         | WHERE date >= '2017-01-01'
      """.stripMargin)
  }

}

Source File: YelpHelpers.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.integration.yelp

import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, LongType}
import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions}
import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey
import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey}
import org.opencypher.morpheus.impl.table.SparkTable._
import org.opencypher.morpheus.integration.yelp.YelpConstants._

object YelpHelpers {

  case class YelpTables(
    userDf: DataFrame,
    businessDf: DataFrame,
    reviewDf: DataFrame
  )

  def loadYelpTables(inputPath: String)(implicit spark: SparkSession): YelpTables = {
    import spark.implicits._

    log("read business.json", 2)
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")
    log("read review.json", 2)
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")
    log("read user.json", 2)
    val rawUserDf = spark.read.json(s"$inputPath/user.json")

    val businessDf = rawBusinessDf.select($"business_id".as(sourceIdKey), $"business_id", $"name", $"address", $"city", $"state")
    val reviewDf = rawReviewDf.select($"review_id".as(sourceIdKey), $"user_id".as(sourceStartNodeKey), $"business_id".as(sourceEndNodeKey), $"stars", $"date".cast(DateType))
    val userDf = rawUserDf.select(
      $"user_id".as(sourceIdKey),
      $"name",
      $"yelping_since".cast(DateType),
      functions.split($"elite", ",").cast(ArrayType(LongType)).as("elite"))

    YelpTables(userDf, businessDf, reviewDf)
  }

  def printYelpStats(inputPath: String)(implicit spark: SparkSession): Unit = {
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")

    import spark.implicits._

    rawBusinessDf.select($"city", $"state").distinct().show()
    rawBusinessDf.withColumnRenamed("business_id", "id")
      .join(rawReviewDf, $"id" === $"business_id")
      .groupBy($"city", $"state")
      .count().as("count")
      .orderBy($"count".desc, $"state".asc)
      .show(100)
  }

  def extractYelpCitySubset(inputPath: String, outputPath: String, city: String)(implicit spark: SparkSession): Unit = {
    import spark.implicits._

    def emailColumn(userId: String): Column = functions.concat($"$userId", functions.lit("@yelp.com"))

    val rawUserDf = spark.read.json(s"$inputPath/user.json")
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")

    val businessDf = rawBusinessDf.filter($"city" === city)
    val reviewDf = rawReviewDf
      .join(businessDf, Seq("business_id"), "left_semi")
      .withColumn("user_email", emailColumn("user_id"))
      .withColumnRenamed("stars", "stars_tmp")
      .withColumn("stars", $"stars_tmp".cast(IntegerType))
      .drop("stars_tmp")
    val userDf = rawUserDf
      .join(reviewDf, Seq("user_id"), "left_semi")
      .withColumn("email", emailColumn("user_id"))
    val friendDf = userDf
      .select($"email".as("user1_email"), functions.explode(functions.split($"friends", ", ")).as("user2_id"))
      .withColumn("user2_email", emailColumn("user2_id"))
      .select(s"user1_email", s"user2_email")

    businessDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/business.json")
    reviewDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/review.json")
    userDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/user.json")
    friendDf.write.json(s"$outputPath/$cityGraphName/$yelpBookDB/friend.json")
  }

  implicit class DataFrameOps(df: DataFrame) {
    def prependIdColumn(idColumn: String, prefix: String): DataFrame =
      df.transformColumns(idColumn)(column => functions.concat(functions.lit(prefix), column).as(idColumn))
  }
}

Source File: EncodeLong.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.impl.expressions

import org.apache.spark.sql.Column
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, NullIntolerant, UnaryExpression}
import org.apache.spark.sql.types.{BinaryType, DataType, LongType}
import org.opencypher.morpheus.api.value.MorpheusElement._


case class EncodeLong(child: Expression) extends UnaryExpression with NullIntolerant with ExpectsInputTypes {

  override val dataType: DataType = BinaryType

  override val inputTypes: Seq[LongType] = Seq(LongType)

  override protected def nullSafeEval(input: Any): Any =
    EncodeLong.encodeLong(input.asInstanceOf[Long])

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode =
    defineCodeGen(ctx, ev, c => s"(byte[])(${EncodeLong.getClass.getName.dropRight(1)}.encodeLong($c))")
}

object EncodeLong {

  private final val moreBytesBitMask: Long = Integer.parseInt("10000000", 2)
  private final val varLength7BitMask: Long = Integer.parseInt("01111111", 2)
  private final val otherBitsMask = ~varLength7BitMask
  private final val maxBytesForLongVarEncoding = 10

  // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding
  @inline
  final def encodeLong(l: Long): Array[Byte] = {
    val tempResult = new Array[Byte](maxBytesForLongVarEncoding)

    var remainder = l
    var index = 0

    while ((remainder & otherBitsMask) != 0) {
      tempResult(index) = ((remainder & varLength7BitMask) | moreBytesBitMask).toByte
      remainder >>>= 7
      index += 1
    }
    tempResult(index) = remainder.toByte

    val result = new Array[Byte](index + 1)
    System.arraycopy(tempResult, 0, result, 0, index + 1)
    result
  }

  // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding
  @inline
  final def decodeLong(input: Array[Byte]): Long = {
    assert(input.nonEmpty, "`decodeLong` requires a non-empty array as its input")
    var index = 0
    var currentByte = input(index)
    var decoded = currentByte & varLength7BitMask
    var nextLeftShift = 7

    while ((currentByte & moreBytesBitMask) != 0) {
      index += 1
      currentByte = input(index)
      decoded |= (currentByte & varLength7BitMask) << nextLeftShift
      nextLeftShift += 7
    }
    assert(index == input.length - 1,
      s"`decodeLong` received an input array ${input.toSeq.toHex} with extra bytes that could not be decoded.")
    decoded
  }

  implicit class ColumnLongOps(val c: Column) extends AnyVal {

    def encodeLongAsMorpheusId(name: String): Column = encodeLongAsMorpheusId.as(name)

    def encodeLongAsMorpheusId: Column = new Column(EncodeLong(c.expr))

  }

}

Source File: TemporalUdafs.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.impl.temporal

import org.apache.logging.log4j.scala.Logging
import org.apache.spark.sql.Row
import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction}
import org.apache.spark.sql.types.{CalendarIntervalType, DataType, LongType, StructField, StructType}
import org.apache.spark.unsafe.types.CalendarInterval
import org.opencypher.okapi.impl.temporal.TemporalConstants
import org.opencypher.morpheus.impl.temporal.TemporalConversions._

object TemporalUdafs extends Logging {

  abstract class SimpleDurationAggregation(aggrName: String) extends UserDefinedAggregateFunction {
    override def inputSchema: StructType = StructType(Array(StructField("duration", CalendarIntervalType)))
    override def bufferSchema: StructType = StructType(Array(StructField(aggrName, CalendarIntervalType)))
    override def dataType: DataType = CalendarIntervalType
    override def deterministic: Boolean = true
    override def initialize(buffer: MutableAggregationBuffer): Unit = {
      buffer(0) = new CalendarInterval(0, 0L)
    }
    override def evaluate(buffer: Row): Any = buffer.getAs[CalendarInterval](0)
  }

  class DurationSum extends SimpleDurationAggregation("sum") {
    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
      buffer(0) = buffer.getAs[CalendarInterval](0).add(input.getAs[CalendarInterval](0))
    }
    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
      buffer1(0) = buffer2.getAs[CalendarInterval](0).add(buffer1.getAs[CalendarInterval](0))
    }
  }

  class DurationMax extends SimpleDurationAggregation("max") {
    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
      val currMaxInterval = buffer.getAs[CalendarInterval](0)
      val inputInterval = input.getAs[CalendarInterval](0)
      buffer(0) = if (currMaxInterval.toDuration.compare(inputInterval.toDuration) >= 0) currMaxInterval else inputInterval
    }
    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
      val interval1 = buffer1.getAs[CalendarInterval](0)
      val interval2 = buffer2.getAs[CalendarInterval](0)
      buffer1(0) = if (interval1.toDuration.compare(interval2.toDuration) >= 0) interval1 else interval2
    }
  }

  class DurationMin extends SimpleDurationAggregation("min") {
    override def initialize(buffer: MutableAggregationBuffer): Unit = {
      buffer(0) = new CalendarInterval(Integer.MAX_VALUE, Long.MaxValue)
    }
    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
      val currMinInterval = buffer.getAs[CalendarInterval](0)
      val inputInterval = input.getAs[CalendarInterval](0)
      buffer(0) = if (inputInterval.toDuration.compare(currMinInterval.toDuration) >= 0) currMinInterval else inputInterval
    }
    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
      val interval1 = buffer1.getAs[CalendarInterval](0)
      val interval2 = buffer2.getAs[CalendarInterval](0)
      buffer1(0) = if (interval2.toDuration.compare(interval1.toDuration) >= 0) interval1 else interval2
    }
  }

  class DurationAvg extends UserDefinedAggregateFunction {
    override def inputSchema: StructType = StructType(Array(StructField("duration", CalendarIntervalType)))
    override def bufferSchema: StructType = StructType(Array(StructField("sum", CalendarIntervalType), StructField("cnt", LongType)))
    override def dataType: DataType = CalendarIntervalType
    override def deterministic: Boolean = true
    override def initialize(buffer: MutableAggregationBuffer): Unit = {
      buffer(0) = new CalendarInterval(0, 0L)
      buffer(1) = 0L
    }
    override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
      buffer(0) = buffer.getAs[CalendarInterval](0).add(input.getAs[CalendarInterval](0))
      buffer(1) = buffer.getLong(1) + 1
    }
    override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
      buffer1(0) = buffer2.getAs[CalendarInterval](0).add(buffer1.getAs[CalendarInterval](0))
      buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1)
    }
    override def evaluate(buffer: Row): Any = {
      val sumInterval = buffer.getAs[CalendarInterval](0)
      val cnt = buffer.getLong(1)
      new CalendarInterval((sumInterval.months / cnt).toInt, sumInterval.microseconds / cnt)
    }
  }

  val durationSum = new DurationSum()
  val durationAvg = new DurationAvg()
  val durationMin = new DurationMin()
  val durationMax = new DurationMax()
}

Source File: MonotonicallyIncreasingID.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext}
import org.apache.spark.sql.types.{LongType, DataType}


  @transient private[this] var count: Long = _

  @transient private[this] var partitionMask: Long = _

  override protected def initInternal(): Unit = {
    count = 0L
    partitionMask = TaskContext.getPartitionId().toLong << 33
  }

  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override protected def evalInternal(input: InternalRow): Long = {
    val currentCount = count
    count += 1
    partitionMask + currentCount
  }

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val countTerm = ctx.freshName("count")
    val partitionMaskTerm = ctx.freshName("partitionMask")
    ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;")
    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm,
      s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;")

    ev.isNull = "false"
    s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
      $countTerm++;
    """
  }
}

Source File: SparkSequenceGeneratorIngress.scala From pipelines-examples with Apache License 2.0

5 votes

package pipelines.example

import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.streaming.OutputMode

import pipelines.streamlets._
import pipelines.streamlets.StreamletShape
import pipelines.streamlets.avro._
import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet }
import pipelines.spark.sql.SQLImplicits._

class SparkSequenceGeneratorIngress extends SparkStreamlet {
  val out = AvroOutlet[Data]("out", d ⇒ d.key.toString)
  val shape = StreamletShape(out)

  val RecordsPerSecond = IntegerConfigParameter(
    "records-per-second",
    "Records per second to process.",
    Some(50))

  override def configParameters = Vector(RecordsPerSecond)

  override def createLogic() = new SparkStreamletLogic {
    val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key)

    override def buildStreamingQueries = {
      writeStream(process, out, OutputMode.Append).toQueryExecution
    }

    private def process: Dataset[Data] = {
      session.readStream
        .format("rate")
        .option("rowsPerSecond", recordsPerSecond)
        .load()
        .withColumn("key", ($"value" / SequenceSettings.GroupSize).cast(LongType))
        .as[Data]
    }
  }
}

Source File: CallRecordGeneratorIngress.scala From pipelines-examples with Apache License 2.0

5 votes

package pipelines.examples.carly.aggregator

import java.sql.Timestamp

import scala.util.Random
import scala.concurrent.duration._

import org.apache.spark.sql.{ Dataset, SparkSession }
import org.apache.spark.sql.streaming.OutputMode

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.LongType

import pipelines.streamlets._
import pipelines.streamlets.avro._
import pipelines.spark.sql.SQLImplicits._
import pipelines.examples.carly.data.CallRecord
import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.log4j.{ Level, Logger }

case class Rate(timestamp: Timestamp, value: Long)

class CallRecordGeneratorIngress extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  val RecordsPerSecond = IntegerConfigParameter(
    "records-per-second",
    "Records per second to process.",
    Some(50))

  override def configParameters = Vector(RecordsPerSecond)

  val out = AvroOutlet[CallRecord]("out", _.user)
  val shape = StreamletShape(out)

  override def createLogic() = new SparkStreamletLogic {
    val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key)
    override def buildStreamingQueries = {
      val outStream = DataGenerator.mkData(super.session, recordsPerSecond)
      writeStream(outStream, out, OutputMode.Append).toQueryExecution
    }
  }
}

object DataGenerator {
  def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = {
    // do we need to expose this through configuration?

    val MaxTime = 2.hours.toMillis
    val MaxUsers = 100000
    val TS0 = new java.sql.Timestamp(0)
    val ZeroTimestampProb = 0.05 // error rate

    // Random Data Generator
    val usersUdf = udf(() ⇒ "user-" + Random.nextInt(MaxUsers))
    val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing")

    // Time-biased randomized filter - 1/2 hour cycles
    val sinTime: Long ⇒ Double = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI)
    val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob
    val timeFilterUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng))
    val zeroTimestampUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ {
      if (rng < ZeroTimestampProb) {
        TS0
      } else {
        ts
      }
    })

    val rateStream = session.readStream
      .format("rate")
      .option("rowsPerSecond", recordsPerSecond)
      .load()
      .as[Rate]

    val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand())
    val sampledData = randomDataset.where(timeFilterUdf($"timestamp", $"rng"))
      .withColumn("user", usersUdf())
      .withColumn("other", usersUdf())
      .withColumn("direction", directionUdf())
      .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType))
      .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng"))
      .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp" as "timestamp")
      .as[CallRecord]
    sampledData
  }
}

Source File: SchemaWithInfo.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.model.metadata

import com.flaminem.flamy.conf.FlamyContext
import com.flaminem.flamy.model.IOFormat
import com.flaminem.flamy.model.metadata.TableWithInfo.getSparkSchema
import com.flaminem.flamy.model.names.{SchemaName, TableName}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}


class SchemaWithInfo(
  override val creationTime: Option[Long],
  override val location: String,
  val name: SchemaName,
  val numTables: Option[Int],
  fileSize: Option[Long],
  fileCount: Option[Long],
  modificationTime: Option[Long]
) extends ItemWithInfo {

  def formattedNumTables: String = {
    numTables.map{_.toString}.getOrElse("")
  }

  override def getFormattedInfo(context: FlamyContext, humanReadable: Boolean): Seq[String] = {
    Seq(
      name.toString,
      formattedNumTables,
      formattedFileSize(context, humanReadable),
      formattedFileCount(context),
      formattedModificationTime(context)
    )
  }

  override def getFileSize: Option[Long] = {
    fileSize
  }

  override def getFileCount: Option[Long] = {
    fileCount
  }

  override def getModificationTime(context: FlamyContext, refresh: Boolean = false): Option[Long] = {
    modificationTime
  }

  override def toString: String = {
    name.toString
  }

}

object SchemaWithInfo {

  val getSparkSchema: StructType = {
    StructType(Seq(
      StructField("schema", StringType),
      StructField("num_tables", LongType),
      StructField("size", LongType),
      StructField("num_files", LongType),
      StructField("modification_time", LongType)
    ))
  }

  def getInfoHeader: Seq[String] = {
    getSparkSchema.fields.map{_.name}
  }

}

Source File: SQLTransformerSuite.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.feature

import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest}
import org.apache.spark.sql.types.{LongType, StructField, StructType}
import org.apache.spark.storage.StorageLevel

class SQLTransformerSuite extends MLTest with DefaultReadWriteTest {

  import testImplicits._

  test("transform numeric data") {
    val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
     val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
      .toDF("id", "v1", "v2", "v3", "v4")
    val resultSchema = sqlTrans.transformSchema(original.schema)
    testTransformerByGlobalCheckFunc[(Int, Double, Double)](
      original,
      sqlTrans,
      "id",
      "v1",
      "v2",
      "v3",
      "v4") { rows =>
      assert(rows.head.schema.toString == resultSchema.toString)
      assert(resultSchema == expected.schema)
      assert(rows == expected.collect().toSeq)
      assert(original.sparkSession.catalog.listTables().count() == 0)
    }
  }

  test("read/write") {
    val t = new SQLTransformer()
      .setStatement("select * from __THIS__")
    testDefaultReadWrite(t)
  }

  test("transformSchema") {
    val df = spark.range(10)
    val outputSchema = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
      .transformSchema(df.schema)
    val expected = StructType(Seq(StructField("id1", LongType, nullable = false)))
    assert(outputSchema === expected)
  }

  ignore("SPARK-22538: SQLTransformer should not unpersist given dataset") {
    val df = spark.range(10).toDF()
    df.cache()
    df.count()
    assert(df.storageLevel != StorageLevel.NONE)
    val sqlTrans = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
    testTransformerByGlobalCheckFunc[Long](df, sqlTrans, "id1") { _ => }
    assert(df.storageLevel != StorageLevel.NONE)
  }
}

Source File: KCore.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.graph.kcore
import com.tencent.angel.sona.context.PSContext
import org.apache.spark.SparkContext
import com.tencent.angel.sona.graph.params._
import com.tencent.angel.sona.ml.Transformer
import com.tencent.angel.sona.ml.param.ParamMap
import com.tencent.angel.sona.ml.util.Identifiable
import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.storage.StorageLevel

class KCore(override val uid: String) extends Transformer
  with HasSrcNodeIdCol with HasDstNodeIdCol with HasOutputNodeIdCol with HasOutputCoreIdCol
  with HasStorageLevel with HasPartitionNum with HasPSPartitionNum with HasUseBalancePartition {

  def this() = this(Identifiable.randomUID("KCore"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val edges = dataset.select($(srcNodeIdCol), $(dstNodeIdCol)).rdd
      .map(row => (row.getLong(0), row.getLong(1)))
      .filter(e => e._1 != e._2)

    edges.persist(StorageLevel.DISK_ONLY)

    val maxId = edges.map(e => math.max(e._1, e._2)).max() + 1
    val minId = edges.map(e => math.min(e._1, e._2)).min()
    val nodes = edges.flatMap(e => Iterator(e._1, e._2))
    val numEdges = edges.count()

    println(s"minId=$minId maxId=$maxId numEdges=$numEdges level=${$(storageLevel)}")

    // Start PS and init the model
    println("start to run ps")
    PSContext.getOrCreate(SparkContext.getOrCreate())

    val model = KCorePSModel.fromMinMax(minId, maxId, nodes, $(psPartitionNum), $(useBalancePartition))
    var graph = edges.flatMap(e => Iterator((e._1, e._2), (e._2, e._1)))
      .groupByKey($(partitionNum))
      .mapPartitionsWithIndex((index, edgeIter) =>
        Iterator(KCoreGraphPartition.apply(index, edgeIter)))

    graph.persist($(storageLevel))
    graph.foreachPartition(_ => Unit)
    graph.foreach(_.initMsgs(model))

    var curIteration = 0
    var numMsgs = model.numMsgs()
    var prev = graph
    println(s"numMsgs=$numMsgs")

    do {
      curIteration += 1
      graph = prev.map(_.process(model, numMsgs, curIteration == 1))
      graph.persist($(storageLevel))
      graph.count()
      prev.unpersist(true)
      prev = graph
      model.resetMsgs()
      numMsgs = model.numMsgs()
      println(s"curIteration=$curIteration numMsgs=$numMsgs")
    } while (numMsgs > 0)

    val retRDD = graph.map(_.save()).flatMap{case (nodes,cores) => nodes.zip(cores)}
      .map(r => Row.fromSeq(Seq[Any](r._1, r._2)))

    dataset.sparkSession.createDataFrame(retRDD, transformSchema(dataset.schema))
  }

  override def transformSchema(schema: StructType): StructType = {
    StructType(Seq(
      StructField(s"${$(outputNodeIdCol)}", LongType, nullable = false),
      StructField(s"${$(outputCoreIdCol)}", IntegerType, nullable = false)
    ))
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

}

Source File: CallRecordGeneratorIngress.scala From cloudflow with Apache License 2.0

5 votes

package carly.aggregator

import java.sql.Timestamp

import scala.util.Random
import scala.concurrent.duration._

import org.apache.spark.sql.{ Dataset, SparkSession }
import org.apache.spark.sql.streaming.OutputMode

import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.LongType

import cloudflow.streamlets._
import cloudflow.streamlets.avro._
import cloudflow.spark.sql.SQLImplicits._
import carly.data.CallRecord
import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic }
import org.apache.log4j.{ Level, Logger }

case class Rate(timestamp: Timestamp, value: Long)

class CallRecordGeneratorIngress extends SparkStreamlet {

  val rootLogger = Logger.getRootLogger()
  rootLogger.setLevel(Level.ERROR)

  val RecordsPerSecond = IntegerConfigParameter("records-per-second", "Records per second to process.", Some(50))

  override def configParameters = Vector(RecordsPerSecond)

  val out   = AvroOutlet[CallRecord]("out", _.user)
  val shape = StreamletShape(out)

  override def createLogic() = new SparkStreamletLogic {
    val recordsPerSecond = RecordsPerSecond.value
    override def buildStreamingQueries = {
      val outStream = DataGenerator.mkData(super.session, recordsPerSecond)
      writeStream(outStream, out, OutputMode.Append).toQueryExecution
    }
  }
}

object DataGenerator {
  def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = {
    // do we need to expose this through configuration?

    val MaxTime           = 2.hours.toMillis
    val MaxUsers          = 100000
    val TS0               = new java.sql.Timestamp(0)
    val ZeroTimestampProb = 0.05 // error rate

    // Random Data Generator
    val usersUdf     = udf(() ⇒ "user-" + Random.nextInt(MaxUsers))
    val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing")

    // Time-biased randomized filter - 1/2 hour cycles
    val sinTime: Long ⇒ Double                   = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI)
    val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob
    val timeFilterUdf                            = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng))
    val zeroTimestampUdf = udf { (ts: java.sql.Timestamp, rng: Double) ⇒
      if (rng < ZeroTimestampProb) {
        TS0
      } else {
        ts
      }
    }

    val rateStream = session.readStream
      .format("rate")
      .option("rowsPerSecond", recordsPerSecond)
      .load()
      .as[Rate]

    val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand())
    val sampledData = randomDataset
      .where(timeFilterUdf($"timestamp", $"rng"))
      .withColumn("user", usersUdf())
      .withColumn("other", usersUdf())
      .withColumn("direction", directionUdf())
      .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType))
      .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng"))
      .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp".as("timestamp"))
      .as[CallRecord]
    sampledData
  }
}

Source File: CassandraSink.scala From Spark-Structured-Streaming-Examples with Apache License 2.0

5 votes

package cassandra.StreamSinkProvider

import cassandra.{CassandraDriver, CassandraKafkaMetadata}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.functions.max
import spark.SparkHelper
import cassandra.CassandraDriver
import com.datastax.spark.connector._
import kafka.KafkaMetadata
import log.LazyLogger
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.types.LongType
import radio.SimpleSongAggregation


  private def saveKafkaMetaData(df: DataFrame) = {
    val kafkaMetadata = df
      .groupBy($"partition")
      .agg(max($"offset").cast(LongType).as("offset"))
      .as[KafkaMetadata]

    log.warn("Saving Kafka Metadata (partition and offset per topic (only one in our example)")
    kafkaMetadata.show()

    kafkaMetadata.rdd.saveToCassandra(CassandraDriver.namespace,
      CassandraDriver.kafkaMetadata,
      SomeColumns("partition", "offset")
    )

    //Otherway to save offset inside Cassandra
    //kafkaMetadata.collect().foreach(CassandraKafkaMetadata.save)
  }
}

Source File: GroupedIteratorSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{LongType, StringType, IntegerType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema)
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema)

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema)
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: MySQLDialect.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.jdbc

import java.sql.Types

import org.apache.spark.sql.types.{BooleanType, LongType, DataType, MetadataBuilder}


private case object MySQLDialect extends JdbcDialect {

  override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql")

  override def getCatalystType(
      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
    if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) {
      // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
      // byte arrays instead of longs.
      md.putLong("binarylong", 1)
      Option(LongType)
    } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
      Option(BooleanType)
    } else None
  }

  override def quoteIdentifier(colName: String): String = {
    s"`$colName`"
  }

  override def getTableExistsQuery(table: String): String = {
    s"SELECT 1 FROM $table LIMIT 1"
  }
}

Source File: DecimalExpressionSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{LongType, DecimalType, Decimal}


class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("UnscaledValue") {
    val d1 = Decimal("10.1")
    checkEvaluation(UnscaledValue(Literal(d1)), 101L)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(UnscaledValue(Literal(d2)), 101L)
    checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("MakeDecimal") {
    checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1"))
    checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null)
  }

  test("PromotePrecision") {
    val d1 = Decimal("10.1")
    checkEvaluation(PromotePrecision(Literal(d1)), d1)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(PromotePrecision(Literal(d2)), d2)
    checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("CheckOverflow") {
    val d1 = Decimal("10.1")
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null)

    val d2 = Decimal(101, 3, 1)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null)

    checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null)
  }

}

Source File: EdgeListDataSource.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.api.io.edgelist

import org.apache.spark.sql.functions
import org.apache.spark.sql.types.{LongType, StructField, StructType}
import org.opencypher.morpheus.api.MorpheusSession
import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey
import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey}
import org.opencypher.morpheus.api.io.edgelist.EdgeListDataSource._
import org.opencypher.morpheus.api.io.{MorpheusNodeTable, MorpheusRelationshipTable}
import org.opencypher.morpheus.schema.MorpheusSchema
import org.opencypher.okapi.api.graph.{GraphName, PropertyGraph}
import org.opencypher.okapi.api.io.PropertyGraphDataSource
import org.opencypher.okapi.api.schema.{PropertyGraphSchema, PropertyKeys}
import org.opencypher.okapi.impl.exception.UnsupportedOperationException

object EdgeListDataSource {

  val NODE_LABEL = "V"

  val REL_TYPE = "E"

  val GRAPH_NAME = GraphName("graph")

  val SCHEMA: PropertyGraphSchema = MorpheusSchema.empty
    .withNodePropertyKeys(Set(NODE_LABEL), PropertyKeys.empty)
    .withRelationshipPropertyKeys(REL_TYPE, PropertyKeys.empty)
}


case class EdgeListDataSource(path: String, options: Map[String, String] = Map.empty)(implicit morpheus: MorpheusSession)
  extends PropertyGraphDataSource {

  override def hasGraph(name: GraphName): Boolean = name == GRAPH_NAME

  override def graph(name: GraphName): PropertyGraph = {
    val reader = options.foldLeft(morpheus.sparkSession.read) {
      case (current, (key, value)) => current.option(key, value)
    }

    val rawRels = reader
      .schema(StructType(Seq(
        StructField(sourceStartNodeKey, LongType),
        StructField(sourceEndNodeKey, LongType))))
      .csv(path)
      .withColumn(sourceIdKey, functions.monotonically_increasing_id())
      .select(sourceIdKey, sourceStartNodeKey, sourceEndNodeKey)

    val rawNodes = rawRels
      .select(rawRels.col(sourceStartNodeKey).as(sourceIdKey))
      .union(rawRels.select(rawRels.col(sourceEndNodeKey).as(sourceIdKey)))
      .distinct()

    morpheus.graphs.create(MorpheusNodeTable(Set(NODE_LABEL), rawNodes), MorpheusRelationshipTable(REL_TYPE, rawRels))
  }

  override def schema(name: GraphName): Option[PropertyGraphSchema] = Some(SCHEMA)

  override def store(name: GraphName, graph: PropertyGraph): Unit =
    throw UnsupportedOperationException("Storing an edge list is not supported")

  override def delete(name: GraphName): Unit =
    throw UnsupportedOperationException("Deleting an edge list is not supported")

  override val graphNames: Set[GraphName] = Set(GRAPH_NAME)
}

Source File: HashSetManager.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.storage

import edu.ucla.cs.wis.bigdatalog.spark.SchemaInfo
import edu.ucla.cs.wis.bigdatalog.spark.storage.set.hashset._
import org.apache.spark.TaskContext
import org.apache.spark.sql.types.{IntegerType, LongType}

object HashSetManager {
  def determineKeyType(schemaInfo: SchemaInfo): Int = {
    schemaInfo.arity match {
      case 1 => {
        schemaInfo.schema(0).dataType match {
          case IntegerType => 1
          case LongType => 2
          case other => 3
        }
      }
      case 2 => {
        val bytesPerKey = schemaInfo.schema.map(_.dataType.defaultSize).sum
        if (bytesPerKey == 8) 2 else 3
      }
      case other => 3
    }
  }

  def create(schemaInfo: SchemaInfo): HashSet = {
    determineKeyType(schemaInfo) match {
      case 1 => new IntKeysHashSet()
      case 2 => new LongKeysHashSet(schemaInfo)
      
      case _ => new ObjectHashSet()
    }
  }
}

Source File: GroupBitwiseOr.scala From mimir with Apache License 2.0

5 votes

package mimir.exec.spark.udf

import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.util.TypeUtils
import org.apache.spark.sql.types.{ DataType, LongType }
import org.apache.spark.sql.catalyst.expressions.{ AttributeReference, Literal, BitwiseOr }

case class GroupBitwiseOr(child: org.apache.spark.sql.catalyst.expressions.Expression) extends DeclarativeAggregate {
  override def children: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = child :: Nil
  override def nullable: Boolean = false
  // Return data type.
  override def dataType: DataType = LongType
  override def checkInputDataTypes(): TypeCheckResult =
    TypeUtils.checkForOrderingExpr(child.dataType, "function group_bitwise_or")
  private lazy val group_bitwise_or = AttributeReference("group_bitwise_or", LongType)()
  override lazy val aggBufferAttributes: Seq[AttributeReference] = group_bitwise_or :: Nil
  override lazy val initialValues: Seq[Literal] = Seq(
    Literal.create(0, LongType)
  )
  override lazy val updateExpressions: Seq[ org.apache.spark.sql.catalyst.expressions.Expression] = Seq(
    BitwiseOr(group_bitwise_or, child)
  )
  override lazy val mergeExpressions: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = {
    Seq(
      BitwiseOr(group_bitwise_or.left, group_bitwise_or.right)
    )
  }
  override lazy val evaluateExpression: AttributeReference = group_bitwise_or
}

Source File: GroupBitwiseAnd.scala From mimir with Apache License 2.0

5 votes

package mimir.exec.spark.udf

import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.util.TypeUtils
import org.apache.spark.sql.types.{ DataType, LongType }
import org.apache.spark.sql.catalyst.expressions.{ AttributeReference, Literal, BitwiseAnd }

case class GroupBitwiseAnd(child: org.apache.spark.sql.catalyst.expressions.Expression) extends DeclarativeAggregate {
  override def children: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = child :: Nil
  override def nullable: Boolean = false
  // Return data type.
  override def dataType: DataType = LongType
  override def checkInputDataTypes(): TypeCheckResult =
    TypeUtils.checkForOrderingExpr(child.dataType, "function group_bitwise_and")
  private lazy val group_bitwise_and = AttributeReference("group_bitwise_and", LongType)()
  override lazy val aggBufferAttributes: Seq[AttributeReference] = group_bitwise_and :: Nil
  override lazy val initialValues: Seq[Literal] = Seq(
    Literal.create(0xffffffffffffffffl, LongType)
  )
  override lazy val updateExpressions: Seq[ org.apache.spark.sql.catalyst.expressions.Expression] = Seq(
    BitwiseAnd(group_bitwise_and, child)
  )
  override lazy val mergeExpressions: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = {
    Seq(
      BitwiseAnd(group_bitwise_and.left, group_bitwise_and.right)
    )
  }
  override lazy val evaluateExpression: AttributeReference = group_bitwise_and
}

Source File: MimirSparkRuntimeUtils.scala From mimir with Apache License 2.0

5 votes

package mimir.exec.spark

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{ DataType, LongType }
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{
  spark_partition_id,
  monotonically_increasing_id,
  count,
  sum,
  first,
  lit,
  col
}

object MimirSparkRuntimeUtils
{
  def zipWithIndex(df: DataFrame, offset: Long = 1, indexName: String = "ROWIDX", indexType:DataType = LongType): DataFrame = {
    val dfWithPartitionId = df.withColumn("partition_id", spark_partition_id()).withColumn("inc_id", monotonically_increasing_id())

    val partitionOffsets = dfWithPartitionId
        .groupBy("partition_id")
        .agg(count(lit(1)) as "cnt", first("inc_id") as "inc_id")
        .orderBy("partition_id")
        .select(col("partition_id"), sum("cnt").over(Window.orderBy("partition_id")) - col("cnt") - col("inc_id") + lit(offset) as "cnt" )
        .collect()
        .map(row => (row.getInt(0), row.getLong(1)))
        .toMap

     val theUdf = org.apache.spark.sql.functions.udf(
       (partitionId: Int) => partitionOffsets(partitionId), 
       LongType
     )
     
     dfWithPartitionId
        .withColumn("partition_offset", theUdf(col("partition_id")))
        .withColumn(indexName, (col("partition_offset") + col("inc_id")).cast(indexType))
        .drop("partition_id", "partition_offset", "inc_id")
  }

  def writeDataSink(dataframe:DataFrame, format:String, options:Map[String, String], save:Option[String]) = {
    val dsFormat = dataframe.write.format(format) 
    val dsOptions = options.toSeq.foldLeft(dsFormat)( (ds, opt) => opt._1 match { 
      case "mode" => ds.mode(opt._2) 
      case _ => ds.option(opt._1, opt._2)
      })
    save match {
      case None => dsOptions.save
      case Some(outputFile) => {
        if(format.equals("com.github.potix2.spark.google.spreadsheets")){
          val gsldfparts = outputFile.split("\\/") 
          val gsldf = s"${gsldfparts(gsldfparts.length-2)}/${gsldfparts(gsldfparts.length-1)}"
          dsOptions.save(gsldf)
        }
        else{
          dsOptions.save(outputFile)
        }
      }
    }
  }
}

Source File: GroupedDatasetSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.api.python.PythonEvalType
import org.apache.spark.sql.catalyst.plans.logical.AnalysisBarrier
import org.apache.spark.sql.execution.python.PythonUDF
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{LongType, StructField, StructType}

class GroupedDatasetSuite extends QueryTest with SharedSQLContext {
  import testImplicits._

  private val scalaUDF = udf((x: Long) => { x + 1 })
  private lazy val datasetWithUDF = spark.range(1).toDF("s").select($"s", scalaUDF($"s"))

  private def assertContainsAnalysisBarrier(ds: Dataset[_], atLevel: Int = 1): Unit = {
    assert(atLevel >= 0)
    var children = Seq(ds.queryExecution.logical)
    (1 to atLevel).foreach { _ =>
      children = children.flatMap(_.children)
    }
    val barriers = children.collect {
      case ab: AnalysisBarrier => ab
    }
    assert(barriers.nonEmpty, s"Plan does not contain AnalysisBarrier at level $atLevel:\n" +
      ds.queryExecution.logical)
  }

  test("SPARK-24373: avoid running Analyzer rules twice on RelationalGroupedDataset") {
    val groupByDataset = datasetWithUDF.groupBy()
    val rollupDataset = datasetWithUDF.rollup("s")
    val cubeDataset = datasetWithUDF.cube("s")
    val pivotDataset = datasetWithUDF.groupBy().pivot("s", Seq(1, 2))
    datasetWithUDF.cache()
    Seq(groupByDataset, rollupDataset, cubeDataset, pivotDataset).foreach { rgDS =>
      val df = rgDS.count()
      assertContainsAnalysisBarrier(df)
      assertCached(df)
    }

    val flatMapGroupsInRDF = datasetWithUDF.groupBy().flatMapGroupsInR(
      Array.emptyByteArray,
      Array.emptyByteArray,
      Array.empty,
      StructType(Seq(StructField("s", LongType))))
    val flatMapGroupsInPandasDF = datasetWithUDF.groupBy().flatMapGroupsInPandas(PythonUDF(
      "pyUDF",
      null,
      StructType(Seq(StructField("s", LongType))),
      Seq.empty,
      PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF,
      true))
    Seq(flatMapGroupsInRDF, flatMapGroupsInPandasDF).foreach { df =>
      assertContainsAnalysisBarrier(df, 2)
      assertCached(df)
    }
    datasetWithUDF.unpersist(true)
  }

  test("SPARK-24373: avoid running Analyzer rules twice on KeyValueGroupedDataset") {
    val kvDasaset = datasetWithUDF.groupByKey(_.getLong(0))
    datasetWithUDF.cache()
    val mapValuesKVDataset = kvDasaset.mapValues(_.getLong(0)).reduceGroups(_ + _)
    val keysKVDataset = kvDasaset.keys
    val flatMapGroupsKVDataset = kvDasaset.flatMapGroups((k, _) => Seq(k))
    val aggKVDataset = kvDasaset.count()
    val otherKVDataset = spark.range(1).groupByKey(_ + 1)
    val cogroupKVDataset = kvDasaset.cogroup(otherKVDataset)((k, _, _) => Seq(k))
    Seq((mapValuesKVDataset, 1),
        (keysKVDataset, 2),
        (flatMapGroupsKVDataset, 2),
        (aggKVDataset, 1),
        (cogroupKVDataset, 2)).foreach { case (df, analysisBarrierDepth) =>
      assertContainsAnalysisBarrier(df, analysisBarrierDepth)
      assertCached(df)
    }
    datasetWithUDF.unpersist(true)
  }
}

Source File: GroupedIteratorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: MySQLDialect.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.jdbc

import java.sql.Types

import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder}

private case object MySQLDialect extends JdbcDialect {

  override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql")

  override def getCatalystType(
      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
    if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) {
      // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
      // byte arrays instead of longs.
      md.putLong("binarylong", 1)
      Option(LongType)
    } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
      Option(BooleanType)
    } else None
  }

  override def quoteIdentifier(colName: String): String = {
    s"`$colName`"
  }

  override def getTableExistsQuery(table: String): String = {
    s"SELECT 1 FROM $table LIMIT 1"
  }

  override def isCascadingTruncateTable(): Option[Boolean] = Some(false)
}

Source File: ResolveInlineTablesSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Cast, Literal, Rand}
import org.apache.spark.sql.catalyst.expressions.aggregate.Count
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types.{LongType, NullType, TimestampType}


class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter {

  private def lit(v: Any): Literal = Literal(v)

  test("validate inputs are foldable") {
    ResolveInlineTables(conf).validateInputEvaluable(
      UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)))))

    // nondeterministic (rand) should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1)))))
    }

    // aggregate should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1))))))
    }

    // unresolved attribute should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A")))))
    }
  }

  test("validate input dimensions") {
    ResolveInlineTables(conf).validateInputDimension(
      UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2)))))

    // num alias != data dimension
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputDimension(
        UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2)))))
    }

    // num alias == data dimension, but data themselves are inconsistent
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputDimension(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22)))))
    }
  }

  test("do not fire the rule if not all expressions are resolved") {
    val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A"))))
    assert(ResolveInlineTables(conf)(table) == table)
  }

  test("convert") {
    val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted = ResolveInlineTables(conf).convert(table)

    assert(converted.output.map(_.dataType) == Seq(LongType))
    assert(converted.data.size == 2)
    assert(converted.data(0).getLong(0) == 1L)
    assert(converted.data(1).getLong(0) == 2L)
  }

  test("convert TimeZoneAwareExpression") {
    val table = UnresolvedInlineTable(Seq("c1"),
      Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType))))
    val withTimeZone = ResolveTimeZone(conf).apply(table)
    val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone)
    val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType)
      .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long]
    assert(output.map(_.dataType) == Seq(TimestampType))
    assert(data.size == 1)
    assert(data.head.getLong(0) == correct)
  }

  test("nullability inference in convert") {
    val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted1 = ResolveInlineTables(conf).convert(table1)
    assert(!converted1.schema.fields(0).nullable)

    val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType))))
    val converted2 = ResolveInlineTables(conf).convert(table2)
    assert(converted2.schema.fields(0).nullable)
  }
}

Source File: RandomSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.scalatest.Matchers._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{IntegerType, LongType}

class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("random") {
    checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001)
    checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001)

    checkDoubleEvaluation(
      new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001)
    checkDoubleEvaluation(
      new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001)
  }

  test("SPARK-9127 codegen with long seed") {
    checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001)
    checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001)
  }
}

Source File: DecimalExpressionSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{Decimal, DecimalType, LongType}

class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("UnscaledValue") {
    val d1 = Decimal("10.1")
    checkEvaluation(UnscaledValue(Literal(d1)), 101L)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(UnscaledValue(Literal(d2)), 101L)
    checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("MakeDecimal") {
    checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1"))
    checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null)
  }

  test("PromotePrecision") {
    val d1 = Decimal("10.1")
    checkEvaluation(PromotePrecision(Literal(d1)), d1)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(PromotePrecision(Literal(d2)), d2)
    checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("CheckOverflow") {
    val d1 = Decimal("10.1")
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null)

    val d2 = Decimal(101, 3, 1)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null)

    checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null)
  }

}

Source File: TestMetadataConstructor.scala From spark-salesforce with Apache License 2.0

5 votes

package com.springml.spark.salesforce.metadata

import org.apache.spark.sql.types.{StructType, StringType, IntegerType, LongType,
  FloatType, DateType, TimestampType, BooleanType, StructField}
import org.scalatest.FunSuite
import com.springml.spark.salesforce.Utils


class TestMetadataConstructor extends FunSuite {

  test("Test Metadata generation") {
    val columnNames = List("c1", "c2", "c3", "c4")
    val columnStruct = columnNames.map(colName => StructField(colName, StringType, true))
    val schema = StructType(columnStruct)

    val schemaString = MetadataConstructor.generateMetaString(schema,"sampleDataSet", Utils.metadataConfig(null))
    assert(schemaString.length > 0)
    assert(schemaString.contains("sampleDataSet"))
  }

  test("Test Metadata generation With Custom MetadataConfig") {
    val columnNames = List("c1", "c2", "c3", "c4")
    val intField = StructField("intCol", IntegerType, true)
    val longField = StructField("longCol", LongType, true)
    val floatField = StructField("floatCol", FloatType, true)
    val dateField = StructField("dateCol", DateType, true)
    val timestampField = StructField("timestampCol", TimestampType, true)
    val stringField = StructField("stringCol", StringType, true)
    val someTypeField = StructField("someTypeCol", BooleanType, true)

    val columnStruct = Array[StructField] (intField, longField, floatField, dateField, timestampField, stringField, someTypeField)

    val schema = StructType(columnStruct)

    var metadataConfig = Map("string" -> Map("wave_type" -> "Text"))
    metadataConfig += ("integer" -> Map("wave_type" -> "Numeric", "precision" -> "10", "scale" -> "0", "defaultValue" -> "100"))
    metadataConfig += ("float" -> Map("wave_type" -> "Numeric", "precision" -> "10", "scale" -> "2"))
    metadataConfig += ("long" -> Map("wave_type" -> "Numeric", "precision" -> "18", "scale" -> "0"))
    metadataConfig += ("date" -> Map("wave_type" -> "Date", "format" -> "yyyy/MM/dd"))
    metadataConfig += ("timestamp" -> Map("wave_type" -> "Date", "format" -> "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))


    val schemaString = MetadataConstructor.generateMetaString(schema, "sampleDataSet", metadataConfig)
    assert(schemaString.length > 0)
    assert(schemaString.contains("sampleDataSet"))
    assert(schemaString.contains("Numeric"))
    assert(schemaString.contains("precision"))
    assert(schemaString.contains("scale"))
    assert(schemaString.contains("18"))
    assert(schemaString.contains("Text"))
    assert(schemaString.contains("Date"))
    assert(schemaString.contains("format"))
    assert(schemaString.contains("defaultValue"))
    assert(schemaString.contains("100"))
    assert(schemaString.contains("yyyy/MM/dd"))
    assert(schemaString.contains("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
  }
}

Source File: MonotonicallyIncreasingID.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral}
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.types.{DataType, LongType}


  @transient private[this] var count: Long = _

  @transient private[this] var partitionMask: Long = _

  override protected def initializeInternal(partitionIndex: Int): Unit = {
    count = 0L
    partitionMask = partitionIndex.toLong << 33
  }

  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override protected def evalInternal(input: InternalRow): Long = {
    val currentCount = count
    count += 1
    partitionMask + currentCount
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val countTerm = ctx.addMutableState(CodeGenerator.JAVA_LONG, "count")
    val partitionMaskTerm = "partitionMask"
    ctx.addImmutableStateIfNotExists(CodeGenerator.JAVA_LONG, partitionMaskTerm)
    ctx.addPartitionInitializationStatement(s"$countTerm = 0L;")
    ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;")

    ev.copy(code = code"""
      final ${CodeGenerator.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
      $countTerm++;""", isNull = FalseLiteral)
  }

  override def prettyName: String = "monotonically_increasing_id"

  override def sql: String = s"$prettyName()"

  override def freshCopy(): MonotonicallyIncreasingID = MonotonicallyIncreasingID()
}

Source File: inputFileBlock.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.rdd.InputFileBlockHolder
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral}
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.types.{DataType, LongType, StringType}
import org.apache.spark.unsafe.types.UTF8String


@ExpressionDescription(
  usage = "_FUNC_() - Returns the name of the file being read, or empty string if not available.")
case class InputFileName() extends LeafExpression with Nondeterministic {

  override def nullable: Boolean = false

  override def dataType: DataType = StringType

  override def prettyName: String = "input_file_name"

  override protected def initializeInternal(partitionIndex: Int): Unit = {}

  override protected def evalInternal(input: InternalRow): UTF8String = {
    InputFileBlockHolder.getInputFilePath
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val className = InputFileBlockHolder.getClass.getName.stripSuffix("$")
    val typeDef = s"final ${CodeGenerator.javaType(dataType)}"
    ev.copy(code = code"$typeDef ${ev.value} = $className.getInputFilePath();",
      isNull = FalseLiteral)
  }
}


@ExpressionDescription(
  usage = "_FUNC_() - Returns the start offset of the block being read, or -1 if not available.")
case class InputFileBlockStart() extends LeafExpression with Nondeterministic {
  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override def prettyName: String = "input_file_block_start"

  override protected def initializeInternal(partitionIndex: Int): Unit = {}

  override protected def evalInternal(input: InternalRow): Long = {
    InputFileBlockHolder.getStartOffset
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val className = InputFileBlockHolder.getClass.getName.stripSuffix("$")
    val typeDef = s"final ${CodeGenerator.javaType(dataType)}"
    ev.copy(code = code"$typeDef ${ev.value} = $className.getStartOffset();", isNull = FalseLiteral)
  }
}


@ExpressionDescription(
  usage = "_FUNC_() - Returns the length of the block being read, or -1 if not available.")
case class InputFileBlockLength() extends LeafExpression with Nondeterministic {
  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override def prettyName: String = "input_file_block_length"

  override protected def initializeInternal(partitionIndex: Int): Unit = {}

  override protected def evalInternal(input: InternalRow): Long = {
    InputFileBlockHolder.getLength
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val className = InputFileBlockHolder.getClass.getName.stripSuffix("$")
    val typeDef = s"final ${CodeGenerator.javaType(dataType)}"
    ev.copy(code = code"$typeDef ${ev.value} = $className.getLength();", isNull = FalseLiteral)
  }
}

Source File: TiSparkTypeSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

class TiSparkTypeSuite extends BaseDataSourceTest("type_test") {
  private val row1 = Row(null, "Hello")
  private val row2 = Row(2L, "TiDB")
  private val row3 = Row(3L, "Spark")
  private val row5 = Row(Long.MaxValue, "Duplicate")

  private val schema = StructType(List(StructField("i", LongType), StructField("s", StringType)))
  test("bigint test") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i bigint, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')")

    tidbWrite(List(row3, row5), schema)
    testTiDBSelect(List(row1, row2, row3, row5))
  }
}

Source File: SummarizeIntervalsSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries

import com.twosigma.flint.timeseries.row.Schema
import org.apache.spark.sql.types.{ DoubleType, LongType, IntegerType }

class SummarizeIntervalsSpec extends MultiPartitionSuite with TimeSeriesTestData with TimeTypeSuite {

  override val defaultResourceDir: String = "/timeseries/summarizeintervals"

  "SummarizeInterval" should "pass `SummarizeSingleColumn` test." in {
    withAllTimeType {
      val volumeTSRdd = fromCSV(
        "Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType)
      )

      volumeTSRdd.toDF.show()

      val clockTSRdd = fromCSV("Clock.csv", Schema())
      val resultTSRdd = fromCSV("SummarizeSingleColumn.results", Schema("volume_sum" -> DoubleType))

      def test(rdd: TimeSeriesRDD): Unit = {
        val summarizedVolumeTSRdd = rdd.summarizeIntervals(clockTSRdd, Summarizers.sum("volume"))
        summarizedVolumeTSRdd.toDF.show()
        assert(summarizedVolumeTSRdd.collect().deep == resultTSRdd.collect().deep)
      }

      withPartitionStrategy(volumeTSRdd)(DEFAULT)(test)
    }
  }

  it should "pass `SummarizeSingleColumnPerKey` test, i.e. with additional a single key." in {
    withAllTimeType {
      val volumeTSRdd = fromCSV(
        "Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType)
      )

      val clockTSRdd = fromCSV("Clock.csv", Schema())
      val resultTSRdd = fromCSV(
        "SummarizeSingleColumnPerKey.results",
        Schema("id" -> IntegerType, "volume_sum" -> DoubleType)
      )

      val result2TSRdd = fromCSV(
        "SummarizeV2PerKey.results",
        Schema("id" -> IntegerType, "v2_sum" -> DoubleType)
      )

      def test(rdd: TimeSeriesRDD): Unit = {
        val summarizedVolumeTSRdd = rdd.summarizeIntervals(clockTSRdd, Summarizers.sum("volume"), Seq("id"))
        assertEquals(summarizedVolumeTSRdd, resultTSRdd)
        val summarizedV2TSRdd = rdd.summarizeIntervals(clockTSRdd, Summarizers.sum("v2"), Seq("id"))
        assertEquals(summarizedV2TSRdd, result2TSRdd)
      }

      withPartitionStrategy(volumeTSRdd)(DEFAULT)(test)
    }
  }

  it should "pass `SummarizeSingleColumnPerSeqOfKeys` test, i.e. with additional a sequence of keys." in {
    withAllTimeType {
      val volumeTSRdd = fromCSV(
        "VolumeWithIndustryGroup.csv",
        Schema("id" -> IntegerType, "group" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType)
      )

      val clockTSRdd = fromCSV("Clock.csv", Schema())
      val resultTSRdd = fromCSV(
        "SummarizeSingleColumnPerSeqOfKeys.results",
        Schema("id" -> IntegerType, "group" -> IntegerType, "volume_sum" -> DoubleType)
      )

      def test(rdd: TimeSeriesRDD): Unit = {
        val summarizedVolumeTSRdd = rdd.summarizeIntervals(
          clockTSRdd,
          Summarizers.sum("volume"),
          Seq("id", "group")
        )
        assertEquals(summarizedVolumeTSRdd, resultTSRdd)
      }

      withPartitionStrategy(volumeTSRdd)(DEFAULT)(test)
    }
  }
}

Source File: QuantileSummarizerSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries.summarize.summarizer.subtractable

import com.twosigma.flint.rdd.function.summarize.summarizer.subtractable.SequentialArrayQueue
import com.twosigma.flint.timeseries.summarize.SummarizerSuite
import com.twosigma.flint.timeseries.{ Clocks, Summarizers, TimeSeriesRDD }
import org.apache.commons.math3.stat.descriptive.rank.Percentile
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.LongType

class QuantileSummarizerSpec extends SummarizerSuite {
  var clockTSRdd: TimeSeriesRDD = _
  private lazy val init = {
    clockTSRdd = Clocks.uniform(
      sc,
      frequency = "1d", offset = "0d", beginDateTime = "1970-01-01", endDateTime = "1980-01-01"
    )
  }

  "SequentialArrayQueue" should "resize up correctly" in {
    val queue = new SequentialArrayQueue[Double]()
    (1 to 32).map{
      i => queue.add(i.toDouble)
    }
    assert(queue.view()._3.length == 32)
    queue.add(0.0)
    assert(queue.view()._3.length == 64)
  }

  it should "shift down correctly" in {
    val queue = new SequentialArrayQueue[Double]()
    (1 to 64).map{
      i => queue.add(i.toDouble)
    }
    assert(queue.view()._3.length == 64)
    (1 to 32).map{
      _ => queue.remove()
    }
    assert(queue.view()._1 == 0)
  }

  it should "addAll and preserve order" in {
    val queue1 = new SequentialArrayQueue[Double]()
    val queue2 = new SequentialArrayQueue[Double]()

    // Move the begin index
    (1 to 5).map{
      i =>
        queue1.add(i.toDouble)
        queue1.remove()
    }
    (1 to 3).map{
      i => queue1.add(i.toDouble)
    }

    (4 to 10).map{
      i => queue2.add(i.toDouble)
    }
    queue1.addAll(queue2)
    var index = queue1.view()._1
    for (i <- 1 to 10) {
      assert(queue1.view()._3(index) == i)
      index += 1
    }
  }

  "QuantileSummarizer" should "compute `quantile` correctly" in {
    init
    val p = (1 to 100).map(_ / 100.0)
    val results = clockTSRdd.summarize(Summarizers.quantile("time", p)).first()

    val percentileEstimator = new Percentile().withEstimationType(Percentile.EstimationType.R_7)
    percentileEstimator.setData(clockTSRdd.collect().map(_.getAs[Long]("time").toDouble))
    val expectedResults = p.map { i => percentileEstimator.evaluate(i * 100.0) }
    (1 to 100).foreach { i => assert(results.getAs[Double](s"time_${i / 100.0}quantile") === expectedResults(i - 1)) }
  }

  it should "ignore null values" in {
    init
    val input = clockTSRdd.addColumns("v" -> LongType -> { row: Row => row.getAs[Long]("time") })
    assertEquals(
      input.summarize(Summarizers.quantile("v", Seq(0.25, 0.5, 0.75, 0.9, 0.95))),
      insertNullRows(input, "v").summarize(Summarizers.quantile("v", Seq(0.25, 0.5, 0.75, 0.9, 0.95)))
    )
  }

  it should "pass summarizer property test" in {
    summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.quantile("x1", Seq(0.25, 0.5, 0.75, 0.9, 0.95)))
  }
}

Source File: ExtremeSummarizerSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries.summarize.summarizer

import com.twosigma.flint.rdd.function.summarize.summarizer.Summarizer
import com.twosigma.flint.timeseries.row.Schema
import com.twosigma.flint.timeseries.summarize.{ SummarizerFactory, SummarizerSuite }
import com.twosigma.flint.timeseries.{ CSV, Summarizers, TimeSeriesRDD, TimeSeriesSuite }
import org.apache.spark.sql.types.{ DataType, DoubleType, FloatType, IntegerType, LongType, StructType }
import java.util.Random

import org.apache.spark.sql.Row

class ExtremeSummarizerSpec extends SummarizerSuite {

  override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer"

  private def test[T](
    dataType: DataType,
    randValue: Row => Any,
    summarizer: String => SummarizerFactory,
    reduceFn: (T, T) => T,
    inputColumn: String,
    outputColumn: String
  ): Unit = {
    val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)).addColumns(
      inputColumn -> dataType -> randValue
    )

    val data = priceTSRdd.collect().map{ row => row.getAs[T](inputColumn) }

    val trueExtreme = data.reduceLeft[T]{ case (x, y) => reduceFn(x, y) }

    val result = priceTSRdd.summarize(summarizer(inputColumn))

    val extreme = result.first().getAs[T](outputColumn)
    val outputType = result.schema(outputColumn).dataType

    assert(outputType == dataType, s"$outputType")
    assert(trueExtreme === extreme, s"extreme: $extreme, trueExtreme: $trueExtreme, data: ${data.toSeq}")
  }

  "MaxSummarizer" should "compute double max correctly" in {
    val rand = new Random()
    test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.max, math.max, "x", "x_max")
  }

  it should "compute long max correctly" in {
    val rand = new Random()
    test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.max, math.max, "x", "x_max")
  }

  it should "compute float max correctly" in {
    val rand = new Random()
    test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.max, math.max, "x", "x_max")
  }

  it should "compute int max correctly" in {
    val rand = new Random()
    test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.max, math.max, "x", "x_max")
  }

  "MinSummarizer" should "compute double min correctly" in {
    val rand = new Random()
    test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.min, math.min, "x", "x_min")
  }

  it should "compute long min correctly" in {
    val rand = new Random()
    test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.min, math.min, "x", "x_min")
  }

  it should "compute float min correctly" in {
    val rand = new Random()
    test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.min, math.min, "x", "x_min")
  }

  it should "compute int min correctly" in {
    val rand = new Random()
    test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.min, math.min, "x", "x_min")
  }

  it should "pass summarizer property test" in {
    summarizerPropertyTest(AllProperties)(Summarizers.max("x1"))
    summarizerPropertyTest(AllProperties)(Summarizers.min("x2"))
  }

  it should "ignore null values" in {
    val input = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType))
    val inputWithNull = insertNullRows(input, "price")

    assertEquals(
      input.summarize(Summarizers.min("price")),
      inputWithNull.summarize(Summarizers.min("price"))
    )
  }
}

Source File: SummarizeCyclesSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries

import com.twosigma.flint.timeseries.row.Schema
import org.apache.spark.sql.types.{ DoubleType, IntegerType, LongType }

class SummarizeCyclesSpec extends MultiPartitionSuite with TimeSeriesTestData with TimeTypeSuite {

  override val defaultResourceDir: String = "/timeseries/summarizecycles"
  private val volumeSchema = Schema("id" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType)
  private val volume2Schema = Schema("id" -> IntegerType, "volume" -> LongType)
  private val volumeWithGroupSchema = Schema(
    "id" -> IntegerType, "group" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType
  )

  "SummarizeCycles" should "pass `SummarizeSingleColumn` test." in {
    withAllTimeType {
      val resultTSRdd = fromCSV("SummarizeSingleColumn.results", Schema("volume_sum" -> DoubleType))

      def test(rdd: TimeSeriesRDD): Unit = {
        val summarizedVolumeTSRdd = rdd.summarizeCycles(Summarizers.sum("volume"))
        assertEquals(summarizedVolumeTSRdd, resultTSRdd)
      }

      val volumeTSRdd = fromCSV("Volume.csv", volumeSchema)
      withPartitionStrategy(volumeTSRdd)(DEFAULT)(test)
    }
  }

  it should "pass `SummarizeSingleColumnPerKey` test, i.e. with additional a single key." in {
    withAllTimeType {
      val resultTSRdd = fromCSV(
        "SummarizeSingleColumnPerKey.results",
        Schema("id" -> IntegerType, "volume_sum" -> DoubleType)
      )

      def test(rdd: TimeSeriesRDD): Unit = {
        val summarizedVolumeTSRdd = rdd.summarizeCycles(Summarizers.sum("volume"), Seq("id"))
        assertEquals(summarizedVolumeTSRdd, resultTSRdd)
      }

      val volumeTSRdd = fromCSV("Volume2.csv", volume2Schema)
      withPartitionStrategy(volumeTSRdd)(DEFAULT)(test)
    }
  }

  it should "pass `SummarizeSingleColumnPerSeqOfKeys` test, i.e. with additional a sequence of keys." in {
    withAllTimeType {
      val resultTSRdd = fromCSV(
        "SummarizeSingleColumnPerSeqOfKeys.results",
        Schema("id" -> IntegerType, "group" -> IntegerType, "volume_sum" -> DoubleType)
      )

      def test(rdd: TimeSeriesRDD): Unit = {
        val summarizedVolumeTSRdd = rdd.summarizeCycles(Summarizers.sum("volume"), Seq("id", "group"))
        assertEquals(summarizedVolumeTSRdd, resultTSRdd)
      }

      val volumeTSRdd = fromCSV("VolumeWithIndustryGroup.csv", volumeWithGroupSchema)
      withPartitionStrategy(volumeTSRdd)(DEFAULT)(test)
    }
  }

  it should "pass generated cycle data test" in {
    // TODO: The way cycleData works now doesn't support changing time type.
    val testData = cycleData1

    def sum(rdd: TimeSeriesRDD): TimeSeriesRDD = {
      rdd.summarizeCycles(Summarizers.compose(Summarizers.count(), Summarizers.sum("v1")))
    }

    withPartitionStrategyCompare(testData)(DEFAULT)(sum)
  }
}

Source File: SummarizeSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries

import com.twosigma.flint.timeseries.row.Schema
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{ LongType, IntegerType, DoubleType }

class SummarizeSpec extends MultiPartitionSuite {

  override val defaultResourceDir: String = "/timeseries/summarize"

  it should "`summarize` correctly" in {
    val expectedSchema = Schema("volume_sum" -> DoubleType)
    val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 7800.0), expectedSchema))

    def test(rdd: TimeSeriesRDD): Unit = {
      val results = rdd.summarize(Summarizers.sum("volume"))
      assert(results.schema == expectedSchema)
      assert(results.collect().deep == expectedResults.deep)
    }

    {
      val volumeRdd = fromCSV("Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType))
      withPartitionStrategy(volumeRdd)(DEFAULT)(test)
    }

  }

  it should "`summarize` per key correctly" in {
    val expectedSchema = Schema("id" -> IntegerType, "volume_sum" -> DoubleType)
    val expectedResults = Array[Row](
      new GenericRowWithSchema(Array(0L, 7, 4100.0), expectedSchema),
      new GenericRowWithSchema(Array(0L, 3, 3700.0), expectedSchema)
    )

    def test(rdd: TimeSeriesRDD): Unit = {
      val results = rdd.summarize(Summarizers.sum("volume"), Seq("id"))
      assert(results.schema == expectedSchema)
      assert(results.collect().sortBy(_.getAs[Int]("id")).deep == expectedResults.sortBy(_.getAs[Int]("id")).deep)
    }

    {
      val volumeTSRdd = fromCSV("Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType))
      withPartitionStrategy(volumeTSRdd)(DEFAULT)(test)
    }
  }
}

Source File: TimestampCast.scala From flint with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.codegen.{ CodegenContext, ExprCode, CodeGenerator, JavaCode, Block }
import org.apache.spark.sql.catalyst.expressions.{ Expression, NullIntolerant, UnaryExpression }
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.types.{ DataType, LongType, TimestampType }

case class TimestampToNanos(child: Expression) extends TimestampCast {
  val dataType: DataType = LongType
  protected def cast(childPrim: String): String =
    s"$childPrim * 1000L"
  override protected def nullSafeEval(input: Any): Any =
    input.asInstanceOf[Long] * 1000L
}

case class NanosToTimestamp(child: Expression) extends TimestampCast {
  val dataType: DataType = TimestampType
  protected def cast(childPrim: String): String =
    s"$childPrim / 1000L"
  override protected def nullSafeEval(input: Any): Any =
    input.asInstanceOf[Long] / 1000L
}

object TimestampToNanos {
  
  private[this] def castCode(ctx: CodegenContext, childPrim: String, childNull: String,
    resultPrim: String, resultNull: String, resultType: DataType): Block = {
    code"""
      boolean $resultNull = $childNull;
      ${CodeGenerator.javaType(resultType)} $resultPrim = ${CodeGenerator.defaultValue(resultType)};
      if (!${childNull}) {
        $resultPrim = (long) ${cast(childPrim)};
      }
    """
  }

  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val eval = child.genCode(ctx)
    ev.copy(code = eval.code +
      castCode(ctx, eval.value, eval.isNull, ev.value, ev.isNull, dataType))
  }
}

Source File: Schema.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs

import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object Schema {
  
  val GraphElementSchema = StructType(CommonFields ++ Seq(
    StructField("id", StringType, nullable = true),
    StructField("service", StringType, nullable = true),
    StructField("column", StringType, nullable = true),
    StructField("from", StringType, nullable = true),
    StructField("to", StringType, nullable = true),
    StructField("label", StringType, nullable = true),
    StructField("props", StringType, nullable = true)
  ))
}

Source File: ResolveTableValuedFunctions.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import java.util.Locale

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Alias, Expression}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Range}
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.types.{DataType, IntegerType, LongType}


      tvf("start" -> LongType, "end" -> LongType, "step" -> LongType,
          "numPartitions" -> IntegerType) {
        case Seq(start: Long, end: Long, step: Long, numPartitions: Int) =>
          Range(start, end, step, Some(numPartitions))
      })
  )

  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
    case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) =>
      // The whole resolution is somewhat difficult to understand here due to too much abstractions.
      // We should probably rewrite the following at some point. Reynold was just here to improve
      // error messages and didn't have time to do a proper rewrite.
      val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match {
        case Some(tvf) =>

          def failAnalysis(): Nothing = {
            val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ")
            u.failAnalysis(
              s"""error: table-valued function ${u.functionName} with alternatives:
                 |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")}
                 |cannot be applied to: ($argTypes)""".stripMargin)
          }

          val resolved = tvf.flatMap { case (argList, resolver) =>
            argList.implicitCast(u.functionArgs) match {
              case Some(casted) =>
                try {
                  Some(resolver(casted.map(_.eval())))
                } catch {
                  case e: AnalysisException =>
                    failAnalysis()
                }
              case _ =>
                None
            }
          }
          resolved.headOption.getOrElse {
            failAnalysis()
          }
        case _ =>
          u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function")
      }

      // If alias names assigned, add `Project` with the aliases
      if (u.outputNames.nonEmpty) {
        val outputAttrs = resolvedFunc.output
        // Checks if the number of the aliases is equal to expected one
        if (u.outputNames.size != outputAttrs.size) {
          u.failAnalysis(s"Number of given aliases does not match number of output columns. " +
            s"Function name: ${u.functionName}; number of aliases: " +
            s"${u.outputNames.size}; number of output columns: ${outputAttrs.size}.")
        }
        val aliases = outputAttrs.zip(u.outputNames).map {
          case (attr, name) => Alias(attr, name)()
        }
        Project(aliases, resolvedFunc)
      } else {
        resolvedFunc
      }
  }
}

Source File: GroupedIteratorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: MySQLDialect.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.jdbc

import java.sql.Types

import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder}

private case object MySQLDialect extends JdbcDialect {

  override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql")

  override def getCatalystType(
      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
    if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) {
      // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
      // byte arrays instead of longs.
      md.putLong("binarylong", 1)
      Option(LongType)
    } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
      Option(BooleanType)
    } else None
  }

  override def quoteIdentifier(colName: String): String = {
    s"`$colName`"
  }

  override def getTableExistsQuery(table: String): String = {
    s"SELECT 1 FROM $table LIMIT 1"
  }

  override def isCascadingTruncateTable(): Option[Boolean] = Some(false)
}

Source File: ResolveInlineTablesSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Literal, Rand}
import org.apache.spark.sql.catalyst.expressions.aggregate.Count
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.types.{LongType, NullType}


class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter {

  private def lit(v: Any): Literal = Literal(v)

  test("validate inputs are foldable") {
    ResolveInlineTables.validateInputEvaluable(
      UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)))))

    // nondeterministic (rand) should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1)))))
    }

    // aggregate should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1))))))
    }

    // unresolved attribute should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A")))))
    }
  }

  test("validate input dimensions") {
    ResolveInlineTables.validateInputDimension(
      UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2)))))

    // num alias != data dimension
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2)))))
    }

    // num alias == data dimension, but data themselves are inconsistent
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22)))))
    }
  }

  test("do not fire the rule if not all expressions are resolved") {
    val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A"))))
    assert(ResolveInlineTables(table) == table)
  }

  test("convert") {
    val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted = ResolveInlineTables.convert(table)

    assert(converted.output.map(_.dataType) == Seq(LongType))
    assert(converted.data.size == 2)
    assert(converted.data(0).getLong(0) == 1L)
    assert(converted.data(1).getLong(0) == 2L)
  }

  test("nullability inference in convert") {
    val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted1 = ResolveInlineTables.convert(table1)
    assert(!converted1.schema.fields(0).nullable)

    val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType))))
    val converted2 = ResolveInlineTables.convert(table2)
    assert(converted2.schema.fields(0).nullable)
  }
}

Source File: DecimalExpressionSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{Decimal, DecimalType, LongType}

class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("UnscaledValue") {
    val d1 = Decimal("10.1")
    checkEvaluation(UnscaledValue(Literal(d1)), 101L)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(UnscaledValue(Literal(d2)), 101L)
    checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("MakeDecimal") {
    checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1"))
    checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null)
  }

  test("PromotePrecision") {
    val d1 = Decimal("10.1")
    checkEvaluation(PromotePrecision(Literal(d1)), d1)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(PromotePrecision(Literal(d2)), d2)
    checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("CheckOverflow") {
    val d1 = Decimal("10.1")
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null)

    val d2 = Decimal(101, 3, 1)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null)

    checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null)
  }

}

Source File: ResolveTableValuedFunctions.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Range}
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.types.{DataType, IntegerType, LongType}


      tvf("start" -> LongType, "end" -> LongType, "step" -> LongType,
          "numPartitions" -> IntegerType) {
        case Seq(start: Long, end: Long, step: Long, numPartitions: Int) =>
          Range(start, end, step, Some(numPartitions))
      })
  )

  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
    case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) =>
      builtinFunctions.get(u.functionName) match {
        case Some(tvf) =>
          val resolved = tvf.flatMap { case (argList, resolver) =>
            argList.implicitCast(u.functionArgs) match {
              case Some(casted) =>
                Some(resolver(casted.map(_.eval())))
              case _ =>
                None
            }
          }
          resolved.headOption.getOrElse {
            val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ")
            u.failAnalysis(
              s"""error: table-valued function ${u.functionName} with alternatives:
                |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")}
                |cannot be applied to: (${argTypes})""".stripMargin)
          }
        case _ =>
          u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function")
      }
  }
}

Source File: MonotonicallyIncreasingID.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{DataType, LongType}


  @transient private[this] var count: Long = _

  @transient private[this] var partitionMask: Long = _

  override protected def initInternal(): Unit = {
    count = 0L
    partitionMask = TaskContext.getPartitionId().toLong << 33
  }

  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override protected def evalInternal(input: InternalRow): Long = {
    val currentCount = count
    count += 1
    partitionMask + currentCount
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val countTerm = ctx.freshName("count")
    val partitionMaskTerm = ctx.freshName("partitionMask")
    ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;")
    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm,
      s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;")

    ev.copy(code = s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
      $countTerm++;""", isNull = "false")
  }

  override def prettyName: String = "monotonically_increasing_id"

  override def sql: String = s"$prettyName()"
}

Source File: SQLTransformerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.types.{LongType, StructField, StructType}

class SQLTransformerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new SQLTransformer())
  }

  test("transform numeric data") {
    val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    val result = sqlTrans.transform(original)
    val resultSchema = sqlTrans.transformSchema(original.schema)
    val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
      .toDF("id", "v1", "v2", "v3", "v4")
    assert(result.schema.toString == resultSchema.toString)
    assert(resultSchema == expected.schema)
    assert(result.collect().toSeq == expected.collect().toSeq)
    assert(original.sparkSession.catalog.listTables().count() == 0)
  }

  test("read/write") {
    val t = new SQLTransformer()
      .setStatement("select * from __THIS__")
    testDefaultReadWrite(t)
  }

  test("transformSchema") {
    val df = spark.range(10)
    val outputSchema = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
      .transformSchema(df.schema)
    val expected = StructType(Seq(StructField("id1", LongType, nullable = false)))
    assert(outputSchema === expected)
  }
}

Source File: TestTableStatsSinglePathMain.scala From Spark.TableStatsExample with Apache License 2.0

5 votes

package com.cloudera.sa.examples.tablestats


import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType}
import org.scalatest.{FunSuite, BeforeAndAfterEach, BeforeAndAfterAll}


class TestTableStatsSinglePathMain extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{
  test("run table stats on sample data") {

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    var sc = new SparkContext("local", "test", sparkConfig)
    try {
      val sqlContext = new org.apache.spark.sql.SQLContext(sc)

      val schema =
        StructType(
          Array(
            StructField("id", LongType, true),
            StructField("name", StringType, true),
            StructField("age", LongType, true),
            StructField("gender", StringType, true),
            StructField("height", LongType, true),
            StructField("job_title", StringType, true)
          )
        )

      val rowRDD = sc.parallelize(Array(
        Row(1l, "Name.1", 20l, "M", 6l, "dad"),
        Row(2l, "Name.2", 20l, "F", 5l, "mom"),
        Row(3l, "Name.3", 20l, "F", 5l, "mom"),
        Row(4l, "Name.4", 20l, "M", 5l, "mom"),
        Row(5l, "Name.5", 10l, "M", 4l, "kid"),
        Row(6l, "Name.6", 8l, "M", 3l, "kid")))

      val df = sqlContext.createDataFrame(rowRDD, schema)

      val firstPassStats = TableStatsSinglePathMain.getFirstPassStat(df)

      assertResult(6l)(firstPassStats.columnStatsMap(0).maxLong)
      assertResult(1l)(firstPassStats.columnStatsMap(0).minLong)
      assertResult(21l)(firstPassStats.columnStatsMap(0).sumLong)
      assertResult(3l)(firstPassStats.columnStatsMap(0).avgLong)

      assertResult(2)(firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.length)

      firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.foreach { r =>
        if (r._1.equals("M")) {
          assertResult(4l)(r._2)
        } else if (r._1.equals("F")) {
          assertResult(2l)(r._2)
        } else {
          throw new RuntimeException("Unknown gender: " + r._1)
        }
      }
    } finally {
      sc.stop()
    }
  }
}

Source File: ConfigurableDataGeneratorMain.scala From Spark.TableStatsExample with Apache License 2.0

5 votes

package com.cloudera.sa.examples.tablestats

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType}
import org.apache.spark.{SparkContext, SparkConf}

import scala.collection.mutable
import scala.util.Random



object ConfigurableDataGeneratorMain {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("ConfigurableDataGeneratorMain <outputPath> <numberOfColumns> <numberOfRecords> <numberOfPartitions> <local>")
      return
    }

    val outputPath = args(0)
    val numberOfColumns = args(1).toInt
    val numberOfRecords = args(2).toInt
    val numberOfPartitions = args(3).toInt
    val runLocal = (args.length == 5 && args(4).equals("L"))

    var sc: SparkContext = null
    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "test", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("ConfigurableDataGeneratorMain")
      sc = new SparkContext(sparkConfig)
    }

    val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    //Part A
    val rowRDD = sc.parallelize( (0 until numberOfPartitions).map( i => i), numberOfPartitions)

    //Part B
    val megaDataRDD = rowRDD.flatMap( r => {
      val random = new Random()

      val dataRange = (0 until numberOfRecords/numberOfPartitions).iterator
      dataRange.map[Row]( x => {
        val values = new mutable.ArrayBuffer[Any]
        for (i <- 0 until numberOfColumns) {
          if (i % 2 == 0) {
            values.+=(random.nextInt(100).toLong)
          } else {
            values.+=(random.nextInt(100).toString)
          }
        }
        new GenericRow(values.toArray)
      })
    })

    //Part C
    val schema =
      StructType(
        (0 until numberOfColumns).map( i => {
          if (i % 2 == 0) {
            StructField("longColumn_" + i, LongType, true) }
          else {
            StructField("stringColumn_" + i, StringType, true)
          }
        })
      )
    val df = sqlContext.createDataFrame(megaDataRDD, schema)
    df.saveAsParquetFile(outputPath)

    //Part D
    sc.stop()
  }
}

Source File: MyUDF.scala From spark-tools with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.types.TimestampType

object MyUDF {

  private def myTimestampCast(xs: Seq[Expression]): Expression = {
    val expSource = xs.head
    expSource.dataType match {
      case LongType =>
        new Column(expSource).divide(Literal(1000)).cast(TimestampType).expr
      case TimestampType =>
        
        expSource
    }
  }

  def register(sparkSession: SparkSession): Unit =
    sparkSession.sessionState.functionRegistry
      .registerFunction("toTs", myTimestampCast)

}

Source File: GraphDataGen.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.datageneration

import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{any2Long, getOrDefault, getOrThrow, time}
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.apache.spark.graphx.util.GraphGenerators

object GraphDataGen extends WorkloadDefaults {

  val name = "graph-data-generator"
  val defaultMu = 4.0
  val defaultSigma = 1.3
  val defaultSeed = -1L
  val defaultNumOfPartitions = 0

  override def apply(m: Map[String, Any]): GraphDataGen = {
      val numVertices = getOrThrow(m, "vertices").asInstanceOf[Int]
      val mu = getOrDefault[Double](m, "mu", defaultMu)
      val sigma = getOrDefault[Double](m, "sigma", defaultSigma)
      val numPartitions = getOrDefault[Int](m, "partitions", defaultNumOfPartitions)
      val seed = getOrDefault[Long](m, "seed", defaultSeed, any2Long)
      val output = {
        val str = getOrThrow(m, "output").asInstanceOf[String]
        val s = verifySuitabilityOfOutputFileFormat(str)
        Some(s)
      }
    val saveMode = getOrDefault[String](m, "save-mode", SaveModes.error)

    new GraphDataGen(
      numVertices = numVertices,
      input = None,
      output = output,
      saveMode = saveMode,
      mu = mu,
      sigma = sigma,
      seed = seed,
      numPartitions = numPartitions
    )
  }

  
  private[datageneration] def verifySuitabilityOfOutputFileFormat(str: String): String = {
    val strArr: Array[String] = str.split('.')

    (strArr.length, strArr.last) match {
      case (1, _) => throw SparkBenchException("Output file for GraphDataGen must have \".txt\" as the file extension." +
        "Please modify your config file.")
      case (2, "txt") => str
      case (_, _) => throw SparkBenchException("Due to limitations of the GraphX GraphLoader, " +
        "the graph data generators may only save files as \".txt\"." +
        "Please modify your config file.")
    }
  }

}

case class GraphDataGen (
                          numVertices: Int,
                          input: Option[String] = None,
                          output: Option[String],
                          saveMode: String,
                          mu: Double = 4.0,
                          sigma: Double = 1.3,
                          seed: Long = 1,
                          numPartitions: Int = 0
                        ) extends Workload {

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {
    val timestamp = System.currentTimeMillis()
    val (generateTime, graph) = time(GraphGenerators.logNormalGraph(spark.sparkContext, numVertices, numPartitions, mu, sigma))
    val (convertTime, out) = time(graph.edges.map(e => s"${e.srcId.toString} ${e.dstId}"))
    val (saveTime, _) = time(out.saveAsTextFile(output.get))

    val timeResultSchema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("generate", LongType, nullable = true),
        StructField("convert", LongType, nullable = true),
        StructField("save", LongType, nullable = true),
        StructField("total_runtime", LongType, nullable = false)
      )
    )
    val total = generateTime + convertTime + saveTime
    val timeList = spark.sparkContext.parallelize(Seq(Row(GraphDataGen.name, timestamp, generateTime, convertTime, saveTime, total)))
    spark.createDataFrame(timeList, timeResultSchema)
  }
}

Source File: ExtAggregatesSpec.scala From spark-ext with Apache License 2.0

5 votes

package org.apache.spark.sql

import com.collective.TestSparkContext
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.scalatest.FlatSpec
import org.apache.spark.sql.functions._
import org.apache.spark.sql.ext.functions._

import scala.collection.mutable

class ExtAggregatesSpec extends FlatSpec with TestSparkContext {

  val schema = StructType(Seq(
    StructField("cookie_id", StringType),
    StructField("site", StringType),
    StructField("impressions", LongType)
  ))

  val cookie1 = "cookie1"
  val cookie2 = "cookie2"
  val cookie3 = "cookie3"

  val impressionLog = sqlContext.createDataFrame(sc.parallelize(Seq(
    Row(cookie1, "google.com", 10L),
    Row(cookie1, "cnn.com", 14L),
    Row(cookie1, "google.com", 2L),
    Row(cookie2, "bbc.com", 20L),
    Row(cookie2, "auto.com", null),
    Row(cookie2, "auto.com", 1L),
    Row(cookie3, "sport.com", 100L)
  )), schema)

  "Ext Aggregates" should "collect column values as array" in {
    val cookies = impressionLog
      .select(collectArray(col("cookie_id")))
      .first().getAs[mutable.WrappedArray[String]](0)
    assert(cookies.length == 7)
    assert(cookies.toSet.size == 3)
  }

  it should "collect distinct values as array" in {
    val distinctCookies = impressionLog.select(col("cookie_id"))
      .distinct()
      .select(collectArray(col("cookie_id")))
      .first().getAs[mutable.WrappedArray[String]](0)
    assert(distinctCookies.length == 3)
  }

  it should "collect values after group by" in {
    val result = impressionLog
      .groupBy(col("cookie_id"))
      .agg(collectArray(col("site")))

    val cookieSites = result.collect().map { case Row(cookie: String, sites: mutable.WrappedArray[_]) =>
      cookie -> sites.toSeq
    }.toMap

    assert(cookieSites(cookie1).length == 3)
    assert(cookieSites(cookie2).length == 3)
    assert(cookieSites(cookie3).length == 1)

  }

}

Source File: SparkEsBulkWriterSpec.scala From Spark2Elasticsearch with Apache License 2.0

5 votes

package com.github.jparkie.spark.elasticsearch

import com.github.jparkie.spark.elasticsearch.conf.{ SparkEsMapperConf, SparkEsWriteConf }
import com.github.jparkie.spark.elasticsearch.sql.{ SparkEsDataFrameMapper, SparkEsDataFrameSerializer }
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType }
import org.apache.spark.sql.{ Row, SQLContext }
import org.scalatest.{ MustMatchers, WordSpec }

class SparkEsBulkWriterSpec extends WordSpec with MustMatchers with SharedSparkContext {
  val esServer = new ElasticSearchServer()

  override def beforeAll(): Unit = {
    super.beforeAll()

    esServer.start()
  }

  override def afterAll(): Unit = {
    esServer.stop()

    super.afterAll()
  }

  "SparkEsBulkWriter" must {
    "execute write() successfully" in {
      esServer.createAndWaitForIndex("test_index")

      val sqlContext = new SQLContext(sc)

      val inputSparkEsWriteConf = SparkEsWriteConf(
        bulkActions = 10,
        bulkSizeInMB = 1,
        concurrentRequests = 0,
        flushTimeoutInSeconds = 1
      )
      val inputMapperConf = SparkEsMapperConf(
        esMappingId = Some("id"),
        esMappingParent = None,
        esMappingVersion = None,
        esMappingVersionType = None,
        esMappingRouting = None,
        esMappingTTLInMillis = None,
        esMappingTimestamp = None
      )
      val inputSchema = StructType(
        Array(
          StructField("id", StringType, true),
          StructField("parent", StringType, true),
          StructField("version", LongType, true),
          StructField("routing", StringType, true),
          StructField("ttl", LongType, true),
          StructField("timestamp", StringType, true),
          StructField("value", LongType, true)
        )
      )
      val inputData = sc.parallelize {
        Array(
          Row("TEST_ID_1", "TEST_PARENT_1", 1L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 1L),
          Row("TEST_ID_1", "TEST_PARENT_2", 2L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 2L),
          Row("TEST_ID_1", "TEST_PARENT_3", 3L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 3L),
          Row("TEST_ID_1", "TEST_PARENT_4", 4L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 4L),
          Row("TEST_ID_1", "TEST_PARENT_5", 5L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 5L),
          Row("TEST_ID_5", "TEST_PARENT_6", 6L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 6L),
          Row("TEST_ID_6", "TEST_PARENT_7", 7L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 7L),
          Row("TEST_ID_7", "TEST_PARENT_8", 8L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 8L),
          Row("TEST_ID_8", "TEST_PARENT_9", 9L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 9L),
          Row("TEST_ID_9", "TEST_PARENT_10", 10L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 10L),
          Row("TEST_ID_10", "TEST_PARENT_11", 11L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 11L)
        )
      }
      val inputDataFrame = sqlContext.createDataFrame(inputData, inputSchema)
      val inputDataIterator = inputDataFrame.rdd.toLocalIterator
      val inputSparkEsBulkWriter = new SparkEsBulkWriter[Row](
        esIndex = "test_index",
        esType = "test_type",
        esClient = () => esServer.client,
        sparkEsSerializer = new SparkEsDataFrameSerializer(inputSchema),
        sparkEsMapper = new SparkEsDataFrameMapper(inputMapperConf),
        sparkEsWriteConf = inputSparkEsWriteConf
      )

      inputSparkEsBulkWriter.write(null, inputDataIterator)

      val outputGetResponse = esServer.client.prepareGet("test_index", "test_type", "TEST_ID_1").get()

      outputGetResponse.isExists mustEqual true
      outputGetResponse.getSource.get("parent").asInstanceOf[String] mustEqual "TEST_PARENT_5"
      outputGetResponse.getSource.get("version").asInstanceOf[Integer] mustEqual 5
      outputGetResponse.getSource.get("routing").asInstanceOf[String] mustEqual "TEST_ROUTING_1"
      outputGetResponse.getSource.get("ttl").asInstanceOf[Integer] mustEqual 86400000
      outputGetResponse.getSource.get("timestamp").asInstanceOf[String] mustEqual "TEST_TIMESTAMP_1"
      outputGetResponse.getSource.get("value").asInstanceOf[Integer] mustEqual 5
    }
  }
}

Source File: GroupedIteratorSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: MySQLDialect.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.jdbc

import java.sql.Types

import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder}

private case object MySQLDialect extends JdbcDialect {

  override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql")

  override def getCatalystType(
      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
    if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) {
      // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
      // byte arrays instead of longs.
      md.putLong("binarylong", 1)
      Option(LongType)
    } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
      Option(BooleanType)
    } else None
  }

  override def quoteIdentifier(colName: String): String = {
    s"`$colName`"
  }

  override def getTableExistsQuery(table: String): String = {
    s"SELECT 1 FROM $table LIMIT 1"
  }

  override def isCascadingTruncateTable(): Option[Boolean] = Some(false)
}

Source File: ResolveInlineTablesSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Literal, Rand}
import org.apache.spark.sql.catalyst.expressions.aggregate.Count
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.types.{LongType, NullType}


class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter {

  private def lit(v: Any): Literal = Literal(v)

  test("validate inputs are foldable") {
    ResolveInlineTables.validateInputEvaluable(
      UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)))))

    // nondeterministic (rand) should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1)))))
    }

    // aggregate should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1))))))
    }

    // unresolved attribute should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A")))))
    }
  }

  test("validate input dimensions") {
    ResolveInlineTables.validateInputDimension(
      UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2)))))

    // num alias != data dimension
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2)))))
    }

    // num alias == data dimension, but data themselves are inconsistent
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22)))))
    }
  }

  test("do not fire the rule if not all expressions are resolved") {
    val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A"))))
    assert(ResolveInlineTables(table) == table)
  }

  test("convert") {
    val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted = ResolveInlineTables.convert(table)

    assert(converted.output.map(_.dataType) == Seq(LongType))
    assert(converted.data.size == 2)
    assert(converted.data(0).getLong(0) == 1L)
    assert(converted.data(1).getLong(0) == 2L)
  }

  test("nullability inference in convert") {
    val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted1 = ResolveInlineTables.convert(table1)
    assert(!converted1.schema.fields(0).nullable)

    val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType))))
    val converted2 = ResolveInlineTables.convert(table2)
    assert(converted2.schema.fields(0).nullable)
  }
}

Source File: RandomSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.scalatest.Matchers._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{IntegerType, LongType}

class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("random") {
    checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001)
    checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001)

    checkDoubleEvaluation(
      new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001)
    checkDoubleEvaluation(
      new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001)
  }

  test("SPARK-9127 codegen with long seed") {
    checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001)
    checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001)
  }
}

Source File: DecimalExpressionSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{Decimal, DecimalType, LongType}

class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("UnscaledValue") {
    val d1 = Decimal("10.1")
    checkEvaluation(UnscaledValue(Literal(d1)), 101L)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(UnscaledValue(Literal(d2)), 101L)
    checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("MakeDecimal") {
    checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1"))
    checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null)
  }

  test("PromotePrecision") {
    val d1 = Decimal("10.1")
    checkEvaluation(PromotePrecision(Literal(d1)), d1)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(PromotePrecision(Literal(d2)), d2)
    checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("CheckOverflow") {
    val d1 = Decimal("10.1")
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null)

    val d2 = Decimal(101, 3, 1)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null)

    checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null)
  }

}

Source File: MonotonicallyIncreasingID.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{DataType, LongType}


  @transient private[this] var count: Long = _

  @transient private[this] var partitionMask: Long = _

  override protected def initializeInternal(partitionIndex: Int): Unit = {
    count = 0L
    partitionMask = partitionIndex.toLong << 33
  }

  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override protected def evalInternal(input: InternalRow): Long = {
    val currentCount = count
    count += 1
    partitionMask + currentCount
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val countTerm = ctx.freshName("count")
    val partitionMaskTerm = ctx.freshName("partitionMask")
    ctx.addMutableState(ctx.JAVA_LONG, countTerm, "")
    ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "")
    ctx.addPartitionInitializationStatement(s"$countTerm = 0L;")
    ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;")

    ev.copy(code = s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm;
      $countTerm++;""", isNull = "false")
  }

  override def prettyName: String = "monotonically_increasing_id"

  override def sql: String = s"$prettyName()"
}

Source File: SQLTransformerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.types.{LongType, StructField, StructType}

class SQLTransformerSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new SQLTransformer())
  }

  test("transform numeric data") {
    val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2")
    val sqlTrans = new SQLTransformer().setStatement(
      "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__")
    val result = sqlTrans.transform(original)
    val resultSchema = sqlTrans.transformSchema(original.schema)
    val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0))
      .toDF("id", "v1", "v2", "v3", "v4")
    assert(result.schema.toString == resultSchema.toString)
    assert(resultSchema == expected.schema)
    assert(result.collect().toSeq == expected.collect().toSeq)
    assert(original.sparkSession.catalog.listTables().count() == 0)
  }

  test("read/write") {
    val t = new SQLTransformer()
      .setStatement("select * from __THIS__")
    testDefaultReadWrite(t)
  }

  test("transformSchema") {
    val df = spark.range(10)
    val outputSchema = new SQLTransformer()
      .setStatement("SELECT id + 1 AS id1 FROM __THIS__")
      .transformSchema(df.schema)
    val expected = StructType(Seq(StructField("id1", LongType, nullable = false)))
    assert(outputSchema === expected)
  }
}

Source File: LinearRegressionDataGen.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.datageneration.mlgenerator

import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{getOrDefault, getOrThrow, time}
import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object LinearRegressionDataGen extends WorkloadDefaults {
  val name = "data-generation-lr"
  // Application parameters #1million points have 200M data size
  val numOfExamples: Int = 40000
  val numOfFeatures: Int = 4
  val eps: Double = 0.5
  val intercepts: Double = 0.1
  val numOfPartitions: Int = 10
  val maxIteration: Int = 3
  override def apply(m: Map[String, Any]) = new LinearRegressionDataGen(
    numRows = getOrThrow(m, "rows").asInstanceOf[Int],
    numCols = getOrThrow(m, "cols").asInstanceOf[Int],
    output = Some(getOrThrow(m, "output").asInstanceOf[String]),
    saveMode = getOrDefault[String](m, "save-mode", SaveModes.error),
    eps = getOrDefault[Double](m, "eps", eps),
    intercepts = getOrDefault[Double](m, "intercepts", intercepts),
    numPartitions = getOrDefault[Int](m, "partitions", numOfPartitions)
  )
}

case class LinearRegressionDataGen (
                                      numRows: Int,
                                      numCols: Int,
                                      input: Option[String] = None,
                                      output: Option[String],
                                      saveMode: String,
                                      eps: Double,
                                      intercepts: Double,
                                      numPartitions: Int
                                   ) extends Workload {

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {

    val timestamp = System.currentTimeMillis()

    val (generateTime, data): (Long, RDD[LabeledPoint]) = time {
      LinearDataGenerator.generateLinearRDD(
        spark.sparkContext,
        numRows,
        numCols,
        eps,
        numPartitions,
        intercepts
      )
    }

    import spark.implicits._
    val (convertTime, dataDF) = time {
      data.toDF
    }

    val (saveTime, _) = time {
      val outputstr = output.get
      if(outputstr.endsWith(".csv")) throw SparkBenchException("LabeledPoints cannot be saved to CSV. Please try outputting to Parquet instead.")
      writeToDisk(output.get, saveMode, dataDF, spark)
    }//TODO you can't output this to CSV. Parquet is fine

    val timeResultSchema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("generate", LongType, nullable = true),
        StructField("convert", LongType, nullable = true),
        StructField("save", LongType, nullable = true),
        StructField("total_runtime", LongType, nullable = false)
      )
    )

    val total = generateTime + convertTime + saveTime

    val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total)))

    spark.createDataFrame(timeList, timeResultSchema)

  }
}

Source File: MyUDF.scala From spark-tools with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.sql.catalyst.FunctionIdentifier
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.types.TimestampType

object MyUDF {

  private def myTimestampCast(xs: Seq[Expression]): Expression = {
    val expSource = xs.head
    expSource.dataType match {
      case LongType =>
        new Column(expSource).divide(Literal(1000)).cast(TimestampType).expr
      case TimestampType =>
        
        expSource
    }
  }

  def register(sparkSession: SparkSession): Unit =
    sparkSession.sessionState.functionRegistry
      .registerFunction(FunctionIdentifier("toTs",None), myTimestampCast)

}

Source File: PartitionAndSleepWorkload.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.workload.exercise

import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._
import com.ibm.sparktc.sparkbench.utils.SaveModes
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object PartitionAndSleepWorkload extends WorkloadDefaults {
  val name = "timedsleep"
  val partitions: Int = 48
  val sleepms: Long = 12000L

  def apply(m: Map[String, Any]) = new PartitionAndSleepWorkload(
    input = None,
    output = None,
    partitions = getOrDefault[Int](m, "partitions", partitions),
    sleepMS = getOrDefault[Long](m, "sleepms", sleepms, any2Long))
}

case class PartitionAndSleepWorkload(input: Option[String] = None,
                                     output: Option[String] = None,
                                     saveMode: String = SaveModes.error,
                                     partitions: Int,
                                     sleepMS: Long) extends Workload {

  def doStuff(spark: SparkSession): (Long, Unit) = time {

    val ms = sleepMS
    val stuff: RDD[Int] = spark.sparkContext.parallelize(0 until partitions * 100, partitions)

    val cool: RDD[(Int, Int)] = stuff.map { i =>
      Thread.sleep(ms)
      (i % 10, i + 42)
    }

    val yeah = cool.reduceByKey(_ + _)
    yeah.collect()
  }

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {
    val (t, _) = doStuff(spark)

    val schema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("runtime", LongType, nullable = false)
      )
    )

    val timeList = spark.sparkContext.parallelize(Seq(Row("timedsleep", System.currentTimeMillis(), t)))

    spark.createDataFrame(timeList, schema)
  }
}

Source File: CubeMakerTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.test.cube

import java.sql.Timestamp

import com.github.nscala_time.time.Imports._
import com.stratio.sparta.driver.step.{Cube, CubeOperations, Trigger}
import com.stratio.sparta.driver.writer.WriterOptions
import com.stratio.sparta.plugin.default.DefaultField
import com.stratio.sparta.plugin.cube.field.datetime.DateTimeField
import com.stratio.sparta.plugin.cube.operator.count.CountOperator
import com.stratio.sparta.sdk.pipeline.aggregation.cube.{Dimension, DimensionValue, DimensionValuesTime, InputFields}
import com.stratio.sparta.sdk.pipeline.schema.TypeOp
import com.stratio.sparta.sdk.utils.AggregationTime
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType, TimestampType}
import org.apache.spark.streaming.TestSuiteBase
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class CubeMakerTest extends TestSuiteBase {

  val PreserverOrder = false

  
  def getEventOutput(timestamp: Timestamp, millis: Long):
  Seq[Seq[(DimensionValuesTime, InputFields)]] = {
    val dimensionString = Dimension("dim1", "eventKey", "identity", new DefaultField)
    val dimensionTime = Dimension("minute", "minute", "minute", new DateTimeField)
    val dimensionValueString1 = DimensionValue(dimensionString, "value1")
    val dimensionValueString2 = dimensionValueString1.copy(value = "value2")
    val dimensionValueString3 = dimensionValueString1.copy(value = "value3")
    val dimensionValueTs = DimensionValue(dimensionTime, timestamp)
    val tsMap = Row(timestamp)
    val valuesMap1 = InputFields(Row("value1", timestamp), 1)
    val valuesMap2 = InputFields(Row("value2", timestamp), 1)
    val valuesMap3 = InputFields(Row("value3", timestamp), 1)

    Seq(Seq(
      (DimensionValuesTime("cubeName", Seq(dimensionValueString1, dimensionValueTs)), valuesMap1),
      (DimensionValuesTime("cubeName", Seq(dimensionValueString2, dimensionValueTs)), valuesMap2),
      (DimensionValuesTime("cubeName", Seq(dimensionValueString3, dimensionValueTs)), valuesMap3)
    ))
  }
}

Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License

5 votes

import org.apache.spark.SparkConf
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.streaming.http.HttpStreamClient
import org.junit.Assert
import org.junit.Test
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.types.BooleanType
import org.apache.spark.sql.types.FloatType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.ByteType
import org.apache.spark.sql.execution.streaming.http.HttpStreamServer
import org.apache.spark.sql.execution.streaming.http.StreamPrinter
import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException


class HttpStreamServerClientTest {
	val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte),
		Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte),
		Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte));

	val ROWS2 = Array(Row("hello"),
		Row("world"),
		Row("bye"),
		Row("world"));

	@Test
	def testHttpStreamIO() {
		//starts a http server
		val kryoSerializer = new KryoSerializer(new SparkConf());
		val server = HttpStreamServer.start("/xxxx", 8080);

		val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]")
			.getOrCreate();
		spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/");

		val sqlContext = spark.sqlContext;
		import spark.implicits._
		//add a local message buffer to server, with 2 topics registered
		server.withBuffer()
			.addListener(new StreamPrinter())
			.createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1")
			.createTopic[String]("topic-2");

		val client = HttpStreamClient.connect("http://localhost:8080/xxxx");
		//tests schema of topics
		val schema1 = client.fetchSchema("topic-1");
		Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType),
			schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]);

		val schema2 = client.fetchSchema("topic-2");
		Assert.assertArrayEquals(Array[Object](StringType),
			schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]);

		//prepare to consume messages
		val sid1 = client.subscribe("topic-1")._1;
		val sid2 = client.subscribe("topic-2")._1;

		//produces some data
		client.sendRows("topic-1", 1, ROWS1);

		val sid4 = client.subscribe("topic-1")._1;
		val sid5 = client.subscribe("topic-2")._1;

		client.sendRows("topic-2", 1, ROWS2);

		//consumes data
		val fetched = client.fetchStream(sid1).map(_.originalRow);
		Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]);
		//it is empty now
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]);

		client.unsubscribe(sid4);
		try {
			client.fetchStream(sid4);
			//exception should be thrown, because subscriber id is invalidated
			Assert.assertTrue(false);
		}
		catch {
			case e: Throwable ⇒
				e.printStackTrace();
				Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass);
		}

		server.stop();
	}
}

Source File: hierarchyGen.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.test

import org.apache.spark.sql.types.{LongType, Node}
import org.scalacheck.{Arbitrary, Gen}

import scala.util.Random
import scalaz._
import Scalaz._
import scalaz.scalacheck.ScalazArbitrary._

// scalastyle:off file.size.limit

object HierarchyGen {

  val MIN_SIZE_TREE = 6
  val MAX_SIZE_TREE = 100

  
  def next(): Long = {
    synchronized {
      if (currentSeq == Long.MaxValue) {
        currentSeq = Long.MinValue
      }
      val result = currentSeq
      currentSeq += 1
      result
    }
  }

  def arb: Arbitrary[Long] = Arbitrary {
    gen
  }

  def gen: Gen[Long] = Gen.resultOf[Int,Long] { x => next() }
}

Source File: GroupedIteratorSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import java.util.concurrent.TimeUnit.NANOSECONDS

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeProjection
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning}
import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.sql.execution.streaming.state.StateStoreOps
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType}
import org.apache.spark.util.CompletionIterator


case class StreamingGlobalLimitExec(
    streamLimit: Long,
    child: SparkPlan,
    stateInfo: Option[StatefulOperatorStateInfo] = None,
    outputMode: Option[OutputMode] = None)
  extends UnaryExecNode with StateStoreWriter {

  private val keySchema = StructType(Array(StructField("key", NullType)))
  private val valueSchema = StructType(Array(StructField("value", LongType)))

  override protected def doExecute(): RDD[InternalRow] = {
    metrics // force lazy init at driver

    assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append,
      "StreamingGlobalLimitExec is only valid for streams in Append output mode")

    child.execute().mapPartitionsWithStateStore(
        getStateInfo,
        keySchema,
        valueSchema,
        indexOrdinal = None,
        sqlContext.sessionState,
        Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) =>
      val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null)))
      val numOutputRows = longMetric("numOutputRows")
      val numUpdatedStateRows = longMetric("numUpdatedStateRows")
      val allUpdatesTimeMs = longMetric("allUpdatesTimeMs")
      val commitTimeMs = longMetric("commitTimeMs")
      val updatesStartTimeNs = System.nanoTime

      val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L)
      var cumulativeRowCount = preBatchRowCount

      val result = iter.filter { r =>
        val x = cumulativeRowCount < streamLimit
        if (x) {
          cumulativeRowCount += 1
        }
        x
      }

      CompletionIterator[InternalRow, Iterator[InternalRow]](result, {
        if (cumulativeRowCount > preBatchRowCount) {
          numUpdatedStateRows += 1
          numOutputRows += cumulativeRowCount - preBatchRowCount
          store.put(key, getValueRow(cumulativeRowCount))
        }
        allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs)
        commitTimeMs += timeTakenMs { store.commit() }
        setStoreMetrics(store)
      })
    }
  }

  override def output: Seq[Attribute] = child.output

  override def outputPartitioning: Partitioning = child.outputPartitioning

  override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil

  private def getValueRow(value: Long): UnsafeRow = {
    UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value)))
  }
}

Source File: MySQLDialect.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.jdbc

import java.sql.Types

import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder}

private case object MySQLDialect extends JdbcDialect {

  override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql")

  override def getCatalystType(
      sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = {
    if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) {
      // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as
      // byte arrays instead of longs.
      md.putLong("binarylong", 1)
      Option(LongType)
    } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) {
      Option(BooleanType)
    } else None
  }

  override def quoteIdentifier(colName: String): String = {
    s"`$colName`"
  }

  override def getTableExistsQuery(table: String): String = {
    s"SELECT 1 FROM $table LIMIT 1"
  }

  override def isCascadingTruncateTable(): Option[Boolean] = Some(false)
}

Source File: ResolveInlineTablesSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Cast, Literal, Rand}
import org.apache.spark.sql.catalyst.expressions.aggregate.Count
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types.{LongType, NullType, TimestampType}


class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter {

  private def lit(v: Any): Literal = Literal(v)

  test("validate inputs are foldable") {
    ResolveInlineTables(conf).validateInputEvaluable(
      UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)))))

    // nondeterministic (rand) should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1)))))
    }

    // aggregate should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1))))))
    }

    // unresolved attribute should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A")))))
    }
  }

  test("validate input dimensions") {
    ResolveInlineTables(conf).validateInputDimension(
      UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2)))))

    // num alias != data dimension
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputDimension(
        UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2)))))
    }

    // num alias == data dimension, but data themselves are inconsistent
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputDimension(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22)))))
    }
  }

  test("do not fire the rule if not all expressions are resolved") {
    val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A"))))
    assert(ResolveInlineTables(conf)(table) == table)
  }

  test("convert") {
    val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted = ResolveInlineTables(conf).convert(table)

    assert(converted.output.map(_.dataType) == Seq(LongType))
    assert(converted.data.size == 2)
    assert(converted.data(0).getLong(0) == 1L)
    assert(converted.data(1).getLong(0) == 2L)
  }

  test("convert TimeZoneAwareExpression") {
    val table = UnresolvedInlineTable(Seq("c1"),
      Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType))))
    val withTimeZone = ResolveTimeZone(conf).apply(table)
    val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone)
    val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType)
      .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long]
    assert(output.map(_.dataType) == Seq(TimestampType))
    assert(data.size == 1)
    assert(data.head.getLong(0) == correct)
  }

  test("nullability inference in convert") {
    val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted1 = ResolveInlineTables(conf).convert(table1)
    assert(!converted1.schema.fields(0).nullable)

    val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType))))
    val converted2 = ResolveInlineTables(conf).convert(table2)
    assert(converted2.schema.fields(0).nullable)
  }
}

Source File: RandomSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.scalatest.Matchers._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{IntegerType, LongType}

class RandomSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("random") {
    checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001)
    checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001)

    checkDoubleEvaluation(
      new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001)
    checkDoubleEvaluation(
      new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001)
  }

  test("SPARK-9127 codegen with long seed") {
    checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001)
    checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001)
  }
}

Source File: DecimalExpressionSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{Decimal, DecimalType, LongType}

class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("UnscaledValue") {
    val d1 = Decimal("10.1")
    checkEvaluation(UnscaledValue(Literal(d1)), 101L)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(UnscaledValue(Literal(d2)), 101L)
    checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("MakeDecimal") {
    checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1"))
    checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null)
  }

  test("PromotePrecision") {
    val d1 = Decimal("10.1")
    checkEvaluation(PromotePrecision(Literal(d1)), d1)
    val d2 = Decimal(101, 3, 1)
    checkEvaluation(PromotePrecision(Literal(d2)), d2)
    checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null)
  }

  test("CheckOverflow") {
    val d1 = Decimal("10.1")
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1)
    checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null)

    val d2 = Decimal(101, 3, 1)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10"))
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2)
    checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null)

    checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null)
  }

}

org.apache.spark.sql.types.LongType Scala Examples