org.apache.spark.sql.types.StringType Scala Example

Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0

12 votes

package org.apache.spark.sql.execution.datasources.text

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.util.CompressionCodecs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.util.SerializableConfiguration


  def getCompressionExtension(context: TaskAttemptContext): String = {
    // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile
    if (FileOutputFormat.getCompressOutput(context)) {
      val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec])
      ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension
    } else {
      ""
    }
  }
}

Source File: BooleanStatement.scala From spark-snowflake with Apache License 2.0

7 votes

package net.snowflake.spark.snowflake.pushdowns.querygeneration

import net.snowflake.spark.snowflake.{ConstantString, SnowflakeSQLStatement}
import org.apache.spark.sql.catalyst.expressions.{
  Attribute,
  Contains,
  EndsWith,
  EqualTo,
  Expression,
  GreaterThan,
  GreaterThanOrEqual,
  In,
  IsNotNull,
  IsNull,
  LessThan,
  LessThanOrEqual,
  Literal,
  Not,
  StartsWith
}
import org.apache.spark.sql.types.StringType
import org.apache.spark.unsafe.types.UTF8String


private[querygeneration] object BooleanStatement {
  def unapply(
    expAttr: (Expression, Seq[Attribute])
  ): Option[SnowflakeSQLStatement] = {
    val expr = expAttr._1
    val fields = expAttr._2

    Option(expr match {
      case In(child, list) if list.forall(_.isInstanceOf[Literal]) =>
        convertStatement(child, fields) + "IN" +
          blockStatement(convertStatements(fields, list: _*))
      case IsNull(child) =>
        blockStatement(convertStatement(child, fields) + "IS NULL")
      case IsNotNull(child) =>
        blockStatement(convertStatement(child, fields) + "IS NOT NULL")
      case Not(child) => {
        child match {
          case EqualTo(left, right) =>
            blockStatement(
              convertStatement(left, fields) + "!=" +
                convertStatement(right, fields)
            )
          case GreaterThanOrEqual(left, right) =>
            convertStatement(LessThan(left, right), fields)
          case LessThanOrEqual(left, right) =>
            convertStatement(GreaterThan(left, right), fields)
          case GreaterThan(left, right) =>
            convertStatement(LessThanOrEqual(left, right), fields)
          case LessThan(left, right) =>
            convertStatement(GreaterThanOrEqual(left, right), fields)
          case _ =>
            ConstantString("NOT") +
              blockStatement(convertStatement(child, fields))
        }
      }
      case Contains(child, Literal(pattern: UTF8String, StringType)) =>
        convertStatement(child, fields) + "LIKE" + s"'%${pattern.toString}%'"
      case EndsWith(child, Literal(pattern: UTF8String, StringType)) =>
        convertStatement(child, fields) + "LIKE" + s"'%${pattern.toString}'"
      case StartsWith(child, Literal(pattern: UTF8String, StringType)) =>
        convertStatement(child, fields) + "LIKE" + s"'${pattern.toString}%'"

      case _ => null
    })
  }
}

Source File: OnErrorSuite.scala From spark-snowflake with Apache License 2.0

6 votes

package net.snowflake.spark.snowflake

import net.snowflake.client.jdbc.SnowflakeSQLException
import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME
import org.apache.spark.sql.{DataFrame, Row, SaveMode}
import org.apache.spark.sql.types.{StringType, StructField, StructType}

class OnErrorSuite extends IntegrationSuiteBase {
  lazy val table = s"spark_test_table_$randomSuffix"

  lazy val schema = new StructType(
    Array(StructField("var", StringType, nullable = false))
  )

  lazy val df: DataFrame = sparkSession.createDataFrame(
    sc.parallelize(
      Seq(Row("{\"dsadas\nadsa\":12311}"), Row("{\"abc\":334}")) // invalid json key
    ),
    schema
  )

  override def beforeAll(): Unit = {
    super.beforeAll()
    jdbcUpdate(s"create or replace table $table(var variant)")
  }

  override def afterAll(): Unit = {
    jdbcUpdate(s"drop table $table")
    super.afterAll()
  }

  test("continue_on_error off") {

    assertThrows[SnowflakeSQLException] {
      df.write
        .format(SNOWFLAKE_SOURCE_NAME)
        .options(connectorOptionsNoTable)
        .option("dbtable", table)
        .mode(SaveMode.Append)
        .save()
    }
  }

  test("continue_on_error on") {
    df.write
      .format(SNOWFLAKE_SOURCE_NAME)
      .options(connectorOptionsNoTable)
      .option("continue_on_error", "on")
      .option("dbtable", table)
      .mode(SaveMode.Append)
      .save()

    val result = sparkSession.read
      .format(SNOWFLAKE_SOURCE_NAME)
      .options(connectorOptionsNoTable)
      .option("dbtable", table)
      .load()

    assert(result.collect().length == 1)
  }

}

Source File: SparkLensTest.scala From spark-tools with Apache License 2.0

5 votes

package io.univalence

import org.apache.spark.SparkConf
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.ArrayType
import org.apache.spark.sql.types.StringType
import io.univalence.SparkLens._
import org.scalatest.FunSuite

case class Toto(name: String, age: Int)

case class Tata(toto: Toto)

class SparkLensTest extends FunSuite {

  val conf: SparkConf = new SparkConf()
  conf.setAppName("yo")
  conf.setMaster("local[*]")

  implicit val ss: SparkSession = SparkSession.builder.config(conf).getOrCreate

  import ss.implicits._

  test("testLensRegExp change string") {
    assert(lensRegExp(ss.createDataFrame(Seq(Toto("a", 1))))({
      case ("name", StringType) => true
      case _                    => false
    }, { case (a: String, d)    => a.toUpperCase }).as[Toto].first() == Toto("A", 1))
  }

  test("change Int") {
    assert(lensRegExp(ss.createDataFrame(Seq(Tata(Toto("a", 1)))))({
      case ("toto/age", _) => true
      case _               => false
    }, { case (a: Int, d)  => a + 1 }).as[Tata].first() == Tata(Toto("a", 2)))
  }

  ignore("null to nil") {

    val df: DataFrame = ss.read.parquet("/home/phong/daily_gpp_20180705")

    val yoho: DataFrame = lensRegExp(df)({
      case (_, ArrayType(_, _)) => true
      case _                    => false
    }, (a, b) => if (a == null) Nil else a)

  }

}

Source File: ConfigurableDataGeneratorMain.scala From Spark.TableStatsExample with Apache License 2.0

5 votes

package com.cloudera.sa.examples.tablestats

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType}
import org.apache.spark.{SparkContext, SparkConf}

import scala.collection.mutable
import scala.util.Random



object ConfigurableDataGeneratorMain {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("ConfigurableDataGeneratorMain <outputPath> <numberOfColumns> <numberOfRecords> <numberOfPartitions> <local>")
      return
    }

    val outputPath = args(0)
    val numberOfColumns = args(1).toInt
    val numberOfRecords = args(2).toInt
    val numberOfPartitions = args(3).toInt
    val runLocal = (args.length == 5 && args(4).equals("L"))

    var sc: SparkContext = null
    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "test", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("ConfigurableDataGeneratorMain")
      sc = new SparkContext(sparkConfig)
    }

    val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    //Part A
    val rowRDD = sc.parallelize( (0 until numberOfPartitions).map( i => i), numberOfPartitions)

    //Part B
    val megaDataRDD = rowRDD.flatMap( r => {
      val random = new Random()

      val dataRange = (0 until numberOfRecords/numberOfPartitions).iterator
      dataRange.map[Row]( x => {
        val values = new mutable.ArrayBuffer[Any]
        for (i <- 0 until numberOfColumns) {
          if (i % 2 == 0) {
            values.+=(random.nextInt(100).toLong)
          } else {
            values.+=(random.nextInt(100).toString)
          }
        }
        new GenericRow(values.toArray)
      })
    })

    //Part C
    val schema =
      StructType(
        (0 until numberOfColumns).map( i => {
          if (i % 2 == 0) {
            StructField("longColumn_" + i, LongType, true) }
          else {
            StructField("stringColumn_" + i, StringType, true)
          }
        })
      )
    val df = sqlContext.createDataFrame(megaDataRDD, schema)
    df.saveAsParquetFile(outputPath)

    //Part D
    sc.stop()
  }
}

Source File: TestTableStatsSinglePathMain.scala From Spark.TableStatsExample with Apache License 2.0

5 votes

package com.cloudera.sa.examples.tablestats


import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType}
import org.scalatest.{FunSuite, BeforeAndAfterEach, BeforeAndAfterAll}


class TestTableStatsSinglePathMain extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{
  test("run table stats on sample data") {

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    var sc = new SparkContext("local", "test", sparkConfig)
    try {
      val sqlContext = new org.apache.spark.sql.SQLContext(sc)

      val schema =
        StructType(
          Array(
            StructField("id", LongType, true),
            StructField("name", StringType, true),
            StructField("age", LongType, true),
            StructField("gender", StringType, true),
            StructField("height", LongType, true),
            StructField("job_title", StringType, true)
          )
        )

      val rowRDD = sc.parallelize(Array(
        Row(1l, "Name.1", 20l, "M", 6l, "dad"),
        Row(2l, "Name.2", 20l, "F", 5l, "mom"),
        Row(3l, "Name.3", 20l, "F", 5l, "mom"),
        Row(4l, "Name.4", 20l, "M", 5l, "mom"),
        Row(5l, "Name.5", 10l, "M", 4l, "kid"),
        Row(6l, "Name.6", 8l, "M", 3l, "kid")))

      val df = sqlContext.createDataFrame(rowRDD, schema)

      val firstPassStats = TableStatsSinglePathMain.getFirstPassStat(df)

      assertResult(6l)(firstPassStats.columnStatsMap(0).maxLong)
      assertResult(1l)(firstPassStats.columnStatsMap(0).minLong)
      assertResult(21l)(firstPassStats.columnStatsMap(0).sumLong)
      assertResult(3l)(firstPassStats.columnStatsMap(0).avgLong)

      assertResult(2)(firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.length)

      firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.foreach { r =>
        if (r._1.equals("M")) {
          assertResult(4l)(r._2)
        } else if (r._1.equals("F")) {
          assertResult(2l)(r._2)
        } else {
          throw new RuntimeException("Unknown gender: " + r._1)
        }
      }
    } finally {
      sc.stop()
    }
  }
}

Source File: Tokenizer.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}


  @Since("1.6.0")
  def getToLowercase: Boolean = $(toLowercase)

  setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+", toLowercase -> true)

  override protected def createTransformFunc: String => Seq[String] = { originStr =>
    val re = $(pattern).r
    val str = if ($(toLowercase)) originStr.toLowerCase() else originStr
    val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq
    val minLength = $(minTokenLength)
    tokens.filter(_.length >= minLength)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType == StringType, s"Input type must be string type but got $inputType.")
  }

  override protected def outputDataType: DataType = new ArrayType(StringType, true)

  @Since("1.4.1")
  override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra)
}

@Since("1.6.0")
object RegexTokenizer extends DefaultParamsReadable[RegexTokenizer] {

  @Since("1.6.0")
  override def load(path: String): RegexTokenizer = super.load(path)
}

Source File: NGram.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}


  @Since("1.5.0")
  def getN: Int = $(n)

  setDefault(n -> 2)

  override protected def createTransformFunc: Seq[String] => Seq[String] = {
    _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.sameType(ArrayType(StringType)),
      s"Input type must be ArrayType(StringType) but got $inputType.")
  }

  override protected def outputDataType: DataType = new ArrayType(StringType, false)
}

@Since("1.6.0")
object NGram extends DefaultParamsReadable[NGram] {

  @Since("1.6.0")
  override def load(path: String): NGram = super.load(path)
}

Source File: InputFileName.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.rdd.InputFileNameHolder
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{DataType, StringType}
import org.apache.spark.unsafe.types.UTF8String


@ExpressionDescription(
  usage = "_FUNC_() - Returns the name of the current file being read if available",
  extended = "> SELECT _FUNC_();\n ''")
case class InputFileName() extends LeafExpression with Nondeterministic {

  override def nullable: Boolean = true

  override def dataType: DataType = StringType

  override def prettyName: String = "input_file_name"

  override protected def initInternal(): Unit = {}

  override protected def evalInternal(input: InternalRow): UTF8String = {
    InputFileNameHolder.getInputFileName()
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    ev.copy(code = s"final ${ctx.javaType(dataType)} ${ev.value} = " +
      "org.apache.spark.rdd.InputFileNameHolder.getInputFileName();", isNull = "false")
  }
}

Source File: MapDataSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import scala.collection._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType}
import org.apache.spark.unsafe.types.UTF8String

class MapDataSuite extends SparkFunSuite {

  test("inequality tests") {
    def u(str: String): UTF8String = UTF8String.fromString(str)

    // test data
    val testMap1 = Map(u("key1") -> 1)
    val testMap2 = Map(u("key1") -> 1, u("key2") -> 2)
    val testMap3 = Map(u("key1") -> 1)
    val testMap4 = Map(u("key1") -> 1, u("key2") -> 2)

    // ArrayBasedMapData
    val testArrayMap1 = ArrayBasedMapData(testMap1.toMap)
    val testArrayMap2 = ArrayBasedMapData(testMap2.toMap)
    val testArrayMap3 = ArrayBasedMapData(testMap3.toMap)
    val testArrayMap4 = ArrayBasedMapData(testMap4.toMap)
    assert(testArrayMap1 !== testArrayMap3)
    assert(testArrayMap2 !== testArrayMap4)

    // UnsafeMapData
    val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType)))
    val row = new GenericInternalRow(1)
    def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = {
      row.update(0, map)
      val unsafeRow = unsafeConverter.apply(row)
      unsafeRow.getMap(0).copy
    }
    assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3))
    assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4))
  }
}

Source File: ScalaUDFSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types.{IntegerType, StringType}

class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("basic") {
    val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil)
    checkEvaluation(intUdf, 2)

    val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil)
    checkEvaluation(stringUdf, "ax")
  }

  test("better error message for NPE") {
    val udf = ScalaUDF(
      (s: String) => s.toLowerCase,
      StringType,
      Literal.create(null, StringType) :: Nil)

    val e1 = intercept[SparkException](udf.eval())
    assert(e1.getMessage.contains("Failed to execute user defined function"))

    val e2 = intercept[SparkException] {
      checkEvalutionWithUnsafeProjection(udf, null)
    }
    assert(e2.getMessage.contains("Failed to execute user defined function"))
  }

}

Source File: CallMethodViaReflectionSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure
import org.apache.spark.sql.types.{IntegerType, StringType}


class CallMethodViaReflectionSuite extends SparkFunSuite with ExpressionEvalHelper {

  import CallMethodViaReflection._

  // Get rid of the $ so we are getting the companion object's name.
  private val staticClassName = ReflectStaticClass.getClass.getName.stripSuffix("$")
  private val dynamicClassName = classOf[ReflectDynamicClass].getName

  test("findMethod via reflection for static methods") {
    assert(findMethod(staticClassName, "method1", Seq.empty).exists(_.getName == "method1"))
    assert(findMethod(staticClassName, "method2", Seq(IntegerType)).isDefined)
    assert(findMethod(staticClassName, "method3", Seq(IntegerType)).isDefined)
    assert(findMethod(staticClassName, "method4", Seq(IntegerType, StringType)).isDefined)
  }

  test("findMethod for a JDK library") {
    assert(findMethod(classOf[java.util.UUID].getName, "randomUUID", Seq.empty).isDefined)
  }

  test("class not found") {
    val ret = createExpr("some-random-class", "method").checkInputDataTypes()
    assert(ret.isFailure)
    val errorMsg = ret.asInstanceOf[TypeCheckFailure].message
    assert(errorMsg.contains("not found") && errorMsg.contains("class"))
  }

  test("method not found because name does not match") {
    val ret = createExpr(staticClassName, "notfoundmethod").checkInputDataTypes()
    assert(ret.isFailure)
    val errorMsg = ret.asInstanceOf[TypeCheckFailure].message
    assert(errorMsg.contains("cannot find a static method"))
  }

  test("method not found because there is no static method") {
    val ret = createExpr(dynamicClassName, "method1").checkInputDataTypes()
    assert(ret.isFailure)
    val errorMsg = ret.asInstanceOf[TypeCheckFailure].message
    assert(errorMsg.contains("cannot find a static method"))
  }

  test("input type checking") {
    assert(CallMethodViaReflection(Seq.empty).checkInputDataTypes().isFailure)
    assert(CallMethodViaReflection(Seq(Literal(staticClassName))).checkInputDataTypes().isFailure)
    assert(CallMethodViaReflection(
      Seq(Literal(staticClassName), Literal(1))).checkInputDataTypes().isFailure)
    assert(createExpr(staticClassName, "method1").checkInputDataTypes().isSuccess)
  }

  test("invoking methods using acceptable types") {
    checkEvaluation(createExpr(staticClassName, "method1"), "m1")
    checkEvaluation(createExpr(staticClassName, "method2", 2), "m2")
    checkEvaluation(createExpr(staticClassName, "method3", 3), "m3")
    checkEvaluation(createExpr(staticClassName, "method4", 4, "four"), "m4four")
  }

  private def createExpr(className: String, methodName: String, args: Any*) = {
    CallMethodViaReflection(
      Literal.create(className, StringType) +:
      Literal.create(methodName, StringType) +:
      args.map(Literal.apply)
    )
  }
}

Source File: RewriteDistinctAggregatesSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.SimpleCatalystConf
import org.apache.spark.sql.catalyst.analysis.{Analyzer, EmptyFunctionRegistry}
import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.{If, Literal}
import org.apache.spark.sql.catalyst.expressions.aggregate.{CollectSet, Count}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Expand, LocalRelation, LogicalPlan}
import org.apache.spark.sql.types.{IntegerType, StringType}

class RewriteDistinctAggregatesSuite extends PlanTest {
  val conf = SimpleCatalystConf(caseSensitiveAnalysis = false, groupByOrdinal = false)
  val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf)
  val analyzer = new Analyzer(catalog, conf)

  val nullInt = Literal(null, IntegerType)
  val nullString = Literal(null, StringType)
  val testRelation = LocalRelation('a.string, 'b.string, 'c.string, 'd.string, 'e.int)

  private def checkRewrite(rewrite: LogicalPlan): Unit = rewrite match {
    case Aggregate(_, _, Aggregate(_, _, _: Expand)) =>
    case _ => fail(s"Plan is not rewritten:\n$rewrite")
  }

  test("single distinct group") {
    val input = testRelation
      .groupBy('a)(countDistinct('e))
      .analyze
    val rewrite = RewriteDistinctAggregates(input)
    comparePlans(input, rewrite)
  }

  test("single distinct group with partial aggregates") {
    val input = testRelation
      .groupBy('a, 'd)(
        countDistinct('e, 'c).as('agg1),
        max('b).as('agg2))
      .analyze
    val rewrite = RewriteDistinctAggregates(input)
    comparePlans(input, rewrite)
  }

  test("single distinct group with non-partial aggregates") {
    val input = testRelation
      .groupBy('a, 'd)(
        countDistinct('e, 'c).as('agg1),
        CollectSet('b).toAggregateExpression().as('agg2))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups") {
    val input = testRelation
      .groupBy('a)(countDistinct('b, 'c), countDistinct('d))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups with partial aggregates") {
    val input = testRelation
      .groupBy('a)(countDistinct('b, 'c), countDistinct('d), sum('e))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups with non-partial aggregates") {
    val input = testRelation
      .groupBy('a)(
        countDistinct('b, 'c),
        countDistinct('d),
        CollectSet('b).toAggregateExpression())
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }
}

Source File: resources.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import java.io.File
import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand {
  override val output: Seq[Attribute] = {
    AttributeReference("Results", StringType, nullable = false)() :: Nil
  }
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val jarList = sparkSession.sparkContext.listJars()
    if (jars.nonEmpty) {
      for {
        jarName <- jars.map(f => new Path(f).getName)
        jarPath <- jarList if jarPath.contains(jarName)
      } yield Row(jarPath)
    } else {
      jarList.map(Row(_))
    }
  }
}

Source File: WholeStageCodegenSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.aggregate.HashAggregateExec
import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
import org.apache.spark.sql.expressions.scalalang.typed
import org.apache.spark.sql.functions.{avg, broadcast, col, max}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}

class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {

  test("range/filter should be combined") {
    val df = spark.range(10).filter("id = 1").selectExpr("id + 1")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined)
    assert(df.collect() === Array(Row(2)))
  }

  test("Aggregate should be included in WholeStageCodegen") {
    val df = spark.range(10).groupBy().agg(max(col("id")), avg(col("id")))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(9, 4.5)))
  }

  test("Aggregate with grouping keys should be included in WholeStageCodegen") {
    val df = spark.range(3).groupBy("id").count().orderBy("id")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(0, 1), Row(1, 1), Row(2, 1)))
  }

  test("BroadcastHashJoin should be included in WholeStageCodegen") {
    val rdd = spark.sparkContext.makeRDD(Seq(Row(1, "1"), Row(1, "1"), Row(2, "2")))
    val schema = new StructType().add("k", IntegerType).add("v", StringType)
    val smallDF = spark.createDataFrame(rdd, schema)
    val df = spark.range(10).join(broadcast(smallDF), col("k") === col("id"))
    assert(df.queryExecution.executedPlan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[BroadcastHashJoinExec]).isDefined)
    assert(df.collect() === Array(Row(1, 1, "1"), Row(1, 1, "1"), Row(2, 2, "2")))
  }

  test("Sort should be included in WholeStageCodegen") {
    val df = spark.range(3, 0, -1).toDF().sort(col("id"))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SortExec]).isDefined)
    assert(df.collect() === Array(Row(1), Row(2), Row(3)))
  }

  test("MapElements should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = spark.range(10).map(_.toString)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SerializeFromObjectExec]).isDefined)
    assert(ds.collect() === 0.until(10).map(_.toString).toArray)
  }

  test("typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 2, 4, 6, 8))
  }

  test("back-to-back typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0).filter(_ % 3 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 6))
  }

  test("simple typed UDAF should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS()
      .groupByKey(_._1).agg(typed.sum(_._2))

    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0)))
  }
}

Source File: GroupedIteratorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: DDLSourceLoadSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {
    intercept[RuntimeException] {
      spark.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {
    spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {
    spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without Hive Support") {
    val e = intercept[AnalysisException] {
      spark.read.format("orc").load()
    }
    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

Source File: CarbonDataFrameExample.scala From CarbonDataLearning with GNU General Public License v3.0

5 votes

package org.github.xubo245.carbonDataLearning.example

import org.apache.carbondata.examples.util.ExampleUtils
import org.apache.spark.sql.{SaveMode, SparkSession}

object CarbonDataFrameExample {

  def main(args: Array[String]) {
    val spark = ExampleUtils.createCarbonSession("CarbonDataFrameExample")
    exampleBody(spark)
    spark.close()
  }

  def exampleBody(spark : SparkSession): Unit = {
    // Writes Dataframe to CarbonData file:
    import spark.implicits._
    val df = spark.sparkContext.parallelize(1 to 100)
      .map(x => ("a" + x % 10, "b", x))
      .toDF("c1", "c2", "number")

    // Saves dataframe to carbondata file
    df.write
      .format("carbondata")
      .option("tableName", "carbon_df_table")
      .option("partitionColumns", "c1")  // a list of column names
      .mode(SaveMode.Overwrite)
      .save()

    spark.sql(""" SELECT * FROM carbon_df_table """).show()

    spark.sql("SHOW PARTITIONS carbon_df_table").show()

    // Specify schema
    import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
    val customSchema = StructType(Array(
      StructField("c1", StringType),
      StructField("c2", StringType),
      StructField("number", IntegerType)))

    // Reads carbondata to dataframe
    val carbondf = spark.read
      .format("carbondata")
      .schema(customSchema)
      // .option("dbname", "db_name") the system will use "default" as dbname if not set this option
      .option("tableName", "carbon_df_table")
      .load()

    df.write
      .format("csv")
      .option("tableName", "csv_df_table")
      .option("partitionColumns", "c1") // a list of column names
      //      .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
      .mode(SaveMode.Overwrite)
      .csv("/Users/xubo/Desktop/xubo/git/carbondata3/examples/spark2/target/csv/1.csv")

    // Reads carbondata to dataframe
    val carbondf2 = spark.read
      .format("csv")
      .schema(customSchema)
      // .option("dbname", "db_name") the system will use "default" as dbname if not set this option
      .option("tableName", "csv_df_table")

      //      .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ")
      .load("/Users/xubo/Desktop/xubo/git/carbondata3/examples/spark2/target/csv")

    carbondf2.show()


    // Dataframe operations
    carbondf.printSchema()
    carbondf.select($"c1", $"number" + 10).show()
    carbondf.filter($"number" > 31).show()

    spark.sql("DROP TABLE IF EXISTS carbon_df_table")
  }
}

Source File: TestSFObjectWriter.scala From spark-salesforce with Apache License 2.0

5 votes

package com.springml.spark.salesforce

import org.mockito.Mockito._
import org.mockito.Matchers._
import org.scalatest.mock.MockitoSugar
import org.scalatest.{ FunSuite, BeforeAndAfterEach}
import com.springml.salesforce.wave.api.BulkAPI
import org.apache.spark.{ SparkConf, SparkContext}
import com.springml.salesforce.wave.model.{ JobInfo, BatchInfo}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ Row, DataFrame, SQLContext}
import org.apache.spark.sql.types.{ StructType, StringType, StructField}

class TestSFObjectWriter extends FunSuite with MockitoSugar with BeforeAndAfterEach {
  val contact = "Contact";
  val jobId = "750B0000000WlhtIAC";
  val batchId = "751B0000000scSHIAY";
  val data = "Id,Description\n003B00000067Rnx,123456\n003B00000067Rnw,7890";

  val bulkAPI = mock[BulkAPI](withSettings().serializable())
  val writer = mock[SFObjectWriter]

  var sparkConf: SparkConf = _
  var sc: SparkContext = _

  override def beforeEach() {
    val jobInfo = new JobInfo
    jobInfo.setId(jobId)
    when(bulkAPI.createJob(contact)).thenReturn(jobInfo)

    val batchInfo = new BatchInfo
    batchInfo.setId(batchId)
    batchInfo.setJobId(jobId)
    when(bulkAPI.addBatch(jobId, data)).thenReturn(batchInfo)

    when(bulkAPI.closeJob(jobId)).thenReturn(jobInfo)
    when(bulkAPI.isCompleted(jobId)).thenReturn(true)

    sparkConf = new SparkConf().setMaster("local").setAppName("Test SF Object Update")
    sc = new SparkContext(sparkConf)
  }

  private def sampleDF() : DataFrame = {
    val rowArray = new Array[Row](2)
    val fieldArray = new Array[String](2)

    fieldArray(0) = "003B00000067Rnx"
    fieldArray(1) = "Desc1"
    rowArray(0) = Row.fromSeq(fieldArray)

    val fieldArray1 = new Array[String](2)
    fieldArray1(0) = "001B00000067Rnx"
    fieldArray1(1) = "Desc2"
    rowArray(1) = Row.fromSeq(fieldArray1)

    val rdd = sc.parallelize(rowArray)
    val schema = StructType(
      StructField("id", StringType, true) ::
      StructField("desc", StringType, true) :: Nil)

    val sqlContext = new SQLContext(sc)
    sqlContext.createDataFrame(rdd, schema)
  }

  test ("Write Object to Salesforce") {
    val df = sampleDF();
    val csvHeader = Utils.csvHeadder(df.schema)
    writer.writeData(df.rdd)
    sc.stop()
  }
}

Source File: TestMetadataConstructor.scala From spark-salesforce with Apache License 2.0

5 votes

package com.springml.spark.salesforce.metadata

import org.apache.spark.sql.types.{StructType, StringType, IntegerType, LongType,
  FloatType, DateType, TimestampType, BooleanType, StructField}
import org.scalatest.FunSuite
import com.springml.spark.salesforce.Utils


class TestMetadataConstructor extends FunSuite {

  test("Test Metadata generation") {
    val columnNames = List("c1", "c2", "c3", "c4")
    val columnStruct = columnNames.map(colName => StructField(colName, StringType, true))
    val schema = StructType(columnStruct)

    val schemaString = MetadataConstructor.generateMetaString(schema,"sampleDataSet", Utils.metadataConfig(null))
    assert(schemaString.length > 0)
    assert(schemaString.contains("sampleDataSet"))
  }

  test("Test Metadata generation With Custom MetadataConfig") {
    val columnNames = List("c1", "c2", "c3", "c4")
    val intField = StructField("intCol", IntegerType, true)
    val longField = StructField("longCol", LongType, true)
    val floatField = StructField("floatCol", FloatType, true)
    val dateField = StructField("dateCol", DateType, true)
    val timestampField = StructField("timestampCol", TimestampType, true)
    val stringField = StructField("stringCol", StringType, true)
    val someTypeField = StructField("someTypeCol", BooleanType, true)

    val columnStruct = Array[StructField] (intField, longField, floatField, dateField, timestampField, stringField, someTypeField)

    val schema = StructType(columnStruct)

    var metadataConfig = Map("string" -> Map("wave_type" -> "Text"))
    metadataConfig += ("integer" -> Map("wave_type" -> "Numeric", "precision" -> "10", "scale" -> "0", "defaultValue" -> "100"))
    metadataConfig += ("float" -> Map("wave_type" -> "Numeric", "precision" -> "10", "scale" -> "2"))
    metadataConfig += ("long" -> Map("wave_type" -> "Numeric", "precision" -> "18", "scale" -> "0"))
    metadataConfig += ("date" -> Map("wave_type" -> "Date", "format" -> "yyyy/MM/dd"))
    metadataConfig += ("timestamp" -> Map("wave_type" -> "Date", "format" -> "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))


    val schemaString = MetadataConstructor.generateMetaString(schema, "sampleDataSet", metadataConfig)
    assert(schemaString.length > 0)
    assert(schemaString.contains("sampleDataSet"))
    assert(schemaString.contains("Numeric"))
    assert(schemaString.contains("precision"))
    assert(schemaString.contains("scale"))
    assert(schemaString.contains("18"))
    assert(schemaString.contains("Text"))
    assert(schemaString.contains("Date"))
    assert(schemaString.contains("format"))
    assert(schemaString.contains("defaultValue"))
    assert(schemaString.contains("100"))
    assert(schemaString.contains("yyyy/MM/dd"))
    assert(schemaString.contains("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'"))
  }
}

Source File: SparkScoreDoc.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd.models

import org.apache.lucene.document.Document
import org.apache.lucene.index.IndexableField
import org.apache.lucene.search.{IndexSearcher, ScoreDoc}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.inferNumericType
import org.zouzias.spark.lucenerdd.models.SparkScoreDoc.{DocIdField, ScoreField, ShardField}

import scala.collection.JavaConverters._

sealed trait FieldType extends Serializable
object TextType extends FieldType
object IntType extends FieldType
object DoubleType extends FieldType
object LongType extends FieldType
object FloatType extends FieldType



  private def inferNumericType(num: Number): FieldType = {
    num match {
      case _: java.lang.Double => DoubleType
      case _: java.lang.Long => LongType
      case _: java.lang.Integer => IntType
      case _: java.lang.Float => FloatType
      case _ => TextType
    }
  }
}

Source File: Schema.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.s2jobs

import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object Schema {
  
  val GraphElementSchema = StructType(CommonFields ++ Seq(
    StructField("id", StringType, nullable = true),
    StructField("service", StringType, nullable = true),
    StructField("column", StringType, nullable = true),
    StructField("from", StringType, nullable = true),
    StructField("to", StringType, nullable = true),
    StructField("label", StringType, nullable = true),
    StructField("props", StringType, nullable = true)
  ))
}

Source File: ExecutePython.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.script

import java.util
import java.util.UUID

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.util.FileUtil
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import jep.Jep
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.JavaConversions._


class ExecutePython extends ConfigurableStop{
  override val authorEmail: String = "xjzhu@cnic.cn"
  override val description: String = "Execute python script"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var script : String = _

  override def setProperties(map: Map[String, Any]): Unit = {
    script = MapUtil.get(map,"script").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val script = new PropertyDescriptor()
      .name("script")
      .displayName("script")
      .description("The code of python")
      .defaultValue("")
      .required(true)

    descriptor = script :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/script/python.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.ScriptGroup)
  }
  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val jep = new Jep()
    val scriptPath = "/tmp/pythonExcutor-"+ UUID.randomUUID() +".py"
    FileUtil.writeFile(script,scriptPath)
    jep.runScript(scriptPath)
  }
}

Source File: ExecutePythonWithDataFrame.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.script

import java.util
import java.util.UUID

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf.{ConfigurableStop, Port, StopGroup}
import cn.piflow.util.FileUtil
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import jep.Jep
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

import scala.collection.JavaConversions._


class ExecutePythonWithDataFrame extends ConfigurableStop{
  override val authorEmail: String = "xjzhu@cnic.cn"
  override val description: String = "Execute python script with dataframe"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var script : String = _
  var execFunction : String = _

  override def setProperties(map: Map[String, Any]): Unit = {
    script = MapUtil.get(map,"script").asInstanceOf[String]
    execFunction = MapUtil.get(map,"execFunction").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val script = new PropertyDescriptor()
      .name("script")
      .displayName("script")
      .description("The code of python")
      .defaultValue("")
      .required(true)
    val execFunction = new PropertyDescriptor()
      .name("execFunction")
      .displayName("execFunction")
      .description("The function of python script to be executed.")
      .defaultValue("")
      .required(true)
    descriptor = script :: descriptor
    descriptor = execFunction :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/script/python.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.ScriptGroup)
  }
  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val spark = pec.get[SparkSession]()

    val df = in.read()

    val jep = new Jep()
    val scriptPath = "/tmp/pythonExcutor-"+ UUID.randomUUID() +".py"
    FileUtil.writeFile(script,scriptPath)
    jep.runScript(scriptPath)


    val listInfo = df.toJSON.collectAsList()
    jep.eval(s"result = $execFunction($listInfo)")
    val resultArrayList = jep.getValue("result",new util.ArrayList().getClass)
    println(resultArrayList)


    var resultList = List[Map[String, Any]]()
    val it = resultArrayList.iterator()
    while(it.hasNext){
      val i = it.next().asInstanceOf[java.util.HashMap[String, Any]]
      val item =  mapAsScalaMap(i).toMap[String, Any]
      resultList =  item +: resultList
    }


    val rows = resultList.map( m => Row(m.values.toSeq:_*))
    val header = resultList.head.keys.toList
    val schema = StructType(header.map(fieldName => new StructField(fieldName, StringType, true)))

    val rdd = spark.sparkContext.parallelize(rows)
    val resultDF = spark.createDataFrame(rdd, schema)

    out.write(resultDF)
  }
}

Source File: DataFrameRowParser.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.script

import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.conf._
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession}

import scala.beans.BeanProperty

class DataFrameRowParser extends ConfigurableStop{

  val authorEmail: String = "xjzhu@cnic.cn"
  val description: String = "Create dataframe by schema"
  val inportList: List[String] = List(Port.DefaultPort.toString)
  val outportList: List[String] = List(Port.DefaultPort.toString)

  var schema: String = _
  var separator: String = _

  override def setProperties(map: Map[String, Any]): Unit = {
    schema = MapUtil.get(map,"schema").asInstanceOf[String]
    separator = MapUtil.get(map,"separator").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val schema = new PropertyDescriptor().name("schema").displayName("schema").description("The schema of dataframe").defaultValue("").required(true)
    val separator = new PropertyDescriptor().name("separator").displayName("separator").description("The separator of schema").defaultValue("").required(true)
    descriptor = schema :: descriptor
    descriptor = separator :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/script/DataFrameRowParser.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.ScriptGroup.toString)
  }


  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {
    val spark = pec.get[SparkSession]()
    val inDF = in.read()

    //parse RDD
    val rdd = inDF.rdd.map(row => {
      val fieldArray = row.get(0).asInstanceOf[String].split(",")
      Row.fromSeq(fieldArray.toSeq)
    })

    //parse schema
    val field = schema.split(separator)
    val structFieldArray : Array[StructField] = new Array[StructField](field.size)
    for(i <- 0 to field.size - 1){
      structFieldArray(i) = new StructField(field(i),StringType, nullable = true)
    }
    val schemaStructType = StructType(structFieldArray)

    //create DataFrame
    val df = spark.createDataFrame(rdd,schemaStructType)
    //df.show()
    out.write(df)
  }

}

Source File: WordSpliter.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.nlp

import cn.piflow._
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import com.huaban.analysis.jieba.JiebaSegmenter.SegMode
import com.huaban.analysis.jieba._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

class WordSpliter extends ConfigurableStop {

  val authorEmail: String = "huchuan0901@163.com"
  val description: String = "Word segmentation"
  val inportList: List[String] = List(Port.AnyPort.toString)
  val outportList: List[String] = List(Port.DefaultPort.toString)

  var path:String = _


  val jiebaSegmenter = new JiebaSegmenter()
  var tokenARR:ArrayBuffer[String]=ArrayBuffer()

  def segmenter(str:String): Unit ={

    var strVar = str
    //delete symbol
    strVar = strVar.replaceAll( "[\\p{P}+~$`^=|<>～｀＄＾＋＝｜＜＞￥×+\\s]" , "");

    val tokens = jiebaSegmenter.process(strVar,SegMode.SEARCH).asScala

    for (token: SegToken <- tokens){

        tokenARR += token.word

    }
  }

  def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val session: SparkSession = pec.get[SparkSession]()

    //read
    val strDF = session.read.text(path)

    //segmenter
    segmenter(strDF.head().getString(0))

    //write df
    val rows: List[Row] = tokenARR.map(each => {
      var arr:Array[String]=Array(each)
      val row: Row = Row.fromSeq(arr)
      row
    }).toList
    val rowRDD: RDD[Row] = session.sparkContext.makeRDD(rows)
    val schema: StructType = StructType(Array(
      StructField("words",StringType)
    ))
    val df: DataFrame = session.createDataFrame(rowRDD,schema)

    out.write(df)
  }

  def initialize(ctx: ProcessContext): Unit = {

  }

  def setProperties(map : Map[String, Any]) = {
    path = MapUtil.get(map,"path").asInstanceOf[String]
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val path = new PropertyDescriptor().name("path").displayName("path").description("The path of text file").defaultValue("").required(true)
    descriptor = path :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/nlp/NLP.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.Alg_NLPGroup.toString)
  }

}

Source File: BasicDataSourceSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import com.pingcap.tikv.exception.TiBatchWriteException
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class BasicDataSourceSuite extends BaseDataSourceTest("test_datasource_basic") {
  private val row1 = Row(null, "Hello")
  private val row2 = Row(2, "TiDB")
  private val row3 = Row(3, "Spark")
  private val row4 = Row(4, null)

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  override def beforeAll(): Unit = {
    super.beforeAll()

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')")
  }

  test("Test Select") {
    if (!supportBatchWrite) {
      cancel
    }

    testTiDBSelect(Seq(row1, row2))
  }

  test("Test Write Append") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    df.write
      .format("tidb")
      .options(tidbOptions)
      .option("database", database)
      .option("table", table)
      .mode("append")
      .save()

    testTiDBSelect(Seq(row1, row2, row3, row4))
  }

  test("Test Write Overwrite") {
    if (!supportBatchWrite) {
      cancel
    }

    val data: RDD[Row] = sc.makeRDD(List(row3, row4))
    val df = sqlContext.createDataFrame(data, schema)

    val caught = intercept[TiBatchWriteException] {
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .mode("overwrite")
        .save()
    }

    assert(
      caught.getMessage
        .equals("SaveMode: Overwrite is not supported. TiSpark only support SaveMode.Append."))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: CheckUnsupportedSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import com.pingcap.tikv.exception.TiBatchWriteException
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class CheckUnsupportedSuite extends BaseDataSourceTest("test_datasource_check_unsupported") {

  override def beforeAll(): Unit =
    super.beforeAll()

  test("Test write to partition table") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()

    tidbStmt.execute("set @@tidb_enable_table_partition = 1")

    jdbcUpdate(
      s"create table $dbtable(i int, s varchar(128)) partition by range(i) (partition p0 values less than maxvalue)")
    jdbcUpdate(s"insert into $dbtable values(null, 'Hello')")

    val row1 = Row(null, "Hello")
    val row2 = Row(2, "TiDB")
    val row3 = Row(3, "Spark")

    val schema = StructType(List(StructField("i", IntegerType), StructField("s", StringType)))

    {
      val caught = intercept[TiBatchWriteException] {
        tidbWrite(List(row2, row3), schema)
      }
      assert(
        caught.getMessage
          .equals("tispark currently does not support write data to partition table!"))
    }

    testTiDBSelect(Seq(row1))
  }

  test("Check Virtual Generated Column") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i INT, c1 INT, c2 INT,  c3 INT AS (c1 + c2))")

    val row1 = Row(1, 2, 3)
    val schema = StructType(
      List(
        StructField("i", IntegerType),
        StructField("c1", IntegerType),
        StructField("c2", IntegerType)))

    val caught = intercept[TiBatchWriteException] {
      tidbWrite(List(row1), schema)
    }
    assert(
      caught.getMessage
        .equals("tispark currently does not support write data to table with generated column!"))

  }

  test("Check Stored Generated Column") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i INT, c1 INT, c2 INT,  c3 INT AS (c1 + c2) STORED)")

    val row1 = Row(1, 2, 3)
    val schema = StructType(
      List(
        StructField("i", IntegerType),
        StructField("c1", IntegerType),
        StructField("c2", IntegerType)))
    val caught = intercept[TiBatchWriteException] {
      tidbWrite(List(row1), schema)
    }
    assert(
      caught.getMessage
        .equals("tispark currently does not support write data to table with generated column!"))

  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: TiSparkTypeSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

class TiSparkTypeSuite extends BaseDataSourceTest("type_test") {
  private val row1 = Row(null, "Hello")
  private val row2 = Row(2L, "TiDB")
  private val row3 = Row(3L, "Spark")
  private val row5 = Row(Long.MaxValue, "Duplicate")

  private val schema = StructType(List(StructField("i", LongType), StructField("s", StringType)))
  test("bigint test") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i bigint, s varchar(128))")
    jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')")

    tidbWrite(List(row3, row5), schema)
    testTiDBSelect(List(row1, row2, row3, row5))
  }
}

Source File: MissingParameterSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.datasource

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class MissingParameterSuite extends BaseDataSourceTest("test_datasource_missing_parameter") {
  private val row1 = Row(null, "Hello")

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  test("Missing parameter: database") {
    if (!supportBatchWrite) {
      cancel
    }

    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")

    val caught = intercept[IllegalArgumentException] {
      val rows = row1 :: Nil
      val data: RDD[Row] = sc.makeRDD(rows)
      val df = sqlContext.createDataFrame(data, schema)
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("table", table)
        .mode("append")
        .save()
    }
    assert(
      caught.getMessage
        .equals("requirement failed: Option 'database' is required."))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: BatchWriteIssueSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark

import com.pingcap.tispark.datasource.BaseDataSourceTest
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class BatchWriteIssueSuite extends BaseDataSourceTest("test_batchwrite_issue") {
  override def beforeAll(): Unit = {
    super.beforeAll()
  }

  test("Combine unique index with null value test") {
    doTestNullValues(s"create table $dbtable(a int, b varchar(64), CONSTRAINT ab UNIQUE (a, b))")
  }

  test("Combine primary key with null value test") {
    doTestNullValues(s"create table $dbtable(a int, b varchar(64), PRIMARY KEY (a, b))")
  }

  test("PK is handler with null value test") {
    doTestNullValues(s"create table $dbtable(a int, b varchar(64), PRIMARY KEY (a))")
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }

  private def doTestNullValues(createTableSQL: String): Unit = {
    if (!supportBatchWrite) {
      cancel
    }
    val schema = StructType(
      List(
        StructField("a", IntegerType),
        StructField("b", StringType),
        StructField("c", StringType)))

    val options = Some(Map("replace" -> "true"))

    dropTable()
    jdbcUpdate(createTableSQL)
    jdbcUpdate(s"alter table $dbtable add column to_delete int")
    jdbcUpdate(s"alter table $dbtable add column c varchar(64) default 'c33'")
    jdbcUpdate(s"alter table $dbtable drop column to_delete")
    jdbcUpdate(s"""
                  |insert into $dbtable values(11, 'c12', null);
                  |insert into $dbtable values(21, 'c22', null);
                  |insert into $dbtable (a, b) values(31, 'c32');
                  |insert into $dbtable values(41, 'c42', 'c43');
                  |
      """.stripMargin)

    assert(queryTiDBViaJDBC(s"select c from $dbtable where a=11").head.head == null)
    assert(queryTiDBViaJDBC(s"select c from $dbtable where a=21").head.head == null)
    assert(
      queryTiDBViaJDBC(s"select c from $dbtable where a=31").head.head.toString.equals("c33"))
    assert(
      queryTiDBViaJDBC(s"select c from $dbtable where a=41").head.head.toString.equals("c43"))

    {
      val row1 = Row(11, "c12", "c13")
      val row3 = Row(31, "c32", null)

      tidbWrite(List(row1, row3), schema, options)

      assert(
        queryTiDBViaJDBC(s"select c from $dbtable where a=11").head.head.toString.equals("c13"))
      assert(queryTiDBViaJDBC(s"select c from $dbtable where a=21").head.head == null)
      assert(queryTiDBViaJDBC(s"select c from $dbtable where a=31").head.head == null)
      assert(
        queryTiDBViaJDBC(s"select c from $dbtable where a=41").head.head.toString.equals("c43"))
    }

    {
      val row1 = Row(11, "c12", "c213")
      val row3 = Row(31, "c32", "tt")
      tidbWrite(List(row1, row3), schema, options)
      assert(
        queryTiDBViaJDBC(s"select c from $dbtable where a=11").head.head.toString.equals("c213"))
      assert(queryTiDBViaJDBC(s"select c from $dbtable where a=21").head.head == null)
      assert(
        queryTiDBViaJDBC(s"select c from $dbtable where a=31").head.head.toString.equals("tt"))
      assert(
        queryTiDBViaJDBC(s"select c from $dbtable where a=41").head.head.toString.equals("c43"))
    }
  }
}

Source File: LockTimeoutSuite.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.ttl

import com.pingcap.tikv.TTLManager
import com.pingcap.tikv.exception.GrpcException
import com.pingcap.tispark.datasource.BaseDataSourceTest
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class LockTimeoutSuite extends BaseDataSourceTest("test_lock_timeout") {
  private val row1 = Row(1, "Hello")

  private val schema = StructType(
    List(StructField("i", IntegerType), StructField("s", StringType)))

  override def beforeAll(): Unit = {
    super.beforeAll()
    dropTable()
    jdbcUpdate(s"create table $dbtable(i int, s varchar(128))")
  }

  test("Test Lock TTL Timeout") {
    if (!supportTTLUpdate) {
      cancel
    }

    val seconds = 1000
    val sleep1 = TTLManager.MANAGED_LOCK_TTL + 10 * seconds
    val sleep2 = TTLManager.MANAGED_LOCK_TTL + 15 * seconds

    val data: RDD[Row] = sc.makeRDD(List(row1))
    val df = sqlContext.createDataFrame(data, schema)

    new Thread(new Runnable {
      override def run(): Unit = {
        Thread.sleep(sleep1)
        queryTiDBViaJDBC(s"select * from $dbtable")
      }
    }).start()

    val grpcException = intercept[GrpcException] {
      df.write
        .format("tidb")
        .options(tidbOptions)
        .option("database", database)
        .option("table", table)
        .option("sleepAfterPrewritePrimaryKey", sleep2)
        .mode("append")
        .save()
    }

    assert(grpcException.getMessage.equals("retry is exhausted."))
    assert(grpcException.getCause.getMessage.startsWith("Txn commit primary key failed"))
    assert(
      grpcException.getCause.getCause.getMessage.startsWith(
        "Key exception occurred and the reason is retryable: \"Txn(Mvcc(TxnLockNotFound"))
  }

  override def afterAll(): Unit =
    try {
      dropTable()
    } finally {
      super.afterAll()
    }
}

Source File: RedisStreamProvider.scala From spark-redis with BSD 3-Clause "New" or "Revised" License

5 votes

package org.apache.spark.sql.redis.stream

import com.redislabs.provider.redis.util.Logging
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Source
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
import org.apache.spark.sql.types.{StringType, StructField, StructType}


class RedisStreamProvider extends DataSourceRegister with StreamSourceProvider with Logging {

  override def shortName(): String = "redis"

  override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType],
                            providerName: String, parameters: Map[String, String]): (String, StructType) = {
    providerName -> schema.getOrElse {
      StructType(Seq(StructField("_id", StringType)))
    }
  }

  override def createSource(sqlContext: SQLContext, metadataPath: String,
                            schema: Option[StructType], providerName: String,
                            parameters: Map[String, String]): Source = {
    val (_, ss) = sourceSchema(sqlContext, schema, providerName, parameters)
    val source = new RedisSource(sqlContext, metadataPath, Some(ss), parameters)
    source.start()
    source
  }
}

Source File: XSDToSchemaSuite.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml.util

import java.nio.file.Paths

import org.apache.spark.sql.types.{ArrayType, StructField, StructType, StringType}
import org.scalatest.funsuite.AnyFunSuite

class XSDToSchemaSuite extends AnyFunSuite {

  test("Basic parsing") {
    val parsedSchema = XSDToSchema.read(Paths.get("src/test/resources/basket.xsd"))
    val expectedSchema = StructType(Array(
      StructField("basket", StructType(Array(
        StructField("entry", ArrayType(
          StructType(Array(
            StructField("key", StringType),
            StructField("value", StringType)
          )))
        ))
      )))
    )
    assert(expectedSchema === parsedSchema)
  }

}

Source File: SparkExecuteStatementOperationSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{IntegerType, NullType, StringType, StructField, StructType}

class SparkExecuteStatementOperationSuite extends SparkFunSuite {
  test("SPARK-17112 `select null` via JDBC triggers IllegalArgumentException in ThriftServer") {
    val field1 = StructField("NULL", NullType)
    val field2 = StructField("(IF(true, NULL, NULL))", NullType)
    val tableSchema = StructType(Seq(field1, field2))
    val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors()
    assert(columns.size() == 2)
    assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.NULL_TYPE)
    assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.NULL_TYPE)
  }

  test("SPARK-20146 Comment should be preserved") {
    val field1 = StructField("column1", StringType).withComment("comment 1")
    val field2 = StructField("column2", IntegerType)
    val tableSchema = StructType(Seq(field1, field2))
    val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors()
    assert(columns.size() == 2)
    assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.STRING_TYPE)
    assert(columns.get(0).getComment() == "comment 1")
    assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.INT_TYPE)
    assert(columns.get(1).getComment() == "")
  }
}

Source File: datasources.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.xsql.execution.command

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.xsql.XSQLSessionCatalog

case class XSQLShowDatasourcesCommand(datasourcePattern: Option[String]) extends RunnableCommand {
  override val output: Seq[Attribute] = {
    AttributeReference("dataSourceName", StringType, nullable = false)() :: Nil
  }

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog]
    val datasources =
      datasourcePattern
        .map { pattern =>
          catalog.listDatasources(pattern)
        }
        .getOrElse(catalog.listDatasources())
    datasources.map { d =>
      Row(d)
    }
  }
}

case class XSQLAddDatasourceCommand(dataSourceName: String, properties: Map[String, String])
  extends RunnableCommand {
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog]
    catalog.addDataSource(dataSourceName, properties)
    Seq.empty[Row]
  }
}

case class XSQLRemoveDatasourceCommand(dataSourceName: String, ifExists: Boolean)
  extends RunnableCommand {
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog]
    catalog.removeDataSource(dataSourceName, ifExists)
    Seq.empty[Row]
  }
}

case class XSQLRefreshDatasourceCommand(dataSourceName: String) extends RunnableCommand {
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog]
    catalog.refreshDataSource(dataSourceName)
    Seq.empty[Row]
  }
}

Source File: databases.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.xsql.execution.command

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.xsql.XSQLSessionCatalog


case class XSQLSetDatabaseCommand(dataSourceName: Option[String], databaseName: String)
  extends RunnableCommand {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog]
    if (dataSourceName.isEmpty) {
      catalog.setCurrentDatabase(databaseName)
    } else {
      catalog.setCurrentDatabase(dataSourceName.get, databaseName)
    }
    Seq.empty[Row]
  }
}

Source File: inputFileBlock.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.rdd.InputFileBlockHolder
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral}
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.types.{DataType, LongType, StringType}
import org.apache.spark.unsafe.types.UTF8String


@ExpressionDescription(
  usage = "_FUNC_() - Returns the name of the file being read, or empty string if not available.")
case class InputFileName() extends LeafExpression with Nondeterministic {

  override def nullable: Boolean = false

  override def dataType: DataType = StringType

  override def prettyName: String = "input_file_name"

  override protected def initializeInternal(partitionIndex: Int): Unit = {}

  override protected def evalInternal(input: InternalRow): UTF8String = {
    InputFileBlockHolder.getInputFilePath
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val className = InputFileBlockHolder.getClass.getName.stripSuffix("$")
    val typeDef = s"final ${CodeGenerator.javaType(dataType)}"
    ev.copy(code = code"$typeDef ${ev.value} = $className.getInputFilePath();",
      isNull = FalseLiteral)
  }
}


@ExpressionDescription(
  usage = "_FUNC_() - Returns the start offset of the block being read, or -1 if not available.")
case class InputFileBlockStart() extends LeafExpression with Nondeterministic {
  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override def prettyName: String = "input_file_block_start"

  override protected def initializeInternal(partitionIndex: Int): Unit = {}

  override protected def evalInternal(input: InternalRow): Long = {
    InputFileBlockHolder.getStartOffset
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val className = InputFileBlockHolder.getClass.getName.stripSuffix("$")
    val typeDef = s"final ${CodeGenerator.javaType(dataType)}"
    ev.copy(code = code"$typeDef ${ev.value} = $className.getStartOffset();", isNull = FalseLiteral)
  }
}


@ExpressionDescription(
  usage = "_FUNC_() - Returns the length of the block being read, or -1 if not available.")
case class InputFileBlockLength() extends LeafExpression with Nondeterministic {
  override def nullable: Boolean = false

  override def dataType: DataType = LongType

  override def prettyName: String = "input_file_block_length"

  override protected def initializeInternal(partitionIndex: Int): Unit = {}

  override protected def evalInternal(input: InternalRow): Long = {
    InputFileBlockHolder.getLength
  }

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val className = InputFileBlockHolder.getClass.getName.stripSuffix("$")
    val typeDef = s"final ${CodeGenerator.javaType(dataType)}"
    ev.copy(code = code"$typeDef ${ev.value} = $className.getLength();", isNull = FalseLiteral)
  }
}

Source File: StatsEstimationTestBase.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.statsEstimation

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{IntegerType, StringType}


trait StatsEstimationTestBase extends SparkFunSuite {

  var originalValue: Boolean = false

  override def beforeAll(): Unit = {
    super.beforeAll()
    // Enable stats estimation based on CBO.
    originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED)
    SQLConf.get.setConf(SQLConf.CBO_ENABLED, true)
  }

  override def afterAll(): Unit = {
    SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue)
    super.afterAll()
  }

  def getColSize(attribute: Attribute, colStat: ColumnStat): Long = attribute.dataType match {
    // For UTF8String: base + offset + numBytes
    case StringType => colStat.avgLen.getOrElse(attribute.dataType.defaultSize.toLong) + 8 + 4
    case _ => colStat.avgLen.getOrElse(attribute.dataType.defaultSize)
  }

  def attr(colName: String): AttributeReference = AttributeReference(colName, IntegerType)()

  
case class StatsTestPlan(
    outputList: Seq[Attribute],
    rowCount: BigInt,
    attributeStats: AttributeMap[ColumnStat],
    size: Option[BigInt] = None) extends LeafNode {
  override def output: Seq[Attribute] = outputList
  override def computeStats(): Statistics = Statistics(
    // If sizeInBytes is useless in testing, we just use a fake value
    sizeInBytes = size.getOrElse(Int.MaxValue),
    rowCount = Some(rowCount),
    attributeStats = attributeStats)
}

Source File: ScalaUDFSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import java.util.Locale

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
import org.apache.spark.sql.types.{IntegerType, StringType}

class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("basic") {
    val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil, true :: Nil)
    checkEvaluation(intUdf, 2)

    val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil, true :: Nil)
    checkEvaluation(stringUdf, "ax")
  }

  test("better error message for NPE") {
    val udf = ScalaUDF(
      (s: String) => s.toLowerCase(Locale.ROOT),
      StringType,
      Literal.create(null, StringType) :: Nil,
      true :: Nil)

    val e1 = intercept[SparkException](udf.eval())
    assert(e1.getMessage.contains("Failed to execute user defined function"))

    val e2 = intercept[SparkException] {
      checkEvaluationWithUnsafeProjection(udf, null)
    }
    assert(e2.getMessage.contains("Failed to execute user defined function"))
  }

  test("SPARK-22695: ScalaUDF should not use global variables") {
    val ctx = new CodegenContext
    ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil, true :: Nil).genCode(ctx)
    assert(ctx.inlinedMutableStates.isEmpty)
  }
}

Source File: CallMethodViaReflectionSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import java.sql.Timestamp

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure
import org.apache.spark.sql.types.{IntegerType, StringType}


class CallMethodViaReflectionSuite extends SparkFunSuite with ExpressionEvalHelper {

  import CallMethodViaReflection._

  // Get rid of the $ so we are getting the companion object's name.
  private val staticClassName = ReflectStaticClass.getClass.getName.stripSuffix("$")
  private val dynamicClassName = classOf[ReflectDynamicClass].getName

  test("findMethod via reflection for static methods") {
    assert(findMethod(staticClassName, "method1", Seq.empty).exists(_.getName == "method1"))
    assert(findMethod(staticClassName, "method2", Seq(IntegerType)).isDefined)
    assert(findMethod(staticClassName, "method3", Seq(IntegerType)).isDefined)
    assert(findMethod(staticClassName, "method4", Seq(IntegerType, StringType)).isDefined)
  }

  test("findMethod for a JDK library") {
    assert(findMethod(classOf[java.util.UUID].getName, "randomUUID", Seq.empty).isDefined)
  }

  test("class not found") {
    val ret = createExpr("some-random-class", "method").checkInputDataTypes()
    assert(ret.isFailure)
    val errorMsg = ret.asInstanceOf[TypeCheckFailure].message
    assert(errorMsg.contains("not found") && errorMsg.contains("class"))
  }

  test("method not found because name does not match") {
    val ret = createExpr(staticClassName, "notfoundmethod").checkInputDataTypes()
    assert(ret.isFailure)
    val errorMsg = ret.asInstanceOf[TypeCheckFailure].message
    assert(errorMsg.contains("cannot find a static method"))
  }

  test("method not found because there is no static method") {
    val ret = createExpr(dynamicClassName, "method1").checkInputDataTypes()
    assert(ret.isFailure)
    val errorMsg = ret.asInstanceOf[TypeCheckFailure].message
    assert(errorMsg.contains("cannot find a static method"))
  }

  test("input type checking") {
    assert(CallMethodViaReflection(Seq.empty).checkInputDataTypes().isFailure)
    assert(CallMethodViaReflection(Seq(Literal(staticClassName))).checkInputDataTypes().isFailure)
    assert(CallMethodViaReflection(
      Seq(Literal(staticClassName), Literal(1))).checkInputDataTypes().isFailure)
    assert(createExpr(staticClassName, "method1").checkInputDataTypes().isSuccess)
  }

  test("unsupported type checking") {
    val ret = createExpr(staticClassName, "method1", new Timestamp(1)).checkInputDataTypes()
    assert(ret.isFailure)
    val errorMsg = ret.asInstanceOf[TypeCheckFailure].message
    assert(errorMsg.contains("arguments from the third require boolean, byte, short"))
  }

  test("invoking methods using acceptable types") {
    checkEvaluation(createExpr(staticClassName, "method1"), "m1")
    checkEvaluation(createExpr(staticClassName, "method2", 2), "m2")
    checkEvaluation(createExpr(staticClassName, "method3", 3), "m3")
    checkEvaluation(createExpr(staticClassName, "method4", 4, "four"), "m4four")
  }

  private def createExpr(className: String, methodName: String, args: Any*) = {
    CallMethodViaReflection(
      Literal.create(className, StringType) +:
      Literal.create(methodName, StringType) +:
      args.map(Literal.apply)
    )
  }
}

Source File: LikeSimplificationSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.types.{BooleanType, StringType}

class LikeSimplificationSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("Like Simplification", Once,
        LikeSimplification) :: Nil
  }

  val testRelation = LocalRelation('a.string)

  test("simplify Like into StartsWith") {
    val originalQuery =
      testRelation
        .where(('a like "abc%") || ('a like "abc\\%"))

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer = testRelation
      .where(StartsWith('a, "abc") || ('a like "abc\\%"))
      .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("simplify Like into EndsWith") {
    val originalQuery =
      testRelation
        .where('a like "%xyz")

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer = testRelation
      .where(EndsWith('a, "xyz"))
      .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("simplify Like into startsWith and EndsWith") {
    val originalQuery =
      testRelation
        .where(('a like "abc\\%def") || ('a like "abc%def"))

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer = testRelation
      .where(('a like "abc\\%def") ||
        (Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def"))))
      .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("simplify Like into Contains") {
    val originalQuery =
      testRelation
        .where(('a like "%mn%") || ('a like "%mn\\%"))

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer = testRelation
      .where(Contains('a, "mn") || ('a like "%mn\\%"))
      .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("simplify Like into EqualTo") {
    val originalQuery =
      testRelation
        .where(('a like "") || ('a like "abc"))

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer = testRelation
      .where(('a === "") || ('a === "abc"))
      .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("null pattern") {
    val originalQuery = testRelation.where('a like Literal(null, StringType)).analyze
    val optimized = Optimize.execute(originalQuery)
    comparePlans(optimized, testRelation.where(Literal(null, BooleanType)).analyze)
  }
}

Source File: RewriteDistinctAggregatesSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.analysis.{Analyzer, EmptyFunctionRegistry}
import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.catalyst.expressions.aggregate.CollectSet
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Expand, LocalRelation, LogicalPlan}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.{CASE_SENSITIVE, GROUP_BY_ORDINAL}
import org.apache.spark.sql.types.{IntegerType, StringType}

class RewriteDistinctAggregatesSuite extends PlanTest {
  override val conf = new SQLConf().copy(CASE_SENSITIVE -> false, GROUP_BY_ORDINAL -> false)
  val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf)
  val analyzer = new Analyzer(catalog, conf)

  val nullInt = Literal(null, IntegerType)
  val nullString = Literal(null, StringType)
  val testRelation = LocalRelation('a.string, 'b.string, 'c.string, 'd.string, 'e.int)

  private def checkRewrite(rewrite: LogicalPlan): Unit = rewrite match {
    case Aggregate(_, _, Aggregate(_, _, _: Expand)) =>
    case _ => fail(s"Plan is not rewritten:\n$rewrite")
  }

  test("single distinct group") {
    val input = testRelation
      .groupBy('a)(countDistinct('e))
      .analyze
    val rewrite = RewriteDistinctAggregates(input)
    comparePlans(input, rewrite)
  }

  test("single distinct group with partial aggregates") {
    val input = testRelation
      .groupBy('a, 'd)(
        countDistinct('e, 'c).as('agg1),
        max('b).as('agg2))
      .analyze
    val rewrite = RewriteDistinctAggregates(input)
    comparePlans(input, rewrite)
  }

  test("multiple distinct groups") {
    val input = testRelation
      .groupBy('a)(countDistinct('b, 'c), countDistinct('d))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups with partial aggregates") {
    val input = testRelation
      .groupBy('a)(countDistinct('b, 'c), countDistinct('d), sum('e))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups with non-partial aggregates") {
    val input = testRelation
      .groupBy('a)(
        countDistinct('b, 'c),
        countDistinct('d),
        CollectSet('b).toAggregateExpression())
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }
}

Source File: resources.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import java.io.File
import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand {
  override val output: Seq[Attribute] = {
    AttributeReference("Results", StringType, nullable = false)() :: Nil
  }
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val jarList = sparkSession.sparkContext.listJars()
    if (jars.nonEmpty) {
      for {
        jarName <- jars.map(f => new Path(f).getName)
        jarPath <- jarList if jarPath.contains(jarName)
      } yield Row(jarPath)
    } else {
      jarList.map(Row(_))
    }
  }
}

Source File: GroupedIteratorSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: ShowTablesUsingCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.sources.DatasourceCatalog
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.types.{StringType, StructField, StructType}


private[sql]
case class ShowTablesUsingCommand(provider: String, options: Map[String, String])
  extends LogicalPlan
  with RunnableCommand {

  override def output: Seq[Attribute] = StructType(
    StructField("TABLE_NAME", StringType, nullable = false) ::
    StructField("IS_TEMPORARY", StringType, nullable = false) ::
    StructField("KIND", StringType, nullable = false) ::
    Nil
  ).toAttributes

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val dataSource: Any = DatasourceResolver.resolverFor(sqlContext).newInstanceOf(provider)

    dataSource match {
      case describableRelation: DatasourceCatalog =>
        describableRelation
          .getRelations(sqlContext, new CaseInsensitiveMap(options))
          .map(relationInfo => Row(
            relationInfo.name,
            relationInfo.isTemporary.toString.toUpperCase,
            relationInfo.kind.toUpperCase))
      case _ =>
        throw new RuntimeException(s"The provided data source $provider does not support " +
        "showing its relations.")
    }
  }
}

Source File: DescribeTableUsingCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.TableIdentifierUtils._
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.sources.{DatasourceCatalog, RelationInfo}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}


private[sql]
case class DescribeTableUsingCommand(
    name: TableIdentifier,
    provider: String,
    options: Map[String, String])
  extends LogicalPlan
  with RunnableCommand {

  override def output: Seq[Attribute] = StructType(
    StructField("TABLE_NAME", StringType, nullable = false) ::
    StructField("DDL_STMT", StringType, nullable = false) ::
    Nil
  ).toAttributes

  override def run(sqlContext: SQLContext): Seq[Row] = {
    // Convert the table name according to the case-sensitivity settings
    val tableId = name.toSeq
    val resolver = DatasourceResolver.resolverFor(sqlContext)
    val catalog = resolver.newInstanceOfTyped[DatasourceCatalog](provider)

    Seq(catalog
      .getRelation(sqlContext, tableId, new CaseInsensitiveMap(options)) match {
        case None => Row("", "")
        case Some(RelationInfo(relName, _, _, ddl, _)) => Row(
          relName, ddl.getOrElse(""))
    })
  }
}

Source File: DescCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources.commands.hive

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{Row, SQLContext}


case class DescCommand(ident: TableIdentifier) extends HiveRunnableCommand {

  override protected val commandName: String = s"DESC $ident"

  override def execute(sqlContext: SQLContext): Seq[Row] = {
    val plan = sqlContext.catalog.lookupRelation(ident)
    if (plan.resolved) {
      plan.schema.map { field =>
        Row(field.name, field.dataType.simpleString, None)
      }
    } else {
      Seq.empty
    }
  }

  override lazy val output: Seq[Attribute] =
    AttributeReference("col_name", StringType)() ::
    AttributeReference("data_type", StringType)() ::
    AttributeReference("comment", StringType)() :: Nil
}

Source File: dependenciesSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.TableDependencyCalculator
import org.apache.spark.sql.sources.{RelationKind, Table}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}

object DependenciesSystemTableProvider extends SystemTableProvider with LocalSpark {
  
  override def execute(): Seq[Row] = {
    val tables = getTables(sqlContext.catalog)
    val dependentsMap = buildDependentsMap(tables)

    def kindOf(tableIdentifier: TableIdentifier): String =
      tables
        .get(tableIdentifier)
        .map(plan => RelationKind.kindOf(plan).getOrElse(Table).name)
        .getOrElse(DependenciesSystemTable.UnknownType)
        .toUpperCase

    dependentsMap.flatMap {
      case (tableIdent, dependents) =>
        val curKind = kindOf(tableIdent)
        dependents.map { dependent =>
          val dependentKind = kindOf(dependent)
          Row(
            tableIdent.database.orNull,
            tableIdent.table,
            curKind,
            dependent.database.orNull,
            dependent.table,
            dependentKind,
            ReferenceDependency.id)
        }
    }.toSeq
  }

  override val schema: StructType = DependenciesSystemTable.schema
}

object DependenciesSystemTable extends SchemaEnumeration {
  val baseSchemaName = Field("BASE_SCHEMA_NAME", StringType, nullable = true)
  val baseObjectName = Field("BASE_OBJECT_NAME", StringType, nullable = false)
  val baseObjectType = Field("BASE_OBJECT_TYPE", StringType, nullable = false)
  val dependentSchemaName = Field("DEPENDENT_SCHEMA_NAME", StringType, nullable = true)
  val dependentObjectName = Field("DEPENDENT_OBJECT_NAME", StringType, nullable = false)
  val dependentObjectType = Field("DEPENDENT_OBJECT_TYPE", StringType, nullable = false)
  val dependencyType = Field("DEPENDENCY_TYPE", IntegerType, nullable = false)

  private[DependenciesSystemTable] val UnknownType = "UNKNOWN"
}

Source File: partitionFunctionSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables

import org.apache.spark.sql.execution.tablefunctions.OutputFormatter
import org.apache.spark.sql.sources._
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.apache.spark.sql.util.GenericUtil._


  private def typeNameOf(f: PartitionFunction): String = f match {
    case _: RangePartitionFunction => "RANGE"
    case _: BlockPartitionFunction => "BLOCK"
    case _: HashPartitionFunction => "HASH"
  }
}

object PartitionFunctionSystemTable extends SchemaEnumeration {
  val id = Field("ID", StringType, nullable = false)
  val functionType = Field("TYPE", StringType, nullable = false)
  val columnName = Field("COLUMN_NAME", StringType, nullable = false)
  val columnType = Field("COLUMN_TYPE", StringType, nullable = false)
  val boundaries = Field("BOUNDARIES", StringType, nullable = true)
  val block = Field("BLOCK_SIZE", IntegerType, nullable = true)
  val partitions = Field("PARTITIONS", IntegerType, nullable = true)
  val minP = Field("MIN_PARTITIONS", IntegerType, nullable = true)
  val maxP = Field("MAX_PARTITIONS", IntegerType, nullable = true)
}

Source File: sessionSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{Row, SQLConf, SQLContext}


  private def allSettingsOf(conf: SQLConf): Map[String, String] = {
    val setConfs = conf.getAllConfs
    val defaultConfs = conf.getAllDefinedConfs.collect {
      case (key, default, _) if !setConfs.contains(key) => key -> default
    }
    setConfs ++ defaultConfs
  }

  override def schema: StructType = SessionSystemTable.schema
}

object SessionSystemTable extends SchemaEnumeration {
  val section = Field("SECTION", StringType, nullable = false)
  val key = Field("KEY", StringType, nullable = false)
  val value = Field("VALUE", StringType, nullable = true)
}

Source File: tablesSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.sources._
import org.apache.spark.sql.sources.commands.WithOrigin
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.util.CollectionUtils.CaseInsensitiveMap
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}
import org.apache.spark.sql.catalyst.CaseSensitivityUtils._

object TablesSystemTableProvider extends SystemTableProvider with LocalSpark with ProviderBound {
  
  override def buildScan(requiredColumns: Array[String],
                         filters: Array[Filter]): RDD[Row] =
    DatasourceResolver
      .resolverFor(sqlContext)
      .newInstanceOfTyped[DatasourceCatalog](provider) match {
      case catalog: DatasourceCatalog with DatasourceCatalogPushDown =>
        catalog.getRelations(sqlContext, options, requiredColumns, filters.toSeq.merge)
      case catalog: DatasourceCatalog =>
        val values =
          catalog
            .getRelations(sqlContext, new CaseInsensitiveMap(options))
            .map(relationInfo => Row(
              relationInfo.name,
              relationInfo.isTemporary.toString.toUpperCase,
              relationInfo.kind.toUpperCase,
              relationInfo.provider))
        val rows = schema.buildPrunedFilteredScan(requiredColumns, filters)(values)
        sparkContext.parallelize(rows)
    }
}

sealed trait TablesSystemTable extends SystemTable {
  override def schema: StructType = TablesSystemTable.schema
}

object TablesSystemTable extends SchemaEnumeration {
  val tableName = Field("TABLE_NAME", StringType, nullable = false)
  val isTemporary = Field("IS_TEMPORARY", StringType, nullable = false)
  val kind = Field("KIND", StringType, nullable = false)
  val provider = Field("PROVIDER", StringType, nullable = true)
}

Source File: metadataSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.tablefunctions.OutputFormatter
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}
import org.apache.spark.sql.catalyst.CaseSensitivityUtils._


  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] =
    DatasourceResolver
      .resolverFor(sqlContext)
      .newInstanceOfTyped[MetadataCatalog](provider) match {
      case catalog: MetadataCatalog with MetadataCatalogPushDown =>
        catalog.getTableMetadata(sqlContext, options, requiredColumns, filters.toSeq.merge)
      case catalog =>
        val rows = catalog.getTableMetadata(sqlContext, options).flatMap { tableMetadata =>
          val formatter = new OutputFormatter(tableMetadata.tableName, tableMetadata.metadata)
          formatter.format().map(Row.fromSeq)
        }
        sparkContext.parallelize(schema.buildPrunedFilteredScan(requiredColumns, filters)(rows))
    }

  override def schema: StructType = MetadataSystemTable.schema
}

object MetadataSystemTable extends SchemaEnumeration {
  val tableName = Field("TABLE_NAME", StringType, nullable = false)
  val metadataKey = Field("METADATA_KEY", StringType, nullable = true)
  val metadataValue = Field("METADATA_VALUE", StringType, nullable = true)
}

Source File: relationMappingSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.sources.sql.SqlLikeRelation
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{Row, SQLContext}

object RelationMappingSystemTableProvider extends SystemTableProvider with LocalSpark {

  
  override def execute(): Seq[Row] = {
    sqlContext.tableNames().map { tableName =>
      val plan = sqlContext.catalog.lookupRelation(TableIdentifier(tableName))
      val sqlName = plan.collectFirst {
        case s: SqlLikeRelation =>
          s.relationName
        case LogicalRelation(s: SqlLikeRelation, _) =>
          s.relationName
      }
      Row(tableName, sqlName)
    }
  }
}

object RelationMappingSystemTable extends SchemaEnumeration {
  val sparkName = Field("RELATION_NAME", StringType, nullable = false)
  val providerName = Field("SQL_NAME", StringType, nullable = true)
}

Source File: NodeTests.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hierarchy

import org.apache.spark.Logging
import org.apache.spark.sql.types.{Node, NodeHelpers, StringType}

import scala.collection.mutable.ArrayBuffer

// scalastyle:off magic.number
// scalastyle:off file.size.limit
class NodeTests extends NodeUnitTestSpec with Logging {
    var nodes = ArrayBuffer[Node]()
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 1L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 1L, 2L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 1L, 3L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 2L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 3L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 4L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 4L, 1L))
    nodes += Node(path = null, pathDataType = StringType, ordPath = List(1L, 4L, 2L))
  log.info("Running unit tests for sorting class Node\n")
  nodes.toArray should equal {
    // deterministic generator:
    val myRand = new scala.util.Random(42)

    // take copy of array-buffer, shuffle it
    val shuffled_nodes = myRand.shuffle(nodes.toSeq)

    // shuffled?:
    shuffled_nodes should not equal nodes.toArray

    shuffled_nodes.sorted(NodeHelpers.OrderedNode)
  }
  log.info("Testing function compareToRecursive\n")
  val x = Node(null, null)

  0 should equal {x.compareToRecursive(Seq(), Seq())}
  0 should be > {x.compareToRecursive(Seq(), Seq(1))}
  0 should be < {x.compareToRecursive(Seq(1), Seq())}
  0 should equal {x.compareToRecursive(Seq(1,2), Seq(1,2))}
  0 should be < {x.compareToRecursive(Seq(1,2), Seq(1))}
  0 should be > {x.compareToRecursive(Seq(1), Seq(1,2))}

}
// scalastyle:on magic.number
// scalastyle:on file.size.limit

Source File: CollapseExpandSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.catalyst.analysis.CollapseExpandSuite.SqlLikeCatalystSourceRelation
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal}
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.sources.sql.SqlLikeRelation
import org.apache.spark.sql.sources.{BaseRelation, CatalystSource, Table}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.util.PlanComparisonUtils._
import org.apache.spark.sql.{GlobalSapSQLContext, Row}
import org.mockito.Matchers._
import org.mockito.Mockito._
import org.scalatest.FunSuite
import org.scalatest.mock.MockitoSugar


class CollapseExpandSuite extends FunSuite with MockitoSugar with GlobalSapSQLContext {
  case object Leaf extends LeafNode {
    override def output: Seq[Attribute] = Seq.empty
  }

  test("Expansion with a single sequence of projections is correctly collapsed") {
    val expand =
      Expand(
        Seq(Seq('a.string, Literal(1))),
        Seq('a.string, 'gid.int),
        Leaf)

    val collapsed = CollapseExpand(expand)
    assertResult(normalizeExprIds(Project(Seq('a.string, Literal(1) as "gid"), Leaf)))(
      normalizeExprIds(collapsed))
  }

  test("Expansion with multiple projections is correctly collapsed") {
    val expand =
      Expand(
        Seq(
          Seq('a.string, Literal(1)),
          Seq('b.string, Literal(1))),
        Seq('a.string, 'gid1.int, 'b.string, 'gid2.int),
        Leaf)

    val collapsed = CollapseExpand(expand)
    assertResult(
      normalizeExprIds(
        Project(Seq(
            'a.string,
            Literal(1) as "gid1",
            'b.string,
            Literal(1) as "gid2"),
          Leaf)))(normalizeExprIds(collapsed))
  }

  test("Expand pushdown integration") {
    val relation = mock[SqlLikeCatalystSourceRelation]
    when(relation.supportsLogicalPlan(any[Expand]))
      .thenReturn(true)
    when(relation.isMultiplePartitionExecution(any[Seq[CatalystSource]]))
      .thenReturn(true)
    when(relation.schema)
      .thenReturn(StructType(StructField("foo", StringType) :: Nil))
    when(relation.relationName)
      .thenReturn("t")
    when(relation.logicalPlanToRDD(any[LogicalPlan]))
      .thenReturn(sc.parallelize(Seq(Row("a", 1), Row("b", 1), Row("a", 1))))

    sqlc.baseRelationToDataFrame(relation).registerTempTable("t")

    val dataFrame = sqlc.sql("SELECT COUNT(DISTINCT foo) FROM t")
    val Seq(Row(ct)) = dataFrame.collect().toSeq

    assertResult(2)(ct)
  }
}

object CollapseExpandSuite {
  abstract class SqlLikeCatalystSourceRelation
    extends BaseRelation
    with Table
    with SqlLikeRelation
    with CatalystSource
}

Source File: ResolveCountDistinctStarSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.Aggregate
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalatest.FunSuite
import org.scalatest.Inside._
import org.scalatest.mock.MockitoSugar
import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Count}

import scala.collection.mutable.ArrayBuffer

class ResolveCountDistinctStarSuite extends FunSuite with MockitoSugar {
  val persons = new LogicalRelation(new BaseRelation {
    override def sqlContext: SQLContext = mock[SQLContext]
    override def schema: StructType = StructType(Seq(
      StructField("age", IntegerType),
      StructField("name", StringType)
    ))
  })

  test("Count distinct star is resolved correctly") {
    val projection = persons.select(UnresolvedAlias(
      AggregateExpression(Count(UnresolvedStar(None) :: Nil), Complete, true)))
    val stillNotCompletelyResolvedAggregate = SimpleAnalyzer.execute(projection)
    val resolvedAggregate = ResolveCountDistinctStar(SimpleAnalyzer)
                              .apply(stillNotCompletelyResolvedAggregate)
    inside(resolvedAggregate) {
      case Aggregate(Nil,
      ArrayBuffer(Alias(AggregateExpression(Count(expressions), Complete, true), _)), _) =>
        assert(expressions.collect {
          case a:AttributeReference => a.name
        }.toSet == Set("name", "age"))
    }
    assert(resolvedAggregate.resolved)
  }
}

Source File: HiveSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark

import com.sap.spark.{GlobalSparkContext, WithSapHiveContext}
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.SapHiveContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.scalatest.FunSuite


class HiveSuite
  extends FunSuite
  with GlobalSparkContext
  with WithSapHiveContext {

  val schema = StructType(
    StructField("foo", StringType) ::
    StructField("bar", StringType) :: Nil)

  test("NewSession returns a new SapHiveContext") {
    val hiveContext = sqlc.asInstanceOf[SapHiveContext]
    val newHiveContext = hiveContext.newSession()

    assert(newHiveContext.isInstanceOf[SapHiveContext])
    assert(newHiveContext != hiveContext)
  }

  test("NewSession returns a hive context whose catalog is separated to the current one") {
    val newContext = sqlc.newSession()
    val emptyRdd = newContext.createDataFrame(sc.emptyRDD[Row], schema)
    emptyRdd.registerTempTable("foo")

    assert(!sqlc.tableNames().contains("foo"))
    assert(newContext.tableNames().contains("foo"))
  }
}

Source File: ExcelRelation.scala From spark-hadoopoffice-ds with Apache License 2.0

5 votes

package org.zuinnote.spark.office.excel

import scala.collection.JavaConversions._

import org.apache.spark.sql.sources.{ BaseRelation, TableScan }
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.types.ArrayType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.SQLContext

import org.apache.spark.sql._
import org.apache.spark.rdd.RDD

import org.apache.hadoop.conf._
import org.apache.hadoop.mapreduce._

import org.apache.commons.logging.LogFactory
import org.apache.commons.logging.Log

import org.zuinnote.hadoop.office.format.common.dao._
import org.zuinnote.hadoop.office.format.mapreduce._

import org.zuinnote.spark.office.excel.util.ExcelFile


  override def buildScan: RDD[Row] = {
    // read ExcelRows
    val excelRowsRDD = ExcelFile.load(sqlContext, location, hadoopParams)
    // map to schema
    val schemaFields = schema.fields
    excelRowsRDD.flatMap(excelKeyValueTuple => {
      // map the Excel row data structure to a Spark SQL schema
      val rowArray = new Array[Any](excelKeyValueTuple._2.get.length)
      var i = 0;
      for (x <- excelKeyValueTuple._2.get) { // parse through the SpreadSheetCellDAO
        val spreadSheetCellDAOStructArray = new Array[String](schemaFields.length)
        val currentSpreadSheetCellDAO: Array[SpreadSheetCellDAO] = excelKeyValueTuple._2.get.asInstanceOf[Array[SpreadSheetCellDAO]]
        spreadSheetCellDAOStructArray(0) = currentSpreadSheetCellDAO(i).getFormattedValue
        spreadSheetCellDAOStructArray(1) = currentSpreadSheetCellDAO(i).getComment
        spreadSheetCellDAOStructArray(2) = currentSpreadSheetCellDAO(i).getFormula
        spreadSheetCellDAOStructArray(3) = currentSpreadSheetCellDAO(i).getAddress
        spreadSheetCellDAOStructArray(4) = currentSpreadSheetCellDAO(i).getSheetName
        // add row representing one Excel row
        rowArray(i) = spreadSheetCellDAOStructArray
        i += 1
      }
      Some(Row.fromSeq(rowArray))
    })

  }

}

Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License

5 votes

import org.apache.spark.SparkConf
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.streaming.http.HttpStreamClient
import org.junit.Assert
import org.junit.Test
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.types.BooleanType
import org.apache.spark.sql.types.FloatType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.ByteType
import org.apache.spark.sql.execution.streaming.http.HttpStreamServer
import org.apache.spark.sql.execution.streaming.http.StreamPrinter
import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException


class HttpStreamServerClientTest {
	val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte),
		Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte),
		Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte));

	val ROWS2 = Array(Row("hello"),
		Row("world"),
		Row("bye"),
		Row("world"));

	@Test
	def testHttpStreamIO() {
		//starts a http server
		val kryoSerializer = new KryoSerializer(new SparkConf());
		val server = HttpStreamServer.start("/xxxx", 8080);

		val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]")
			.getOrCreate();
		spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/");

		val sqlContext = spark.sqlContext;
		import spark.implicits._
		//add a local message buffer to server, with 2 topics registered
		server.withBuffer()
			.addListener(new StreamPrinter())
			.createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1")
			.createTopic[String]("topic-2");

		val client = HttpStreamClient.connect("http://localhost:8080/xxxx");
		//tests schema of topics
		val schema1 = client.fetchSchema("topic-1");
		Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType),
			schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]);

		val schema2 = client.fetchSchema("topic-2");
		Assert.assertArrayEquals(Array[Object](StringType),
			schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]);

		//prepare to consume messages
		val sid1 = client.subscribe("topic-1")._1;
		val sid2 = client.subscribe("topic-2")._1;

		//produces some data
		client.sendRows("topic-1", 1, ROWS1);

		val sid4 = client.subscribe("topic-1")._1;
		val sid5 = client.subscribe("topic-2")._1;

		client.sendRows("topic-2", 1, ROWS2);

		//consumes data
		val fetched = client.fetchStream(sid1).map(_.originalRow);
		Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]);
		//it is empty now
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]);

		client.unsubscribe(sid4);
		try {
			client.fetchStream(sid4);
			//exception should be thrown, because subscriber id is invalidated
			Assert.assertTrue(false);
		}
		catch {
			case e: Throwable ⇒
				e.printStackTrace();
				Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass);
		}

		server.stop();
	}
}

Source File: StringMapParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.parity.feature

import ml.combust.mleap.core.feature.{HandleInvalid, StringMapModel}
import org.apache.spark.ml.mleap.feature.StringMap
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StringType, StructType}

class StringMapParitySpec extends SparkParityBase {
  val names = Seq("alice", "andy", "kevin")
  val rows = spark.sparkContext.parallelize(Seq.tabulate(3) { i => Row(names(i)) })
  val schema = new StructType().add("name", StringType, nullable = false)

  override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema)

  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(
    new StringMap(uid = "string_map", model = new StringMapModel(
      Map("alice" -> 0, "andy" -> 1, "kevin" -> 2)
    )).setInputCol("name").setOutputCol("index"),
    new StringMap(uid = "string_map2", model = new StringMapModel(
      // This map is missing the label "kevin". Exception is thrown unless HandleInvalid.Keep is set.
      Map("alice" -> 0, "andy" -> 1),
      handleInvalid = HandleInvalid.Keep, defaultValue = 1.0
    )).setInputCol("name").setOutputCol("index2")

  )).fit(dataset)
}

Source File: WrappersSpec.scala From sparksql-scalapb with Apache License 2.0

5 votes

package scalapb.spark

import com.example.protos.wrappers._
import org.apache.spark.sql.SparkSession
import org.apache.hadoop.io.ArrayPrimitiveWritable
import scalapb.GeneratedMessageCompanion
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.ArrayType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.Row

import org.scalatest.BeforeAndAfterAll
import org.scalatest.flatspec.AnyFlatSpec
import org.scalatest.matchers.must.Matchers

class WrappersSpec extends AnyFlatSpec with Matchers with BeforeAndAfterAll {
  val spark: SparkSession = SparkSession
    .builder()
    .appName("ScalaPB Demo")
    .master("local[2]")
    .getOrCreate()

  import spark.implicits.StringToColumn

  val data = Seq(
    PrimitiveWrappers(
      intValue = Option(45),
      stringValue = Option("boo"),
      ints = Seq(17, 19, 25),
      strings = Seq("foo", "bar")
    ),
    PrimitiveWrappers(
      intValue = None,
      stringValue = None,
      ints = Seq(17, 19, 25),
      strings = Seq("foo", "bar")
    )
  )

  "converting df with primitive wrappers" should "work with primitive implicits" in {
    import ProtoSQL.withPrimitiveWrappers.implicits._
    val df = ProtoSQL.withPrimitiveWrappers.createDataFrame(spark, data)
    df.schema.fields.map(_.dataType).toSeq must be(
      Seq(
        IntegerType,
        StringType,
        ArrayType(IntegerType, false),
        ArrayType(StringType, false)
      )
    )
    df.collect must contain theSameElementsAs (
      Seq(
        Row(45, "boo", Seq(17, 19, 25), Seq("foo", "bar")),
        Row(null, null, Seq(17, 19, 25), Seq("foo", "bar"))
      )
    )
  }

  "converting df with primitive wrappers" should "work with default implicits" in {
    import ProtoSQL.implicits._
    val df = ProtoSQL.createDataFrame(spark, data)
    df.schema.fields.map(_.dataType).toSeq must be(
      Seq(
        StructType(Seq(StructField("value", IntegerType, true))),
        StructType(Seq(StructField("value", StringType, true))),
        ArrayType(
          StructType(Seq(StructField("value", IntegerType, true))),
          false
        ),
        ArrayType(
          StructType(Seq(StructField("value", StringType, true))),
          false
        )
      )
    )
    df.collect must contain theSameElementsAs (
      Seq(
        Row(
          Row(45),
          Row("boo"),
          Seq(Row(17), Row(19), Row(25)),
          Seq(Row("foo"), Row("bar"))
        ),
        Row(
          null,
          null,
          Seq(Row(17), Row(19), Row(25)),
          Seq(Row("foo"), Row("bar"))
        )
      )
    )
  }
}

Source File: MultiStreamHandler.scala From structured-streaming-application with Apache License 2.0

5 votes

package knolx.spark

import knolx.Config._
import knolx.KnolXLogger
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode}
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{Encoders, SparkSession}


case class CurrentPowerConsumption(kwh: Double)

case class PowerConsumptionStatus(numOfReadings: Long, total: Double, avg: Double, status: String) {
  def compute(newReadings: List[Double]) = {
    val newTotal = newReadings.sum + total
    val newNumOfReadings = numOfReadings + newReadings.size
    val newAvg = newTotal / newNumOfReadings.toDouble

    PowerConsumptionStatus(newNumOfReadings, newTotal, newAvg, "ON")
  }
}

object MultiStreamHandler extends App with KnolXLogger {
  info("Creating Spark Session")
  val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate()
  spark.sparkContext.setLogLevel("WARN")

  val updateStateFunc =
    (deviceId: String, newReadings: Iterator[(String, CurrentPowerConsumption)], state: GroupState[PowerConsumptionStatus]) => {
      val data = newReadings.toList.map { case(_, reading) => reading }.map(_.kwh)

      lazy val initialPowerConsumptionStatus = PowerConsumptionStatus(0L, 0D, 0D, "OFF")
      val currentState = state.getOption.fold(initialPowerConsumptionStatus.compute(data))(_.compute(data))

      val currentStatus =
        if(state.hasTimedOut) {
          // If we do not receive any reading, for a device, we will assume that it is OFF.
          currentState.copy(status = "OFF")
        } else {
          state.setTimeoutDuration("10 seconds")
          currentState
        }

      state.update(currentStatus)
      (deviceId, currentStatus)
    }

  info("Creating Streaming DF...")
  val dataStream =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", topic)
      .option("failOnDataLoss", false)
      .option("includeTimestamp", true)
      .load()

  info("Writing data to Console...")
  import spark.implicits._

  implicit val currentPowerConsumptionEncoder = Encoders.kryo[CurrentPowerConsumption]
  implicit val powerConsumptionStatusEncoder = Encoders.kryo[PowerConsumptionStatus]

  val query =
    dataStream
      .select(col("key").cast(StringType).as("key"), col("value").cast(StringType).as("value"))
      .as[(String, String)]
      .map { case(deviceId, unit) =>
        (deviceId, CurrentPowerConsumption(Option(unit).fold(0D)(_.toDouble)))
      }
      .groupByKey { case(deviceId, _) => deviceId }
      .mapGroupsWithState[PowerConsumptionStatus, (String, PowerConsumptionStatus)](GroupStateTimeout.ProcessingTimeTimeout())(updateStateFunc)
      .toDF("deviceId", "current_status")
      .writeStream
      .format("console")
      .option("truncate", false)
      .outputMode(OutputMode.Update())
      .option("checkpointLocation", checkPointDir)
      .start()

  info("Waiting for the query to terminate...")
  query.awaitTermination()
  query.stop()
}

Source File: StructuredStreamingWordCount.scala From structured-streaming-application with Apache License 2.0

5 votes

package knolx.spark

import com.datastax.driver.core.Cluster
import knolx.Config._
import knolx.KnolXLogger
import knolx.spark.CassandraForeachWriter.writeToCassandra
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.{col, lit, sum}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StringType


object StructuredStreamingWordCount extends App with KnolXLogger {
  val cluster = Cluster.builder.addContactPoints(cassandraHosts).build
  val session = cluster.newSession()

  info("Creating Keypsace and tables in Cassandra...")
  session.execute(s"CREATE KEYSPACE IF NOT EXISTS $keyspace WITH " +
    "replication = {'class':'SimpleStrategy','replication_factor':1};")

  session.execute(s"CREATE TABLE IF NOT EXISTS $keyspace.wordcount ( word text PRIMARY KEY,count int );")

  info("Closing DB connection...")
  session.close()
  session.getCluster.close()

  info("Creating Spark Session")
  val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate()
  spark.sparkContext.setLogLevel("WARN")

  info("Creating Streaming DF...")
  val dataStream =
    spark
      .readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", bootstrapServer)
      .option("subscribe", topic)
      .load()

  info("Writing data to Cassandra...")
  val query =
    dataStream
      .select(col("value").cast(StringType).as("word"), lit(1).as("count"))
      .groupBy(col("word"))
      .agg(sum("count").as("count"))
      .writeStream
      .outputMode(OutputMode.Update())
      .foreach(writeToCassandra)
      .option("checkpointLocation", checkPointDir)
      .start()

  info("Waiting for the query to terminate...")
  query.awaitTermination()
  query.stop()
}

Source File: EventHubsWriter.scala From azure-event-hubs-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.eventhubs

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{ AnalysisException, SparkSession }
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.types.{ BinaryType, StringType }
import org.apache.spark.util.Utils


private[eventhubs] object EventHubsWriter extends Logging {

  val BodyAttributeName = "body"
  val PartitionKeyAttributeName = "partitionKey"
  val PartitionIdAttributeName = "partition"
  val PropertiesAttributeName = "properties"

  override def toString: String = "EventHubsWriter"

  private def validateQuery(schema: Seq[Attribute], parameters: Map[String, String]): Unit = {
    schema
      .find(_.name == BodyAttributeName)
      .getOrElse(
        throw new AnalysisException(s"Required attribute '$BodyAttributeName' not found.")
      )
      .dataType match {
      case StringType | BinaryType => // good
      case _ =>
        throw new AnalysisException(
          s"$BodyAttributeName attribute type " +
            s"must be a String or BinaryType.")
    }
  }

  def write(
      sparkSession: SparkSession,
      queryExecution: QueryExecution,
      parameters: Map[String, String]
  ): Unit = {
    val schema = queryExecution.analyzed.output
    validateQuery(schema, parameters)
    queryExecution.toRdd.foreachPartition { iter =>
      val writeTask = new EventHubsWriteTask(parameters, schema)
      Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
        finallyBlock = writeTask.close()
      )
    }
  }
}

Source File: KustoSourceTests.scala From azure-kusto-spark with Apache License 2.0

5 votes

package com.microsoft.kusto.spark

import com.microsoft.kusto.spark.datasource.KustoSourceOptions
import com.microsoft.kusto.spark.utils.{KustoDataSourceUtils => KDSU}
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.junit.runner.RunWith
import org.scalamock.scalatest.MockFactory
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}

@RunWith(classOf[JUnitRunner])
class KustoSourceTests extends FlatSpec with MockFactory with Matchers with BeforeAndAfterAll {
  private val loggingLevel: Option[String] = Option(System.getProperty("logLevel"))
  if (loggingLevel.isDefined) KDSU.setLoggingLevel(loggingLevel.get)

  private val nofExecutors = 4
  private val spark: SparkSession = SparkSession.builder()
    .appName("KustoSource")
    .master(f"local[$nofExecutors]")
    .getOrCreate()

  private var sc: SparkContext = _
  private var sqlContext: SQLContext = _
  private val cluster: String = "KustoCluster"
  private val database: String = "KustoDatabase"
  private val query: String = "KustoTable"
  private val appId: String = "KustoSinkTestApplication"
  private val appKey: String = "KustoSinkTestKey"
  private val appAuthorityId: String = "KustoSinkAuthorityId"

  override def beforeAll(): Unit = {
    super.beforeAll()

    sc = spark.sparkContext
    sqlContext = spark.sqlContext
  }

  override def afterAll(): Unit = {
    super.afterAll()

    sc.stop()
  }

  "KustoDataSource" should "recognize Kusto and get the correct schema" in {
    val spark: SparkSession = SparkSession.builder()
      .appName("KustoSource")
      .master(f"local[$nofExecutors]")
      .getOrCreate()

    val customSchema = "colA STRING, colB INT"

    val df = spark.sqlContext
      .read
      .format("com.microsoft.kusto.spark.datasource")
      .option(KustoSourceOptions.KUSTO_CLUSTER, cluster)
      .option(KustoSourceOptions.KUSTO_DATABASE, database)
      .option(KustoSourceOptions.KUSTO_QUERY, query)
      .option(KustoSourceOptions.KUSTO_AAD_APP_ID, appId)
      .option(KustoSourceOptions.KUSTO_AAD_APP_SECRET, appKey)
      .option(KustoSourceOptions.KUSTO_AAD_AUTHORITY_ID, appAuthorityId)
      .option(KustoSourceOptions.KUSTO_CUSTOM_DATAFRAME_COLUMN_TYPES, customSchema)
      .load("src/test/resources/")

    val expected = StructType(Array(StructField("colA", StringType, nullable = true),StructField("colB", IntegerType, nullable = true)))
    assert(df.schema.equals(expected))
  }
}

Source File: RawDataWriterHelper.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.writer

import com.stratio.sparta.driver.factory.SparkContextFactory
import com.stratio.sparta.driver.step.RawData
import com.stratio.sparta.sdk.pipeline.output.Output
import com.stratio.sparta.sdk.utils.AggregationTime
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
import org.apache.spark.streaming.dstream.DStream


object RawDataWriterHelper {

  def writeRawData(rawData: RawData, outputs: Seq[Output], input: DStream[Row]): Unit = {
    val RawSchema = StructType(Seq(
      StructField(rawData.timeField, TimestampType, nullable = false),
      StructField(rawData.dataField, StringType, nullable = true)))
    val eventTime = AggregationTime.millisToTimeStamp(System.currentTimeMillis())

    input.map(row => Row.merge(Row(eventTime), row))
      .foreachRDD(rdd => {
        if (!rdd.isEmpty()) {
          val rawDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, RawSchema)

          WriterHelper.write(rawDataFrame, rawData.writerOptions, Map.empty[String, String], outputs)
        }
      })
  }
}

Source File: CubeMakerTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.test.cube

import java.sql.Timestamp

import com.github.nscala_time.time.Imports._
import com.stratio.sparta.driver.step.{Cube, CubeOperations, Trigger}
import com.stratio.sparta.driver.writer.WriterOptions
import com.stratio.sparta.plugin.default.DefaultField
import com.stratio.sparta.plugin.cube.field.datetime.DateTimeField
import com.stratio.sparta.plugin.cube.operator.count.CountOperator
import com.stratio.sparta.sdk.pipeline.aggregation.cube.{Dimension, DimensionValue, DimensionValuesTime, InputFields}
import com.stratio.sparta.sdk.pipeline.schema.TypeOp
import com.stratio.sparta.sdk.utils.AggregationTime
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType, TimestampType}
import org.apache.spark.streaming.TestSuiteBase
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class CubeMakerTest extends TestSuiteBase {

  val PreserverOrder = false

  
  def getEventOutput(timestamp: Timestamp, millis: Long):
  Seq[Seq[(DimensionValuesTime, InputFields)]] = {
    val dimensionString = Dimension("dim1", "eventKey", "identity", new DefaultField)
    val dimensionTime = Dimension("minute", "minute", "minute", new DateTimeField)
    val dimensionValueString1 = DimensionValue(dimensionString, "value1")
    val dimensionValueString2 = dimensionValueString1.copy(value = "value2")
    val dimensionValueString3 = dimensionValueString1.copy(value = "value3")
    val dimensionValueTs = DimensionValue(dimensionTime, timestamp)
    val tsMap = Row(timestamp)
    val valuesMap1 = InputFields(Row("value1", timestamp), 1)
    val valuesMap2 = InputFields(Row("value2", timestamp), 1)
    val valuesMap3 = InputFields(Row("value3", timestamp), 1)

    Seq(Seq(
      (DimensionValuesTime("cubeName", Seq(dimensionValueString1, dimensionValueTs)), valuesMap1),
      (DimensionValuesTime("cubeName", Seq(dimensionValueString2, dimensionValueTs)), valuesMap2),
      (DimensionValuesTime("cubeName", Seq(dimensionValueString3, dimensionValueTs)), valuesMap3)
    ))
  }
}

Source File: ParserTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.sdk.pipeline.transformation

import java.io.{Serializable => JSerializable}

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class ParserTest extends WordSpec with Matchers {

  "Parser" should {

    val parserTest = new ParserMock(
      1,
      Some("input"),
      Seq("output"),
      StructType(Seq(StructField("some", StringType))),
      Map()
    )

    "Order must be " in {
      val expected = 1
      val result = parserTest.getOrder
      result should be(expected)
    }

    "Parse must be " in {
      val event = Row("value")
      val expected = Seq(event)
      val result = parserTest.parse(event)
      result should be(expected)
    }

    "checked fields not be contained in outputs must be " in {
      val keyMap = Map("field" -> "value")
      val expected = Map()
      val result = parserTest.checkFields(keyMap)
      result should be(expected)
    }

    "checked fields are contained in outputs must be " in {
      val keyMap = Map("output" -> "value")
      val expected = keyMap
      val result = parserTest.checkFields(keyMap)
      result should be(expected)
    }

    "classSuffix must be " in {
      val expected = "Parser"
      val result = Parser.ClassSuffix
      result should be(expected)
    }
  }
}

Source File: CassandraOutputTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.output.cassandra

import java.io.{Serializable => JSerializable}

import com.datastax.spark.connector.cql.CassandraConnector
import com.stratio.sparta.sdk._
import com.stratio.sparta.sdk.properties.JsoneyString
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.mock.MockitoSugar
import org.scalatest.{FlatSpec, Matchers}

@RunWith(classOf[JUnitRunner])
class CassandraOutputTest extends FlatSpec with Matchers with MockitoSugar with AnswerSugar {

  val s = "sum"
  val properties = Map(("connectionHost", "127.0.0.1"), ("connectionPort", "9042"))

  "getSparkConfiguration" should "return a Seq with the configuration" in {
    val configuration = Map(("connectionHost", "127.0.0.1"), ("connectionPort", "9042"))
    val cass = CassandraOutput.getSparkConfiguration(configuration)

    cass should be(List(("spark.cassandra.connection.host", "127.0.0.1"), ("spark.cassandra.connection.port", "9042")))
  }

  "getSparkConfiguration" should "return all cassandra-spark config" in {
    val config: Map[String, JSerializable] = Map(
      ("sparkProperties" -> JsoneyString(
        "[{\"sparkPropertyKey\":\"spark.cassandra.input.fetch.size_in_rows\",\"sparkPropertyValue\":\"2000\"}," +
          "{\"sparkPropertyKey\":\"spark.cassandra.input.split.size_in_mb\",\"sparkPropertyValue\":\"64\"}]")),
      ("anotherProperty" -> "true")
    )

    val sparkConfig = CassandraOutput.getSparkConfiguration(config)

    sparkConfig.exists(_ == ("spark.cassandra.input.fetch.size_in_rows" -> "2000")) should be(true)
    sparkConfig.exists(_ == ("spark.cassandra.input.split.size_in_mb" -> "64")) should be(true)
    sparkConfig.exists(_ == ("anotherProperty" -> "true")) should be(false)
  }

  "getSparkConfiguration" should "not return cassandra-spark config" in {
    val config: Map[String, JSerializable] = Map(
      ("hadoopProperties" -> JsoneyString(
        "[{\"sparkPropertyKey\":\"spark.cassandra.input.fetch.size_in_rows\",\"sparkPropertyValue\":\"2000\"}," +
          "{\"sparkPropertyKey\":\"spark.cassandra.input.split.size_in_mb\",\"sparkPropertyValue\":\"64\"}]")),
      ("anotherProperty" -> "true")
    )

    val sparkConfig = CassandraOutput.getSparkConfiguration(config)

    sparkConfig.exists(_ == ("spark.cassandra.input.fetch.size_in_rows" -> "2000")) should be(false)
    sparkConfig.exists(_ == ("spark.cassandra.input.split.size_in_mb" -> "64")) should be(false)
    sparkConfig.exists(_ == ("anotherProperty" -> "true")) should be(false)
  }
}

Source File: OperatorEntityCountTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.cube.operator.entityCount

import java.io.{Serializable => JSerializable}

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpec}

@RunWith(classOf[JUnitRunner])
class OperatorEntityCountTest extends WordSpec with Matchers {

  "EntityCount" should {
    val props = Map(
      "inputField" -> "inputField".asInstanceOf[JSerializable],
      "split" -> ",".asInstanceOf[JSerializable])
    val schema = StructType(Seq(StructField("inputField", StringType)))
    val entityCount = new OperatorEntityCountMock("op1", schema, props)
    val inputFields = Row("hello,bye")

    "Return the associated precision name" in {
      val expected = Option(Seq("hello", "bye"))
      val result = entityCount.processMap(inputFields)
      result should be(expected)
    }

    "Return empty list" in {
      val expected = None
      val result = entityCount.processMap(Row())
      result should be(expected)
    }
  }
}

Source File: MorphlinesParserTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.transformation.morphline

import java.io.Serializable

import com.stratio.sparta.sdk.pipeline.input.Input
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, Matchers, WordSpecLike}


@RunWith(classOf[JUnitRunner])
class MorphlinesParserTest extends WordSpecLike with Matchers with BeforeAndAfter with BeforeAndAfterAll {

  val morphlineConfig = """
          id : test1
          importCommands : ["org.kitesdk.**"]
          commands: [
          {
              readJson {},
          }
          {
              extractJsonPaths {
                  paths : {
                      col1 : /col1
                      col2 : /col2
                  }
              }
          }
          {
            java {
              code : "return child.process(record);"
            }
          }
          {
              removeFields {
                  blacklist:["literal:_attachment_body"]
              }
          }
          ]
                        """
  val inputField = Some(Input.RawDataKey)
  val outputsFields = Seq("col1", "col2")
  val props: Map[String, Serializable] = Map("morphline" -> morphlineConfig)

  val schema = StructType(Seq(StructField("col1", StringType), StructField("col2", StringType)))

  val parser = new MorphlinesParser(1, inputField, outputsFields, schema, props)

  "A MorphlinesParser" should {

    "parse a simple json" in {
      val simpleJson =
        """{
            "col1":"hello",
            "col2":"word"
            }
        """
      val input = Row(simpleJson)
      val result = parser.parse(input)

      val expected = Seq(Row(simpleJson, "hello", "world"))

      result should be eq(expected)
    }

    "parse a simple json removing raw" in {
      val simpleJson =
        """{
            "col1":"hello",
            "col2":"word"
            }
        """
      val input = Row(simpleJson)
      val result = parser.parse(input)

      val expected = Seq(Row("hello", "world"))

      result should be eq(expected)
    }

    "exclude not configured fields" in {
      val simpleJson =
        """{
            "col1":"hello",
            "col2":"word",
            "col3":"!"
            }
        """
      val input = Row(simpleJson)
      val result = parser.parse(input)

      val expected = Seq(Row(simpleJson, "hello", "world"))

      result should be eq(expected)
    }
  }
}

Source File: DateTimeParserTest.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.transformation.datetime

import com.stratio.sparta.sdk.properties.JsoneyString
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Matchers, WordSpecLike}

@RunWith(classOf[JUnitRunner])
class DateTimeParserTest extends WordSpecLike with Matchers {

  val inputField = Some("ts")
  val outputsFields = Seq("ts")

  //scalastyle:off
  "A DateTimeParser" should {
    "parse unixMillis to string" in {
      val input = Row(1416330788000L)
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unixMillis"))
          .parse(input)

      val expected = Seq(Row(1416330788000L, "1416330788000"))

      assertResult(result)(expected)
    }

    "parse unix to string" in {
      val input = Row(1416330788)
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unix"))
          .parse(input)

      val expected = Seq(Row(1416330788, "1416330788000"))

      assertResult(result)(expected)
    }

    "parse unix to string removing raw" in {
      val input = Row(1416330788)
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unix",
          "removeInputField" -> JsoneyString.apply("true")))
          .parse(input)

      val expected = Seq(Row("1416330788000"))

      assertResult(result)(expected)
    }

    "not parse anything if the field does not match" in {
      val input = Row("1212")
      val schema = StructType(Seq(StructField("otherField", StringType)))

      an[IllegalStateException] should be thrownBy new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "unixMillis")).parse(input)
    }

    "not parse anything and generate a new Date" in {
      val input = Row("anything")
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "autoGenerated"))
          .parse(input)

      assertResult(result.head.size)(2)
    }

    "Auto generated if inputFormat does not exist" in {
      val input = Row("1416330788")
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map()).parse(input)

      assertResult(result.head.size)(2)
    }

    "parse dateTime in hive format" in {
      val input = Row("2015-11-08 15:58:58")
      val schema = StructType(Seq(StructField("ts", StringType)))

      val result =
        new DateTimeParser(1, inputField, outputsFields, schema, Map("inputFormat" -> "hive"))
          .parse(input)

      val expected = Seq(Row("2015-11-08 15:58:58", "1446998338000"))

      assertResult(result)(expected)
    }
  }
}

Source File: StatisticsTest.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.statistics

import java.io.ByteArrayOutputStream

import scala.collection.mutable.ArrayBuffer

import org.scalatest.BeforeAndAfterEach

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.BaseOrdering
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateOrdering
import org.apache.spark.sql.execution.datasources.oap.filecache.FiberCache
import org.apache.spark.sql.execution.datasources.oap.index.RangeInterval
import org.apache.spark.sql.execution.datasources.oap.utils.{NonNullKeyReader, NonNullKeyWriter}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.unsafe.Platform
import org.apache.spark.unsafe.memory.MemoryBlock
import org.apache.spark.unsafe.types.UTF8String

abstract class StatisticsTest extends SparkFunSuite with BeforeAndAfterEach {

  protected def rowGen(i: Int): InternalRow = InternalRow(i, UTF8String.fromString(s"test#$i"))

  protected lazy val schema: StructType = StructType(StructField("a", IntegerType)
    :: StructField("b", StringType) :: Nil)
  @transient
  protected lazy val nnkw: NonNullKeyWriter = new NonNullKeyWriter(schema)
  @transient
  protected lazy val nnkr: NonNullKeyReader = new NonNullKeyReader(schema)
  @transient
  protected lazy val ordering: BaseOrdering = GenerateOrdering.create(schema)
  @transient
  protected lazy val partialOrdering: BaseOrdering =
    GenerateOrdering.create(StructType(schema.dropRight(1)))
  protected var out: ByteArrayOutputStream = _

  protected var intervalArray: ArrayBuffer[RangeInterval] = new ArrayBuffer[RangeInterval]()

  override def beforeEach(): Unit = {
    out = new ByteArrayOutputStream(8000)
  }

  override def afterEach(): Unit = {
    out.close()
    intervalArray.clear()
  }

  protected def generateInterval(
      start: InternalRow, end: InternalRow,
      startInclude: Boolean, endInclude: Boolean): Unit = {
    intervalArray.clear()
    intervalArray.append(new RangeInterval(start, end, startInclude, endInclude))
  }

  protected def checkInternalRow(row1: InternalRow, row2: InternalRow): Unit = {
    val res = row1 == row2 // it works..
    assert(res, s"row1: $row1 does not match $row2")
  }

  protected def wrapToFiberCache(out: ByteArrayOutputStream): FiberCache = {
    val bytes = out.toByteArray
    FiberCache(bytes)
  }
}

Source File: DeltaByteArrayEncoderSuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.scalacheck.{Arbitrary, Gen, Properties}
import org.scalacheck.Prop.forAll
import org.scalatest.prop.Checkers

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.oap.adapter.PropertiesAdapter
import org.apache.spark.sql.execution.datasources.oap.filecache.StringFiberBuilder
import org.apache.spark.sql.types.StringType
import org.apache.spark.unsafe.types.UTF8String

class DeltaByteArrayEncoderCheck extends Properties("DeltaByteArrayEncoder") {

  private val rowCountInEachGroup = Gen.choose(1, 1024)
  private val rowCountInLastGroup = Gen.choose(1, 1024)
  private val groupCount = Gen.choose(1, 100)

  property("Encoding/Decoding String Type") = forAll { (values: Array[String]) =>

    forAll(rowCountInEachGroup, rowCountInLastGroup, groupCount) {
      (rowCount, lastCount, groupCount) =>
        if (values.nonEmpty) {
          // This is the 'PLAIN' FiberBuilder to validate the 'Encoding/Decoding'
          // Normally, the test case should be:
          // values => encoded bytes => decoded bytes => decoded values (Using ColumnValues class)
          // Validate if 'values' and 'decoded values' are identical.
          // But ColumnValues only support read value form DataFile. So, we have to use another way
          // to validate.
          val referenceFiberBuilder = StringFiberBuilder(rowCount, 0)
          val fiberBuilder = DeltaByteArrayFiberBuilder(rowCount, 0, StringType)
          val fiberParser = DeltaByteArrayDataFiberParser(
            new OapDataFileMetaV1(rowCountInEachGroup = rowCount), StringType)
          !(0 until groupCount).exists { group =>
            // If lastCount > rowCount, assume lastCount = rowCount
            val count = if (group < groupCount - 1) {
              rowCount
            } else if (lastCount > rowCount) {
              rowCount
            } else {
              lastCount
            }
            (0 until count).foreach { row =>
              fiberBuilder.append(InternalRow(UTF8String.fromString(values(row % values.length))))
              referenceFiberBuilder
                .append(InternalRow(UTF8String.fromString(values(row % values.length))))
            }
            val bytes = fiberBuilder.build().fiberData
            val parsedBytes = fiberParser.parse(bytes, count)
            val referenceBytes = referenceFiberBuilder.build().fiberData
            referenceFiberBuilder.clear()
            fiberBuilder.clear()
            assert(parsedBytes.length == referenceBytes.length)
            parsedBytes.zip(referenceBytes).exists(byte => byte._1 != byte._2)
          }
        } else true
    }
  }
}

class DeltaByteArrayEncoderSuite extends SparkFunSuite with Checkers {

  test("Check Encoding/Decoding") {
    check(PropertiesAdapter.getProp(new DictionaryBasedEncoderCheck()))
  }
}

Source File: DictionaryBasedEncoderSuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.oap.io

import org.apache.parquet.bytes.BytesInput
import org.apache.parquet.column.page.DictionaryPage
import org.apache.parquet.column.values.dictionary.PlainValuesDictionary.PlainBinaryDictionary
import org.scalacheck.{Arbitrary, Gen, Properties}
import org.scalacheck.Prop.forAll
import org.scalatest.prop.Checkers

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.execution.datasources.oap.adapter.PropertiesAdapter
import org.apache.spark.sql.execution.datasources.oap.filecache.StringFiberBuilder
import org.apache.spark.sql.types.StringType
import org.apache.spark.unsafe.types.UTF8String

class DictionaryBasedEncoderCheck extends Properties("DictionaryBasedEncoder") {
  private val rowCountInEachGroup = Gen.choose(1, 1024)
  private val rowCountInLastGroup = Gen.choose(1, 1024)
  private val groupCount = Gen.choose(1, 100)

  property("Encoding/Decoding String Type") = forAll { (values: Array[String]) =>

    forAll(rowCountInEachGroup, rowCountInLastGroup, groupCount) {
      (rowCount, lastCount, groupCount) =>
        if (values.nonEmpty) {
          // This is the 'PLAIN' FiberBuilder to validate the 'Encoding/Decoding'
          // Normally, the test case should be:
          // values => encoded bytes => decoded bytes => decoded values (Using ColumnValues class)
          // Validate if 'values' and 'decoded values' are identical.
          // But ColumnValues only support read value form DataFile. So, we have to use another way
          // to validate.
          val referenceFiberBuilder = StringFiberBuilder(rowCount, 0)
          val fiberBuilder = PlainBinaryDictionaryFiberBuilder(rowCount, 0, StringType)
          !(0 until groupCount).exists { group =>
            // If lastCount > rowCount, assume lastCount = rowCount
            val count =
              if (group < groupCount - 1) {
                rowCount
              } else if (lastCount > rowCount) {
                rowCount
              } else {
                lastCount
              }
            (0 until count).foreach { row =>
              fiberBuilder.append(InternalRow(UTF8String.fromString(values(row % values.length))))
              referenceFiberBuilder
                .append(InternalRow(UTF8String.fromString(values(row % values.length))))
            }
            val bytes = fiberBuilder.build().fiberData
            val dictionary = new PlainBinaryDictionary(
              new DictionaryPage(
                BytesInput.from(fiberBuilder.buildDictionary),
                fiberBuilder.getDictionarySize,
                org.apache.parquet.column.Encoding.PLAIN))
            val fiberParser = PlainDictionaryFiberParser(
              new OapDataFileMetaV1(rowCountInEachGroup = rowCount), dictionary, StringType)
            val parsedBytes = fiberParser.parse(bytes, count)
            val referenceBytes = referenceFiberBuilder.build().fiberData
            referenceFiberBuilder.clear()
            referenceFiberBuilder.resetDictionary()
            fiberBuilder.clear()
            fiberBuilder.resetDictionary()
            assert(parsedBytes.length == referenceBytes.length)
            parsedBytes.zip(referenceBytes).exists(byte => byte._1 != byte._2)
          }
        } else {
          true
        }
    }
  }
}

class DictionaryBasedEncoderSuite extends SparkFunSuite with Checkers {

  test("Check Encoding/Decoding") {
    check(PropertiesAdapter.getProp(new DictionaryBasedEncoderCheck()))
  }
}

Source File: MLPipelineTrackerIT.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas.ml

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.scalatest.Matchers
import com.hortonworks.spark.atlas._
import com.hortonworks.spark.atlas.types._
import com.hortonworks.spark.atlas.TestUtils._

class MLPipelineTrackerIT extends BaseResourceIT with Matchers with WithHiveSupport {
  private val atlasClient = new RestAtlasClient(atlasClientConf)

  def clusterName: String = atlasClientConf.get(AtlasClientConf.CLUSTER_NAME)

  def getTableEntity(tableName: String): SACAtlasEntityWithDependencies = {
    val dbDefinition = createDB("db1", "hdfs:///test/db/db1")
    val sd = createStorageFormat()
    val schema = new StructType()
      .add("user", StringType, false)
      .add("age", IntegerType, true)
    val tableDefinition = createTable("db1", s"$tableName", schema, sd)
    internal.sparkTableToEntity(tableDefinition, clusterName, Some(dbDefinition))
  }

  // Enable it to run integrated test.
  it("pipeline and pipeline model") {
    val uri = "hdfs://"
    val pipelineDir = "tmp/pipeline"
    val modelDir = "tmp/model"

    val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir)
    val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir)

    atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, modelDirEntity))

    val df = sparkSession.createDataFrame(Seq(
      (1, Vectors.dense(0.0, 1.0, 4.0), 1.0),
      (2, Vectors.dense(1.0, 0.0, 4.0), 2.0),
      (3, Vectors.dense(1.0, 0.0, 5.0), 3.0),
      (4, Vectors.dense(0.0, 0.0, 5.0), 4.0)
    )).toDF("id", "features", "label")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("features_scaled")
      .setMin(0.0)
      .setMax(3.0)
    val pipeline = new Pipeline().setStages(Array(scaler))

    val model = pipeline.fit(df)

    pipeline.write.overwrite().save(pipelineDir)

    val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity)

    atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, pipelineEntity))

    val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity)

    atlasClient.createEntitiesWithDependencies(Seq(modelDirEntity, modelEntity))

    val tableEntities1 = getTableEntity("chris1")
    val tableEntities2 = getTableEntity("chris2")

    atlasClient.createEntitiesWithDependencies(tableEntities1)
    atlasClient.createEntitiesWithDependencies(tableEntities2)

  }
}

Source File: MLAtlasEntityUtilsSuite.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas.types

import java.io.File

import org.apache.atlas.{AtlasClient, AtlasConstants}
import org.apache.atlas.model.instance.AtlasEntity
import org.apache.commons.io.FileUtils
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.scalatest.{FunSuite, Matchers}
import com.hortonworks.spark.atlas.TestUtils._
import com.hortonworks.spark.atlas.{AtlasUtils, WithHiveSupport}

class MLAtlasEntityUtilsSuite extends FunSuite with Matchers with WithHiveSupport {

  def getTableEntity(tableName: String): AtlasEntity = {
    val dbDefinition = createDB("db1", "hdfs:///test/db/db1")
    val sd = createStorageFormat()
    val schema = new StructType()
      .add("user", StringType, false)
      .add("age", IntegerType, true)
    val tableDefinition = createTable("db1", s"$tableName", schema, sd)

    val tableEntities = internal.sparkTableToEntity(
      tableDefinition, AtlasConstants.DEFAULT_CLUSTER_NAME, Some(dbDefinition))
    val tableEntity = tableEntities.entity

    tableEntity
  }

  test("pipeline, pipeline model, fit and transform") {
    val uri = "/"
    val pipelineDir = "tmp/pipeline"
    val modelDir = "tmp/model"

    val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir)
    pipelineDirEntity.entity.getAttribute("uri") should be (uri)
    pipelineDirEntity.entity.getAttribute("directory") should be (pipelineDir)
    pipelineDirEntity.dependencies.length should be (0)

    val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir)
    modelDirEntity.entity.getAttribute("uri") should be (uri)
    modelDirEntity.entity.getAttribute("directory") should be (modelDir)
    modelDirEntity.dependencies.length should be (0)

    val df = sparkSession.createDataFrame(Seq(
      (1, Vectors.dense(0.0, 1.0, 4.0), 1.0),
      (2, Vectors.dense(1.0, 0.0, 4.0), 2.0),
      (3, Vectors.dense(1.0, 0.0, 5.0), 3.0),
      (4, Vectors.dense(0.0, 0.0, 5.0), 4.0)
    )).toDF("id", "features", "label")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("features_scaled")
      .setMin(0.0)
      .setMax(3.0)
    val pipeline = new Pipeline().setStages(Array(scaler))

    val model = pipeline.fit(df)

    pipeline.write.overwrite().save(pipelineDir)

    val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity)
    pipelineEntity.entity.getTypeName should be (metadata.ML_PIPELINE_TYPE_STRING)
    pipelineEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (
      pipeline.uid)
    pipelineEntity.entity.getAttribute("name") should be (pipeline.uid)
    pipelineEntity.entity.getRelationshipAttribute("directory") should be (
      AtlasUtils.entityToReference(pipelineDirEntity.entity, useGuid = false))
    pipelineEntity.dependencies should be (Seq(pipelineDirEntity))

    val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity)
    val modelUid = model.uid.replaceAll("pipeline", "model")
    modelEntity.entity.getTypeName should be (metadata.ML_MODEL_TYPE_STRING)
    modelEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (modelUid)
    modelEntity.entity.getAttribute("name") should be (modelUid)
    modelEntity.entity.getRelationshipAttribute("directory") should be (
      AtlasUtils.entityToReference(modelDirEntity.entity, useGuid = false))

    modelEntity.dependencies should be (Seq(modelDirEntity))

    FileUtils.deleteDirectory(new File("tmp"))
  }
}

Source File: ProxyFeedback.scala From oni-ml with Apache License 2.0

5 votes

package org.opennetworkinsight.proxy

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StructType, StructField, StringType}
import scala.io.Source

import org.opennetworkinsight.proxy.ProxySchema._


object ProxyFeedback {

  
  def loadFeedbackDF(sc: SparkContext,
                     sqlContext: SQLContext,
                     feedbackFile: String,
                     duplicationFactor: Int): DataFrame = {


    val feedbackSchema = StructType(
      List(StructField(Date, StringType, nullable= true),
        StructField(Time, StringType, nullable= true),
        StructField(ClientIP, StringType, nullable= true),
        StructField(Host, StringType, nullable= true),
        StructField(ReqMethod, StringType, nullable= true),
        StructField(UserAgent, StringType, nullable= true),
        StructField(ResponseContentType, StringType, nullable= true),
        StructField(RespCode, StringType, nullable= true),
        StructField(FullURI, StringType, nullable= true)))

    if (new java.io.File(feedbackFile).exists) {

      val dateIndex = 0
      val timeIndex = 1
      val clientIpIndex = 2
      val hostIndex = 3
      val reqMethodIndex = 4
      val userAgentIndex = 5
      val resContTypeIndex = 6
      val respCodeIndex = 11
      val fullURIIndex = 18

      val fullURISeverityIndex = 22

      val lines = Source.fromFile(feedbackFile).getLines().toArray.drop(1)
      val feedback: RDD[String] = sc.parallelize(lines)

      sqlContext.createDataFrame(feedback.map(_.split("\t"))
        .filter(row => row(fullURISeverityIndex).trim.toInt == 3)
        .map(row => Row.fromSeq(List(row(dateIndex),
          row(timeIndex),
          row(clientIpIndex),
          row(hostIndex),
          row(reqMethodIndex),
          row(userAgentIndex),
          row(resContTypeIndex),
          row(respCodeIndex),
          row(fullURIIndex))))
        .flatMap(row => List.fill(duplicationFactor)(row)), feedbackSchema)
        .select(Date, Time, ClientIP, Host, ReqMethod, UserAgent, ResponseContentType, RespCode, FullURI)
    } else {
      sqlContext.createDataFrame(sc.emptyRDD[Row], feedbackSchema)
    }
  }
}

Source File: StreamingQueryListenerSampleJob.scala From spark-monitoring with MIT License

5 votes

package com.microsoft.pnp.samplejob

import com.microsoft.pnp.logging.Log4jConfiguration
import com.microsoft.pnp.util.TryWith
import org.apache.spark.SparkEnv
import org.apache.spark.internal.Logging
import org.apache.spark.metrics.UserMetricsSystems
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.window
import org.apache.spark.sql.types.{StringType, StructType, TimestampType}

object StreamingQueryListenerSampleJob extends Logging {

  private final val METRICS_NAMESPACE = "streamingquerylistenersamplejob"
  private final val COUNTER_NAME = "rowcounter"

  def main(args: Array[String]): Unit = {

    // Configure our logging
    TryWith(getClass.getResourceAsStream("/com/microsoft/pnp/samplejob/log4j.properties")) {
      stream => {
        Log4jConfiguration.configure(stream)
      }
    }

    logTrace("Trace message from StreamingQueryListenerSampleJob")
    logDebug("Debug message from StreamingQueryListenerSampleJob")
    logInfo("Info message from StreamingQueryListenerSampleJob")
    logWarning("Warning message from StreamingQueryListenerSampleJob")
    logError("Error message from StreamingQueryListenerSampleJob")

    val spark = SparkSession
      .builder
      .getOrCreate

    import spark.implicits._

    // this path has sample files provided by databricks for trying out purpose
    val inputPath = "/databricks-datasets/structured-streaming/events/"

    val jsonSchema = new StructType().add("time", TimestampType).add("action", StringType)

    val driverMetricsSystem = UserMetricsSystems
        .getMetricSystem(METRICS_NAMESPACE, builder => {
          builder.registerCounter(COUNTER_NAME)
        })

    driverMetricsSystem.counter(COUNTER_NAME).inc

    // Similar to definition of staticInputDF above, just using `readStream` instead of `read`
    val streamingInputDF =
      spark
        .readStream // `readStream` instead of `read` for creating streaming DataFrame
        .schema(jsonSchema) // Set the schema of the JSON data
        .option("maxFilesPerTrigger", 1) // Treat a sequence of files as a stream by picking one file at a time
        .json(inputPath)

    driverMetricsSystem.counter(COUNTER_NAME).inc(5)

    val streamingCountsDF =
      streamingInputDF
        .groupBy($"action", window($"time", "1 hour"))
        .count()

    // Is this DF actually a streaming DF?
    streamingCountsDF.isStreaming

    driverMetricsSystem.counter(COUNTER_NAME).inc(10)

    val query =
      streamingCountsDF
        .writeStream
        .format("memory") // memory = store in-memory table (for testing only in Spark 2.0)
        .queryName("counts") // counts = name of the in-memory table
        .outputMode("complete") // complete = all the counts should be in the table
        .start()
  }
}

Source File: UserData.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.df
//import org.apache.spark.sql.SQLContext
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession

import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType};
package object UserData {
  def main(args: Array[String]): Unit = {
    val customSchema = StructType(Array(
      StructField("no", IntegerType, true),
      StructField("age", StringType, true),
      StructField("gender", StringType, true),
      StructField("occupation", StringType, true),
      StructField("zipCode", StringType, true)));
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val spark = SparkSession
      .builder()
      .appName("SparkUserData").config(spConfig)
      .getOrCreate()

    val user_df = spark.read.format("com.databricks.spark.csv")
      .option("delimiter", "|").schema(customSchema)
      .load("/home/ubuntu/work/ml-resources/spark-ml/data/ml-100k/u.user")
    val first = user_df.first()
    println("First Record : " + first)

    val num_genders = user_df.groupBy("gender").count().count()
    val num_occupations = user_df.groupBy("occupation").count().count()
    val num_zipcodes = user_df.groupBy("zipCode").count().count()

    println("num_users : " + user_df.count())
    println("num_genders : "+ num_genders)
    println("num_occupations : "+ num_occupations)
    println("num_zipcodes: " + num_zipcodes)
    println("Distribution by Occupation")
    println(user_df.groupBy("occupation").count().show())

  }
}

Source File: Util.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples

import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StringType, StructField, StructType}


object Util {
  val PATH = "/home/ubuntu/work/spark-2.0.0-bin-hadoop2.7/"
  val DATA_PATH= "../../../data/ml-100k"
  val PATH_MOVIES = DATA_PATH + "/u.item"

  def reduceDimension2(x: Vector) : String= {
    var i = 0
    var l = x.toArray.size
    var l_2 = l/2.toInt
    var x_ = 0.0
    var y_ = 0.0

    for(i <- 0 until l_2) {
      x_ += x(i).toDouble
    }
    for(i <- (l_2 + 1) until l) {
      y_ += x(i).toDouble
    }
    var t = x_ + "," + y_
    return t
  }

  def getMovieDataDF(spark : SparkSession) : DataFrame = {

    //1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
    // |0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0
    val customSchema = StructType(Array(
      StructField("id", StringType, true),
      StructField("name", StringType, true),
      StructField("date", StringType, true),
      StructField("url", StringType, true)));
    val movieDf = spark.read.format("com.databricks.spark.csv")
      .option("delimiter", "|").schema(customSchema)
      .load(PATH_MOVIES)
    return movieDf
  }

}

Source File: PrettifyTest.scala From spark-testing-base with Apache License 2.0

5 votes

package com.holdenkarau.spark.testing

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalacheck.Gen
import org.scalacheck.Prop._
import org.scalacheck.util.Pretty
import org.scalatest.FunSuite
import org.scalatest.exceptions.GeneratorDrivenPropertyCheckFailedException
import org.scalatest.prop.Checkers

class PrettifyTest extends FunSuite with SharedSparkContext with Checkers with Prettify {
  implicit val propertyCheckConfig = PropertyCheckConfig(minSize = 2, maxSize = 2)

  test("pretty output of DataFrame's check") {
    val schema = StructType(List(StructField("name", StringType), StructField("age", IntegerType)))
    val sqlContext = new SQLContext(sc)
    val nameGenerator = new Column("name", Gen.const("Holden Hanafy"))
    val ageGenerator = new Column("age", Gen.const(20))

    val dataframeGen = DataframeGenerator.arbitraryDataFrameWithCustomFields(sqlContext, schema)(nameGenerator, ageGenerator)

    val actual = runFailingCheck(dataframeGen.arbitrary)
    val expected =
      Some("arg0 = <DataFrame: schema = [name: string, age: int], size = 2, values = ([Holden Hanafy,20], [Holden Hanafy,20])>")
    assert(actual == expected)
  }

  test("pretty output of RDD's check") {
    val rddGen = RDDGenerator.genRDD[(String, Int)](sc) {
      for {
        name <- Gen.const("Holden Hanafy")
        age <- Gen.const(20)
      } yield name -> age
    }

    val actual = runFailingCheck(rddGen)
    val expected =
      Some("""arg0 = <RDD: size = 2, values = ((Holden Hanafy,20), (Holden Hanafy,20))>""")
    assert(actual == expected)
  }

  test("pretty output of Dataset's check") {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val datasetGen = DatasetGenerator.genDataset[(String, Int)](sqlContext) {
      for {
        name <- Gen.const("Holden Hanafy")
        age <- Gen.const(20)
      } yield name -> age
    }

    val actual = runFailingCheck(datasetGen)
    val expected =
      Some("""arg0 = <Dataset: schema = [_1: string, _2: int], size = 2, values = ((Holden Hanafy,20), (Holden Hanafy,20))>""")
    assert(actual == expected)
  }

  private def runFailingCheck[T](genUnderTest: Gen[T])(implicit p: T => Pretty) = {
    val property = forAll(genUnderTest)(_ => false)
    val e = intercept[GeneratorDrivenPropertyCheckFailedException] {
      check(property)
    }
    takeSecondToLastLine(e.message)
  }

  private def takeSecondToLastLine(msg: Option[String]) =
    msg.flatMap(_.split("\n").toList.reverse.tail.headOption.map(_.trim))

}

Source File: PartitionAndSleepWorkload.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.workload.exercise

import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._
import com.ibm.sparktc.sparkbench.utils.SaveModes
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object PartitionAndSleepWorkload extends WorkloadDefaults {
  val name = "timedsleep"
  val partitions: Int = 48
  val sleepms: Long = 12000L

  def apply(m: Map[String, Any]) = new PartitionAndSleepWorkload(
    input = None,
    output = None,
    partitions = getOrDefault[Int](m, "partitions", partitions),
    sleepMS = getOrDefault[Long](m, "sleepms", sleepms, any2Long))
}

case class PartitionAndSleepWorkload(input: Option[String] = None,
                                     output: Option[String] = None,
                                     saveMode: String = SaveModes.error,
                                     partitions: Int,
                                     sleepMS: Long) extends Workload {

  def doStuff(spark: SparkSession): (Long, Unit) = time {

    val ms = sleepMS
    val stuff: RDD[Int] = spark.sparkContext.parallelize(0 until partitions * 100, partitions)

    val cool: RDD[(Int, Int)] = stuff.map { i =>
      Thread.sleep(ms)
      (i % 10, i + 42)
    }

    val yeah = cool.reduceByKey(_ + _)
    yeah.collect()
  }

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {
    val (t, _) = doStuff(spark)

    val schema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("runtime", LongType, nullable = false)
      )
    )

    val timeList = spark.sparkContext.parallelize(Seq(Row("timedsleep", System.currentTimeMillis(), t)))

    spark.createDataFrame(timeList, schema)
  }
}

Source File: GraphDataGen.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.datageneration

import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{any2Long, getOrDefault, getOrThrow, time}
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.apache.spark.graphx.util.GraphGenerators

object GraphDataGen extends WorkloadDefaults {

  val name = "graph-data-generator"
  val defaultMu = 4.0
  val defaultSigma = 1.3
  val defaultSeed = -1L
  val defaultNumOfPartitions = 0

  override def apply(m: Map[String, Any]): GraphDataGen = {
      val numVertices = getOrThrow(m, "vertices").asInstanceOf[Int]
      val mu = getOrDefault[Double](m, "mu", defaultMu)
      val sigma = getOrDefault[Double](m, "sigma", defaultSigma)
      val numPartitions = getOrDefault[Int](m, "partitions", defaultNumOfPartitions)
      val seed = getOrDefault[Long](m, "seed", defaultSeed, any2Long)
      val output = {
        val str = getOrThrow(m, "output").asInstanceOf[String]
        val s = verifySuitabilityOfOutputFileFormat(str)
        Some(s)
      }
    val saveMode = getOrDefault[String](m, "save-mode", SaveModes.error)

    new GraphDataGen(
      numVertices = numVertices,
      input = None,
      output = output,
      saveMode = saveMode,
      mu = mu,
      sigma = sigma,
      seed = seed,
      numPartitions = numPartitions
    )
  }

  
  private[datageneration] def verifySuitabilityOfOutputFileFormat(str: String): String = {
    val strArr: Array[String] = str.split('.')

    (strArr.length, strArr.last) match {
      case (1, _) => throw SparkBenchException("Output file for GraphDataGen must have \".txt\" as the file extension." +
        "Please modify your config file.")
      case (2, "txt") => str
      case (_, _) => throw SparkBenchException("Due to limitations of the GraphX GraphLoader, " +
        "the graph data generators may only save files as \".txt\"." +
        "Please modify your config file.")
    }
  }

}

case class GraphDataGen (
                          numVertices: Int,
                          input: Option[String] = None,
                          output: Option[String],
                          saveMode: String,
                          mu: Double = 4.0,
                          sigma: Double = 1.3,
                          seed: Long = 1,
                          numPartitions: Int = 0
                        ) extends Workload {

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {
    val timestamp = System.currentTimeMillis()
    val (generateTime, graph) = time(GraphGenerators.logNormalGraph(spark.sparkContext, numVertices, numPartitions, mu, sigma))
    val (convertTime, out) = time(graph.edges.map(e => s"${e.srcId.toString} ${e.dstId}"))
    val (saveTime, _) = time(out.saveAsTextFile(output.get))

    val timeResultSchema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("generate", LongType, nullable = true),
        StructField("convert", LongType, nullable = true),
        StructField("save", LongType, nullable = true),
        StructField("total_runtime", LongType, nullable = false)
      )
    )
    val total = generateTime + convertTime + saveTime
    val timeList = spark.sparkContext.parallelize(Seq(Row(GraphDataGen.name, timestamp, generateTime, convertTime, saveTime, total)))
    spark.createDataFrame(timeList, timeResultSchema)
  }
}

Source File: LinearRegressionDataGen.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.datageneration.mlgenerator

import org.apache.spark.mllib.util.LinearDataGenerator
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException}
import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{getOrDefault, getOrThrow, time}
import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}

object LinearRegressionDataGen extends WorkloadDefaults {
  val name = "data-generation-lr"
  // Application parameters #1million points have 200M data size
  val numOfExamples: Int = 40000
  val numOfFeatures: Int = 4
  val eps: Double = 0.5
  val intercepts: Double = 0.1
  val numOfPartitions: Int = 10
  val maxIteration: Int = 3
  override def apply(m: Map[String, Any]) = new LinearRegressionDataGen(
    numRows = getOrThrow(m, "rows").asInstanceOf[Int],
    numCols = getOrThrow(m, "cols").asInstanceOf[Int],
    output = Some(getOrThrow(m, "output").asInstanceOf[String]),
    saveMode = getOrDefault[String](m, "save-mode", SaveModes.error),
    eps = getOrDefault[Double](m, "eps", eps),
    intercepts = getOrDefault[Double](m, "intercepts", intercepts),
    numPartitions = getOrDefault[Int](m, "partitions", numOfPartitions)
  )
}

case class LinearRegressionDataGen (
                                      numRows: Int,
                                      numCols: Int,
                                      input: Option[String] = None,
                                      output: Option[String],
                                      saveMode: String,
                                      eps: Double,
                                      intercepts: Double,
                                      numPartitions: Int
                                   ) extends Workload {

  override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = {

    val timestamp = System.currentTimeMillis()

    val (generateTime, data): (Long, RDD[LabeledPoint]) = time {
      LinearDataGenerator.generateLinearRDD(
        spark.sparkContext,
        numRows,
        numCols,
        eps,
        numPartitions,
        intercepts
      )
    }

    import spark.implicits._
    val (convertTime, dataDF) = time {
      data.toDF
    }

    val (saveTime, _) = time {
      val outputstr = output.get
      if(outputstr.endsWith(".csv")) throw SparkBenchException("LabeledPoints cannot be saved to CSV. Please try outputting to Parquet instead.")
      writeToDisk(output.get, saveMode, dataDF, spark)
    }//TODO you can't output this to CSV. Parquet is fine

    val timeResultSchema = StructType(
      List(
        StructField("name", StringType, nullable = false),
        StructField("timestamp", LongType, nullable = false),
        StructField("generate", LongType, nullable = true),
        StructField("convert", LongType, nullable = true),
        StructField("save", LongType, nullable = true),
        StructField("total_runtime", LongType, nullable = false)
      )
    )

    val total = generateTime + convertTime + saveTime

    val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total)))

    spark.createDataFrame(timeList, timeResultSchema)

  }
}

Source File: NGram.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}


  @Since("1.5.0")
  def getN: Int = $(n)

  setDefault(n -> 2)

  override protected def createTransformFunc: Seq[String] => Seq[String] = {
    _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.sameType(ArrayType(StringType)),
      s"Input type must be ArrayType(StringType) but got $inputType.")
  }

  override protected def outputDataType: DataType = new ArrayType(StringType, false)
}

@Since("1.6.0")
object NGram extends DefaultParamsReadable[NGram] {

  @Since("1.6.0")
  override def load(path: String): NGram = super.load(path)
}

Source File: MapDataSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import scala.collection._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType}
import org.apache.spark.unsafe.types.UTF8String

class MapDataSuite extends SparkFunSuite {

  test("inequality tests") {
    def u(str: String): UTF8String = UTF8String.fromString(str)

    // test data
    val testMap1 = Map(u("key1") -> 1)
    val testMap2 = Map(u("key1") -> 1, u("key2") -> 2)
    val testMap3 = Map(u("key1") -> 1)
    val testMap4 = Map(u("key1") -> 1, u("key2") -> 2)

    // ArrayBasedMapData
    val testArrayMap1 = ArrayBasedMapData(testMap1.toMap)
    val testArrayMap2 = ArrayBasedMapData(testMap2.toMap)
    val testArrayMap3 = ArrayBasedMapData(testMap3.toMap)
    val testArrayMap4 = ArrayBasedMapData(testMap4.toMap)
    assert(testArrayMap1 !== testArrayMap3)
    assert(testArrayMap2 !== testArrayMap4)

    // UnsafeMapData
    val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType)))
    val row = new GenericInternalRow(1)
    def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = {
      row.update(0, map)
      val unsafeRow = unsafeConverter.apply(row)
      unsafeRow.getMap(0).copy
    }
    assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3))
    assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4))
  }
}

Source File: ScalaUDFSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types.{IntegerType, StringType}

class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("basic") {
    val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil)
    checkEvaluation(intUdf, 2)

    val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil)
    checkEvaluation(stringUdf, "ax")
  }

  test("better error message for NPE") {
    val udf = ScalaUDF(
      (s: String) => s.toLowerCase,
      StringType,
      Literal.create(null, StringType) :: Nil)

    val e1 = intercept[SparkException](udf.eval())
    assert(e1.getMessage.contains("Failed to execute user defined function"))

    val e2 = intercept[SparkException] {
      checkEvalutionWithUnsafeProjection(udf, null)
    }
    assert(e2.getMessage.contains("Failed to execute user defined function"))
  }

}

Source File: RewriteDistinctAggregatesSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.SimpleCatalystConf
import org.apache.spark.sql.catalyst.analysis.{Analyzer, EmptyFunctionRegistry}
import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.{If, Literal}
import org.apache.spark.sql.catalyst.expressions.aggregate.{CollectSet, Count}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Expand, LocalRelation, LogicalPlan}
import org.apache.spark.sql.types.{IntegerType, StringType}

class RewriteDistinctAggregatesSuite extends PlanTest {
  val conf = SimpleCatalystConf(caseSensitiveAnalysis = false, groupByOrdinal = false)
  val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf)
  val analyzer = new Analyzer(catalog, conf)

  val nullInt = Literal(null, IntegerType)
  val nullString = Literal(null, StringType)
  val testRelation = LocalRelation('a.string, 'b.string, 'c.string, 'd.string, 'e.int)

  private def checkRewrite(rewrite: LogicalPlan): Unit = rewrite match {
    case Aggregate(_, _, Aggregate(_, _, _: Expand)) =>
    case _ => fail(s"Plan is not rewritten:\n$rewrite")
  }

  test("single distinct group") {
    val input = testRelation
      .groupBy('a)(countDistinct('e))
      .analyze
    val rewrite = RewriteDistinctAggregates(input)
    comparePlans(input, rewrite)
  }

  test("single distinct group with partial aggregates") {
    val input = testRelation
      .groupBy('a, 'd)(
        countDistinct('e, 'c).as('agg1),
        max('b).as('agg2))
      .analyze
    val rewrite = RewriteDistinctAggregates(input)
    comparePlans(input, rewrite)
  }

  test("single distinct group with non-partial aggregates") {
    val input = testRelation
      .groupBy('a, 'd)(
        countDistinct('e, 'c).as('agg1),
        CollectSet('b).toAggregateExpression().as('agg2))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups") {
    val input = testRelation
      .groupBy('a)(countDistinct('b, 'c), countDistinct('d))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups with partial aggregates") {
    val input = testRelation
      .groupBy('a)(countDistinct('b, 'c), countDistinct('d), sum('e))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups with non-partial aggregates") {
    val input = testRelation
      .groupBy('a)(
        countDistinct('b, 'c),
        countDistinct('d),
        CollectSet('b).toAggregateExpression())
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }
}

Source File: resources.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import java.io.File
import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand {
  override val output: Seq[Attribute] = {
    AttributeReference("Results", StringType, nullable = false)() :: Nil
  }
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val jarList = sparkSession.sparkContext.listJars()
    if (jars.nonEmpty) {
      for {
        jarName <- jars.map(f => new Path(f).getName)
        jarPath <- jarList if jarPath.contains(jarName)
      } yield Row(jarPath)
    } else {
      jarList.map(Row(_))
    }
  }
}

Source File: WholeStageCodegenSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.aggregate.HashAggregateExec
import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
import org.apache.spark.sql.expressions.scalalang.typed
import org.apache.spark.sql.functions.{avg, broadcast, col, max}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}

class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {

  test("range/filter should be combined") {
    val df = spark.range(10).filter("id = 1").selectExpr("id + 1")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined)
    assert(df.collect() === Array(Row(2)))
  }

  test("Aggregate should be included in WholeStageCodegen") {
    val df = spark.range(10).groupBy().agg(max(col("id")), avg(col("id")))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(9, 4.5)))
  }

  test("Aggregate with grouping keys should be included in WholeStageCodegen") {
    val df = spark.range(3).groupBy("id").count().orderBy("id")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(0, 1), Row(1, 1), Row(2, 1)))
  }

  test("BroadcastHashJoin should be included in WholeStageCodegen") {
    val rdd = spark.sparkContext.makeRDD(Seq(Row(1, "1"), Row(1, "1"), Row(2, "2")))
    val schema = new StructType().add("k", IntegerType).add("v", StringType)
    val smallDF = spark.createDataFrame(rdd, schema)
    val df = spark.range(10).join(broadcast(smallDF), col("k") === col("id"))
    assert(df.queryExecution.executedPlan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[BroadcastHashJoinExec]).isDefined)
    assert(df.collect() === Array(Row(1, 1, "1"), Row(1, 1, "1"), Row(2, 2, "2")))
  }

  test("Sort should be included in WholeStageCodegen") {
    val df = spark.range(3, 0, -1).toDF().sort(col("id"))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SortExec]).isDefined)
    assert(df.collect() === Array(Row(1), Row(2), Row(3)))
  }

  test("MapElements should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = spark.range(10).map(_.toString)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SerializeFromObjectExec]).isDefined)
    assert(ds.collect() === 0.until(10).map(_.toString).toArray)
  }

  test("typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 2, 4, 6, 8))
  }

  test("back-to-back typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0).filter(_ % 3 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 6))
  }

  test("simple typed UDAF should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS()
      .groupByKey(_._1).agg(typed.sum(_._2))

    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0)))
  }
}

Source File: GroupedIteratorSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: DDLSourceLoadSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {
    intercept[RuntimeException] {
      spark.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {
    spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {
    spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without Hive Support") {
    val e = intercept[AnalysisException] {
      spark.read.format("orc").load()
    }
    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

Source File: ForecastPipelineStage.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import org.apache.spark.ml.param.shared.{HasNFutures, HasPredictionCol, HasValidationCol}
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.types.{StructType, StringType, StructField, MapType}


trait ForecastPipelineStage
    extends PipelineStage
    with HasNFutures
    with HasPredictionCol
    with HasValidationCol {

  def setValidationCol(value: String): this.type = set(validationCol, value)

  override def transformSchema(schema: StructType): StructType = {
    schema
      .add(StructField($(validationCol), new VectorUDT))
      .add(StructField(IUberdataForecastUtil.ALGORITHM, StringType))
      .add(StructField(IUberdataForecastUtil.PARAMS, MapType(StringType, StringType)))
  }
}

Source File: CarbonCatalystOperators.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql

import scala.collection.mutable

import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Count}
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.types.StringType

abstract class CarbonProfile(attributes: Seq[Attribute]) extends Serializable {
  def isEmpty: Boolean = attributes.isEmpty
}

case class IncludeProfile(attributes: Seq[Attribute]) extends CarbonProfile(attributes)

case class ExcludeProfile(attributes: Seq[Attribute]) extends CarbonProfile(attributes)

case class ProjectForUpdate(
    table: UnresolvedRelation,
    columns: List[String],
    children: Seq[LogicalPlan]) extends LogicalPlan {
  override def output: Seq[Attribute] = Seq.empty
}

case class UpdateTable(
    table: UnresolvedRelation,
    columns: List[String],
    selectStmt: String,
    alias: Option[String] = None,
    filer: String) extends LogicalPlan {
  override def children: Seq[LogicalPlan] = Seq.empty
  override def output: Seq[Attribute] = Seq.empty
}

case class DeleteRecords(
    statement: String,
    alias: Option[String] = None,
    table: UnresolvedRelation) extends LogicalPlan {
  override def children: Seq[LogicalPlan] = Seq.empty
  override def output: Seq[AttributeReference] = Seq.empty
}


  def strictCountStar(groupingExpressions: Seq[Expression],
      partialComputation: Seq[NamedExpression],
      child: LogicalPlan): Boolean = {
    if (groupingExpressions.nonEmpty) {
      return false
    }
    if (partialComputation.isEmpty) {
      return false
    }
    if (partialComputation.size > 1 && partialComputation.nonEmpty) {
      return false
    }
    child collect {
      case cd: Filter => return false
    }
    true
  }
}

Source File: CarbonShowStreamsCommand.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.stream

import java.util.Date
import java.util.concurrent.TimeUnit

import org.apache.spark.sql.{CarbonEnv, Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.execution.command.MetadataCommand
import org.apache.spark.sql.types.StringType

import org.apache.carbondata.stream.StreamJobManager


case class CarbonShowStreamsCommand(
    tableOp: Option[TableIdentifier]
) extends MetadataCommand {
  override def output: Seq[Attribute] = {
    Seq(AttributeReference("Stream Name", StringType, nullable = false)(),
      AttributeReference("JobId", StringType, nullable = false)(),
      AttributeReference("Status", StringType, nullable = false)(),
      AttributeReference("Source", StringType, nullable = false)(),
      AttributeReference("Sink", StringType, nullable = false)(),
      AttributeReference("Start Time", StringType, nullable = false)(),
      AttributeReference("Time Elapse", StringType, nullable = false)())
  }

  override def processMetadata(sparkSession: SparkSession): Seq[Row] = {
    val jobs = tableOp match {
      case None => StreamJobManager.getAllJobs.toSeq
      case Some(table) =>
        val carbonTable = CarbonEnv.getCarbonTable(table.database, table.table)(sparkSession)
        setAuditTable(carbonTable)
        StreamJobManager.getAllJobs.filter { job =>
          job.sinkTable.equalsIgnoreCase(carbonTable.getTableName) &&
          job.sinkDb.equalsIgnoreCase(carbonTable.getDatabaseName)
        }.toSeq
    }

    jobs.map { job =>
      val elapsedTime = System.currentTimeMillis() - job.startTime
      Row(
        job.streamName,
        job.streamingQuery.id.toString,
        if (job.streamingQuery.isActive) "RUNNING" else "FAILED",
        s"${ job.sourceDb }.${ job.sourceTable }",
        s"${ job.sinkDb }.${ job.sinkTable }",
        new Date(job.startTime).toString,
        String.format(
          "%s days, %s hours, %s min, %s sec",
          TimeUnit.MILLISECONDS.toDays(elapsedTime).toString,
          TimeUnit.MILLISECONDS.toHours(elapsedTime).toString,
          TimeUnit.MILLISECONDS.toMinutes(elapsedTime).toString,
          TimeUnit.MILLISECONDS.toSeconds(elapsedTime).toString)
      )
    }
  }

  override protected def opName: String = "SHOW STREAMS"
}

Source File: CarbonShowMVCommand.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.view

import java.util

import scala.collection.JavaConverters._

import org.apache.spark.sql.{CarbonEnv, Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.execution.command.{Checker, DataCommand}
import org.apache.spark.sql.types.{BooleanType, StringType}

import org.apache.carbondata.core.view.{MVProperty, MVSchema}
import org.apache.carbondata.view.MVManagerInSpark


case class CarbonShowMVCommand(
    databaseNameOption: Option[String],
    relatedTableIdentifier: Option[TableIdentifier]) extends DataCommand {

  override def output: Seq[Attribute] = {
    Seq(
      AttributeReference("Database", StringType, nullable = false)(),
      AttributeReference("Name", StringType, nullable = false)(),
      AttributeReference("Status", StringType, nullable = false)(),
      AttributeReference("Refresh Mode", StringType, nullable = false)(),
      AttributeReference("Refresh Trigger Mode", StringType, nullable = false)(),
      AttributeReference("Properties", StringType, nullable = false)())
  }

  override def processData(session: SparkSession): Seq[Row] = {
    // Get mv schemas.
    val schemaList = new util.ArrayList[MVSchema]()
    val viewManager = MVManagerInSpark.get(session)
    relatedTableIdentifier match {
      case Some(table) =>
        val relatedTable = CarbonEnv.getCarbonTable(table)(session)
        setAuditTable(relatedTable)
        Checker.validateTableExists(table.database, table.table, session)
        if (databaseNameOption.isDefined) {
          schemaList.addAll(viewManager.getSchemasOnTable(
            databaseNameOption.get,
            relatedTable))
        } else {
          schemaList.addAll(viewManager.getSchemasOnTable(relatedTable))
        }
      case _ =>
        if (databaseNameOption.isDefined) {
          schemaList.addAll(viewManager.getSchemas(databaseNameOption.get))
        } else {
          schemaList.addAll(viewManager.getSchemas())
        }
    }
    // Convert mv schema to row.
    schemaList.asScala.map {
      schema =>
        Row(
          schema.getIdentifier.getDatabaseName,
          schema.getIdentifier.getTableName,
          schema.getStatus.name(),
          schema.getProperties.get(MVProperty.REFRESH_MODE),
          schema.getProperties.get(MVProperty.REFRESH_TRIGGER_MODE),
          schema.getPropertiesAsString
        )
    }
  }

  override protected def opName: String = "SHOW MATERIALIZED VIEW"
}

Source File: CarbonCliCommand.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.management

import java.util

import scala.collection.JavaConverters._

import org.apache.spark.sql.{CarbonEnv, Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.execution.command.{Checker, DataCommand}
import org.apache.spark.sql.types.StringType

import org.apache.carbondata.tool.CarbonCli


case class CarbonCliCommand(
    databaseNameOp: Option[String],
    tableName: String,
    commandOptions: String)
  extends DataCommand {

  override def output: Seq[Attribute] = {
      Seq(AttributeReference("CarbonCli", StringType, nullable = false)())
  }

  override def processData(sparkSession: SparkSession): Seq[Row] = {
    Checker.validateTableExists(databaseNameOp, tableName, sparkSession)
    val carbonTable = CarbonEnv.getCarbonTable(databaseNameOp, tableName)(sparkSession)
    setAuditTable(carbonTable)
    setAuditInfo(Map("options" -> commandOptions))
    val commandArgs: Seq[String] = commandOptions.split("\\s+").map(_.trim)
    val finalCommands = commandArgs.exists(_.equalsIgnoreCase("-p")) match {
      case true =>
        commandArgs
      case false =>
        val needPath = commandArgs.exists { command =>
          command.equalsIgnoreCase("summary") || command.equalsIgnoreCase("benchmark")
        }
        needPath match {
          case true =>
            commandArgs ++ Seq("-p", carbonTable.getTablePath)
          case false =>
            commandArgs
        }
    }
    val summaryOutput = new util.ArrayList[String]()
    CarbonCli.run(finalCommands.toArray, summaryOutput, false)
    summaryOutput.asScala.map(x =>
      Row(x)
    )
  }

  override protected def opName: String = "CLI"
}

Source File: CarbonShowTablesCommand.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.table

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.execution.command.MetadataCommand
import org.apache.spark.sql.types.{BooleanType, StringType}


private[sql] case class CarbonShowTablesCommand ( databaseName: Option[String],
    tableIdentifierPattern: Option[String])  extends MetadataCommand{

  // The result of SHOW TABLES has three columns: database, tableName and isTemporary.
  override val output: Seq[Attribute] = {
    AttributeReference("database", StringType, nullable = false)() ::
    AttributeReference("tableName", StringType, nullable = false)() ::
    AttributeReference("isTemporary", BooleanType, nullable = false)() :: Nil
  }

  override def processMetadata(sparkSession: SparkSession): Seq[Row] = {
    // Since we need to return a Seq of rows, we will call getTables directly
    // instead of calling tables in sparkSession.
    val catalog = sparkSession.sessionState.catalog
    val db = databaseName.getOrElse(catalog.getCurrentDatabase)
    val tables =
      tableIdentifierPattern.map(catalog.listTables(db, _)).getOrElse(catalog.listTables(db))
    val externalCatalog = sparkSession.sharedState.externalCatalog
    // this method checks whether the table is mainTable or MV based on property "isVisible"
    def isMainTable(tableIdent: TableIdentifier) = {
      var isMainTable = true
      try {
        isMainTable = externalCatalog.getTable(db, tableIdent.table).storage.properties
          .getOrElse("isVisible", true).toString.toBoolean
      } catch {
        case ex: Throwable =>
        // ignore the exception for show tables
      }
      isMainTable
    }
    // tables will be filtered for all the MVs to show only main tables
    tables.collect {
      case tableIdent if isMainTable(tableIdent) =>
        val isTemp = catalog.isTemporaryTable(tableIdent)
        Row(tableIdent.database.getOrElse("default"), tableIdent.table, isTemp)
    }

  }

  override protected def opName: String = "SHOW TABLES"
}

Source File: CarbonExplainCommand.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command.table

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan, Union}
import org.apache.spark.sql.execution.command.{ExplainCommand, MetadataCommand}
import org.apache.spark.sql.types.StringType

import org.apache.carbondata.core.profiler.ExplainCollector

case class CarbonExplainCommand(
    child: LogicalPlan,
    override val output: Seq[Attribute] =
    Seq(AttributeReference("plan", StringType, nullable = true)()))
  extends MetadataCommand {

  override def processMetadata(sparkSession: SparkSession): Seq[Row] = {
    val explainCommand = child.asInstanceOf[ExplainCommand]
    setAuditInfo(Map("query" -> explainCommand.logicalPlan.simpleString))
    val isCommand = explainCommand.logicalPlan match {
      case _: Command => true
      case Union(childern) if childern.forall(_.isInstanceOf[Command]) => true
      case _ => false
    }

    if (explainCommand.logicalPlan.isStreaming || isCommand) {
      explainCommand.run(sparkSession)
    } else {
      CarbonExplainCommand.collectProfiler(explainCommand, sparkSession) ++
      explainCommand.run(sparkSession)
    }
  }

  override protected def opName: String = "EXPLAIN"
}

case class CarbonInternalExplainCommand(
    explainCommand: ExplainCommand,
    override val output: Seq[Attribute] =
    Seq(AttributeReference("plan", StringType, nullable = true)()))
  extends MetadataCommand {

  override def processMetadata(sparkSession: SparkSession): Seq[Row] = {
    CarbonExplainCommand
      .collectProfiler(explainCommand, sparkSession) ++ explainCommand.run(sparkSession)
  }

  override protected def opName: String = "Carbon EXPLAIN"
}

object CarbonExplainCommand {
  def collectProfiler(
      explain: ExplainCommand,
      sparkSession: SparkSession): Seq[Row] = {
    try {
      ExplainCollector.setup()
      if (ExplainCollector.enabled()) {
        val queryExecution =
          sparkSession.sessionState.executePlan(explain.logicalPlan)
        queryExecution.toRdd.partitions
        // For count(*) queries the explain collector will be disabled, so profiler
        // informations not required in such scenarios.
        if (null == ExplainCollector.getFormatedOutput) {
          Seq.empty
        }
        Seq(Row("== CarbonData Profiler ==\n" + ExplainCollector.getFormatedOutput))
      } else {
        Seq.empty
      }
    } finally {
      ExplainCollector.remove()
    }
  }
}

Source File: CarbonUDFTransformRule.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.sql.optimizer

import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, PredicateHelper,
ScalaUDF}
import org.apache.spark.sql.catalyst.plans.logical.{Filter, Join, LogicalPlan, Project}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.types.StringType

import org.apache.carbondata.core.constants.CarbonCommonConstants

class CarbonUDFTransformRule extends Rule[LogicalPlan] with PredicateHelper {
  override def apply(plan: LogicalPlan): LogicalPlan = {
      pushDownUDFToJoinLeftRelation(plan)
  }

  private def pushDownUDFToJoinLeftRelation(plan: LogicalPlan): LogicalPlan = {
    val output = plan.transform {
      case proj@Project(cols, Join(
      left, right, jointype: org.apache.spark.sql.catalyst.plans.JoinType, condition)) =>
        var projectionToBeAdded: Seq[org.apache.spark.sql.catalyst.expressions.Alias] = Seq.empty
        var udfExists = false
        val newCols = cols.map {
          case a@Alias(s: ScalaUDF, name)
            if name.equalsIgnoreCase(CarbonCommonConstants.POSITION_ID) ||
               name.equalsIgnoreCase(CarbonCommonConstants.CARBON_IMPLICIT_COLUMN_TUPLEID) =>
            udfExists = true
            projectionToBeAdded :+= a
            AttributeReference(name, StringType, nullable = true)().withExprId(a.exprId)
          case other => other
        }
        if (udfExists) {
          val newLeft = left match {
            case Project(columns, logicalPlan) =>
              Project(columns ++ projectionToBeAdded, logicalPlan)
            case filter: Filter =>
              Project(filter.output ++ projectionToBeAdded, filter)
            case relation: LogicalRelation =>
              Project(relation.output ++ projectionToBeAdded, relation)
            case other => other
          }
          Project(newCols, Join(newLeft, right, jointype, condition))
        } else {
          proj
        }
      case other => other
    }
    output
  }

}

Source File: CarbonDataFrameExample.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.examples

import org.apache.spark.sql.{SaveMode, SparkSession}

import org.apache.carbondata.examples.util.ExampleUtils

object CarbonDataFrameExample {

  def main(args: Array[String]) {
    val spark = ExampleUtils.createSparkSession("CarbonDataFrameExample")
    exampleBody(spark)
    spark.close()
  }

  def exampleBody(spark : SparkSession): Unit = {
    // Writes Dataframe to CarbonData file:
    import spark.implicits._
    val df = spark.sparkContext.parallelize(1 to 100)
      .map(x => ("a" + x % 10, "b", x))
      .toDF("c1", "c2", "number")

    // Saves dataframe to carbondata file
    df.write
      .format("carbondata")
      .option("tableName", "carbon_df_table")
      .option("partitionColumns", "c1")  // a list of column names
      .mode(SaveMode.Overwrite)
      .save()

    spark.sql(""" SELECT * FROM carbon_df_table """).show()

    spark.sql("SHOW PARTITIONS carbon_df_table").show()

    // Specify schema
    import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
    val customSchema = StructType(Array(
      StructField("c1", StringType),
      StructField("c2", StringType),
      StructField("number", IntegerType)))

    // Reads carbondata to dataframe
    val carbondf = spark.read
      .format("carbondata")
      .schema(customSchema)
      // .option("dbname", "db_name") the system will use "default" as dbname if not set this option
      .option("tableName", "carbon_df_table")
      .load()

    // Dataframe operations
    carbondf.printSchema()
    carbondf.select($"c1", $"number" + 10).show()
    carbondf.filter($"number" > 31).show()

    spark.sql("DROP TABLE IF EXISTS carbon_df_table")
  }
}

Source File: NestedTableExample.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}
import org.apache.spark.sql.{Row, SparkSession}

object NestedTableExample {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .enableHiveSupport()
      .getOrCreate()


    spark.sql("create table IF NOT EXISTS nested_empty " +
      "( A int, " +
      "  B string, " +
      "  nested ARRAY<STRUCT< " +
      "     nested_C: int," +
      "     nested_D: string" +
      "  >>" +
      ") ")

    val rowRDD = spark.sparkContext.
      parallelize(Array(
        Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))),
        Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))),
        Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar")))))

    val emptyDf = spark.sql("select * from nested_empty limit 0")

    val tableSchema = emptyDf.schema

    val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema)

    println("----")
    populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r))

    val nestedSchema = new StructType()
      .add("nested_C", IntegerType)
      .add("nested_D", StringType)

    val definedSchema = new StructType()
      .add("A", IntegerType)
      .add("B", StringType)
      .add("nested", ArrayType(nestedSchema))

    val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema)
    println("----")
    populated1Df.collect().foreach(r => println(" BuiltExample:" + r))

    spark.stop()
  }
}

Source File: PopulateHiveTable.scala From spark_training with Apache License 2.0

5 votes

package com.malaska.spark.training.nested

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{ArrayType, IntegerType, StringType, StructType}


object PopulateHiveTable {
  Logger.getLogger("org").setLevel(Level.OFF)
  Logger.getLogger("akka").setLevel(Level.OFF)

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder
      .master("local")
      .appName("my-spark-app")
      .config("spark.some.config.option", "config-value")
      .config("spark.driver.host","127.0.0.1")
      .config("spark.sql.parquet.compression.codec", "gzip")
      .enableHiveSupport()
      .getOrCreate()


    spark.sql("create table IF NOT EXISTS nested_empty " +
      "( A int, " +
      "  B string, " +
      "  nested ARRAY<STRUCT< " +
      "     nested_C: int," +
      "     nested_D: string" +
      "  >>" +
      ") ")

    val rowRDD = spark.sparkContext.
      parallelize(Array(
        Row(1, "foo", Seq(Row(1, "barA"),Row(2, "bar"))),
        Row(2, "foo", Seq(Row(1, "barB"),Row(2, "bar"))),
        Row(3, "foo", Seq(Row(1, "barC"),Row(2, "bar")))))

    val emptyDf = spark.sql("select * from nested_empty limit 0")

    val tableSchema = emptyDf.schema

    val populated1Df = spark.sqlContext.createDataFrame(rowRDD, tableSchema)

    populated1Df.repartition(2).write.saveAsTable("nested_populated")

    println("----")
    populated1Df.collect().foreach(r => println(" emptySchemaExample:" + r))

    val nestedSchema = new StructType()
      .add("nested_C", IntegerType)
      .add("nested_D", StringType)

    val definedSchema = new StructType()
      .add("A", IntegerType)
      .add("B", StringType)
      .add("nested", ArrayType(nestedSchema))

    val populated2Df = spark.sqlContext.createDataFrame(rowRDD, definedSchema)

    println("----")
    populated1Df.collect().foreach(r => println(" BuiltExample:" + r))

    spark.stop()
  }
}

Source File: OutputMetricsTest.scala From memsql-spark-connector with Apache License 2.0

5 votes

package com.memsql.spark

import com.github.mrpowers.spark.daria.sql.SparkSessionExt._
import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd}
import org.apache.spark.sql.types.{IntegerType, StringType}

class OutputMetricsTest extends IntegrationSuiteBase {
  it("records written") {
    var outputWritten = 0L
    spark.sparkContext.addSparkListener(new SparkListener() {
      override def onTaskEnd(taskEnd: SparkListenerTaskEnd) {
        val metrics = taskEnd.taskMetrics
        outputWritten += metrics.outputMetrics.recordsWritten
      }
    })

    val numRows = 100000
    val df1 = spark.createDF(
      List.range(0, numRows),
      List(("id", IntegerType, true))
    )

    df1.repartition(30)

    df1.write
      .format("memsql")
      .save("metricsInts")

    assert(outputWritten == numRows)
    outputWritten = 0

    val df2 = spark.createDF(
      List("st1", "", null),
      List(("st", StringType, true))
    )

    df2.write
      .format("memsql")
      .save("metricsStrings")

    assert(outputWritten == 3)
  }
}

Source File: SparkRecoverPartitionsCustomTest.scala From m3d-engine with Apache License 2.0

5 votes

package com.adidas.analytics.unit

import com.adidas.analytics.util.SparkRecoverPartitionsCustom
import com.adidas.utils.SparkSessionWrapper
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers, PrivateMethodTester}

import scala.collection.JavaConverters._

class SparkRecoverPartitionsCustomTest extends FunSuite
  with SparkSessionWrapper
  with PrivateMethodTester
  with Matchers
  with BeforeAndAfterAll{

  test("test conversion of String Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue("theValue")

    result should be("'theValue'")
  }

  test("test conversion of Short Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Short.valueOf("2"))

    result should be("2")
  }

  test("test conversion of Integer Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Integer.valueOf("4"))

    result should be("4")
  }

  test("test conversion of null Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    an [Exception] should be thrownBy {
      customSparkRecoverPartitions invokePrivate createParameterValue(null)
    }
  }

  test("test conversion of not supported Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    an [Exception] should be thrownBy {
      customSparkRecoverPartitions invokePrivate createParameterValue(false)
    }
  }

  test("test HiveQL statements Generation") {
    val customSparkRecoverPartitions = SparkRecoverPartitionsCustom(
      tableName="test",
      targetPartitions = Seq("country","district")
    )

    val rowsInput = Seq(
      Row(1, "portugal", "porto"),
      Row(2, "germany", "herzogenaurach"),
      Row(3, "portugal", "coimbra")
    )

    val inputSchema = StructType(
      List(
        StructField("number", IntegerType, nullable = true),
        StructField("country", StringType, nullable = true),
        StructField("district", StringType, nullable = true)
      )
    )

    val expectedStatements: Seq[String] = Seq(
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='porto')",
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='germany',district='herzogenaurach')",
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='coimbra')"
    )

    val testDataset: Dataset[Row] = spark.createDataset(rowsInput)(RowEncoder(inputSchema))

    val createParameterValue = PrivateMethod[Dataset[String]]('generateAddPartitionStatements)

    val producedStatements: Seq[String] = (customSparkRecoverPartitions invokePrivate createParameterValue(testDataset))
      .collectAsList()
      .asScala

    expectedStatements.sorted.toSet should equal(producedStatements.sorted.toSet)
  }

  override def afterAll(): Unit = {
    spark.stop()
  }

}

Source File: RecoverPartitionsCustomTest.scala From m3d-engine with Apache License 2.0

5 votes

package com.adidas.analytics.unit

import com.adidas.analytics.util.RecoverPartitionsCustom
import com.adidas.utils.SparkSessionWrapper
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Dataset, Row}
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers, PrivateMethodTester}

import scala.collection.JavaConverters._

class RecoverPartitionsCustomTest extends FunSuite
  with SparkSessionWrapper
  with PrivateMethodTester
  with Matchers
  with BeforeAndAfterAll{

  test("test conversion of String Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue("theValue")

    result should be("'theValue'")
  }

  test("test conversion of Short Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Short.valueOf("2"))

    result should be("2")
  }

  test("test conversion of Integer Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    val result = customSparkRecoverPartitions invokePrivate createParameterValue(java.lang.Integer.valueOf("4"))

    result should be("4")
  }

  test("test conversion of null Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    an [Exception] should be thrownBy {
      customSparkRecoverPartitions invokePrivate createParameterValue(null)
    }
  }

  test("test conversion of not supported Value to HiveQL Partition Parameter") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(tableName="", targetPartitions = Seq())
    val createParameterValue = PrivateMethod[String]('createParameterValue)
    an [Exception] should be thrownBy {
      customSparkRecoverPartitions invokePrivate createParameterValue(false)
    }
  }

  test("test HiveQL statements Generation") {
    val customSparkRecoverPartitions = RecoverPartitionsCustom(
      tableName="test",
      targetPartitions = Seq("country","district")
    )

    val rowsInput = Seq(
      Row(1, "portugal", "porto"),
      Row(2, "germany", "herzogenaurach"),
      Row(3, "portugal", "coimbra")
    )

    val inputSchema = StructType(
      List(
        StructField("number", IntegerType, nullable = true),
        StructField("country", StringType, nullable = true),
        StructField("district", StringType, nullable = true)
      )
    )

    val expectedStatements: Seq[String] = Seq(
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='porto')",
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='germany',district='herzogenaurach')",
      "ALTER TABLE test ADD IF NOT EXISTS PARTITION(country='portugal',district='coimbra')"
    )

    val testDataset: Dataset[Row] = spark.createDataset(rowsInput)(RowEncoder(inputSchema))

    val createParameterValue = PrivateMethod[Dataset[String]]('generateAddPartitionStatements)

    val producedStatements: Seq[String] = (customSparkRecoverPartitions invokePrivate createParameterValue(testDataset))
      .collectAsList()
      .asScala

    expectedStatements.sorted.toSet should equal(producedStatements.sorted.toSet)
  }

  override def afterAll(): Unit = {
    spark.stop()
  }

}

Source File: TemporalDataSuite.scala From datasource-receiver with Apache License 2.0

5 votes

package org.apache.spark.streaming.datasource

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.datasource.config.ConfigParameters._
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.BeforeAndAfter

private[datasource] trait TemporalDataSuite extends DatasourceSuite
  with BeforeAndAfter {

  val conf = new SparkConf()
    .setAppName("datasource-receiver-example")
    .setIfMissing("spark.master", "local[*]")
  var sc: SparkContext = null
  var ssc: StreamingContext = null
  val tableName = "tableName"
  val datasourceParams = Map(
    StopGracefully -> "true",
    StopSparkContext -> "false",
    StorageLevelKey -> "MEMORY_ONLY",
    RememberDuration -> "15s"
  )
  val schema = new StructType(Array(
    StructField("id", StringType, nullable = true),
    StructField("idInt", IntegerType, nullable = true)
  ))
  val totalRegisters = 10000
  val registers = for (a <- 1 to totalRegisters) yield Row(a.toString, a)

  after {
    if (ssc != null) {
      ssc.stop()
      ssc = null
    }
    if (sc != null) {
      sc.stop()
      sc = null
    }
  }
}

Source File: SparkEsBulkWriterSpec.scala From Spark2Elasticsearch with Apache License 2.0

5 votes

package com.github.jparkie.spark.elasticsearch

import com.github.jparkie.spark.elasticsearch.conf.{ SparkEsMapperConf, SparkEsWriteConf }
import com.github.jparkie.spark.elasticsearch.sql.{ SparkEsDataFrameMapper, SparkEsDataFrameSerializer }
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType }
import org.apache.spark.sql.{ Row, SQLContext }
import org.scalatest.{ MustMatchers, WordSpec }

class SparkEsBulkWriterSpec extends WordSpec with MustMatchers with SharedSparkContext {
  val esServer = new ElasticSearchServer()

  override def beforeAll(): Unit = {
    super.beforeAll()

    esServer.start()
  }

  override def afterAll(): Unit = {
    esServer.stop()

    super.afterAll()
  }

  "SparkEsBulkWriter" must {
    "execute write() successfully" in {
      esServer.createAndWaitForIndex("test_index")

      val sqlContext = new SQLContext(sc)

      val inputSparkEsWriteConf = SparkEsWriteConf(
        bulkActions = 10,
        bulkSizeInMB = 1,
        concurrentRequests = 0,
        flushTimeoutInSeconds = 1
      )
      val inputMapperConf = SparkEsMapperConf(
        esMappingId = Some("id"),
        esMappingParent = None,
        esMappingVersion = None,
        esMappingVersionType = None,
        esMappingRouting = None,
        esMappingTTLInMillis = None,
        esMappingTimestamp = None
      )
      val inputSchema = StructType(
        Array(
          StructField("id", StringType, true),
          StructField("parent", StringType, true),
          StructField("version", LongType, true),
          StructField("routing", StringType, true),
          StructField("ttl", LongType, true),
          StructField("timestamp", StringType, true),
          StructField("value", LongType, true)
        )
      )
      val inputData = sc.parallelize {
        Array(
          Row("TEST_ID_1", "TEST_PARENT_1", 1L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 1L),
          Row("TEST_ID_1", "TEST_PARENT_2", 2L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 2L),
          Row("TEST_ID_1", "TEST_PARENT_3", 3L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 3L),
          Row("TEST_ID_1", "TEST_PARENT_4", 4L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 4L),
          Row("TEST_ID_1", "TEST_PARENT_5", 5L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 5L),
          Row("TEST_ID_5", "TEST_PARENT_6", 6L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 6L),
          Row("TEST_ID_6", "TEST_PARENT_7", 7L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 7L),
          Row("TEST_ID_7", "TEST_PARENT_8", 8L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 8L),
          Row("TEST_ID_8", "TEST_PARENT_9", 9L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 9L),
          Row("TEST_ID_9", "TEST_PARENT_10", 10L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 10L),
          Row("TEST_ID_10", "TEST_PARENT_11", 11L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 11L)
        )
      }
      val inputDataFrame = sqlContext.createDataFrame(inputData, inputSchema)
      val inputDataIterator = inputDataFrame.rdd.toLocalIterator
      val inputSparkEsBulkWriter = new SparkEsBulkWriter[Row](
        esIndex = "test_index",
        esType = "test_type",
        esClient = () => esServer.client,
        sparkEsSerializer = new SparkEsDataFrameSerializer(inputSchema),
        sparkEsMapper = new SparkEsDataFrameMapper(inputMapperConf),
        sparkEsWriteConf = inputSparkEsWriteConf
      )

      inputSparkEsBulkWriter.write(null, inputDataIterator)

      val outputGetResponse = esServer.client.prepareGet("test_index", "test_type", "TEST_ID_1").get()

      outputGetResponse.isExists mustEqual true
      outputGetResponse.getSource.get("parent").asInstanceOf[String] mustEqual "TEST_PARENT_5"
      outputGetResponse.getSource.get("version").asInstanceOf[Integer] mustEqual 5
      outputGetResponse.getSource.get("routing").asInstanceOf[String] mustEqual "TEST_ROUTING_1"
      outputGetResponse.getSource.get("ttl").asInstanceOf[Integer] mustEqual 86400000
      outputGetResponse.getSource.get("timestamp").asInstanceOf[String] mustEqual "TEST_TIMESTAMP_1"
      outputGetResponse.getSource.get("value").asInstanceOf[Integer] mustEqual 5
    }
  }
}

Source File: NullValuesTest.scala From spark-dynamodb with Apache License 2.0

5 votes

package com.audienceproject.spark.dynamodb

import com.amazonaws.services.dynamodbv2.model.{AttributeDefinition, CreateTableRequest, KeySchemaElement, ProvisionedThroughput}
import com.audienceproject.spark.dynamodb.implicits._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}

class NullValuesTest extends AbstractInMemoryTest {

    test("Insert nested StructType with null values") {
        dynamoDB.createTable(new CreateTableRequest()
            .withTableName("NullTest")
            .withAttributeDefinitions(new AttributeDefinition("name", "S"))
            .withKeySchema(new KeySchemaElement("name", "HASH"))
            .withProvisionedThroughput(new ProvisionedThroughput(5L, 5L)))

        val schema = StructType(
            Seq(
                StructField("name", StringType, nullable = false),
                StructField("info", StructType(
                    Seq(
                        StructField("age", IntegerType, nullable = true),
                        StructField("address", StringType, nullable = true)
                    )
                ), nullable = true)
            )
        )

        val rows = spark.sparkContext.parallelize(Seq(
            Row("one", Row(30, "Somewhere")),
            Row("two", null),
            Row("three", Row(null, null))
        ))

        val newItemsDs = spark.createDataFrame(rows, schema)

        newItemsDs.write.dynamodb("NullTest")

        val validationDs = spark.read.dynamodb("NullTest")

        validationDs.show(false)
    }

}

Source File: BigQuerySource.scala From spark-bigquery with Apache License 2.0

5 votes

package com.samelamin.spark.bigquery.streaming

import java.math.BigInteger
import com.google.cloud.hadoop.io.bigquery.BigQueryStrings
import com.samelamin.spark.bigquery.BigQueryClient
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.execution.streaming.{Offset, _}
import org.apache.spark.sql.types.{BinaryType, StringType, StructField, StructType}
import com.samelamin.spark.bigquery._
import com.samelamin.spark.bigquery.converters.SchemaConverters
import org.joda.time.DateTime
import org.slf4j.LoggerFactory


  override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
    val startIndex = start.getOrElse(LongOffset(0L)).asInstanceOf[LongOffset].offset.toLong
    val endIndex = end.asInstanceOf[LongOffset].offset.toLong
    val startPartitionTime = new DateTime(startIndex).toLocalDate
    val endPartitionTime = new DateTime(endIndex).toLocalDate.toString
    logger.info(s"Fetching data between $startIndex and $endIndex")
    val query =
      s"""
         |SELECT
         |  *
         |FROM
         |  `${fullyQualifiedOutputTableId.replace(':','.')}`
         |WHERE
         |  $timestampColumn BETWEEN TIMESTAMP_MILLIS($startIndex) AND TIMESTAMP_MILLIS($endIndex)
         |  AND _PARTITIONTIME BETWEEN TIMESTAMP('$startPartitionTime') AND TIMESTAMP('$endPartitionTime')
         |  """.stripMargin
    val bigQuerySQLContext = new BigQuerySQLContext(sqlContext)
    val df = bigQuerySQLContext.bigQuerySelect(query)
    df
  }

  override def stop(): Unit = {}
  def getConvertedSchema(sqlContext: SQLContext): StructType = {
    val bigqueryClient = BigQueryClient.getInstance(sqlContext)
    val tableReference = BigQueryStrings.parseTableReference(fullyQualifiedOutputTableId)
    SchemaConverters.BQToSQLSchema(bigqueryClient.getTableSchema(tableReference))
  }
}

object BigQuerySource {
  val DEFAULT_SCHEMA = StructType(
    StructField("Sample Column", StringType) ::
      StructField("value", BinaryType) :: Nil
  )
}

Source File: Cleaner.scala From CkoocNLP with Apache License 2.0

5 votes

package functions.clean

import com.hankcs.hanlp.HanLP
import config.paramconf.{HasOutputCol, HasInputCol}
import functions.MySchemaUtils
import functions.clean.chinese.BCConvert
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset}



  setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1)

  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema, logging = true)

    val cleanFunc = udf {line: String =>
      var cleaned = ""
      getFanJian match {
        case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line)
        case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line)
        case _ => cleaned = line
      }

      getQuanBan match {
        case "q2b" => cleaned = BCConvert.qj2bj(cleaned)
        case "b2q" => cleaned = BCConvert.bj2qj(cleaned)
        case _ => cleaned = cleaned
      }

      cleaned
    }

    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record =>
      val outputIndex = record.fieldIndex($(outputCol))
      record.getString(outputIndex).length >= getMinLineLen
    }
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.typeName.equals(StringType.typeName),
      s"Input type must be StringType but got $inputType.")
    MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable)
  }
}


object Cleaner extends DefaultParamsReadable[Cleaner] {
  override def load(path: String): Cleaner = super.load(path)
}

Source File: ExtAggregatesSpec.scala From spark-ext with Apache License 2.0

5 votes

package org.apache.spark.sql

import com.collective.TestSparkContext
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.scalatest.FlatSpec
import org.apache.spark.sql.functions._
import org.apache.spark.sql.ext.functions._

import scala.collection.mutable

class ExtAggregatesSpec extends FlatSpec with TestSparkContext {

  val schema = StructType(Seq(
    StructField("cookie_id", StringType),
    StructField("site", StringType),
    StructField("impressions", LongType)
  ))

  val cookie1 = "cookie1"
  val cookie2 = "cookie2"
  val cookie3 = "cookie3"

  val impressionLog = sqlContext.createDataFrame(sc.parallelize(Seq(
    Row(cookie1, "google.com", 10L),
    Row(cookie1, "cnn.com", 14L),
    Row(cookie1, "google.com", 2L),
    Row(cookie2, "bbc.com", 20L),
    Row(cookie2, "auto.com", null),
    Row(cookie2, "auto.com", 1L),
    Row(cookie3, "sport.com", 100L)
  )), schema)

  "Ext Aggregates" should "collect column values as array" in {
    val cookies = impressionLog
      .select(collectArray(col("cookie_id")))
      .first().getAs[mutable.WrappedArray[String]](0)
    assert(cookies.length == 7)
    assert(cookies.toSet.size == 3)
  }

  it should "collect distinct values as array" in {
    val distinctCookies = impressionLog.select(col("cookie_id"))
      .distinct()
      .select(collectArray(col("cookie_id")))
      .first().getAs[mutable.WrappedArray[String]](0)
    assert(distinctCookies.length == 3)
  }

  it should "collect values after group by" in {
    val result = impressionLog
      .groupBy(col("cookie_id"))
      .agg(collectArray(col("site")))

    val cookieSites = result.collect().map { case Row(cookie: String, sites: mutable.WrappedArray[_]) =>
      cookie -> sites.toSeq
    }.toMap

    assert(cookieSites(cookie1).length == 3)
    assert(cookieSites(cookie2).length == 3)
    assert(cookieSites(cookie3).length == 1)

  }

}

Source File: VacuumTableCommand.scala From delta with Apache License 2.0

5 votes

package io.delta.tables.execution

import org.apache.hadoop.fs.Path
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.delta.{DeltaErrors, DeltaLog, DeltaTableIdentifier, DeltaTableUtils}
import org.apache.spark.sql.delta.commands.VacuumCommand
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.types.StringType


case class VacuumTableCommand(
    path: Option[String],
    table: Option[TableIdentifier],
    horizonHours: Option[Double],
    dryRun: Boolean) extends RunnableCommand {

  override val output: Seq[Attribute] =
    Seq(AttributeReference("path", StringType, nullable = true)())

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val pathToVacuum =
      if (path.nonEmpty) {
        new Path(path.get)
      } else if (table.nonEmpty) {
        DeltaTableIdentifier(sparkSession, table.get) match {
          case Some(id) if id.path.nonEmpty =>
            new Path(id.path.get)
          case _ =>
            new Path(sparkSession.sessionState.catalog.getTableMetadata(table.get).location)
        }
      } else {
        throw DeltaErrors.missingTableIdentifierException("VACUUM")
      }
    val baseDeltaPath = DeltaTableUtils.findDeltaTableRoot(sparkSession, pathToVacuum)
    if (baseDeltaPath.isDefined) {
      if (baseDeltaPath.get != pathToVacuum) {
        throw DeltaErrors.vacuumBasePathMissingException(baseDeltaPath.get)
      }
    }
    val deltaLog = DeltaLog.forTable(sparkSession, pathToVacuum)
    if (deltaLog.snapshot.version == -1) {
      throw DeltaErrors.notADeltaTableException(
        "VACUUM",
        DeltaTableIdentifier(path = Some(pathToVacuum.toString)))
    }
    VacuumCommand.gc(sparkSession, deltaLog, dryRun, horizonHours).collect()
  }
}

Source File: Lambda.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.contracts.Wrappable
import org.apache.spark.SparkContext
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param.{ParamMap, UDFParam}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object Lambda extends ComplexParamsReadable[Lambda] {
  def apply(f: Dataset[_] => DataFrame): Lambda = {
    new Lambda().setTransform(f)
  }
}

class Lambda(val uid: String) extends Transformer with Wrappable with ComplexParamsWritable {
  def this() = this(Identifiable.randomUID("Lambda"))

  val transformFunc = new UDFParam(this, "transformFunc", "holder for dataframe function")

  def setTransform(f: Dataset[_] => DataFrame): this.type = {
    set(transformFunc, udf(f, StringType))
  }

  def getTransform: Dataset[_] => DataFrame = {
    $(transformFunc).f.asInstanceOf[Dataset[_] => DataFrame]
  }

  val transformSchemaFunc = new UDFParam(this, "transformSchemaFunc", "the output schema after the transformation")

  def setTransformSchema(f: StructType => StructType): this.type = {
    set(transformSchemaFunc, udf(f, StringType))
  }

  def getTransformSchema: StructType => StructType = {
    $(transformSchemaFunc).f.asInstanceOf[StructType => StructType]
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    getTransform(dataset)
  }

  def transformSchema(schema: StructType): StructType = {
    if (get(transformSchemaFunc).isEmpty) {
      val sc = SparkContext.getOrCreate()
      val df = SparkSession.builder().getOrCreate().createDataFrame(sc.emptyRDD[Row], schema)
      transform(df).schema
    } else {
      getTransformSchema(schema)
    }
  }

  def copy(extra: ParamMap): Lambda = defaultCopy(extra)

}

Source File: StratifiedRepartitionSuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.TaskContext
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}

class StratifiedRepartitionSuite extends TestBase with TransformerFuzzing[StratifiedRepartition] {

  import session.implicits._

  val values = "values"
  val colors = "colors"
  val const = "const"

  lazy val input = Seq(
    (0, "Blue", 2),
    (0, "Red", 2),
    (0, "Green", 2),
    (1, "Purple", 2),
    (1, "Orange", 2),
    (1, "Indigo", 2),
    (2, "Violet", 2),
    (2, "Black", 2),
    (2, "White", 2),
    (3, "Gray", 2),
    (3, "Yellow", 2),
    (3, "Cerulean", 2)
  ).toDF(values, colors, const)

  test("Assert doing a stratified repartition will ensure all keys exist across all partitions") {
    val inputSchema = new StructType()
      .add(values, IntegerType).add(colors, StringType).add(const, IntegerType)
    val inputEnc = RowEncoder(inputSchema)
    val valuesFieldIndex = inputSchema.fieldIndex(values)
    val numPartitions = 3
    val trainData = input.repartition(numPartitions).select(values, colors, const)
      .mapPartitions(iter => {
        val ctx = TaskContext.get
        val partId = ctx.partitionId
        // Remove all instances of 0 class on partition 1
        if (partId == 1) {
          iter.flatMap(row => {
            if (row.getInt(valuesFieldIndex) <= 0)
              None
            else Some(row)
          })
        } else {
          // Add back at least 3 instances on other partitions
          val oneOfEachExample = List(Row(0, "Blue", 2), Row(1, "Purple", 2), Row(2, "Black", 2), Row(3, "Gray", 2))
          (iter.toList.union(oneOfEachExample).union(oneOfEachExample).union(oneOfEachExample)).toIterator
        }
      })(inputEnc).cache()
    // Some debug to understand what data is on which partition
    trainData.foreachPartition { rows =>
      rows.foreach { row =>
        val ctx = TaskContext.get
        val partId = ctx.partitionId
        println(s"Row: $row partition id: $partId")
      }
    }
    val stratifiedInputData = new StratifiedRepartition().setLabelCol(values)
      .setMode(SPConstants.Equal).transform(trainData)
    // Assert stratified data contains all keys across all partitions, with extra count
    // for it to be evaluated
    stratifiedInputData
      .mapPartitions(iter => {
        val actualLabels = iter.map(row => row.getInt(valuesFieldIndex))
          .toArray.distinct.sorted.toList
        val expectedLabels = (0 to 3).toList
        if (actualLabels != expectedLabels)
          throw new Exception(s"Missing labels, actual: $actualLabels, expected: $expectedLabels")
        iter
      })(inputEnc).count()
    val stratifiedMixedInputData = new StratifiedRepartition().setLabelCol(values)
      .setMode(SPConstants.Mixed).transform(trainData)
    assert(stratifiedMixedInputData.count() >= trainData.count())
    val stratifiedOriginalInputData = new StratifiedRepartition().setLabelCol(values)
      .setMode(SPConstants.Original).transform(trainData)
    assert(stratifiedOriginalInputData.count() == trainData.count())
  }

  def testObjects(): Seq[TestObject[StratifiedRepartition]] = List(new TestObject(
    new StratifiedRepartition().setLabelCol(values).setMode(SPConstants.Equal), input))

  def reader: MLReadable[_] = StratifiedRepartition
}

Source File: HTTPSuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.split2

import java.io.File

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.io.http.HTTPSchema.string_to_response
import org.apache.http.impl.client.HttpClientBuilder
import org.apache.spark.sql.execution.streaming.{HTTPSinkProvider, HTTPSourceProvider}
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StringType

class HTTPSuite extends TestBase with HTTPTestUtils {

  test("stream from HTTP", TestBase.Extended) {
    val q1 = session.readStream.format(classOf[HTTPSourceProvider].getName)
      .option("host", host)
      .option("port", port.toString)
      .option("path", apiPath)
      .load()
      .withColumn("contentLength", col("request.entity.contentLength"))
      .withColumn("reply", string_to_response(col("contentLength").cast(StringType)))
      .writeStream
      .format(classOf[HTTPSinkProvider].getName)
      .option("name", "foo")
      .queryName("foo")
      .option("replyCol", "reply")
      .option("checkpointLocation", new File(tmpDir.toFile, "checkpoints").toString)
      .start()

    Thread.sleep(5000)
    val client = HttpClientBuilder.create().build()
    val p1 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "here"), url)
    val p2 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "heree"), url)
    val p3 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "hereee"), url)
    val p4 = sendJsonRequest(client, Map("foo" -> 1, "bar" -> "hereeee"), url)
    val posts = List(p1, p2, p3, p4)
    val correctResponses = List(27, 28, 29, 30)

    posts.zip(correctResponses).foreach { p =>
      assert(p._1 === p._2.toString)
    }
    q1.stop()
    client.close()
  }

}

Source File: SimpleHTTPTransformerSuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.split1

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.io.http.{HandlingUtils, JSONOutputParser, SimpleHTTPTransformer}
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{StringType, StructType}

class SimpleHTTPTransformerSuite
  extends TransformerFuzzing[SimpleHTTPTransformer] with WithServer {

  import session.implicits._

  lazy val df: DataFrame = sc.parallelize((1 to 10).map(Tuple1(_))).toDF("data")

  def simpleTransformer: SimpleHTTPTransformer =
    new SimpleHTTPTransformer()
      .setInputCol("data")
      .setOutputParser(new JSONOutputParser()
        .setDataType(new StructType().add("blah", StringType)))
      .setUrl(url)
      .setOutputCol("results")

  test("HttpTransformerTest") {
    val results = simpleTransformer.transform(df).collect
    assert(results.length == 10)
    results.foreach(r =>
      assert(r.getStruct(2).getString(0) === "more blah"))
    assert(results(0).schema.fields.length == 3)
  }

  test("HttpTransformerTest with Flaky Connection") {
    lazy val df2: DataFrame = sc.parallelize((1 to 5).map(Tuple1(_))).toDF("data")
    val results = simpleTransformer
      .setUrl(url + "/flaky")
      .setTimeout(1)
      .transform(df2).collect
    assert(results.length == 5)
  }

  test("Basic Handling") {
    val results = simpleTransformer
      .setHandler(HandlingUtils.basic)
      .transform(df).collect
    assert(results.length == 10)
    results.foreach(r =>
      assert(r.getStruct(2).getString(0) === "more blah"))
    assert(results(0).schema.fields.length == 3)
  }

  test("Concurrent HttpTransformerTest") {
    val results =
      new SimpleHTTPTransformer()
        .setInputCol("data")
        .setOutputParser(new JSONOutputParser()
          .setDataType(new StructType().add("blah", StringType)))
        .setUrl(url)
        .setOutputCol("results")
        .setConcurrency(3)
        .transform(df)
        .collect
    assert(results.length == 10)
    assert(results.forall(_.getStruct(2).getString(0) == "more blah"))
    assert(results(0).schema.fields.length == 3)
  }

  override def testObjects(): Seq[TestObject[SimpleHTTPTransformer]] =
    Seq(new TestObject(simpleTransformer, df))

  override def reader: MLReadable[_] = SimpleHTTPTransformer

}

Source File: ParserSuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.split1

import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import com.microsoft.ml.spark.io.http._
import org.apache.http.client.methods.HttpPost
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util.MLReadable
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, SparkSession}

trait ParserUtils extends WithServer {

  def sampleDf(spark: SparkSession): DataFrame = {
    val df = spark.createDataFrame((1 to 10).map(Tuple1(_)))
      .toDF("data")
    val df2 = new JSONInputParser().setInputCol("data")
      .setOutputCol("parsedInput").setUrl(url)
      .transform(df)
      .withColumn("unparsedOutput", udf({ x: Int =>
        HTTPResponseData(
          Array(),
          Some(EntityData(
            "{\"foo\": \"here\"}".getBytes, None, None, None, false, false, false)),
          StatusLineData(ProtocolVersionData("foo", 1, 1), 200, "bar"),
          "en")
      }).apply(col("data"))
      )

    new JSONOutputParser()
      .setDataType(new StructType().add("foo", StringType))
      .setInputCol("unparsedOutput")
      .setOutputCol("parsedOutput")
      .transform(df2)
  }

  def makeTestObject[T <: Transformer](t: T, session: SparkSession): Seq[TestObject[T]] = {
    Seq(new TestObject(t, sampleDf(session)))
  }

}

class JsonInputParserSuite extends TransformerFuzzing[JSONInputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[JSONInputParser]] = makeTestObject(
    new JSONInputParser().setInputCol("data").setOutputCol("out")
      .setUrl(url), session)

  override def reader: MLReadable[_] = JSONInputParser
}

class JsonOutputParserSuite extends TransformerFuzzing[JSONOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[JSONOutputParser]] = makeTestObject(
    new JSONOutputParser().setInputCol("unparsedOutput").setOutputCol("out")
      .setDataType(new StructType().add("foo", StringType)), session)

  override def reader: MLReadable[_] = JSONOutputParser
}

class StringOutputParserSuite extends TransformerFuzzing[StringOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[StringOutputParser]] = makeTestObject(
    new StringOutputParser().setInputCol("unparsedOutput").setOutputCol("out"), session)

  override def reader: MLReadable[_] = StringOutputParser
}

class CustomInputParserSuite extends TransformerFuzzing[CustomInputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[CustomInputParser]] = makeTestObject(
    new CustomInputParser().setInputCol("data").setOutputCol("out")
      .setUDF({ x: Int => new HttpPost(s"http://$x") }), session)

  override def reader: MLReadable[_] = CustomInputParser
}

class CustomOutputParserSuite extends TransformerFuzzing[CustomOutputParser] with ParserUtils {
  override def testObjects(): Seq[TestObject[CustomOutputParser]] = makeTestObject(
    new CustomOutputParser().setInputCol("unparsedOutput").setOutputCol("out")
      .setUDF({ x: HTTPResponseData => x.locale }), session)

  override def reader: MLReadable[_] = CustomOutputParser
}

Source File: Locus.scala From hail with MIT License

5 votes

package is.hail.variant

import is.hail.annotations.Annotation
import is.hail.check.Gen
import is.hail.expr.Parser
import is.hail.utils._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.json4s._

import scala.collection.JavaConverters._
import scala.language.implicitConversions

object Locus {
  val simpleContigs: Seq[String] = (1 to 22).map(_.toString) ++ Seq("X", "Y", "MT")

  def apply(contig: String, position: Int, rg: ReferenceGenome): Locus = {
    rg.checkLocus(contig, position)
    Locus(contig, position)
  }

  def annotation(contig: String, position: Int, rg: Option[ReferenceGenome]): Annotation = {
    rg match {
      case Some(ref) => Locus(contig, position, ref)
      case None => Annotation(contig, position)
    }
  }

  def sparkSchema: StructType =
    StructType(Array(
      StructField("contig", StringType, nullable = false),
      StructField("position", IntegerType, nullable = false)))

  def fromRow(r: Row): Locus = {
    Locus(r.getAs[String](0), r.getInt(1))
  }

  def gen(rg: ReferenceGenome): Gen[Locus] = for {
    (contig, length) <- Contig.gen(rg)
    pos <- Gen.choose(1, length)
  } yield Locus(contig, pos)

  def parse(str: String, rg: ReferenceGenome): Locus = {
    val elts = str.split(":")
    val size = elts.length
    if (size < 2)
      fatal(s"Invalid string for Locus. Expecting contig:pos -- found '$str'.")

    val contig = elts.take(size - 1).mkString(":")
    Locus(contig, elts(size - 1).toInt, rg)
  }

  def parseInterval(str: String, rg: ReferenceGenome, invalidMissing: Boolean = false): Interval =
    Parser.parseLocusInterval(str, rg, invalidMissing)

  def parseIntervals(arr: Array[String], rg: ReferenceGenome, invalidMissing: Boolean): Array[Interval] = arr.map(parseInterval(_, rg, invalidMissing))

  def parseIntervals(arr: java.util.List[String], rg: ReferenceGenome, invalidMissing: Boolean = false): Array[Interval] = parseIntervals(arr.asScala.toArray, rg, invalidMissing)

  def makeInterval(contig: String, start: Int, end: Int, includesStart: Boolean, includesEnd: Boolean,
    rgBase: ReferenceGenome, invalidMissing: Boolean = false): Interval = {
    val rg = rgBase.asInstanceOf[ReferenceGenome]
    rg.toLocusInterval(Interval(Locus(contig, start), Locus(contig, end), includesStart, includesEnd), invalidMissing)
  }
}

case class Locus(contig: String, position: Int) {
  def toRow: Row = Row(contig, position)

  def toJSON: JValue = JObject(
    ("contig", JString(contig)),
    ("position", JInt(position)))

  def copyChecked(rg: ReferenceGenome, contig: String = contig, position: Int = position): Locus = {
    rg.checkLocus(contig, position)
    Locus(contig, position)
  }

  def isAutosomalOrPseudoAutosomal(rg: ReferenceGenome): Boolean = isAutosomal(rg) || inXPar(rg) || inYPar(rg)

  def isAutosomal(rg: ReferenceGenome): Boolean = !(inX(rg) || inY(rg) || isMitochondrial(rg))

  def isMitochondrial(rg: ReferenceGenome): Boolean = rg.isMitochondrial(contig)

  def inXPar(rg: ReferenceGenome): Boolean = rg.inXPar(this)

  def inYPar(rg: ReferenceGenome): Boolean = rg.inYPar(this)

  def inXNonPar(rg: ReferenceGenome): Boolean = inX(rg) && !inXPar(rg)

  def inYNonPar(rg: ReferenceGenome): Boolean = inY(rg) && !inYPar(rg)

  private def inX(rg: ReferenceGenome): Boolean = rg.inX(contig)

  private def inY(rg: ReferenceGenome): Boolean = rg.inY(contig)

  override def toString: String = s"$contig:$position"
}

Source File: ConcatColumnBenchmark.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.jmh

import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{DataFrame, SparkSession, functions}
import org.apache.spark.storage.StorageLevel
import org.opencypher.morpheus.impl.MorpheusFunctions
import org.opencypher.morpheus.impl.expressions.EncodeLong._
import org.openjdk.jmh.annotations._

@State(Scope.Benchmark)
@BenchmarkMode(Array(Mode.AverageTime))
class ConcatColumnBenchmark {

  implicit var sparkSession: SparkSession = _

  var df: DataFrame = _

  @Setup
  def setUp(): Unit = {
    sparkSession = SparkSession.builder().master("local[*]").getOrCreate()
    val fromRow = 100000000L
    val numRows = 1000000
    val rangeDf = sparkSession.range(fromRow, fromRow + numRows).toDF("i")
    val indexCol = rangeDf.col("i")
    df = rangeDf
      .withColumn("s", indexCol.cast(StringType))
      .withColumn("b", indexCol.encodeLongAsMorpheusId)
      .partitionAndCache
  }

  @Benchmark
  def concatWs(): Int = {
    val result = df.withColumn("c", functions.concat_ws("|", df.col("i"), df.col("s"), df.col("b")))
    result.select("c").collect().length
  }

  @Benchmark
  def serialize(): Int = {
    val result = df.withColumn("c", MorpheusFunctions.serialize(df.col("i"), df.col("s"), df.col("b")))
    result.select("c").collect().length
  }

  implicit class DataFrameSetup(df: DataFrame) {

    def partitionAndCache: DataFrame = {
      val cached = df.repartition(10).persist(StorageLevel.MEMORY_ONLY)
      cached.count()
      cached
    }
  }

}

Source File: LoadInteractionsInHive.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.util

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType}
import org.opencypher.morpheus.api.MorpheusSession

object LoadInteractionsInHive {

  val databaseName = "customers"
  val baseTableName = s"$databaseName.csv_input"

  def load(show: Boolean = false)(implicit session: MorpheusSession): DataFrame = {

    val datafile = getClass.getResource("/customer-interactions/csv/customer-interactions.csv").toURI.getPath
    val structType = StructType(Seq(
      StructField("interactionId", LongType, nullable = false),
      StructField("date", StringType, nullable = false),
      StructField("customerIdx", LongType, nullable = false),
      StructField("empNo", LongType, nullable = false),
      StructField("empName", StringType, nullable = false),
      StructField("type", StringType, nullable = false),
      StructField("outcomeScore", StringType, nullable = false),
      StructField("accountHolderId", StringType, nullable = false),
      StructField("policyAccountNumber", StringType, nullable = false),
      StructField("customerId", StringType, nullable = false),
      StructField("customerName", StringType, nullable = false)
    ))

    val baseTable: DataFrame = session.sparkSession.read
      .format("csv")
      .option("header", "true")
      .schema(structType)
      .load(datafile)

    if (show) baseTable.show()

    session.sql(s"DROP DATABASE IF EXISTS $databaseName CASCADE")
    session.sql(s"CREATE DATABASE $databaseName")

    baseTable.write.saveAsTable(s"$baseTableName")

    // Create views for nodes
    createView(baseTableName, "interactions", true, "interactionId", "date", "type", "outcomeScore")
    createView(baseTableName, "customers", true, "customerIdx", "customerId", "customerName")
    createView(baseTableName, "account_holders", true, "accountHolderId")
    createView(baseTableName, "policies", true, "policyAccountNumber")
    createView(baseTableName, "customer_reps", true, "empNo", "empName")

    // Create views for relationships
    createView(baseTableName, "has_customer_reps", false, "interactionId", "empNo")
    createView(baseTableName, "has_customers", false, "interactionId", "customerIdx")
    createView(baseTableName, "has_policies", false, "interactionId", "policyAccountNumber")
    createView(baseTableName, "has_account_holders", false, "interactionId", "accountHolderId")

    baseTable
  }

  def createView(fromTable: String, viewName: String, distinct: Boolean, columns: String*)
    (implicit session: MorpheusSession): Unit = {
    val distinctString = if (distinct) "DISTINCT" else ""

    session.sql(
      s"""
         |CREATE VIEW $databaseName.${viewName}_SEED AS
         | SELECT $distinctString ${columns.mkString(", ")}
         | FROM $fromTable
         | WHERE date < '2017-01-01'
      """.stripMargin)

    session.sql(
      s"""
         |CREATE VIEW $databaseName.${viewName}_DELTA AS
         | SELECT $distinctString ${columns.mkString(", ")}
         | FROM $fromTable
         | WHERE date >= '2017-01-01'
      """.stripMargin)
  }

}

Source File: MorpheusRecordHeaderTest.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.impl.table

import org.apache.spark.sql.types.{ArrayType, StringType, StructField}
import org.opencypher.morpheus.impl.convert.SparkConversions._
import org.opencypher.okapi.api.types.{CTList, CTString}
import org.opencypher.okapi.ir.api.expr.Var
import org.opencypher.okapi.relational.impl.table.RecordHeader
import org.opencypher.okapi.testing.BaseTestSuite

class MorpheusRecordHeaderTest extends BaseTestSuite {

  it("computes a struct type from a given record header") {
    val header = RecordHeader.empty
      .withExpr(Var("a")(CTString))
      .withExpr(Var("b")(CTString.nullable))
      .withExpr(Var("c")(CTList(CTString.nullable)))

    header.toStructType.fields.toSet should equal(Set(
      StructField(header.column(Var("a")()), StringType, nullable = false),
      StructField(header.column(Var("b")()), StringType, nullable = true),
      StructField(header.column(Var("c")()), ArrayType(StringType, containsNull = true), nullable = false)
    ))
  }

}

Source File: NGram.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}


  @Since("1.5.0")
  def getN: Int = $(n)

  setDefault(n -> 2)

  override protected def createTransformFunc: Seq[String] => Seq[String] = {
    _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.sameType(ArrayType(StringType)),
      s"Input type must be ArrayType(StringType) but got $inputType.")
  }

  override protected def outputDataType: DataType = new ArrayType(StringType, false)
}

@Since("1.6.0")
object NGram extends DefaultParamsReadable[NGram] {

  @Since("1.6.0")
  override def load(path: String): NGram = super.load(path)
}

Source File: MapDataSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import scala.collection._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.util.ArrayBasedMapData
import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType}
import org.apache.spark.unsafe.types.UTF8String

class MapDataSuite extends SparkFunSuite {

  test("inequality tests") {
    def u(str: String): UTF8String = UTF8String.fromString(str)

    // test data
    val testMap1 = Map(u("key1") -> 1)
    val testMap2 = Map(u("key1") -> 1, u("key2") -> 2)
    val testMap3 = Map(u("key1") -> 1)
    val testMap4 = Map(u("key1") -> 1, u("key2") -> 2)

    // ArrayBasedMapData
    val testArrayMap1 = ArrayBasedMapData(testMap1.toMap)
    val testArrayMap2 = ArrayBasedMapData(testMap2.toMap)
    val testArrayMap3 = ArrayBasedMapData(testMap3.toMap)
    val testArrayMap4 = ArrayBasedMapData(testMap4.toMap)
    assert(testArrayMap1 !== testArrayMap3)
    assert(testArrayMap2 !== testArrayMap4)

    // UnsafeMapData
    val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType)))
    val row = new GenericInternalRow(1)
    def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = {
      row.update(0, map)
      val unsafeRow = unsafeConverter.apply(row)
      unsafeRow.getMap(0).copy
    }
    assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3))
    assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4))
  }
}

Source File: ScalaUDFSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types.{IntegerType, StringType}

class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("basic") {
    val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil)
    checkEvaluation(intUdf, 2)

    val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil)
    checkEvaluation(stringUdf, "ax")
  }

  test("better error message for NPE") {
    val udf = ScalaUDF(
      (s: String) => s.toLowerCase,
      StringType,
      Literal.create(null, StringType) :: Nil)

    val e1 = intercept[SparkException](udf.eval())
    assert(e1.getMessage.contains("Failed to execute user defined function"))

    val e2 = intercept[SparkException] {
      checkEvalutionWithUnsafeProjection(udf, null)
    }
    assert(e2.getMessage.contains("Failed to execute user defined function"))
  }

}

Source File: RewriteDistinctAggregatesSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.SimpleCatalystConf
import org.apache.spark.sql.catalyst.analysis.{Analyzer, EmptyFunctionRegistry}
import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.{If, Literal}
import org.apache.spark.sql.catalyst.expressions.aggregate.{CollectSet, Count}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Expand, LocalRelation, LogicalPlan}
import org.apache.spark.sql.types.{IntegerType, StringType}

class RewriteDistinctAggregatesSuite extends PlanTest {
  val conf = SimpleCatalystConf(caseSensitiveAnalysis = false, groupByOrdinal = false)
  val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf)
  val analyzer = new Analyzer(catalog, conf)

  val nullInt = Literal(null, IntegerType)
  val nullString = Literal(null, StringType)
  val testRelation = LocalRelation('a.string, 'b.string, 'c.string, 'd.string, 'e.int)

  private def checkRewrite(rewrite: LogicalPlan): Unit = rewrite match {
    case Aggregate(_, _, Aggregate(_, _, _: Expand)) =>
    case _ => fail(s"Plan is not rewritten:\n$rewrite")
  }

  test("single distinct group") {
    val input = testRelation
      .groupBy('a)(countDistinct('e))
      .analyze
    val rewrite = RewriteDistinctAggregates(input)
    comparePlans(input, rewrite)
  }

  test("single distinct group with partial aggregates") {
    val input = testRelation
      .groupBy('a, 'd)(
        countDistinct('e, 'c).as('agg1),
        max('b).as('agg2))
      .analyze
    val rewrite = RewriteDistinctAggregates(input)
    comparePlans(input, rewrite)
  }

  test("single distinct group with non-partial aggregates") {
    val input = testRelation
      .groupBy('a, 'd)(
        countDistinct('e, 'c).as('agg1),
        CollectSet('b).toAggregateExpression().as('agg2))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups") {
    val input = testRelation
      .groupBy('a)(countDistinct('b, 'c), countDistinct('d))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups with partial aggregates") {
    val input = testRelation
      .groupBy('a)(countDistinct('b, 'c), countDistinct('d), sum('e))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups with non-partial aggregates") {
    val input = testRelation
      .groupBy('a)(
        countDistinct('b, 'c),
        countDistinct('d),
        CollectSet('b).toAggregateExpression())
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }
}

Source File: resources.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import java.io.File
import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand {
  override val output: Seq[Attribute] = {
    AttributeReference("Results", StringType, nullable = false)() :: Nil
  }
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val jarList = sparkSession.sparkContext.listJars()
    if (jars.nonEmpty) {
      for {
        jarName <- jars.map(f => new Path(f).getName)
        jarPath <- jarList if jarPath.contains(jarName)
      } yield Row(jarPath)
    } else {
      jarList.map(Row(_))
    }
  }
}

Source File: WholeStageCodegenSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.aggregate.HashAggregateExec
import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
import org.apache.spark.sql.expressions.scalalang.typed
import org.apache.spark.sql.functions.{avg, broadcast, col, max}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}

class WholeStageCodegenSuite extends SparkPlanTest with SharedSQLContext {

  test("range/filter should be combined") {
    val df = spark.range(10).filter("id = 1").selectExpr("id + 1")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined)
    assert(df.collect() === Array(Row(2)))
  }

  test("Aggregate should be included in WholeStageCodegen") {
    val df = spark.range(10).groupBy().agg(max(col("id")), avg(col("id")))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(9, 4.5)))
  }

  test("Aggregate with grouping keys should be included in WholeStageCodegen") {
    val df = spark.range(3).groupBy("id").count().orderBy("id")
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(df.collect() === Array(Row(0, 1), Row(1, 1), Row(2, 1)))
  }

  test("BroadcastHashJoin should be included in WholeStageCodegen") {
    val rdd = spark.sparkContext.makeRDD(Seq(Row(1, "1"), Row(1, "1"), Row(2, "2")))
    val schema = new StructType().add("k", IntegerType).add("v", StringType)
    val smallDF = spark.createDataFrame(rdd, schema)
    val df = spark.range(10).join(broadcast(smallDF), col("k") === col("id"))
    assert(df.queryExecution.executedPlan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[BroadcastHashJoinExec]).isDefined)
    assert(df.collect() === Array(Row(1, 1, "1"), Row(1, 1, "1"), Row(2, 2, "2")))
  }

  test("Sort should be included in WholeStageCodegen") {
    val df = spark.range(3, 0, -1).toDF().sort(col("id"))
    val plan = df.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SortExec]).isDefined)
    assert(df.collect() === Array(Row(1), Row(2), Row(3)))
  }

  test("MapElements should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = spark.range(10).map(_.toString)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[SerializeFromObjectExec]).isDefined)
    assert(ds.collect() === 0.until(10).map(_.toString).toArray)
  }

  test("typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 2, 4, 6, 8))
  }

  test("back-to-back typed filter should be included in WholeStageCodegen") {
    val ds = spark.range(10).filter(_ % 2 == 0).filter(_ % 3 == 0)
    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
      p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[FilterExec]).isDefined)
    assert(ds.collect() === Array(0, 6))
  }

  test("simple typed UDAF should be included in WholeStageCodegen") {
    import testImplicits._

    val ds = Seq(("a", 10), ("b", 1), ("b", 2), ("c", 1)).toDS()
      .groupByKey(_._1).agg(typed.sum(_._2))

    val plan = ds.queryExecution.executedPlan
    assert(plan.find(p =>
      p.isInstanceOf[WholeStageCodegenExec] &&
        p.asInstanceOf[WholeStageCodegenExec].child.isInstanceOf[HashAggregateExec]).isDefined)
    assert(ds.collect() === Array(("a", 10.0), ("b", 3.0), ("c", 1.0)))
  }
}

Source File: GroupedIteratorSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: DDLSourceLoadSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {
    intercept[RuntimeException] {
      spark.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {
    spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {
    spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without Hive Support") {
    val e = intercept[AnalysisException] {
      spark.read.format("orc").load()
    }
    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

Source File: Tokenizer.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}


  def getPattern: String = $(pattern)

  setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+")

  override protected def createTransformFunc: String => Seq[String] = { str =>
    val re = $(pattern).r
    val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq
    val minLength = $(minTokenLength)
    tokens.filter(_.length >= minLength)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType == StringType, s"Input type must be string type but got $inputType.")
  }

  override protected def outputDataType: DataType = new ArrayType(StringType, false)

  override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra)
}

Source File: GenerateOrdering.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import org.apache.spark.Logging
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types.{BinaryType, StringType, NumericType}


object GenerateOrdering extends CodeGenerator[Seq[SortOrder], Ordering[Row]] with Logging {
  import scala.reflect.runtime.{universe => ru}
  import scala.reflect.runtime.universe._

 protected def canonicalize(in: Seq[SortOrder]): Seq[SortOrder] =
    in.map(ExpressionCanonicalizer.execute(_).asInstanceOf[SortOrder])

  protected def bind(in: Seq[SortOrder], inputSchema: Seq[Attribute]): Seq[SortOrder] =
    in.map(BindReferences.bindReference(_, inputSchema))

  protected def create(ordering: Seq[SortOrder]): Ordering[Row] = {
    val a = newTermName("a")
    val b = newTermName("b")
    val comparisons = ordering.zipWithIndex.map { case (order, i) =>
      val evalA = expressionEvaluator(order.child)
      val evalB = expressionEvaluator(order.child)

      val compare = order.child.dataType match {
        case BinaryType =>
          q"""
          val x = ${if (order.direction == Ascending) evalA.primitiveTerm else evalB.primitiveTerm}
          val y = ${if (order.direction != Ascending) evalB.primitiveTerm else evalA.primitiveTerm}
          var i = 0
          while (i < x.length && i < y.length) {
            val res = x(i).compareTo(y(i))
            if (res != 0) return res
            i = i+1
          }
          return x.length - y.length
          """
        case _: NumericType =>
          q"""
          val comp = ${evalA.primitiveTerm} - ${evalB.primitiveTerm}
          if(comp != 0) {
            return ${if (order.direction == Ascending) q"comp.toInt" else q"-comp.toInt"}
          }
          """
        case StringType =>
          if (order.direction == Ascending) {
            q"""return ${evalA.primitiveTerm}.compare(${evalB.primitiveTerm})"""
          } else {
            q"""return ${evalB.primitiveTerm}.compare(${evalA.primitiveTerm})"""
          }
      }

      q"""
        i = $a
        ..${evalA.code}
        i = $b
        ..${evalB.code}
        if (${evalA.nullTerm} && ${evalB.nullTerm}) {
          // Nothing
        } else if (${evalA.nullTerm}) {
          return ${if (order.direction == Ascending) q"-1" else q"1"}
        } else if (${evalB.nullTerm}) {
          return ${if (order.direction == Ascending) q"1" else q"-1"}
        } else {
          $compare
        }
      """
    }

    val q"class $orderingName extends $orderingType { ..$body }" = reify {
      class SpecificOrdering extends Ordering[Row] {
        val o = ordering
      }
    }.tree.children.head

    val code = q"""
      class $orderingName extends $orderingType {
        ..$body
        def compare(a: $rowType, b: $rowType): Int = {
          var i: $rowType = null // Holds current row being evaluated.
          ..$comparisons
          return 0
        }
      }
      new $orderingName()
      """
    logDebug(s"Generated Ordering: $code")
    toolBox.eval(code).asInstanceOf[Ordering[Row]]
  }
}

Source File: SparkSQLParser.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql

import scala.util.parsing.combinator.RegexParsers

import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution._
import org.apache.spark.sql.types.StringType



private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends AbstractSparkSQLParser {

  // A parser for the key-value part of the "SET [key = [value ]]" syntax
  private object SetCommandParser extends RegexParsers {
    private val key: Parser[String] = "(?m)[^=]+".r

    private val value: Parser[String] = "(?m).*$".r

    private val output: Seq[Attribute] = Seq(AttributeReference("", StringType, nullable = false)())

    private val pair: Parser[LogicalPlan] =
      (key ~ ("=".r ~> value).?).? ^^ {
        case None => SetCommand(None, output)
        case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim)), output)
      }

    def apply(input: String): LogicalPlan = parseAll(pair, input) match {
      case Success(plan, _) => plan
      case x => sys.error(x.toString)
    }
  }

  protected val AS = Keyword("AS")
  protected val CACHE = Keyword("CACHE")
  protected val CLEAR = Keyword("CLEAR")
  protected val IN = Keyword("IN")
  protected val LAZY = Keyword("LAZY")
  protected val SET = Keyword("SET")
  protected val SHOW = Keyword("SHOW")
  protected val TABLE = Keyword("TABLE")
  protected val TABLES = Keyword("TABLES")
  protected val UNCACHE = Keyword("UNCACHE")

  override protected lazy val start: Parser[LogicalPlan] = cache | uncache | set | show | others

  private lazy val cache: Parser[LogicalPlan] =
    CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ {
      case isLazy ~ tableName ~ plan =>
        CacheTableCommand(tableName, plan.map(fallback), isLazy.isDefined)
    }

  private lazy val uncache: Parser[LogicalPlan] =
    ( UNCACHE ~ TABLE ~> ident ^^ {
        case tableName => UncacheTableCommand(tableName)
      }
    | CLEAR ~ CACHE ^^^ ClearCacheCommand
    )

  private lazy val set: Parser[LogicalPlan] =
    SET ~> restInput ^^ {
      case input => SetCommandParser(input)
    }

  private lazy val show: Parser[LogicalPlan] =
    SHOW ~> TABLES ~ (IN ~> ident).? ^^ {
      case _ ~ dbName => ShowTablesCommand(dbName)
    }

  private lazy val others: Parser[LogicalPlan] =
    wholeInput ^^ {
      case input => fallback(input)
    }

}

Source File: ListTablesSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.test.TestSQLContext
import org.apache.spark.sql.test.TestSQLContext._
import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType}

class ListTablesSuite extends QueryTest with BeforeAndAfter {

  import org.apache.spark.sql.test.TestSQLContext.implicits._

  val df =
    sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value")

  before {
    df.registerTempTable("ListTablesSuiteTable")
  }

  after {
    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
  }

  test("get all tables") {
    checkAnswer(
      tables().filter("tableName = 'ListTablesSuiteTable'"),
      Row("ListTablesSuiteTable", true))

    checkAnswer(
      sql("SHOW tables").filter("tableName = 'ListTablesSuiteTable'"),
      Row("ListTablesSuiteTable", true))

    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
    assert(tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
  }

  test("getting all Tables with a database name has no impact on returned table names") {
    checkAnswer(
      tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
      Row("ListTablesSuiteTable", true))

    checkAnswer(
      sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"),
      Row("ListTablesSuiteTable", true))

    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
    assert(tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
  }

  test("query the returned DataFrame of tables") {
    val expectedSchema = StructType(
      StructField("tableName", StringType, false) ::
      StructField("isTemporary", BooleanType, false) :: Nil)

    Seq(tables(), sql("SHOW TABLes")).foreach {
      case tableDF =>
        assert(expectedSchema === tableDF.schema)

        tableDF.registerTempTable("tables")
        checkAnswer(
          sql("SELECT isTemporary, tableName from tables WHERE tableName = 'ListTablesSuiteTable'"),
          Row(true, "ListTablesSuiteTable")
        )
        checkAnswer(
          tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
          Row("tables", true))
        dropTempTable("tables")
    }
  }
}

Source File: DiscreteDistributionBuilder.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables.dataframe.report.distribution.discrete

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{BooleanType, StringType, StructField}

import ai.deepsense.deeplang.doperables.dataframe.report.DataFrameReportGenerator
import ai.deepsense.deeplang.doperables.dataframe.report.distribution.{DistributionBuilder, NoDistributionReasons}
import ai.deepsense.deeplang.doperables.report.ReportUtils
import ai.deepsense.deeplang.utils.aggregators.Aggregator
import ai.deepsense.deeplang.utils.aggregators.AggregatorBatch.BatchedResult
import ai.deepsense.reportlib.model.{DiscreteDistribution, Distribution, NoDistribution}

case class DiscreteDistributionBuilder(
    categories: Aggregator[Option[scala.collection.mutable.Map[String, Long]], Row],
    missing: Aggregator[Long, Row],
    field: StructField)
  extends DistributionBuilder {

  def allAggregators: Seq[Aggregator[_, Row]] = Seq(categories, missing)

  override def build(results: BatchedResult): Distribution = {
    val categoriesMap = results.forAggregator(categories)
    val nullsCount = results.forAggregator(missing)

    categoriesMap match {
      case Some(occurrencesMap) => {
        val labels = field.dataType match {
          case StringType => occurrencesMap.keys.toSeq.sorted
          // We always want two labels, even when all elements are true or false
          case BooleanType => Seq(false.toString, true.toString)
        }
        val counts = labels.map(occurrencesMap.getOrElse(_, 0L))
        DiscreteDistribution(
          field.name,
          s"Discrete distribution for ${field.name} column",
          nullsCount,
          labels.map(ReportUtils.shortenLongStrings(_,
            DataFrameReportGenerator.StringPreviewMaxLength)),
          counts)
      }
      case None => NoDistribution(
        field.name,
        NoDistributionReasons.TooManyDistinctCategoricalValues
      )
    }
  }
}

Source File: CountVectorizerExample.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperations.examples

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ArrayType, StringType, StructField, StructType}

import ai.deepsense.deeplang.doperables.dataframe.{DataFrame, DataFrameBuilder}
import ai.deepsense.deeplang.doperations.spark.wrappers.estimators.CountVectorizer

class CountVectorizerExample extends AbstractOperationExample[CountVectorizer]{
  override def dOperation: CountVectorizer = {
    val op = new CountVectorizer()
    op.estimator
      .setInputColumn("lines")
      .setNoInPlace("lines_out")
      .setMinTF(3)
    op.set(op.estimator.extractParamMap())
  }

  override def inputDataFrames: Seq[DataFrame] = {
    val rows = Seq(
      Row("a a a b b c c c d ".split(" ").toSeq),
      Row("c c c c c c".split(" ").toSeq),
      Row("a".split(" ").toSeq),
      Row("e e e e e".split(" ").toSeq))
    val rdd = sparkContext.parallelize(rows)
    val schema = StructType(Seq(StructField("lines", ArrayType(StringType, containsNull = true))))
    Seq(DataFrameBuilder(sparkSQLSession).buildDataFrame(schema, rdd))
  }
}

Source File: UnionIntegSpec.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperations

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.doperations.exceptions.SchemaMismatchException
import ai.deepsense.deeplang.inference.{InferContext, InferenceWarnings}
import ai.deepsense.deeplang.{DKnowledge, DeeplangIntegTestSupport}

class UnionIntegSpec extends DeeplangIntegTestSupport {

  import DeeplangIntegTestSupport._
  val schema1 = StructType(List(
    StructField("column1", DoubleType),
    StructField("column2", DoubleType)))

  val rows1_1 = Seq(
    Row(1.0, 2.0),
    Row(2.0, 3.0)
  )

  "Union" should {
    "return a union of two DataFrames" in {
      val rows1_2 = Seq(
        Row(2.0, 4.0),
        Row(4.0, 6.0)
      )

      val df1 = createDataFrame(rows1_1, schema1)
      val df2 = createDataFrame(rows1_2, schema1)

      val merged = Union()
        .executeUntyped(Vector(df1, df2))(executionContext)
        .head.asInstanceOf[DataFrame]

      assertDataFramesEqual(
        merged, createDataFrame(rows1_1 ++ rows1_2, schema1))
    }

    "throw for mismatching types in DataFrames" in {
      val schema2 = StructType(List(
        StructField("column1", StringType),
        StructField("column2", DoubleType)))

      val rows2_1 = Seq(
        Row("a", 1.0),
        Row("b", 1.0)
      )

      val df1 = createDataFrame(rows1_1, schema1)
      val df2 = createDataFrame(rows2_1, schema2)

      a [SchemaMismatchException] should be thrownBy {
        Union().executeUntyped(Vector(df1, df2))(executionContext)
      }
    }

    "throw for mismatching column names in DataFrames" in {
      val schema2 = StructType(List(
        StructField("column1", DoubleType),
        StructField("different_column_name", DoubleType)))

      val rows2_1 = Seq(
        Row(1.1, 1.0),
        Row(1.1, 1.0)
      )

      val df1 = createDataFrame(rows1_1, schema1)
      val df2 = createDataFrame(rows2_1, schema2)

      a [SchemaMismatchException] should be thrownBy {
        Union().executeUntyped(Vector(df1, df2))(executionContext)
      }
    }
  }

  it should {
    "propagate schema when both schemas match" in {
      val structType = StructType(Seq(
        StructField("x", DoubleType),
        StructField("y", DoubleType)))
      val knowledgeDF1 = DKnowledge(DataFrame.forInference(structType))
      val knowledgeDF2 = DKnowledge(DataFrame.forInference(structType))
      Union().inferKnowledgeUntyped(Vector(knowledgeDF1, knowledgeDF2))(mock[InferContext]) shouldBe
        (Vector(knowledgeDF1), InferenceWarnings())
    }
    "generate error when schemas don't match" in {
      val structType1 = StructType(Seq(
        StructField("x", DoubleType)))
      val structType2 = StructType(Seq(
        StructField("y", DoubleType)))
      val knowledgeDF1 = DKnowledge(DataFrame.forInference(structType1))
      val knowledgeDF2 = DKnowledge(DataFrame.forInference(structType2))
      an [SchemaMismatchException] shouldBe thrownBy(
        Union().inferKnowledgeUntyped(Vector(knowledgeDF1, knowledgeDF2))(mock[InferContext]))
    }
  }
}

Source File: AbstractEvaluatorSmokeTest.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.params.ParamPair
import ai.deepsense.deeplang.{DKnowledge, DeeplangIntegTestSupport}
import ai.deepsense.sparkutils.Linalg.Vectors

abstract class AbstractEvaluatorSmokeTest extends DeeplangIntegTestSupport {

  def className: String

  val evaluator: Evaluator

  val evaluatorParams: Seq[ParamPair[_]]

  val inputDataFrameSchema = StructType(Seq(
    StructField("s", StringType),
    StructField("prediction", DoubleType),
    StructField("rawPrediction", new ai.deepsense.sparkutils.Linalg.VectorUDT),
    StructField("label", DoubleType)
  ))

  val inputDataFrame: DataFrame = {
    val rowSeq = Seq(
      Row("aAa bBb cCc dDd eEe f", 1.0, Vectors.dense(2.1, 2.2, 2.3), 3.0),
      Row("das99213 99721 8i!#@!", 4.0, Vectors.dense(5.1, 5.2, 5.3), 6.0)
    )
    createDataFrame(rowSeq, inputDataFrameSchema)
  }

  def setUpStubs(): Unit = ()

  className should {
    "successfully run _evaluate()" in {
      setUpStubs()
      evaluator.set(evaluatorParams: _*)._evaluate(executionContext, inputDataFrame)
    }
    "successfully run _infer()" in {
      evaluator.set(evaluatorParams: _*)._infer(DKnowledge(inputDataFrame))
    }
    "successfully run report" in {
      evaluator.set(evaluatorParams: _*).report()
    }
  }
}

Source File: StringTokenizerSmokeTest.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables.spark.wrappers.transformers

import org.apache.spark.sql.types.{ArrayType, DataType, StringType}

import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice
import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice
import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection

class StringTokenizerSmokeTest
  extends AbstractTransformerWrapperSmokeTest[StringTokenizer]
  with MultiColumnTransformerWrapperTestSupport {

  override def transformerWithParams: StringTokenizer = {
     val inPlace = NoInPlaceChoice()
      .setOutputColumn("tokenized")

    val single = SingleColumnChoice()
      .setInputColumn(NameSingleColumnSelection("s"))
      .setInPlace(inPlace)

    val transformer = new StringTokenizer()
    transformer.set(Seq(
      transformer.singleOrMultiChoiceParam -> single
    ): _*)
  }

  override def testValues: Seq[(Any, Any)] = {
    val strings = Seq(
      "this is a test",
      "this values should be separated",
      "Bla bla bla!"
    )

    val tokenized = strings.map { _.toLowerCase.split("\\s") }
    strings.zip(tokenized)
  }

  override def inputType: DataType = StringType

  override def outputType: DataType = new ArrayType(StringType, true)
}

Source File: RegexTokenizerSmokeTest.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables.spark.wrappers.transformers

import org.apache.spark.sql.types.{ArrayType, DataType, StringType}

import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice
import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice
import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection

class RegexTokenizerSmokeTest
  extends AbstractTransformerWrapperSmokeTest[RegexTokenizer]
  with MultiColumnTransformerWrapperTestSupport {

  override def transformerWithParams: RegexTokenizer = {
    val inPlace = NoInPlaceChoice()
      .setOutputColumn("tokenized")

    val single = SingleColumnChoice()
      .setInputColumn(NameSingleColumnSelection("s"))
      .setInPlace(inPlace)

    val transformer = new RegexTokenizer()
    transformer.set(Seq(
      transformer.singleOrMultiChoiceParam -> single,
      transformer.gaps -> false,
      transformer.minTokenLength -> 1,
      transformer.pattern -> "\\d+"
    ): _*)
  }

  override def testValues: Seq[(Any, Any)] = {
    val strings = Seq(
      "100 200 300",
      "400 500 600",
      "700 800 900"
    )

    val tokenized = strings.map { _.toLowerCase.split(" ") }
    strings.zip(tokenized)
  }

  override def inputType: DataType = StringType

  override def outputType: DataType = new ArrayType(StringType, true)
}

Source File: NGramTransformerSmokeTest.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables.spark.wrappers.transformers

import org.apache.spark.sql.types.{ArrayType, DataType, StringType}

import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice
import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice
import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection

class NGramTransformerSmokeTest
  extends AbstractTransformerWrapperSmokeTest[NGramTransformer]
  with MultiColumnTransformerWrapperTestSupport {

  override def transformerWithParams: NGramTransformer = {
    val inPlace = NoInPlaceChoice()
      .setOutputColumn("ngrams")

    val single = SingleColumnChoice()
      .setInputColumn(NameSingleColumnSelection("as"))
      .setInPlace(inPlace)

    val transformer = new NGramTransformer()
    transformer.set(Seq(
      transformer.singleOrMultiChoiceParam -> single,
      transformer.n -> 2
    ): _*)
  }

  override def testValues: Seq[(Any, Any)] = {
    val strings = Seq(
      Array("a", "b", "c"),
      Array("d", "e", "f")
    )

    val ngrams = Seq(
      Array("a b", "b c"),
      Array("d e", "e f")
    )
    strings.zip(ngrams)
  }

  override def inputType: DataType = new ArrayType(StringType, true)

  override def outputType: DataType = new ArrayType(StringType, false)
}

Source File: StopWordsRemoverSmokeTest.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperables.spark.wrappers.transformers

import org.apache.spark.sql.types.{ArrayType, DataType, StringType}

import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice
import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice
import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection

class StopWordsRemoverSmokeTest
    extends AbstractTransformerWrapperSmokeTest[StopWordsRemover]
    with MultiColumnTransformerWrapperTestSupport  {

  override def transformerWithParams: StopWordsRemover = {
    val inPlace = NoInPlaceChoice()
      .setOutputColumn("stopWordsRemoverOutput")
    val single = SingleColumnChoice()
      .setInputColumn(NameSingleColumnSelection("as"))
      .setInPlace(inPlace)

    val stopWordsRemover = new StopWordsRemover()
    stopWordsRemover.set(
      stopWordsRemover.singleOrMultiChoiceParam -> single,
      stopWordsRemover.caseSensitive -> false)
  }

  override def testValues: Seq[(Any, Any)] = {
    val inputNumbers = Seq(Array("a", "seahorse", "The", "Horseshoe", "Crab"))
    val outputNumbers = Seq(Array("seahorse", "Horseshoe", "Crab"))
    inputNumbers.zip(outputNumbers)
  }

  override def inputType: DataType = ArrayType(StringType)

  override def outputType: DataType = ArrayType(StringType)
}

Source File: HBaseLoader.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

package it.agilelab.bigdata.DataQuality.utils.io.db.readers

import it.agilelab.bigdata.DataQuality.sources.HBaseSrcConfig
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import it.nerdammer.spark.hbase._
import it.nerdammer.spark.hbase.conversion.FieldReader
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}


object HBaseLoader {

  private def seqReader[T]()(implicit m1: FieldReader[Option[T]]): FieldReader[Seq[Option[T]]] = new FieldReader[Seq[Option[T]]] {
    def map(data: HBaseData): Seq[Option[T]] = data.map(optArr => m1.map(Iterable(optArr))).toSeq
  }

  
  def loadToDF(conf: HBaseSrcConfig)(implicit sqlContext: SQLContext): DataFrame = {

    implicit val stringSeqReader: FieldReader[Seq[Option[String]]] = seqReader[String]()

    val header: Seq[String] = Seq("key") ++ conf.hbaseColumns
    val rdd: RDD[Row] =
      sqlContext.sparkContext
        .hbaseTable[Seq[Option[String]]](conf.table)
        .select(conf.hbaseColumns: _*)
        .map {
          case (s: Seq[Option[String]]) => Row.fromSeq(s)
        }
    val struct = StructType(header.map(StructField(_, StringType)))
    sqlContext.createDataFrame(rdd, struct)
  }

}

Source File: ColumnDescriptorSuite.scala From kyuubi with Apache License 2.0

5 votes

package yaooqinn.kyuubi.schema

import org.apache.hive.service.cli.thrift.{TCLIServiceConstants, TTypeId}
import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{DecimalType, StringType, StructType}

class ColumnDescriptorSuite extends SparkFunSuite {

  test("Column Descriptor basic test") {
    val col1 = "a"
    val col2 = "b"
    val comments = "no comments"
    val schema = new StructType()
      .add(col1, StringType, nullable = true, comments)
      .add(col2, DecimalType(10, 9), nullable = true, "")

    val tColumnDescs =
      (0 until schema.length).map(i => ColumnDescriptor(schema(i), i)).map(_.toTColumnDesc)
    assert(tColumnDescs.head.getColumnName === col1)
    assert(tColumnDescs.head.getComment === comments)
    assert(tColumnDescs.head.getPosition === 0)
    assert(tColumnDescs.head.getTypeDesc === TypeDescriptor(StringType).toTTypeDesc)
    assert(tColumnDescs.head.getTypeDesc.getTypesSize === 1)
    assert(tColumnDescs.head.getTypeDesc.getTypes.get(0)
      .getPrimitiveEntry.getTypeQualifiers === null)
    assert(tColumnDescs.head.getTypeDesc.getTypes.get(0)
      .getPrimitiveEntry.getType === TTypeId.STRING_TYPE)

    assert(tColumnDescs(1).getColumnName === col2)
    assert(tColumnDescs(1).getComment === "")
    assert(tColumnDescs(1).getPosition === 1)
    assert(tColumnDescs(1).getTypeDesc.getTypesSize === 1)
    assert(tColumnDescs(1)
      .getTypeDesc
      .getTypes.get(0)
      .getPrimitiveEntry
      .getTypeQualifiers
      .getQualifiers
      .get(TCLIServiceConstants.PRECISION).getI32Value === 10)
    assert(tColumnDescs(1)
      .getTypeDesc
      .getTypes.get(0)
      .getPrimitiveEntry
      .getTypeQualifiers
      .getQualifiers
      .get(TCLIServiceConstants.SCALE).getI32Value === 9)
    assert(tColumnDescs(1).getTypeDesc.getTypes.get(0)
      .getPrimitiveEntry.getType === TTypeId.DECIMAL_TYPE)

  }

  test("field is null") {
    val tColumnDesc = ColumnDescriptor(null, 0).toTColumnDesc
    assert(tColumnDesc.isSetPosition)
    assert(!tColumnDesc.isSetColumnName)
    assert(!tColumnDesc.isSetTypeDesc)
    assert(!tColumnDesc.isSetComment)
  }
}

Source File: FilterSpec.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine.util

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types.StringType
import org.scalatest.{FlatSpec, Matchers}

class FilterSpec extends FlatSpec with Matchers {
  "CompiledFilters" should "filter properly depending of his type" in {
    val eq = EqualFilter(Attr("test", ""), "a")

    eq.eval("a") should be(true)
    eq.eval("b") should be(false)

    val notEq = NotFilter(EqualFilter(Attr("test", ""), "a"))

    notEq.eval("a") should be(false)
    notEq.eval("b") should be(true)

    val in = InFilter(Attr("test", ""), Array("a", "b", "c"))

    in.eval("a") should be(true)
    in.eval("b") should be(true)
    in.eval("c") should be(true)
    in.eval("d") should be(false)

    val gt = GreaterThanFilter(Attr("test", ""), 5)

    gt.eval(4) should be(false)
    gt.eval(5) should be(false)
    gt.eval(6) should be(true)

    val gte = GreaterThanOrEqualFilter(Attr("test", ""), 5)

    gte.eval(4) should be(false)
    gte.eval(5) should be(true)
    gte.eval(6) should be(true)

    val lt = LessThanFilter(Attr("test", ""), 5)

    lt.eval(4) should be(true)
    lt.eval(5) should be(false)
    lt.eval(6) should be(false)

    val lte = LessThanOrEqualFilter(Attr("test", ""), 5)

    lte.eval(4) should be(true)
    lte.eval(5) should be(true)
    lte.eval(6) should be(false)
  }

  "ColumnFilter" should "process correctly columns" in {
    // test = 'val' AND test IS NOT NULL AND test2 = 'val2' AND test3 IN ('a', 'b')
    val f = Filter.compile(And(
      And(
        And(
          EqualTo(AttributeReference("test", StringType)(), Literal("val")),
          IsNotNull(AttributeReference("test", StringType)())
        ),
        EqualTo(AttributeReference("test2", StringType)(), Literal("val2"))
      ),
      In(AttributeReference("test3", StringType)(), Seq(Literal("a"), Literal("b")))
    ))

    f.length should be(4)
    val filters = Filters(f)
    filters.matches(Seq("test"), "val") should be(true)
    filters.matches(Seq("test2"), "val") should be(false)
    filters.matches(Seq("test3"), "b") should be(true)
  }

  "ColumnFilter" should "handle correctly unsupported filters" in {
    val f = Filter.compile(StartsWith(AttributeReference("test", StringType)(), Literal("a")))

    f.length should be(0)
  }
}

Source File: MetadataIteratorSpec.scala From jgit-spark-connector with Apache License 2.0

5 votes

package tech.sourced.engine.iterator

import java.nio.file.Paths
import java.util.{Properties, UUID}

import org.apache.commons.io.FileUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{Metadata, StringType, StructType}
import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers}
import tech.sourced.engine.{BaseSparkSpec, Schema}

class JDBCQueryIteratorSpec
  extends FlatSpec with Matchers with BeforeAndAfterAll with BaseSparkSpec {
  private val tmpPath = Paths.get(
    System.getProperty("java.io.tmpdir"),
    UUID.randomUUID.toString
  )

  private val dbPath = tmpPath.resolve("test.db")

  override def beforeAll(): Unit = {
    super.beforeAll()
    tmpPath.toFile.mkdir()
    val rdd = ss.sparkContext.parallelize(Seq(
      Row("id1"),
      Row("id2"),
      Row("id3")
    ))

    val properties = new Properties()
    properties.put("driver", "org.sqlite.JDBC")
    val df = ss.createDataFrame(rdd, StructType(Seq(Schema.repositories.head)))
    df.write.jdbc(s"jdbc:sqlite:${dbPath.toString}", "repositories", properties)
  }

  override def afterAll(): Unit = {
    super.afterAll()
    FileUtils.deleteQuietly(tmpPath.toFile)
  }

  "JDBCQueryIterator" should "return all rows for the query" in {
    val iter = new JDBCQueryIterator(
      Seq(attr("id")),
      dbPath.toString,
      "SELECT id FROM repositories ORDER BY id"
    )

    // calling hasNext more than one time does not cause rows to be lost
    iter.hasNext
    iter.hasNext
    val rows = (for (row <- iter) yield row).toArray
    rows.length should be(3)
    rows(0).length should be(1)
    rows(0)(0).toString should be("id1")
    rows(1)(0).toString should be("id2")
    rows(2)(0).toString should be("id3")
  }

  private def attr(name: String): Attribute = AttributeReference(
    name, StringType, nullable = false, Metadata.empty
  )()
}

Source File: MultilayerPerceptronClassifierExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
// $example off$
import org.apache.spark.sql.Row
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}


    result.show(5)
    val predictionAndLabels = result.select("prediction", "label")
    //多分类评估
    val evaluator = new MulticlassClassificationEvaluator()
      .setMetricName("precision")
    //准确率 Accuracy: 0.9636363636363636
    println("Accuracy: " + evaluator.evaluate(predictionAndLabels))
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: GradientBoostedTreeRegressorExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.{GBTRegressionModel, GBTRegressor}
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    predictions.select("prediction", "label", "features").show(5)

    // Select (prediction, true label) and compute test error.
    val evaluator = new RegressionEvaluator()
      .setLabelCol("label")//标签列名
      //预测结果列名
      .setPredictionCol("prediction")
       //rmse均方根误差说明样本的离散程度
      .setMetricName("rmse")
    val rmse = evaluator.evaluate(predictions)
     //rmse均方根误差说明样本的离散程度
    println("Root Mean Squared Error (RMSE) on test data = " + rmse)

    val gbtModel = model.stages(1).asInstanceOf[GBTRegressionModel]
    println("Learned regression GBT model:\n" + gbtModel.toDebugString)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: LinearRegressionWithElasticNetExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.regression.LinearRegression
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    trainingSummary.residuals.show()
    //rmse均方根误差说明样本的离散程度
    //RMSE: 10.189126225286143
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
    //R2平方系统也称判定系数,用来评估模型拟合数据的好坏
    //r2: 0.02285205756871944
    println(s"r2: ${trainingSummary.r2}")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: NaiveBayesExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.classification.NaiveBayes
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}
// $example off$


    predictions.show(5)

    // Select (prediction, true label) and compute test error
    val evaluator = new MulticlassClassificationEvaluator()
      .setLabelCol("label")//标签列名
      .setPredictionCol("prediction")//预测结果列名
      .setMetricName("precision")//准确率
    //Accuracy: 1.0
    val accuracy = evaluator.evaluate(predictions)
    println("Accuracy: " + accuracy)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: VectorSlicerExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import java.util.Arrays

import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.feature.VectorSlicer

import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
// $example off$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    output.show()
    println(output.select("userFeatures", "features").first())
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: CountVectorizerExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{CountVectorizer, CountVectorizerModel}
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

     //transform()方法将DataFrame转化为另外一个DataFrame的算法
    cvm.transform(df).select("features").show()
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: Word2VecExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Word2Vec

// $example off$

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    result.show()
    result.select("result").take(3).foreach(println)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: TokenizerExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{RegexTokenizer, Tokenizer}
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    regexTokenized.show()
    regexTokenized.select("words", "label").take(3).foreach(println)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: StringIndexerExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StringIndexer
// $example off$
import org.apache.spark.sql.Row
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    val indexer = new StringIndexer()
      .setInputCol("category")
      .setOutputCol("categoryIndex")
    //fit()方法将DataFrame转化为一个Transformer的算法
    //transform()方法将DataFrame转化为另外一个DataFrame的算法
    val indexed = indexer.fit(df).transform(df)
    indexed.show()
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: DCTExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.DCT
import org.apache.spark.mllib.linalg.Vectors
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

object DCTExample {
  def main(args: Array[String]): Unit = {
    
    val conf = new SparkConf().setAppName("DCTExample").setMaster("local[4]")
    val sc = new SparkContext(conf)
  
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._
    
   

    // $example on$
    val data = Seq(
      Vectors.dense(0.0, 1.0, -2.0, 3.0),
      Vectors.dense(-1.0, 2.0, 4.0, -7.0),
      Vectors.dense(14.0, -2.0, -5.0, 1.0))

    val df = sqlContext.createDataFrame(data.map(Tuple1.apply)).toDF("features")
    //离散余弦变换(DCT)
    val dct = new DCT()
      .setInputCol("features")
      .setOutputCol("featuresDCT")
      .setInverse(false)
    //transform()方法将DataFrame转化为另外一个DataFrame的算法
    val dctDf = dct.transform(df)
    
    dctDf.select("featuresDCT").show(3)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: RandomForestRegressorExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.feature.VectorIndexer
import org.apache.spark.ml.regression.{RandomForestRegressionModel, RandomForestRegressor}
// $example off$
import org.apache.spark.sql.Row
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    predictions.select("prediction", "label", "features").show(5)

    // Select (prediction, true label) and compute test error.
    val evaluator = new RegressionEvaluator()
      .setLabelCol("label")
	//算法预测结果的存储列的名称, 默认是”prediction”
      .setPredictionCol("prediction")
      //rmse均方根误差说明样本的离散程度
      .setMetricName("rmse")
    val rmse = evaluator.evaluate(predictions)
    //Root Mean Squared Error (RMSE) on test data = 0.09854713827168428
    println("Root Mean Squared Error (RMSE) on test data = " + rmse)

    val rfModel = model.stages(1).asInstanceOf[RandomForestRegressionModel]
    println("Learned regression forest model:\n" + rfModel.toDebugString)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: StopWordsRemoverExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.StopWordsRemover
// $example off$
import org.apache.spark.sql.Row
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    //transform()方法将DataFrame转化为另外一个DataFrame的算法
    remover.transform(dataSet).show()
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: TfIdfExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    rescaledData.show()
    rescaledData.select("features", "label").take(3).foreach(println)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: VectorIndexerExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.VectorIndexer
// $example off$
import org.apache.spark.mllib.linalg.Vectors
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}
import org.apache.spark.mllib.util._

    indexedData.show()
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: Tokenizer.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}


  def getPattern: String = $(pattern)

  setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+")

  override protected def createTransformFunc: String => Seq[String] = { str =>
    val re = $(pattern).r
    val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq
    val minLength = $(minTokenLength)
    tokens.filter(_.length >= minLength)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType == StringType, s"Input type must be string type but got $inputType.")
  }

  override protected def outputDataType: DataType = new ArrayType(StringType, true)

  override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra)
}

Source File: MiscFunctionsSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.commons.codec.digest.DigestUtils

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{IntegerType, StringType, BinaryType}

class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("md5") {
    checkEvaluation(Md5(Literal("ABC".getBytes)), "902fbdd2b1df0c4f70b4a5d23525e932")
    checkEvaluation(Md5(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
      "6ac1e56bc78f031059be7be854522c4c")
    checkEvaluation(Md5(Literal.create(null, BinaryType)), null)
    checkConsistencyBetweenInterpretedAndCodegen(Md5, BinaryType)
  }

  test("sha1") {
    checkEvaluation(Sha1(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8")
    checkEvaluation(Sha1(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
      "5d211bad8f4ee70e16c7d343a838fc344a1ed961")
    checkEvaluation(Sha1(Literal.create(null, BinaryType)), null)
    checkEvaluation(Sha1(Literal("".getBytes)), "da39a3ee5e6b4b0d3255bfef95601890afd80709")
    checkConsistencyBetweenInterpretedAndCodegen(Sha1, BinaryType)
  }

  test("sha2") {
    checkEvaluation(Sha2(Literal("ABC".getBytes), Literal(256)), DigestUtils.sha256Hex("ABC"))
    checkEvaluation(Sha2(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType), Literal(384)),
      DigestUtils.sha384Hex(Array[Byte](1, 2, 3, 4, 5, 6)))
    // unsupported bit length
    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(1024)), null)
    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(512)), null)
    checkEvaluation(Sha2(Literal("ABC".getBytes), Literal.create(null, IntegerType)), null)
    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal.create(null, IntegerType)), null)
  }

  test("crc32") {
    checkEvaluation(Crc32(Literal("ABC".getBytes)), 2743272264L)
    checkEvaluation(Crc32(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
      2180413220L)
    checkEvaluation(Crc32(Literal.create(null, BinaryType)), null)
    checkConsistencyBetweenInterpretedAndCodegen(Crc32, BinaryType)
  }
}

Source File: ColumnPruningSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.expressions.Explode
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{Project, LocalRelation, Generate, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.types.StringType

class ColumnPruningSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = Batch("Column pruning", FixedPoint(100),
      ColumnPruning) :: Nil
  }

  test("Column pruning for Generate when Generate.join = false") {
    val input = LocalRelation('a.int, 'b.array(StringType))

    val query = Generate(Explode('b), false, false, None, 's.string :: Nil, input).analyze
    val optimized = Optimize.execute(query)

    val correctAnswer =
      Generate(Explode('b), false, false, None, 's.string :: Nil,
        Project('b.attr :: Nil, input)).analyze

    comparePlans(optimized, correctAnswer)
  }
  //生成Generate.join = true时的列修剪
  test("Column pruning for Generate when Generate.join = true") {
    val input = LocalRelation('a.int, 'b.int, 'c.array(StringType))

    val query =
      Project(Seq('a, 's),
        Generate(Explode('c), true, false, None, 's.string :: Nil,
          input)).analyze
    val optimized = Optimize.execute(query)

    val correctAnswer =
      Project(Seq('a, 's),
        Generate(Explode('c), true, false, None, 's.string :: Nil,
          Project(Seq('a, 'c),
            input))).analyze

    comparePlans(optimized, correctAnswer)
  }
  //如果可能,将Generate.join转换为false
  test("Turn Generate.join to false if possible") {
    val input = LocalRelation('b.array(StringType))

    val query =
      Project(('s + 1).as("s+1") :: Nil,
        Generate(Explode('b), true, false, None, 's.string :: Nil,
          input)).analyze
    val optimized = Optimize.execute(query)

    val correctAnswer =
      Project(('s + 1).as("s+1") :: Nil,
        Generate(Explode('b), false, false, None, 's.string :: Nil,
          input)).analyze

    comparePlans(optimized, correctAnswer)
  }

  // todo: add more tests for column pruning
}

Source File: SparkSQLParser.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql

import scala.util.parsing.combinator.RegexParsers

import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.{DescribeFunction, LogicalPlan, ShowFunctions}
import org.apache.spark.sql.execution._
import org.apache.spark.sql.types.StringType



private[sql] class SparkSQLParser(fallback: String => LogicalPlan) extends AbstractSparkSQLParser {

  // A parser for the key-value part of the "SET [key = [value ]]" syntax
  //用于“SET [key = [value]]”语法的键值部分的解析器
  private object SetCommandParser extends RegexParsers {
    private val key: Parser[String] = "(?m)[^=]+".r

    private val value: Parser[String] = "(?m).*$".r

    private val output: Seq[Attribute] = Seq(AttributeReference("", StringType, nullable = false)())

    private val pair: Parser[LogicalPlan] =
      (key ~ ("=".r ~> value).?).? ^^ {
        case None => SetCommand(None)
        case Some(k ~ v) => SetCommand(Some(k.trim -> v.map(_.trim)))
      }

    def apply(input: String): LogicalPlan = parseAll(pair, input) match {
      case Success(plan, _) => plan
      case x => sys.error(x.toString)
    }
  }

  protected val AS = Keyword("AS")
  protected val CACHE = Keyword("CACHE")
  protected val CLEAR = Keyword("CLEAR")
  protected val DESCRIBE = Keyword("DESCRIBE")
  protected val EXTENDED = Keyword("EXTENDED")
  protected val FUNCTION = Keyword("FUNCTION")
  protected val FUNCTIONS = Keyword("FUNCTIONS")
  protected val IN = Keyword("IN")
  protected val LAZY = Keyword("LAZY")
  protected val SET = Keyword("SET")
  protected val SHOW = Keyword("SHOW")
  protected val TABLE = Keyword("TABLE")
  protected val TABLES = Keyword("TABLES")
  protected val UNCACHE = Keyword("UNCACHE")

  override protected lazy val start: Parser[LogicalPlan] =
    cache | uncache | set | show | desc | others

  private lazy val cache: Parser[LogicalPlan] =
    CACHE ~> LAZY.? ~ (TABLE ~> ident) ~ (AS ~> restInput).? ^^ {
      case isLazy ~ tableName ~ plan =>
        CacheTableCommand(tableName, plan.map(fallback), isLazy.isDefined)
    }

  private lazy val uncache: Parser[LogicalPlan] =
    ( UNCACHE ~ TABLE ~> ident ^^ {
        case tableName => UncacheTableCommand(tableName)
      }
    | CLEAR ~ CACHE ^^^ ClearCacheCommand
    )

  private lazy val set: Parser[LogicalPlan] =
    SET ~> restInput ^^ {
      case input => SetCommandParser(input)
    }

  // It can be the following patterns:
  // SHOW FUNCTIONS;显示函数
  // SHOW FUNCTIONS mydb.func1;
  // SHOW FUNCTIONS func1;
  // SHOW FUNCTIONS `mydb.a`.`func1.aa`;
  private lazy val show: Parser[LogicalPlan] =
    ( SHOW ~> TABLES ~ (IN ~> ident).? ^^ {
        case _ ~ dbName => ShowTablesCommand(dbName)
      }
    | SHOW ~ FUNCTIONS ~> ((ident <~ ".").? ~ (ident | stringLit)).? ^^ {
        case Some(f) => ShowFunctions(f._1, Some(f._2))
        case None => ShowFunctions(None, None)
      }
    )

  private lazy val desc: Parser[LogicalPlan] =
    DESCRIBE ~ FUNCTION ~> EXTENDED.? ~ (ident | stringLit) ^^ {
      case isExtended ~ functionName => DescribeFunction(functionName, isExtended.isDefined)
    }

  private lazy val others: Parser[LogicalPlan] =
    wholeInput ^^ {
      case input => fallback(input)
    }

}

Source File: NullableColumnAccessorSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.columnar

import java.nio.ByteBuffer

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.expressions.GenericMutableRow
import org.apache.spark.sql.types.{StringType, ArrayType, DataType}
//试验可为空的列的访问
class TestNullableColumnAccessor[JvmType](
    buffer: ByteBuffer,
    columnType: ColumnType[JvmType])
  extends BasicColumnAccessor(buffer, columnType)
  with NullableColumnAccessor
//试验可为空的列的访问
object TestNullableColumnAccessor {
  def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType])
    : TestNullableColumnAccessor[JvmType] = {
    // Skips the column type ID
    buffer.getInt()
    new TestNullableColumnAccessor(buffer, columnType)
  }
}
//空列存取器套件
class NullableColumnAccessorSuite extends SparkFunSuite {
  import ColumnarTestUtils._

  Seq(
    BOOLEAN, BYTE, SHORT, INT, DATE, LONG, TIMESTAMP, FLOAT, DOUBLE,
    STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC(ArrayType(StringType)))
    .foreach {
    testNullableColumnAccessor(_)
  }
  //试验可为空的列的访问
  def testNullableColumnAccessor[JvmType](
      columnType: ColumnType[JvmType]): Unit = {
    //stripSuffix去掉<string>字串中结尾的字符
    val typeName = columnType.getClass.getSimpleName.stripSuffix("$")
    val nullRow = makeNullRow(1)
    //空值
    test(s"Nullable $typeName column accessor: empty column") {
      val builder = TestNullableColumnBuilder(columnType)
      val accessor = TestNullableColumnAccessor(builder.build(), columnType)
      assert(!accessor.hasNext)
    }
    //访问空值
    test(s"Nullable $typeName column accessor: access null values") {
      val builder = TestNullableColumnBuilder(columnType)
      val randomRow = makeRandomRow(columnType)

      (0 until 4).foreach { _ =>
        builder.appendFrom(randomRow, 0)
        builder.appendFrom(nullRow, 0)
      }

      val accessor = TestNullableColumnAccessor(builder.build(), columnType)
      val row = new GenericMutableRow(1)

      (0 until 4).foreach { _ =>
        assert(accessor.hasNext)
        accessor.extractTo(row, 0)
        assert(row.get(0, columnType.dataType) === randomRow.get(0, columnType.dataType))

        assert(accessor.hasNext)
        accessor.extractTo(row, 0)
        assert(row.isNullAt(0))
      }

      assert(!accessor.hasNext)
    }
  }
}

Source File: ListTablesSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{BooleanType, StringType, StructField, StructType}
//列表测试套件
class ListTablesSuite extends QueryTest with BeforeAndAfter with SharedSQLContext {
  import testImplicits._

  private lazy val df = (1 to 10).map(i => (i, s"str$i")).toDF("key", "value")

  before {
    df.registerTempTable("ListTablesSuiteTable")
  }

  after {
    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
  }

  test("get all tables") {//获得所有的表
      
      
    ctx.tables("DB").show()
    
    checkAnswer(
      //使用数据库,查找表名
      ctx.tables("DB").filter("tableName = 'ListTablesSuiteTable'"),
      Row("ListTablesSuiteTable", true))

    checkAnswer(
      //使用命令查询表名
      sql("show TABLES in DB").filter("tableName = 'ListTablesSuiteTable'"),
      Row("ListTablesSuiteTable", true))

    ctx.catalog.unregisterTable(Seq("ListTablesSuiteTable"))
    assert(ctx.tables().filter("tableName = 'ListTablesSuiteTable'").count() === 0)
  }

  test("query the returned DataFrame of tables") {//查询返回的数据集的表名
  //StructType代表一张表,StructField代表一个字段
    val expectedSchema = StructType(
      StructField("tableName", StringType, false) ::
      StructField("isTemporary", BooleanType, false) :: Nil)

    Seq(ctx.tables(), sql("SHOW TABLes")).foreach {
      case tableDF =>
        assert(expectedSchema === tableDF.schema)

        tableDF.registerTempTable("tables")
        checkAnswer(
          sql(
            "SELECT isTemporary, tableName from tables WHERE tableName = 'ListTablesSuiteTable'"),
          Row(true, "ListTablesSuiteTable")
        )
        checkAnswer(
          ctx.tables().filter("tableName = 'tables'").select("tableName", "isTemporary"),
          Row("tables", true))
        ctx.dropTempTable("tables")
    }
  }
}

Source File: DDLSourceLoadSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
//请注意,这个服务必须改进这项工作的测试目录
//数据库定义语言数据源加载
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {//相同的名字的数据源
    intercept[RuntimeException] {
      caseInsensitiveContext.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {//从格式化别名加载数据源
    caseInsensitiveContext.read.format("gathering quorum").load().schema ==
    //StructType代表一张表,StructField代表一个字段
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {//重复的格式指定完整的类名
    caseInsensitiveContext.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without HiveContext") {//
    intercept[ClassNotFoundException] {
      caseInsensitiveContext.read.format("orc").load()
    }
  }
}

//假数据源之一
class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}
//假数据源之二
class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}
//假数据源之三
class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

Source File: LanguageAwareAnalyzer.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import org.apache.lucene.analysis.util.StopwordAnalyzerBase
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.HasOutputCol
import org.apache.spark.ml.param.{Param, ParamMap, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}


  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  def this() = this(Identifiable.randomUID("languageAnalyzer"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), stemmTextUDF(dataset.col($(inputColLang)), dataset.col($(inputColText)))).toDF
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputColText) equals $(outputCol)) {
      val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputColText)))
      SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), ArrayType(StringType, true))
    } else {
      SchemaUtils.appendColumn(schema, $(outputCol), ArrayType(StringType, true))
    }
  }

}

object LanguageAwareAnalyzer extends DefaultParamsReadable[LanguageAwareAnalyzer] {
  override def load(path: String): LanguageAwareAnalyzer = super.load(path)
}

Source File: LanguageDetectorTransformer.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import com.google.common.base.Optional
import com.optimaize.langdetect.LanguageDetector
import com.optimaize.langdetect.i18n.LdLocale
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}

import scala.collection.Map


  def setOutputCol(value: String): this.type = set(outputCol, value)

  def this() = this(Identifiable.randomUID("languageDetector"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), StringType)
  }

  @transient object languageDetectorWrapped extends Serializable {
    val languageDetector: LanguageDetector =
      LanguageDetectorUtils.buildLanguageDetector(
        LanguageDetectorUtils.readListLangsBuiltIn(),
        $(minimalConfidence),
        $(languagePriors).toMap)
  }

}

Source File: NGramExtractor.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}


  def setOutputCol(value: String): this.type = set(outputCol, value)

  setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val lowerBound = $(lowerN)
    val upperBound = $(upperN)
    val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound))
    dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol))))
  }


  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputCol) != $(outputCol)) {
      schema.add($(outputCol), new ArrayType(StringType, true))
    } else {
      schema
    }
  }
}
object NGramExtractor extends DefaultParamsReadable[NGramExtractor] {
  override def load(path: String): NGramExtractor = super.load(path)
}

Source File: RegexpReplaceTransformer.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StringType, StructType}


  def setInputCol(value: String): this.type = set(inputCol, value)

  def this() = this(Identifiable.randomUID("RegexpReplaceTransformer"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), regexp_replace(dataset.col($(inputCol)), $(regexpPattern), $(regexpReplacement)))
  }
  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputCol) equals $(outputCol)) {
      val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputCol)))
      SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), StringType)
    } else {
      SchemaUtils.appendColumn(schema, $(outputCol), StringType)
    }
  }

}

object RegexpReplaceTransformer extends DefaultParamsReadable[RegexpReplaceTransformer] {
  override def load(path: String): RegexpReplaceTransformer = super.load(path)
}

Source File: URLElimminator.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, Params}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}


  def setInputCol(value: String): this.type = set(inputCol, value)

  def this() = this(Identifiable.randomUID("URLEliminator"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), filterTextUDF(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputCol) != $(outputCol)) {
      schema.add($(outputCol), StringType)
    } else {
      schema
    }
  }
}

object URLElimminator extends DefaultParamsReadable[URLElimminator] {
  override def load(path: String): URLElimminator = super.load(path)
}

Source File: NameAssigner.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasInputCols
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.{DataFrame, Dataset, functions}
import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType}


class NameAssigner(override val uid: String) extends Transformer with HasInputCols{

  def setInputCols(column: String*) : this.type = set(inputCols, column.toArray)

  def this() = this(Identifiable.randomUID("NameAssigner"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    $(inputCols)

    $(inputCols).foldLeft(dataset.toDF)((data, column) => {
      val metadata: Metadata = dataset.schema(column).metadata
      val attributes = AttributeGroup.fromStructField(
        StructField(column, new VectorUDT, nullable = false, metadata = metadata))

      val map = attributes.attributes
        .map(arr => arr.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap)
        .getOrElse(Map())

      val func = functions.udf[String, Number](x => if(x == null) {
        null
      } else {
        val i = x.intValue()
        map.getOrElse(i, i.toString)
      })

      data.withColumn(column, func(data(column)).as(column, metadata))
    }).toDF
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(schema.map(f => if ($(inputCols).contains(f.name)) {
      StructField(f.name, StringType, f.nullable, f.metadata)
    } else {
      f
    }))
}

Source File: RegressionEvaluator.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl


class RegressionEvaluator(override val uid: String) extends Evaluator[RegressionEvaluator](uid) {

  val throughOrigin = new BooleanParam(this, "throughOrigin",
    "True if the regression is through the origin. For example, in " +
      "linear regression, it will be true without fitting intercept.")

  def setThroughOrigin(value: Boolean): this.type = set(throughOrigin, value)

  def getThroughOrigin: Boolean = $(throughOrigin)

  def this() = this(Identifiable.randomUID("regressionEvaluator"))


  override def transform(dataset: Dataset[_]): DataFrame = {

    try {
      val predictions: RDD[(Double, Double)] = dataset.select($(predictionCol), $(labelCol))
        .rdd.map { case Row(score: Double, label: Double) => (score, label) }

      val metrics = Try(new RegressionMetrics(predictions))


      val rows = metrics.toOption.map(m => Seq(
        "r2" -> m.r2,
        "rmse" -> m.rootMeanSquaredError,
        "explainedVariance" -> m.explainedVariance,
        "meanAbsoluteError" -> m.meanAbsoluteError,
        "meanSquaredError" -> m.meanSquaredError
      ).map(Row.fromTuple)).getOrElse(Seq())

      SparkSqlUtils.reflectionLock.synchronized(
        dataset.sqlContext.createDataFrame(
          dataset.sparkSession.sparkContext.parallelize(rows, 1), transformSchema(dataset.schema)))
    } catch {
      // Most probably evaluation dataset is empty
      case e: Exception =>
        logWarning("Failed to calculate metrics due to " + e.getMessage)
        SparkSqlUtils.reflectionLock.synchronized(
          dataset.sqlContext.createDataFrame(
            dataset.sparkSession.sparkContext.emptyRDD[Row], transformSchema(dataset.schema)))
    }
  }

  override def copy(extra: ParamMap): RegressionEvaluator = {
    copyValues(new RegressionEvaluator(), extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    new StructType()
      .add("metric", StringType, nullable = false)
      .add("value", DoubleType, nullable = false)
  }
}

Source File: EWStatsTransformerSpec.scala From pravda-ml with Apache License 2.0

5 votes

package odkl.analysis.spark.texts

import odkl.analysis.spark.TestEnv
import org.apache.spark.ml.odkl.texts.EWStatsTransformer
import org.apache.spark.ml.odkl.texts.EWStatsTransformer.EWStruct
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}
import org.scalatest.FlatSpec




class EWStatsTransformerSpec extends FlatSpec with TestEnv with org.scalatest.Matchers {

  import sqlc.implicits._

  case class dummyCase(Term: String, sig: Double, ewma: Double, ewmvar: Double)

  case class ewStruct(sig: Double, ewma: Double, ewmvar: Double) extends Serializable

  "CorrectEWFreqStatsTransformer" should "count existing and non-existing today words" in {

    val oldData = Seq(Seq("a", 0.0, 0.1, 0.01), Seq("b", 0.0, 0.2, 0.02), Seq("c", 0.0, 0.3, 0.015))

    val oldDF =
      sqlc.createDataFrame(sc.parallelize(oldData).map(f => {
        Row.fromSeq(f)
      }), new StructType().add("term", StringType)
        .add("sig", DoubleType).add("ewma", DoubleType).add("ewmvar", DoubleType))
    val rddRes = oldDF.rdd.
      map { case Row(term, sig, ewma, ewmvar) => Row(term, Row(sig, ewma, ewmvar)) }

    val schemaRes = StructType(
      StructField("term", StringType, false) ::
        StructField("ewStruct", StructType(
          StructField("sig", DoubleType, false) ::
          StructField("ewma", DoubleType, false) ::
          StructField("ewmvar", DoubleType, false) :: Nil
        ), true) :: Nil
    )
    val modernOldDF = sqlc.createDataFrame(rddRes, schemaRes)
      .withColumnRenamed("ewStruct", "old_EWStruct").withColumnRenamed("term", "old_Term")

    oldDF.collect()
    val fTransformer =
      new EWStatsTransformer()
        .setAlpha(0.7)
        .setBeta(0.055)
        .setInputFreqColName("Freq")
        .setInputTermColName("Term")
        .setOldEWStructColName("old_EWStruct")
        .setNewEWStructColName("EWStruct")
        .setOldTermColName("old_Term")
    val schema = new StructType().add("Term", StringType).add("Freq", DoubleType)

    val inDF = sqlc.createDataFrame(
      sc.parallelize(Seq(("a", 0.2), ("b", 0.1), ("d", 0.05)))
        .map(f => {
          Row.fromSeq(Seq(f._1, f._2))
        }), schema)
    val joined = inDF.join(modernOldDF, $"Term" === $"old_Term", "outer")
    val outDF = fTransformer.transform(joined)
    val ans: Array[Row] = outDF.sort("Term").collect()
    assertResult(4)(ans.size)
  }

  "CorrectEWStatsTransformer" should "count EWStats correct" in {

    val mathTransformFun: (String, Double, Double, Double) => EWStruct = EWStatsTransformer.termEWStatsComputing(_:String,_:Double,_:Double,_:Double,0.7,0.005)
    val input = ("test", 0.01, 0.006, 0.003)
    val expected = (0.0669, 0.0088, 0.0009)
    val real = mathTransformFun(input._1, input._2, input._3, input._4)
    val realRounded = (Math.round(real.sig * 10000D) / 10000D, Math.round(real.ewma * 10000D) / 10000D, Math.round(real.ewmvar * 10000D) / 10000D)
    assertResult(expected)(realRounded)
  }
}

Source File: NGramExtractorSpec.scala From pravda-ml with Apache License 2.0

5 votes

package odkl.analysis.spark.texts

import odkl.analysis.spark.TestEnv
import org.apache.spark.ml.odkl.texts.NGramExtractor
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}
import org.scalatest.FlatSpec


class NGramExtractorSpec extends FlatSpec with TestEnv with org.scalatest.Matchers {

  "NGramExtractor" should "extract NGrams upTo=true" in {
    val nGramExtractor =
      new NGramExtractor()
        .setUpperN(2)
        .setInputCol("textTokenized")
        .setOutputCol("nGram")

    val schema = new StructType().add("textTokenized",ArrayType(StringType,true))
    val inDF = sqlc.createDataFrame(
      sc.parallelize(Seq(Seq[String]("ab","bc","cd"),Seq[String]("a","b")))
        .map(f => {Row(f)}), schema)

    val outDF = nGramExtractor.transform(inDF)

    val outArrays = outDF.collect().map(_.getAs[Seq[String]]("nGram")).toSeq

    val correctArrays = Seq(Seq("ab","bc","cd","ab bc","bc cd"),Seq("a","b", "a b"))
    assertResult(correctArrays)(outArrays)
  }

  "NGramExtractor" should "extract NGrams upTo=false" in {
    val nGramExtractor =
      new NGramExtractor()
        .setUpperN(3)
        .setLowerN(3)
        .setInputCol("textTokenized")
        .setOutputCol("nGram")

    val schema = new StructType().add("textTokenized",ArrayType(StringType,true))
    val inDF = sqlc.createDataFrame(
      sc.parallelize(Seq(Seq[String]("a","b","c","d")).map(f => {Row(f)})),
      schema)

    val outDF = nGramExtractor.transform(inDF)

    val outArrays = outDF.collect().map(_.getAs[Seq[String]]("nGram")).toSeq

    val correctArrays = Seq(Seq("a b c", "b c d"))
    assertResult(correctArrays)(outArrays)
  }
  "NGramExtractor" should "extract NGrams with the same col" in {
    val nGramExtractor =
      new NGramExtractor()
        .setUpperN(3)
        .setLowerN(3)
        .setInputCol("textTokenized")
        .setOutputCol("textTokenized")

    val schema = new StructType().add("textTokenized",ArrayType(StringType,true))
    val inDF = sqlc.createDataFrame(
      sc.parallelize(Seq(Seq[String]("a","b","c","d")).map(f => {Row(f)})),
      schema)

    val outDF = nGramExtractor.transform(inDF)

    val outArrays = outDF.collect().map(_.getAs[Seq[String]]("textTokenized")).toSeq

    val correctArrays = Seq(Seq("a b c", "b c d"))
    assertResult(correctArrays)(outArrays)
  }

}

Source File: HashBasedDeduplicatorSpec.scala From pravda-ml with Apache License 2.0

5 votes

package odkl.analysis.spark.texts

import odkl.analysis.spark.TestEnv
import org.apache.spark.ml.odkl.texts.HashBasedDeduplicator
import org.apache.spark.ml.linalg.{VectorUDT, Vectors}
import org.apache.spark.ml.odkl.MatrixUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{LongType, StringType, StructType}
import org.scalatest.FlatSpec

class HashBasedDeduplicatorSpec extends FlatSpec with TestEnv with org.scalatest.Matchers {
  "cotrect HashBasedDeduplicator " should " remove similar vectors based on hash " in {

    val vectorsSize = 10000

    val vector1 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 1.0, 1.0)), 1L, "vector1")
    val vector2 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 1.0, 0.0)), 1L, "vector2")
    val vector3 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 0.0, 1.0)), 2L, "vector3") //pretty similar, but in 2nd bucket
    val vector4 = (Vectors.sparse(vectorsSize, Array(1, 2), Array(1.0, 1.0)), 1L, "vector4") //completly another but in 1-st bucket

    val schema = new StructType()
      .add("vector", MatrixUtils.vectorUDT)
      .add("hash", LongType)
      .add("alias", StringType)

    val dataFrame = sqlc.createDataFrame(sc.parallelize(Seq(vector1, vector2, vector3, vector4).map(Row.fromTuple(_))), schema)
    val deduplicator = new HashBasedDeduplicator()
      .setInputColHash("hash")
      .setInputColVector("vector")
      .setSimilarityTreshold(0.80)

   val answer = deduplicator.transform(dataFrame)
        .collect().map(row => (row.getLong(1), row.getString(2)))

    assert(answer.exists(_._2 == "vector1")) //should stay
    assert(!answer.exists(_._2 == "vector2")) //should be removed
    assert(answer.exists(_._2 == "vector3")) //should stay cause in other bucket (FalseNegative)
    assert(answer.exists(_._2 == "vector4")) //should stay cause different (FalsePositive)
  }
}

Source File: FreqStatsTransformerSpec.scala From pravda-ml with Apache License 2.0

5 votes

package odkl.analysis.spark.texts

import odkl.analysis.spark.TestEnv
import org.apache.spark.ml.odkl.texts.FreqStatsTransformer
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ArrayType, LongType, StringType, StructType}
import org.scalatest.FlatSpec


class FreqStatsTransformerSpec extends FlatSpec with TestEnv with org.scalatest.Matchers {

  "FreqStatsTransformer" should "count freq" in {
    val fTransformer =  new FreqStatsTransformer()
      .setInputDataCol("data")
      .setOutputColFreq("Freq")
      .setOutputColTerm("Term")

    val schema = new StructType().add("data",ArrayType(StringType,true))
    val inDF = sqlc.createDataFrame(
      sc.parallelize(Seq(Seq[String]("a","b","c"),Seq[String]("a","b","a")))
        .map(f => {Row(f)}), schema)

    val correctAns = Array[(String,Double)](("a",2D/5D),("b",2D/5D),("c",1D/5D))
    val realAns = fTransformer.transform(inDF).sort("Term").collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"))})
    assertResult(correctAns)(realAns)

  }
  "FreqStatsTransformer" should "filter freq by uni and bi treshold" in {
    val fTransformer =  new FreqStatsTransformer()
      .setInputDataCol("data")
      .setOutputColFreq("Freq")
      .setOutputColTerm("Term")
      .setTresholdArr(Array[Double](1.5D/8D,1.1D/8D))

    val schema = new StructType().add("data",ArrayType(StringType,true))
    val inDF = sqlc.createDataFrame(
      sc.parallelize(Seq(Seq[String]("a","b","c","c a", "c a"),Seq[String]("a","b","a", "c a", "a b")))
        .map(f => {Row(f)}), schema)

    val correctAns = Array[(String,Double)](("a",2D/8D),("b",2D/8D),("c a",2D/8D))
    val realAnsDF = fTransformer.transform(inDF).sort("Term")
      val realAns = realAnsDF.collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"))})
    assertResult(correctAns)(realAns)

  }

  "FreqStatsTransformer" should "extract max timestamp by term" in {
    val fTransformer =  new FreqStatsTransformer()
      .setInputDataCol("data")
      .setOutputColFreq("Freq")
      .setOutputColTerm("Term")
        .setWithTimestamp(true)
        .setTimestampColumnName("timestamp")
      .setTresholdArr(Array[Double](1D/8D,1.1D/8D))

    val schema =
      new StructType().add("data",ArrayType(StringType,true)).add("timestamp",LongType)
    val inDF = sqlc.createDataFrame(
      sc.parallelize(Seq(Seq(Seq[String]("a","c","c a", "c a"),100L),Seq(Seq[String]("c a", "a b"),150L),Seq(Seq[String]("b"),200L)))
        .map(f => {Row.fromSeq(f)}), schema)

    inDF.collect()
    val correctAns = Array[(String,Double,Long)](("a",1D/6D,100L),("a b",1D/6D, 150L),("b",1D/6D,200L),
      ("c",1D/6D, 100L),("c a",2D/6D, 150L))
    val realAns = fTransformer.transform(inDF).sort("Term").collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"),f.getAs[Long]("timestamp"))})
    assertResult(correctAns)(realAns)
    assertResult(correctAns(1))(realAns(1))

  }
}

Source File: CustomSchemaTest.scala From spark-sftp with Apache License 2.0

5 votes

package com.springml.spark.sftp

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, _}
import org.scalatest.{BeforeAndAfterEach, FunSuite}


class CustomSchemaTest extends FunSuite with BeforeAndAfterEach {
  var ss: SparkSession = _

  val csvTypesMap = Map("ProposalId" -> IntegerType,
    "OpportunityId" -> StringType,
    "Clicks" -> LongType,
    "Impressions" -> LongType
  )

  val jsonTypesMap = Map("name" -> StringType,
    "age" -> IntegerType
  )

  override def beforeEach() {
    ss = SparkSession.builder().master("local").appName("Custom Schema Test").getOrCreate()
  }

  private def validateTypes(field : StructField, typeMap : Map[String, DataType]) = {
    val expectedType = typeMap(field.name)
    assert(expectedType == field.dataType)
  }

  private def columnArray(typeMap : Map[String, DataType]) : Array[StructField] = {
    val columns = typeMap.map(x => new StructField(x._1, x._2, true))

    val columnStruct = Array[StructField] ()
    columns.copyToArray(columnStruct)

    columnStruct
  }

  test ("Read CSV with custom schema") {
    val columnStruct = columnArray(csvTypesMap)
    val expectedSchema = StructType(columnStruct)

    val fileLocation = getClass.getResource("/sample.csv").getPath
    val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext)
    val rdd = dsr.buildScan()

    assert(dsr.schema.fields.length == columnStruct.length)
    dsr.schema.fields.foreach(s => validateTypes(s, csvTypesMap))
  }

  test ("Read Json with custom schema") {
    val columnStruct = columnArray(jsonTypesMap)
    val expectedSchema = StructType(columnStruct)

    val fileLocation = getClass.getResource("/people.json").getPath
    val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext)
    val rdd = dsr.buildScan()

    assert(dsr.schema.fields.length == columnStruct.length)
    dsr.schema.fields.foreach(s => validateTypes(s, jsonTypesMap))
  }

}

Source File: NGram.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}


  @Since("1.5.0")
  def getN: Int = $(n)

  setDefault(n -> 2)

  override protected def createTransformFunc: Seq[String] => Seq[String] = {
    _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.sameType(ArrayType(StringType)),
      s"Input type must be ArrayType(StringType) but got $inputType.")
  }

  override protected def outputDataType: DataType = new ArrayType(StringType, false)
}

@Since("1.6.0")
object NGram extends DefaultParamsReadable[NGram] {

  @Since("1.6.0")
  override def load(path: String): NGram = super.load(path)
}

Source File: KafkaWriter.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.kafka010

import java.{util => ju}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, SparkSession}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}
import org.apache.spark.sql.types.{BinaryType, StringType}
import org.apache.spark.util.Utils


private[kafka010] object KafkaWriter extends Logging {
  val TOPIC_ATTRIBUTE_NAME: String = "topic"
  val KEY_ATTRIBUTE_NAME: String = "key"
  val VALUE_ATTRIBUTE_NAME: String = "value"

  override def toString: String = "KafkaWriter"

  def validateQuery(
      schema: Seq[Attribute],
      kafkaParameters: ju.Map[String, Object],
      topic: Option[String] = None): Unit = {
    schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse(
      if (topic.isEmpty) {
        throw new AnalysisException(s"topic option required when no " +
          s"'$TOPIC_ATTRIBUTE_NAME' attribute is present. Use the " +
          s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a topic.")
      } else {
        Literal(topic.get, StringType)
      }
    ).dataType match {
      case StringType => // good
      case _ =>
        throw new AnalysisException(s"Topic type must be a String")
    }
    schema.find(_.name == KEY_ATTRIBUTE_NAME).getOrElse(
      Literal(null, StringType)
    ).dataType match {
      case StringType | BinaryType => // good
      case _ =>
        throw new AnalysisException(s"$KEY_ATTRIBUTE_NAME attribute type " +
          s"must be a String or BinaryType")
    }
    schema.find(_.name == VALUE_ATTRIBUTE_NAME).getOrElse(
      throw new AnalysisException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found")
    ).dataType match {
      case StringType | BinaryType => // good
      case _ =>
        throw new AnalysisException(s"$VALUE_ATTRIBUTE_NAME attribute type " +
          s"must be a String or BinaryType")
    }
  }

  def write(
      sparkSession: SparkSession,
      queryExecution: QueryExecution,
      kafkaParameters: ju.Map[String, Object],
      topic: Option[String] = None): Unit = {
    val schema = queryExecution.analyzed.output
    validateQuery(schema, kafkaParameters, topic)
    queryExecution.toRdd.foreachPartition { iter =>
      val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic)
      Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
        finallyBlock = writeTask.close())
    }
  }
}

Source File: SparkExecuteStatementOperationSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{IntegerType, NullType, StringType, StructField, StructType}

class SparkExecuteStatementOperationSuite extends SparkFunSuite {
  test("SPARK-17112 `select null` via JDBC triggers IllegalArgumentException in ThriftServer") {
    val field1 = StructField("NULL", NullType)
    val field2 = StructField("(IF(true, NULL, NULL))", NullType)
    val tableSchema = StructType(Seq(field1, field2))
    val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors()
    assert(columns.size() == 2)
    assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.NULL_TYPE)
    assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.NULL_TYPE)
  }

  test("SPARK-20146 Comment should be preserved") {
    val field1 = StructField("column1", StringType).withComment("comment 1")
    val field2 = StructField("column2", IntegerType)
    val tableSchema = StructType(Seq(field1, field2))
    val columns = SparkExecuteStatementOperation.getTableSchema(tableSchema).getColumnDescriptors()
    assert(columns.size() == 2)
    assert(columns.get(0).getType() == org.apache.hive.service.cli.Type.STRING_TYPE)
    assert(columns.get(0).getComment() == "comment 1")
    assert(columns.get(1).getType() == org.apache.hive.service.cli.Type.INT_TYPE)
    assert(columns.get(1).getComment() == "")
  }
}

Source File: StatsEstimationTestBase.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.statsEstimation

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.{ColumnStat, LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{IntegerType, StringType}


trait StatsEstimationTestBase extends SparkFunSuite {

  var originalValue: Boolean = false

  override def beforeAll(): Unit = {
    super.beforeAll()
    // Enable stats estimation based on CBO.
    originalValue = SQLConf.get.getConf(SQLConf.CBO_ENABLED)
    SQLConf.get.setConf(SQLConf.CBO_ENABLED, true)
  }

  override def afterAll(): Unit = {
    SQLConf.get.setConf(SQLConf.CBO_ENABLED, originalValue)
    super.afterAll()
  }

  def getColSize(attribute: Attribute, colStat: ColumnStat): Long = attribute.dataType match {
    // For UTF8String: base + offset + numBytes
    case StringType => colStat.avgLen + 8 + 4
    case _ => colStat.avgLen
  }

  def attr(colName: String): AttributeReference = AttributeReference(colName, IntegerType)()

  
case class StatsTestPlan(
    outputList: Seq[Attribute],
    rowCount: BigInt,
    attributeStats: AttributeMap[ColumnStat],
    size: Option[BigInt] = None) extends LeafNode {
  override def output: Seq[Attribute] = outputList
  override def computeStats(): Statistics = Statistics(
    // If sizeInBytes is useless in testing, we just use a fake value
    sizeInBytes = size.getOrElse(Int.MaxValue),
    rowCount = Some(rowCount),
    attributeStats = attributeStats)
}

Source File: ScalaUDFSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import java.util.Locale

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
import org.apache.spark.sql.types.{IntegerType, StringType}

class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("basic") {
    val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil)
    checkEvaluation(intUdf, 2)

    val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil)
    checkEvaluation(stringUdf, "ax")
  }

  test("better error message for NPE") {
    val udf = ScalaUDF(
      (s: String) => s.toLowerCase(Locale.ROOT),
      StringType,
      Literal.create(null, StringType) :: Nil)

    val e1 = intercept[SparkException](udf.eval())
    assert(e1.getMessage.contains("Failed to execute user defined function"))

    val e2 = intercept[SparkException] {
      checkEvalutionWithUnsafeProjection(udf, null)
    }
    assert(e2.getMessage.contains("Failed to execute user defined function"))
  }

  test("SPARK-22695: ScalaUDF should not use global variables") {
    val ctx = new CodegenContext
    ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil).genCode(ctx)
    assert(ctx.inlinedMutableStates.isEmpty)
  }
}

Source File: GenerateUnsafeProjectionSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions.codegen

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.BoundReference
import org.apache.spark.sql.catalyst.util.{ArrayData, MapData}
import org.apache.spark.sql.types.{DataType, Decimal, StringType, StructType}
import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String}

class GenerateUnsafeProjectionSuite extends SparkFunSuite {
  test("Test unsafe projection string access pattern") {
    val dataType = (new StructType).add("a", StringType)
    val exprs = BoundReference(0, dataType, nullable = true) :: Nil
    val projection = GenerateUnsafeProjection.generate(exprs)
    val result = projection.apply(InternalRow(AlwaysNull))
    assert(!result.isNullAt(0))
    assert(result.getStruct(0, 1).isNullAt(0))
  }
}

object AlwaysNull extends InternalRow {
  override def numFields: Int = 1
  override def setNullAt(i: Int): Unit = {}
  override def copy(): InternalRow = this
  override def anyNull: Boolean = true
  override def isNullAt(ordinal: Int): Boolean = true
  override def update(i: Int, value: Any): Unit = notSupported
  override def getBoolean(ordinal: Int): Boolean = notSupported
  override def getByte(ordinal: Int): Byte = notSupported
  override def getShort(ordinal: Int): Short = notSupported
  override def getInt(ordinal: Int): Int = notSupported
  override def getLong(ordinal: Int): Long = notSupported
  override def getFloat(ordinal: Int): Float = notSupported
  override def getDouble(ordinal: Int): Double = notSupported
  override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = notSupported
  override def getUTF8String(ordinal: Int): UTF8String = notSupported
  override def getBinary(ordinal: Int): Array[Byte] = notSupported
  override def getInterval(ordinal: Int): CalendarInterval = notSupported
  override def getStruct(ordinal: Int, numFields: Int): InternalRow = notSupported
  override def getArray(ordinal: Int): ArrayData = notSupported
  override def getMap(ordinal: Int): MapData = notSupported
  override def get(ordinal: Int, dataType: DataType): AnyRef = notSupported
  private def notSupported: Nothing = throw new UnsupportedOperationException
}

Source File: ComplexDataSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.util

import scala.collection._

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{BoundReference, GenericInternalRow, SpecificInternalRow, UnsafeMapData, UnsafeProjection}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection
import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType}
import org.apache.spark.unsafe.types.UTF8String

class ComplexDataSuite extends SparkFunSuite {
  def utf8(str: String): UTF8String = UTF8String.fromString(str)

  test("inequality tests for MapData") {
    // test data
    val testMap1 = Map(utf8("key1") -> 1)
    val testMap2 = Map(utf8("key1") -> 1, utf8("key2") -> 2)
    val testMap3 = Map(utf8("key1") -> 1)
    val testMap4 = Map(utf8("key1") -> 1, utf8("key2") -> 2)

    // ArrayBasedMapData
    val testArrayMap1 = ArrayBasedMapData(testMap1.toMap)
    val testArrayMap2 = ArrayBasedMapData(testMap2.toMap)
    val testArrayMap3 = ArrayBasedMapData(testMap3.toMap)
    val testArrayMap4 = ArrayBasedMapData(testMap4.toMap)
    assert(testArrayMap1 !== testArrayMap3)
    assert(testArrayMap2 !== testArrayMap4)

    // UnsafeMapData
    val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType)))
    val row = new GenericInternalRow(1)
    def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = {
      row.update(0, map)
      val unsafeRow = unsafeConverter.apply(row)
      unsafeRow.getMap(0).copy
    }
    assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3))
    assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4))
  }

  test("GenericInternalRow.copy return a new instance that is independent from the old one") {
    val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true)))
    val unsafeRow = project.apply(InternalRow(utf8("a")))

    val genericRow = new GenericInternalRow(Array[Any](unsafeRow.getUTF8String(0)))
    val copiedGenericRow = genericRow.copy()
    assert(copiedGenericRow.getString(0) == "a")
    project.apply(InternalRow(UTF8String.fromString("b")))
    // The copied internal row should not be changed externally.
    assert(copiedGenericRow.getString(0) == "a")
  }

  test("SpecificMutableRow.copy return a new instance that is independent from the old one") {
    val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true)))
    val unsafeRow = project.apply(InternalRow(utf8("a")))

    val mutableRow = new SpecificInternalRow(Seq(StringType))
    mutableRow(0) = unsafeRow.getUTF8String(0)
    val copiedMutableRow = mutableRow.copy()
    assert(copiedMutableRow.getString(0) == "a")
    project.apply(InternalRow(UTF8String.fromString("b")))
    // The copied internal row should not be changed externally.
    assert(copiedMutableRow.getString(0) == "a")
  }

  test("GenericArrayData.copy return a new instance that is independent from the old one") {
    val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true)))
    val unsafeRow = project.apply(InternalRow(utf8("a")))

    val genericArray = new GenericArrayData(Array[Any](unsafeRow.getUTF8String(0)))
    val copiedGenericArray = genericArray.copy()
    assert(copiedGenericArray.getUTF8String(0).toString == "a")
    project.apply(InternalRow(UTF8String.fromString("b")))
    // The copied array data should not be changed externally.
    assert(copiedGenericArray.getUTF8String(0).toString == "a")
  }

  test("copy on nested complex type") {
    val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true)))
    val unsafeRow = project.apply(InternalRow(utf8("a")))

    val arrayOfRow = new GenericArrayData(Array[Any](InternalRow(unsafeRow.getUTF8String(0))))
    val copied = arrayOfRow.copy()
    assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a")
    project.apply(InternalRow(UTF8String.fromString("b")))
    // The copied data should not be changed externally.
    assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a")
  }
}

Source File: LikeSimplificationSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.types.{BooleanType, StringType}

class LikeSimplificationSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("Like Simplification", Once,
        LikeSimplification) :: Nil
  }

  val testRelation = LocalRelation('a.string)

  test("simplify Like into StartsWith") {
    val originalQuery =
      testRelation
        .where(('a like "abc%") || ('a like "abc\\%"))

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer = testRelation
      .where(StartsWith('a, "abc") || ('a like "abc\\%"))
      .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("simplify Like into EndsWith") {
    val originalQuery =
      testRelation
        .where('a like "%xyz")

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer = testRelation
      .where(EndsWith('a, "xyz"))
      .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("simplify Like into startsWith and EndsWith") {
    val originalQuery =
      testRelation
        .where(('a like "abc\\%def") || ('a like "abc%def"))

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer = testRelation
      .where(('a like "abc\\%def") ||
        (Length('a) >= 6 && (StartsWith('a, "abc") && EndsWith('a, "def"))))
      .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("simplify Like into Contains") {
    val originalQuery =
      testRelation
        .where(('a like "%mn%") || ('a like "%mn\\%"))

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer = testRelation
      .where(Contains('a, "mn") || ('a like "%mn\\%"))
      .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("simplify Like into EqualTo") {
    val originalQuery =
      testRelation
        .where(('a like "") || ('a like "abc"))

    val optimized = Optimize.execute(originalQuery.analyze)
    val correctAnswer = testRelation
      .where(('a === "") || ('a === "abc"))
      .analyze

    comparePlans(optimized, correctAnswer)
  }

  test("null pattern") {
    val originalQuery = testRelation.where('a like Literal(null, StringType)).analyze
    val optimized = Optimize.execute(originalQuery)
    comparePlans(optimized, testRelation.where(Literal(null, BooleanType)).analyze)
  }
}

Source File: RewriteDistinctAggregatesSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.analysis.{Analyzer, EmptyFunctionRegistry}
import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.catalyst.expressions.aggregate.CollectSet
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Expand, LocalRelation, LogicalPlan}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.internal.SQLConf.{CASE_SENSITIVE, GROUP_BY_ORDINAL}
import org.apache.spark.sql.types.{IntegerType, StringType}

class RewriteDistinctAggregatesSuite extends PlanTest {
  override val conf = new SQLConf().copy(CASE_SENSITIVE -> false, GROUP_BY_ORDINAL -> false)
  val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf)
  val analyzer = new Analyzer(catalog, conf)

  val nullInt = Literal(null, IntegerType)
  val nullString = Literal(null, StringType)
  val testRelation = LocalRelation('a.string, 'b.string, 'c.string, 'd.string, 'e.int)

  private def checkRewrite(rewrite: LogicalPlan): Unit = rewrite match {
    case Aggregate(_, _, Aggregate(_, _, _: Expand)) =>
    case _ => fail(s"Plan is not rewritten:\n$rewrite")
  }

  test("single distinct group") {
    val input = testRelation
      .groupBy('a)(countDistinct('e))
      .analyze
    val rewrite = RewriteDistinctAggregates(input)
    comparePlans(input, rewrite)
  }

  test("single distinct group with partial aggregates") {
    val input = testRelation
      .groupBy('a, 'd)(
        countDistinct('e, 'c).as('agg1),
        max('b).as('agg2))
      .analyze
    val rewrite = RewriteDistinctAggregates(input)
    comparePlans(input, rewrite)
  }

  test("multiple distinct groups") {
    val input = testRelation
      .groupBy('a)(countDistinct('b, 'c), countDistinct('d))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups with partial aggregates") {
    val input = testRelation
      .groupBy('a)(countDistinct('b, 'c), countDistinct('d), sum('e))
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }

  test("multiple distinct groups with non-partial aggregates") {
    val input = testRelation
      .groupBy('a)(
        countDistinct('b, 'c),
        countDistinct('d),
        CollectSet('b).toAggregateExpression())
      .analyze
    checkRewrite(RewriteDistinctAggregates(input))
  }
}

Source File: resources.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import java.io.File
import java.net.URI

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}


case class ListJarsCommand(jars: Seq[String] = Seq.empty[String]) extends RunnableCommand {
  override val output: Seq[Attribute] = {
    AttributeReference("Results", StringType, nullable = false)() :: Nil
  }
  override def run(sparkSession: SparkSession): Seq[Row] = {
    val jarList = sparkSession.sparkContext.listJars()
    if (jars.nonEmpty) {
      for {
        jarName <- jars.map(f => new Path(f).getName)
        jarPath <- jarList if jarPath.contains(jarName)
      } yield Row(jarPath)
    } else {
      jarList.map(Row(_))
    }
  }
}

Source File: GroupedIteratorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType}

class GroupedIteratorSuite extends SparkFunSuite {

  test("basic") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 1)
        key.getInt(0) -> data.map(encoder.fromRow).toSeq
    }.toSeq

    assert(result ==
      1 -> Seq(input(0), input(1)) ::
      2 -> Seq(input(2)) :: Nil)
  }

  test("group by 2 columns") {
    val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()

    val input = Seq(
      Row(1, 2L, "a"),
      Row(1, 2L, "b"),
      Row(1, 3L, "c"),
      Row(2, 1L, "d"),
      Row(3, 2L, "e"))

    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes)

    val result = grouped.map {
      case (key, data) =>
        assert(key.numFields == 2)
        (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq)
    }.toSeq

    assert(result ==
      (1, 2L, Seq(input(0), input(1))) ::
      (1, 3L, Seq(input(2))) ::
      (2, 1L, Seq(input(3))) ::
      (3, 2L, Seq(input(4))) :: Nil)
  }

  test("do nothing to the value iterator") {
    val schema = new StructType().add("i", IntegerType).add("s", StringType)
    val encoder = RowEncoder(schema).resolveAndBind()
    val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c"))
    val grouped = GroupedIterator(input.iterator.map(encoder.toRow),
      Seq('i.int.at(0)), schema.toAttributes)

    assert(grouped.length == 2)
  }
}

Source File: SpreadsheetRelation.scala From mimir with Apache License 2.0

5 votes

package mimir.exec.spark.datasource.google.spreadsheet

import mimir.exec.spark.datasource.google.spreadsheet.SparkSpreadsheetService.SparkSpreadsheetContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

case class SpreadsheetRelation protected[spark] (
                                                  context:SparkSpreadsheetContext,
                                                  spreadsheetName: String,
                                                  worksheetName: String,
                                                  userSchema: Option[StructType] = None)(@transient val sqlContext: SQLContext)
  extends BaseRelation with TableScan with InsertableRelation {

  import mimir.exec.spark.datasource.google.spreadsheet.SparkSpreadsheetService._

  private val fieldMap = scala.collection.mutable.Map[String, String]()
  override def schema: StructType = userSchema.getOrElse(inferSchema())

  private lazy val aWorksheet: SparkWorksheet =
    findWorksheet(spreadsheetName, worksheetName)(context) match {
      case Right(aWorksheet) => aWorksheet
      case Left(e) => throw e
    }

  private lazy val rows: Seq[Map[String, String]] = aWorksheet.rows

  private[spreadsheet] def findWorksheet(spreadsheetName: String, worksheetName: String)(implicit ctx: SparkSpreadsheetContext): Either[Throwable, SparkWorksheet] =
    for {
      sheet <- findSpreadsheet(spreadsheetName).toRight(new RuntimeException(s"no such spreadsheet: $spreadsheetName")).right
      worksheet <- sheet.findWorksheet(worksheetName).toRight(new RuntimeException(s"no such worksheet: $worksheetName")).right
    } yield worksheet

  override def buildScan(): RDD[Row] = {
    val aSchema = schema
    val schemaMap = fieldMap.toMap
    sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter =>
      iter.map { m =>
        var index = 0
        val rowArray = new Array[Any](aSchema.fields.length)
        while(index < aSchema.fields.length) {
          val field = aSchema.fields(index)
          rowArray(index) = if (m.contains(field.name)) {
            TypeCast.castTo(m(field.name), field.dataType, field.nullable)
          } else if (schemaMap.contains(field.name) && m.contains(schemaMap(field.name))) {
            TypeCast.castTo(m(schemaMap(field.name)), field.dataType, field.nullable)
          } else {
            null
          }
          index += 1
        }
        Row.fromSeq(rowArray)
      }
    }
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    if(!overwrite) {
      sys.error("Spreadsheet tables only support INSERT OVERWRITE for now.")
    }

    findWorksheet(spreadsheetName, worksheetName)(context) match {
      case Right(w) =>
        w.updateCells(data.schema, data.collect().toList, Util.toRowData)
      case Left(e) =>
        throw e
    }
  }

  def sanitizeColumnName(name: String): String =
  {
    name
      .replaceAll("[^a-zA-Z0-9]+", "_")    // Replace sequences of non-alphanumeric characters with underscores
      .replaceAll("_+$", "")               // Strip trailing underscores
      .replaceAll("^[0-9_]+", "")          // Strip leading underscores and digits
  }

  private def inferSchema(): StructType =
    StructType(aWorksheet.headers.toList.map { fieldName => {
      val sanitizedName = sanitizeColumnName(fieldName)
      fieldMap.put(sanitizedName, fieldName)
      StructField(sanitizedName, StringType, true)
    }})

}

Source File: JsonGroupArray.scala From mimir with Apache License 2.0

5 votes

package mimir.exec.spark.udf

import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.util.TypeUtils
import org.apache.spark.sql.types.{ DataType, StringType }
import org.apache.spark.sql.catalyst.expressions.{ 
  AttributeReference, 
  If, 
  StartsWith, 
  Literal, 
  IsNull, 
  Concat, 
  Substring
}

case class JsonGroupArray(child: org.apache.spark.sql.catalyst.expressions.Expression) extends DeclarativeAggregate {
  override def children: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = child :: Nil
  override def nullable: Boolean = false
  // Return data type.
  override def dataType: DataType = StringType
  override def checkInputDataTypes(): TypeCheckResult =
    TypeUtils.checkForOrderingExpr(child.dataType, "function json_group_array")
  private lazy val json_group_array = AttributeReference("json_group_array", StringType)()
  override lazy val aggBufferAttributes: Seq[AttributeReference] = json_group_array :: Nil
  override lazy val initialValues: Seq[Literal] = Seq(
    Literal.create("", StringType)
  )
  override lazy val updateExpressions: Seq[ org.apache.spark.sql.catalyst.expressions.Expression] = Seq(
    If(IsNull(child),
      Concat(Seq(json_group_array, Literal(","), Literal("null"))),
      Concat(Seq(json_group_array, Literal(","), org.apache.spark.sql.catalyst.expressions.Cast(child,StringType,None)))) 
  )
  override lazy val mergeExpressions: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = {
    Seq(
      Concat(Seq(json_group_array.left, json_group_array.right))
    )
  }
  override lazy val evaluateExpression = Concat(Seq(Literal("["), If(StartsWith(json_group_array,Literal(",")),Substring(json_group_array,Literal(2),Literal(Integer.MAX_VALUE)),json_group_array), Literal("]")))
}

Source File: ProtobufRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter

class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val labelColumnName = "label"
  val featuresColumnName = "features"
  val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField(
    featuresColumnName, VectorType)))

  it should "serialize a dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "serialize a sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "fail to set schema on invalid features name" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    intercept[IllegalArgumentException] {
      val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist")
    }
  }


  it should "fail on invalid types" in {
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new ProtobufRequestRowSerializer(Some(validSchema))
  }
}

Source File: UnlabeledLibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StringType, StructField, StructType}

class UnlabeledLibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val schema = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false)))

  "UnlabeledLibSVMRequestRowSerializer" should "serialize sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    val serialized = new String(rrs.serializeRow(row))
    assert ("0.0 1:-100.0 11:100.1\n" == serialized)
  }

  it should "serialize dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    val serialized = new String(rrs.serializeRow(row))
    assert("0.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "fail on invalid features column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs =
      new UnlabeledLibSVMRequestRowSerializer(featuresColumnName = "mangoes are not features")
    intercept[RuntimeException] {
      rrs.serializeRow(row)
    }
  }

  it should "fail on invalid features type" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row =
      new GenericRowWithSchema(values = Seq(1.0, "FEATURESSSSSZ!1!").toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    intercept[RuntimeException] {
      rrs.serializeRow(row)
    }
  }


  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))

    val rrs = new UnlabeledLibSVMRequestRowSerializer(Some(validSchema))
  }

  it should "fail to validate incorrect schema" in {
    val invalidSchema = StructType(Array(
      StructField("features", StringType, nullable = false)))

    intercept[IllegalArgumentException] {
      new UnlabeledLibSVMRequestRowSerializer(Some(invalidSchema))
    }
  }
}

Source File: LibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest._
import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.LibSVMResponseRowDeserializer

class LibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {
  val schema = new LibSVMResponseRowDeserializer(10).schema

  "LibSVMRequestRowSerializer" should "serialize sparse vector" in {

    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new LibSVMRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert ("1.0 1:-100.0 11:100.1\n" == serialized)
  }

  it should "serialize dense vector" in {

    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new LibSVMRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "ignore other columns" in {
    val schemaWithExtraColumns = StructType(Array(
      StructField("name", StringType, nullable = false),
      StructField("label", DoubleType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false),
        StructField("favorite activity", StringType, nullable = false)))

    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq("Elizabeth", 1.0, vec, "Crying").toArray,
      schema = schemaWithExtraColumns)

    val rrs = new LibSVMRequestRowSerializer(Some(schemaWithExtraColumns))
    val serialized = new String(rrs.serializeRow(row))
    assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "fail on invalid features column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schema), featuresColumnName = "i do not exist dear sir!")
    }
  }

  it should "fail on invalid label column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schema),
        labelColumnName = "Sir! I must protest! I do not exist!")
    }
  }

  it should "fail on invalid types" in {
    val schemaWithInvalidLabelType = StructType(Array(
      StructField("label", StringType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schemaWithInvalidLabelType))
    }
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new LibSVMRequestRowSerializer(Some(validSchema))
  }
}

Source File: KinesisWriteTask.scala From kinesis-sql with Apache License 2.0

5 votes

package org.apache.spark.sql.kinesis

import java.nio.ByteBuffer

import com.amazonaws.services.kinesis.producer.{KinesisProducer, UserRecordResult}
import com.google.common.util.concurrent.{FutureCallback, Futures}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, UnsafeProjection}
import org.apache.spark.sql.types.{BinaryType, StringType}

private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, String],
                                        inputSchema: Seq[Attribute]) extends Logging {

  private var producer: KinesisProducer = _
  private val projection = createProjection
  private val streamName = producerConfiguration.getOrElse(
    KinesisSourceProvider.SINK_STREAM_NAME_KEY, "")

  def execute(iterator: Iterator[InternalRow]): Unit = {
    producer = CachedKinesisProducer.getOrCreate(producerConfiguration)
    while (iterator.hasNext) {
      val currentRow = iterator.next()
      val projectedRow = projection(currentRow)
      val partitionKey = projectedRow.getString(0)
      val data = projectedRow.getBinary(1)

      sendData(partitionKey, data)
    }
  }

  def sendData(partitionKey: String, data: Array[Byte]): String = {
    var sentSeqNumbers = new String

    val future = producer.addUserRecord(streamName, partitionKey, ByteBuffer.wrap(data))

    val kinesisCallBack = new FutureCallback[UserRecordResult]() {

      override def onFailure(t: Throwable): Unit = {
        logError(s"Writing to  $streamName failed due to ${t.getCause}")
      }

      override def onSuccess(result: UserRecordResult): Unit = {
        val shardId = result.getShardId
        sentSeqNumbers = result.getSequenceNumber
      }
    }
    Futures.addCallback(future, kinesisCallBack)

    producer.flushSync()
    sentSeqNumbers
  }

  def close(): Unit = {
    if (producer != null) {
      producer.flush()
      producer = null
    }
  }

  private def createProjection: UnsafeProjection = {

    val partitionKeyExpression = inputSchema
      .find(_.name == KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME).getOrElse(
      throw new IllegalStateException("Required attribute " +
        s"'${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME}' not found"))

    partitionKeyExpression.dataType match {
      case StringType | BinaryType => // ok
      case t =>
        throw new IllegalStateException(s"${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME} " +
          "attribute type must be a String or BinaryType")
    }

    val dataExpression = inputSchema.find(_.name == KinesisWriter.DATA_ATTRIBUTE_NAME).getOrElse(
      throw new IllegalStateException("Required attribute " +
        s"'${KinesisWriter.DATA_ATTRIBUTE_NAME}' not found")
    )

    dataExpression.dataType match {
      case StringType | BinaryType => // ok
      case t =>
        throw new IllegalStateException(s"${KinesisWriter.DATA_ATTRIBUTE_NAME} " +
          "attribute type must be a String or BinaryType")
    }

    UnsafeProjection.create(
      Seq(Cast(partitionKeyExpression, StringType), Cast(dataExpression, StringType)), inputSchema)
  }

}

Source File: NGram.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.annotation.{Since, Experimental}
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param._
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}


  def getN: Int = $(n)

  setDefault(n -> 2)

  override protected def createTransformFunc: Seq[String] => Seq[String] = {
    _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.sameType(ArrayType(StringType)),
      s"Input type must be ArrayType(StringType) but got $inputType.")
  }

  override protected def outputDataType: DataType = new ArrayType(StringType, false)
}

@Since("1.6.0")
object NGram extends DefaultParamsReadable[NGram] {

  @Since("1.6.0")
  override def load(path: String): NGram = super.load(path)
}

Source File: MiscFunctionsSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.commons.codec.digest.DigestUtils

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.types.{IntegerType, StringType, BinaryType}

class MiscFunctionsSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("md5") {
    checkEvaluation(Md5(Literal("ABC".getBytes)), "902fbdd2b1df0c4f70b4a5d23525e932")
    checkEvaluation(Md5(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
      "6ac1e56bc78f031059be7be854522c4c")
    checkEvaluation(Md5(Literal.create(null, BinaryType)), null)
    checkConsistencyBetweenInterpretedAndCodegen(Md5, BinaryType)
  }

  test("sha1") {
    checkEvaluation(Sha1(Literal("ABC".getBytes)), "3c01bdbb26f358bab27f267924aa2c9a03fcfdb8")
    checkEvaluation(Sha1(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
      "5d211bad8f4ee70e16c7d343a838fc344a1ed961")
    checkEvaluation(Sha1(Literal.create(null, BinaryType)), null)
    checkEvaluation(Sha1(Literal("".getBytes)), "da39a3ee5e6b4b0d3255bfef95601890afd80709")
    checkConsistencyBetweenInterpretedAndCodegen(Sha1, BinaryType)
  }

  test("sha2") {
    checkEvaluation(Sha2(Literal("ABC".getBytes), Literal(256)), DigestUtils.sha256Hex("ABC"))
    checkEvaluation(Sha2(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType), Literal(384)),
      DigestUtils.sha384Hex(Array[Byte](1, 2, 3, 4, 5, 6)))
    // unsupported bit length
    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(1024)), null)
    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal(512)), null)
    checkEvaluation(Sha2(Literal("ABC".getBytes), Literal.create(null, IntegerType)), null)
    checkEvaluation(Sha2(Literal.create(null, BinaryType), Literal.create(null, IntegerType)), null)
  }

  test("crc32") {
    checkEvaluation(Crc32(Literal("ABC".getBytes)), 2743272264L)
    checkEvaluation(Crc32(Literal.create(Array[Byte](1, 2, 3, 4, 5, 6), BinaryType)),
      2180413220L)
    checkEvaluation(Crc32(Literal.create(null, BinaryType)), null)
    checkConsistencyBetweenInterpretedAndCodegen(Crc32, BinaryType)
  }
}

org.apache.spark.sql.types.StringType Scala Examples