org.apache.spark.sql.hive.test.TestHive Scala Example

Source File: HiveTypeCoercionSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.expressions.{Cast, EqualTo}
import org.apache.spark.sql.execution.Project
import org.apache.spark.sql.hive.test.TestHive


class HiveTypeCoercionSuite extends HiveComparisonTest {
  val baseTypes = Seq("1", "1.0", "1L", "1S", "1Y", "'1'")

  baseTypes.foreach { i =>
    baseTypes.foreach { j =>
      createQueryTest(s"$i + $j", s"SELECT $i + $j FROM src LIMIT 1")
    }
  }

  val nullVal = "null"
  baseTypes.init.foreach { i =>
    createQueryTest(s"case when then $i else $nullVal end ",
      s"SELECT case when true then $i else $nullVal end FROM src limit 1")
    createQueryTest(s"case when then $nullVal else $i end ",
      s"SELECT case when true then $nullVal else $i end FROM src limit 1")
  }
  //应该删除布尔值的boolean cast
  test("[SPARK-2210] boolean cast on boolean value should be removed") {
    val q = "select cast(cast(key=0 as boolean) as boolean) from src"
    val project = TestHive.sql(q).queryExecution.executedPlan.collect { case e: Project => e }.head

    // No cast expression introduced 没有引入表达式
    project.transformAllExpressions { case c: Cast =>
      fail(s"unexpected cast $c")
      c
    }

    // Only one equality check  只有一个平等检查
    var numEquals = 0
    project.transformAllExpressions { case e: EqualTo =>
      numEquals += 1
      e
    }
    assert(numEquals === 1)
  }
}

Source File: HiveTableScanSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.Row
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.sql.hive.test.TestHive.implicits._

import org.apache.spark.util.Utils

class HiveTableScanSuite extends HiveComparisonTest {

  createQueryTest("partition_based_table_scan_with_different_serde",
    """
      |CREATE TABLE part_scan_test (key STRING, value STRING) PARTITIONED BY (ds STRING)
      |ROW FORMAT SERDE
      |'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'
      |STORED AS RCFILE;
      |
      |FROM src
      |INSERT INTO TABLE part_scan_test PARTITION (ds='2010-01-01')
      |SELECT 100,100 LIMIT 1;
      |
      |ALTER TABLE part_scan_test SET SERDE
      |'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe';
      |
      |FROM src INSERT INTO TABLE part_scan_test PARTITION (ds='2010-01-02')
      |SELECT 200,200 LIMIT 1;
      |
      |SELECT * from part_scan_test;
    """.stripMargin)

  // In unit test, kv1.txt is a small file and will be loaded as table src
  // Since the small file will be considered as a single split, we assume
  // Hive / SparkSQL HQL has the same output even for SORT BY
  createQueryTest("file_split_for_small_table",
    """
      |SELECT key, value FROM src SORT BY key, value
    """.stripMargin)

  test("Spark-4041: lowercase issue") {
    TestHive.sql("CREATE TABLE tb (KEY INT, VALUE STRING) STORED AS ORC")
    TestHive.sql("insert into table tb select key, value from src")
    TestHive.sql("select KEY from tb where VALUE='just_for_test' limit 5").collect()
    TestHive.sql("drop table tb")
  }

  test("Spark-4077: timestamp query for null value") {
    TestHive.sql("DROP TABLE IF EXISTS timestamp_query_null")
    TestHive.sql(
      """
        CREATE EXTERNAL TABLE timestamp_query_null (time TIMESTAMP,id INT)
        ROW FORMAT DELIMITED
        FIELDS TERMINATED BY ','
        LINES TERMINATED BY '\n'
      """.stripMargin)
    val location =
      Utils.getSparkClassLoader.getResource("data/files/issue-4077-data.txt").getFile()

    TestHive.sql(s"LOAD DATA LOCAL INPATH '$location' INTO TABLE timestamp_query_null")
    assert(TestHive.sql("SELECT time from timestamp_query_null limit 2").collect()
      === Array(Row(java.sql.Timestamp.valueOf("2014-12-11 00:00:00")), Row(null)))
    TestHive.sql("DROP TABLE timestamp_query_null")
  }

  test("Spark-4959 Attributes are case sensitive when using a select query from a projection") {
    sql("create table spark_4959 (col1 string)")
    sql("""insert into table spark_4959 select "hi" from src limit 1""")
    table("spark_4959").select(
      'col1.as("CaseSensitiveColName"),
      'col1.as("CaseSensitiveColName2")).registerTempTable("spark_4959_2")

    assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi"))
    assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi"))
  }
}

Source File: HiveSerDeSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.scalatest.BeforeAndAfterAll

import org.apache.spark.sql.hive.test.TestHive


class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
  override def beforeAll(): Unit = {
    import TestHive._
    import org.apache.hadoop.hive.serde2.RegexSerDe
      super.beforeAll()
    TestHive.cacheTables = false
    sql(s"""CREATE TABLE IF NOT EXISTS sales (key STRING, value INT)
       |ROW FORMAT SERDE '${classOf[RegexSerDe].getCanonicalName}'
       |WITH SERDEPROPERTIES ("input.regex" = "([^ ]*)\t([^ ]*)")
       """.stripMargin)
    sql(s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/sales.txt")}' INTO TABLE sales")
  }

  // table sales is not a cache table, and will be clear after reset
  createQueryTest("Read with RegexSerDe", "SELECT * FROM sales", false)

  createQueryTest(
    "Read and write with LazySimpleSerDe (tab separated)",
    "SELECT * from serdeins")

  createQueryTest("Read with AvroSerDe", "SELECT * FROM episodes")

  createQueryTest("Read Partitioned with AvroSerDe", "SELECT * FROM episodes_part")
}

Source File: HiveTypeCoercionSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.expressions.{Cast, EqualTo}
import org.apache.spark.sql.execution.Project
import org.apache.spark.sql.hive.test.TestHive


class HiveTypeCoercionSuite extends HiveComparisonTest {
  val baseTypes = Seq("1", "1.0", "1L", "1S", "1Y", "'1'")

  baseTypes.foreach { i =>
    baseTypes.foreach { j =>
      createQueryTest(s"$i + $j", s"SELECT $i + $j FROM src LIMIT 1")
    }
  }

  val nullVal = "null"
  baseTypes.init.foreach { i =>
    createQueryTest(s"case when then $i else $nullVal end ",
      s"SELECT case when true then $i else $nullVal end FROM src limit 1")
    createQueryTest(s"case when then $nullVal else $i end ",
      s"SELECT case when true then $nullVal else $i end FROM src limit 1")
  }

  test("[SPARK-2210] boolean cast on boolean value should be removed") {
    val q = "select cast(cast(key=0 as boolean) as boolean) from src"
    val project = TestHive.sql(q).queryExecution.executedPlan.collect { case e: Project => e }.head

    // No cast expression introduced
    project.transformAllExpressions { case c: Cast =>
      fail(s"unexpected cast $c")
      c
    }

    // Only one equality check
    var numEquals = 0
    project.transformAllExpressions { case e: EqualTo =>
      numEquals += 1
      e
    }
    assert(numEquals === 1)
  }

  test("COALESCE with different types") {
    intercept[RuntimeException] {
      TestHive.sql("""SELECT COALESCE(1, true, "abc") FROM src limit 1""").collect()
    }
  }
}

Source File: ListTablesSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.scalatest.BeforeAndAfterAll

import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.Row

class ListTablesSuite extends QueryTest with BeforeAndAfterAll {

  import org.apache.spark.sql.hive.test.TestHive.implicits._

  val df =
    sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value")

  override def beforeAll(): Unit = {
    // The catalog in HiveContext is a case insensitive one.
    catalog.registerTable(Seq("ListTablesSuiteTable"), df.logicalPlan)
    catalog.registerTable(Seq("ListTablesSuiteDB", "InDBListTablesSuiteTable"), df.logicalPlan)
    sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)")
    sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB")
    sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)")
  }

  override def afterAll(): Unit = {
    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
    catalog.unregisterTable(Seq("ListTablesSuiteDB", "InDBListTablesSuiteTable"))
    sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable")
    sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable")
    sql("DROP DATABASE IF EXISTS ListTablesSuiteDB")
  }

  test("get all tables of current database") {
    Seq(tables(), sql("SHOW TABLes")).foreach {
      case allTables =>
        // We are using default DB.
        checkAnswer(
          allTables.filter("tableName = 'listtablessuitetable'"),
          Row("listtablessuitetable", true))
        assert(allTables.filter("tableName = 'indblisttablessuitetable'").count() === 0)
        checkAnswer(
          allTables.filter("tableName = 'hivelisttablessuitetable'"),
          Row("hivelisttablessuitetable", false))
        assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0)
    }
  }

  test("getting all tables with a database name") {
    Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach {
      case allTables =>
        checkAnswer(
          allTables.filter("tableName = 'listtablessuitetable'"),
          Row("listtablessuitetable", true))
        checkAnswer(
          allTables.filter("tableName = 'indblisttablessuitetable'"),
          Row("indblisttablessuitetable", true))
        assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0)
        checkAnswer(
          allTables.filter("tableName = 'hiveindblisttablessuitetable'"),
          Row("hiveindblisttablessuitetable", false))
    }
  }
}

Source File: HiveDataFrameAnalyticsSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.sql.{DataFrame, QueryTest}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.sql.hive.test.TestHive.implicits._
import org.scalatest.BeforeAndAfterAll

// TODO ideally we should put the test suite into the package `sql`, as
//TODO理想情况下,我们应该将测试套件放入“sql”包中
// `hive` package is optional in compiling, however, `SQLContext.sql` doesn't
// support the `cube` or `rollup` yet. 不支持“cube”或“rollup”
//Hive DataFrame分析套件
class HiveDataFrameAnalyticsSuite extends QueryTest with BeforeAndAfterAll {
  private var testData: DataFrame = _

  override def beforeAll() {
    testData = Seq((1, 2), (2, 4)).toDF("a", "b")
    TestHive.registerDataFrameAsTable(testData, "mytable")
  }

  override def afterAll(): Unit = {
    TestHive.dropTempTable("mytable")
  }

  
  test("cube") {
    checkAnswer(
      testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")),
      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with cube").collect()
    )

    checkAnswer(
      testData.cube("a", "b").agg(sum("b")),
      sql("select a, b, sum(b) from mytable group by a, b with cube").collect()
    )
  }
}

Source File: HiveParquetSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.execution.datasources.parquet.ParquetTest
import org.apache.spark.sql.{QueryTest, Row, SQLContext}

case class Cases(lower: String, UPPER: String)

class HiveParquetSuite extends QueryTest with ParquetTest {
  private val ctx = TestHive
  override def _sqlContext: SQLContext = ctx
  //不区分大小写的属性名
  test("Case insensitive attribute names") {
    withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") {
      val expected = (1 to 4).map(i => Row(i.toString))
      checkAnswer(sql("SELECT upper FROM cases"), expected)
      checkAnswer(sql("SELECT LOWER FROM cases"), expected)
    }
  }
  //select 在Parquet表查询
  test("SELECT on Parquet table") {
    val data = (1 to 4).map(i => (i, s"val_$i"))
    withParquetTable(data, "t") {
      checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple))
    }
  }
//简单的柱投影+filter在Parquet表
  test("Simple column projection + filter on Parquet table") {
    withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") {
      checkAnswer(
        sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"),
        Seq(Row(true, "val_2"), Row(true, "val_4")))
    }
  }
//通过saveAsParquetFile将Hive转换为Parquet Table
  test("Converting Hive to Parquet Table via saveAsParquetFile") {
    withTempPath { dir =>
      sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath)
      ctx.read.parquet(dir.getCanonicalPath).registerTempTable("p")
      withTempTable("p") {
        checkAnswer(
          sql("SELECT * FROM src ORDER BY key"),
          sql("SELECT * from p ORDER BY key").collect().toSeq)
      }
    }
  }
//插入覆盖表Parquet表
  test("INSERT OVERWRITE TABLE Parquet table") {
    withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") {
      withTempPath { file =>
        sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath)
        ctx.read.parquet(file.getCanonicalPath).registerTempTable("p")
        withTempTable("p") {
          // let's do three overwrites for good measure
          //让我们做良好的措施三覆盖
          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
          sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
          checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq)
        }
      }
    }
  }
}

Source File: HiveTableScanSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.Row
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.sql.hive.test.TestHive.implicits._

import org.apache.spark.util.Utils


class HiveTableScanSuite extends HiveComparisonTest {
  //基于分区的表扫描与不同的serde
  createQueryTest("partition_based_table_scan_with_different_serde",
    """
      |CREATE TABLE part_scan_test (key STRING, value STRING) PARTITIONED BY (ds STRING)
      |ROW FORMAT SERDE
      |'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'
      |STORED AS RCFILE;
      |
      |FROM src
      |INSERT INTO TABLE part_scan_test PARTITION (ds='2010-01-01')
      |SELECT 100,100 LIMIT 1;
      |
      |ALTER TABLE part_scan_test SET SERDE
      |'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe';
      |
      |FROM src INSERT INTO TABLE part_scan_test PARTITION (ds='2010-01-02')
      |SELECT 200,200 LIMIT 1;
      |
      |SELECT * from part_scan_test;
    """.stripMargin)

  // In unit test, kv1.txt is a small file and will be loaded as table src
  // Since the small file will be considered as a single split, we assume
  // Hive / SparkSQL HQL has the same output even for SORT BY
  //在单元测试中,kv1.txt是一个小文件,将作为表src加载由于小文件将被视为单个拆分,
  //我们假设Hive / SparkSQL HQL即使对于SORT BY也具有相同的输出
  //文件拆分为小表
  createQueryTest("file_split_for_small_table",
    """
      |SELECT key, value FROM src SORT BY key, value
    """.stripMargin)
  //小写问题
  test("Spark-4041: lowercase issue") {
    TestHive.sql("CREATE TABLE tb (KEY INT, VALUE STRING) STORED AS ORC")
    TestHive.sql("insert into table tb select key, value from src")
    TestHive.sql("select KEY from tb where VALUE='just_for_test' limit 5").collect()
    TestHive.sql("drop table tb")
  }
  //时间戳查询空值
  test("Spark-4077: timestamp query for null value") {
    TestHive.sql("DROP TABLE IF EXISTS timestamp_query_null")
    TestHive.sql(
      """
        CREATE EXTERNAL TABLE timestamp_query_null (time TIMESTAMP,id INT)
        ROW FORMAT DELIMITED
        FIELDS TERMINATED BY ','
        LINES TERMINATED BY '\n'
      """.stripMargin)
    val location =
      Utils.getSparkClassLoader.getResource("data/files/issue-4077-data.txt").getFile()

    TestHive.sql(s"LOAD DATA LOCAL INPATH '$location' INTO TABLE timestamp_query_null")
    assert(TestHive.sql("SELECT time from timestamp_query_null limit 2").collect()
      === Array(Row(java.sql.Timestamp.valueOf("2014-12-11 00:00:00")), Row(null)))
    TestHive.sql("DROP TABLE timestamp_query_null")
  }
  //当使用从投影中选择查询时,属性是区分大小写的
  test("Spark-4959 Attributes are case sensitive when using a select query from a projection") {
    sql("create table spark_4959 (col1 string)")
    sql("""insert into table spark_4959 select "hi" from src limit 1""")
    table("spark_4959").select(
      'col1.as("CaseSensitiveColName"),
      'col1.as("CaseSensitiveColName2")).registerTempTable("spark_4959_2")

    assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi"))
    assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi"))
  }
}

Source File: HiveSerDeSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.scalatest.BeforeAndAfterAll

import org.apache.spark.sql.hive.test.TestHive


class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
  override def beforeAll(): Unit = {
    import TestHive._
    import org.apache.hadoop.hive.serde2.RegexSerDe
      super.beforeAll()
    TestHive.cacheTables = false
    sql(s"""CREATE TABLE IF NOT EXISTS sales (key STRING, value INT)
       |ROW FORMAT SERDE '${classOf[RegexSerDe].getCanonicalName}'
       |WITH SERDEPROPERTIES ("input.regex" = "([^ ]*)\t([^ ]*)")
       """.stripMargin)
    sql(s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/sales.txt")}' INTO TABLE sales")
  }

  // table sales is not a cache table, and will be clear after reset
  //sales表不是缓存表,重置后会清除
  createQueryTest("Read with RegexSerDe", "SELECT * FROM sales", false)

  createQueryTest(
    "Read and write with LazySimpleSerDe (tab separated)",
    "SELECT * from serdeins")

  createQueryTest("Read with AvroSerDe", "SELECT * FROM episodes")

  createQueryTest("Read Partitioned with AvroSerDe", "SELECT * FROM episodes_part")
}

Source File: HiveMetastoreCatalogSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.hive.test.TestHive

import org.apache.spark.sql.test.ExamplePointUDT
import org.apache.spark.sql.types.StructType

class HiveMetastoreCatalogSuite extends SparkFunSuite {

  test("struct field should accept underscore in sub-column name") {
    val metastr = "struct<a: int, b_1: string, c: string>"

    val datatype = HiveMetastoreTypes.toDataType(metastr)
    assert(datatype.isInstanceOf[StructType])
  }

  test("udt to metastore type conversion") {
    val udt = new ExamplePointUDT
    assert(HiveMetastoreTypes.toMetastoreType(udt) ===
      HiveMetastoreTypes.toMetastoreType(udt.sqlType))
  }

  test("duplicated metastore relations") {
    import TestHive.implicits._
    val df = TestHive.sql("SELECT * FROM src")
    println(df.queryExecution)
    df.as('a).join(df.as('b), $"a.key" === $"b.key")
  }
}

Source File: HivePlanTest.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHive

class HivePlanTest extends QueryTest {
  import TestHive._
  import TestHive.implicits._
  //自定义函数常量折叠
  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t")
    val optimized = sql("SELECT cos(null) FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }
  //共享相同分区的窗口表达式和order by子句
  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
}

Source File: ListTablesSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.scalatest.BeforeAndAfterAll

import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.Row


class ListTablesSuite extends QueryTest with BeforeAndAfterAll {

  import org.apache.spark.sql.hive.test.TestHive.implicits._

  val df =
    sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value")

  override def beforeAll(): Unit = {
    // The catalog in HiveContext is a case insensitive one.
    //HiveContext中的目录是不区分大小写的
    catalog.registerTable(Seq("ListTablesSuiteTable"), df.logicalPlan)
    sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)")
    sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB")
    sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)")
  }

  override def afterAll(): Unit = {
    catalog.unregisterTable(Seq("ListTablesSuiteTable"))
    sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable")
    sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable")
    sql("DROP DATABASE IF EXISTS ListTablesSuiteDB")
  }
  //获取当前数据库的所有表
  test("get all tables of current database") {
    Seq(tables(), sql("SHOW TABLes")).foreach {
      case allTables =>
        // We are using default DB.
        //我们正在使用默认数据库
        checkAnswer(
          allTables.filter("tableName = 'listtablessuitetable'"),
          Row("listtablessuitetable", true))
        checkAnswer(
          allTables.filter("tableName = 'hivelisttablessuitetable'"),
          Row("hivelisttablessuitetable", false))
        assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0)
    }
  }
  //获取具有数据库名称的所有表
  test("getting all tables with a database name") {
    Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach {
      case allTables =>
        checkAnswer(
          allTables.filter("tableName = 'listtablessuitetable'"),
          Row("listtablessuitetable", true))
        assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0)
        checkAnswer(
          allTables.filter("tableName = 'hiveindblisttablessuitetable'"),
          Row("hiveindblisttablessuitetable", false))
    }
  }
}

Source File: CommitFailureTestRelationSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path
import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.test.SQLTestUtils


class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils {
  override def _sqlContext: SQLContext = TestHive
  private val sqlContext = _sqlContext

  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  //提交任务时，“CommitFailureTestSource”会为测试目的引发异常
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName
  //commitTask（）失败应该回退到abortTask（）
  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      //这里我们将分区号合并为1，以确保只发出一个任务, 这个防止当FileOutputCommitter尝试删除`_temporary`时发生竞争条件
      //目录提交/中止作业, 有关详细信息，请参阅SPARK-8513
      val df = sqlContext.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: HiveTypeCoercionSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.expressions.{Cast, EqualTo}
import org.apache.spark.sql.execution.ProjectExec
import org.apache.spark.sql.hive.test.TestHive


class HiveTypeCoercionSuite extends HiveComparisonTest {
  val baseTypes = Seq(
    ("1", "1"),
    ("1.0", "CAST(1.0 AS DOUBLE)"),
    ("1L", "1L"),
    ("1S", "1S"),
    ("1Y", "1Y"),
    ("'1'", "'1'"))

  baseTypes.foreach { case (ni, si) =>
    baseTypes.foreach { case (nj, sj) =>
      createQueryTest(s"$ni + $nj", s"SELECT $si + $sj FROM src LIMIT 1")
    }
  }

  val nullVal = "null"
  baseTypes.init.foreach { case (i, s) =>
    createQueryTest(s"case when then $i else $nullVal end ",
      s"SELECT case when true then $s else $nullVal end FROM src limit 1")
    createQueryTest(s"case when then $nullVal else $i end ",
      s"SELECT case when true then $nullVal else $s end FROM src limit 1")
  }

  test("[SPARK-2210] boolean cast on boolean value should be removed") {
    val q = "select cast(cast(key=0 as boolean) as boolean) from src"
    val project = TestHive.sql(q).queryExecution.sparkPlan.collect {
      case e: ProjectExec => e
    }.head

    // No cast expression introduced
    project.transformAllExpressions { case c: Cast =>
      fail(s"unexpected cast $c")
      c
    }

    // Only one equality check
    var numEquals = 0
    project.transformAllExpressions { case e: EqualTo =>
      numEquals += 1
      e
    }
    assert(numEquals === 1)
  }
}

Source File: HiveTableScanSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.Row
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.sql.hive.test.TestHive.implicits._

import org.apache.spark.util.Utils

class HiveTableScanSuite extends HiveComparisonTest {

  createQueryTest("partition_based_table_scan_with_different_serde",
    """
      |CREATE TABLE part_scan_test (key STRING, value STRING) PARTITIONED BY (ds STRING)
      |ROW FORMAT SERDE
      |'org.apache.hadoop.hive.serde2.columnar.LazyBinaryColumnarSerDe'
      |STORED AS RCFILE;
      |
      |FROM src
      |INSERT INTO TABLE part_scan_test PARTITION (ds='2010-01-01')
      |SELECT 100,100 LIMIT 1;
      |
      |ALTER TABLE part_scan_test SET SERDE
      |'org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe';
      |
      |FROM src INSERT INTO TABLE part_scan_test PARTITION (ds='2010-01-02')
      |SELECT 200,200 LIMIT 1;
      |
      |SELECT * from part_scan_test;
    """.stripMargin)

  // In unit test, kv1.txt is a small file and will be loaded as table src
  // Since the small file will be considered as a single split, we assume
  // Hive / SparkSQL HQL has the same output even for SORT BY
  createQueryTest("file_split_for_small_table",
    """
      |SELECT key, value FROM src SORT BY key, value
    """.stripMargin)

  test("Spark-4041: lowercase issue") {
    TestHive.sql("CREATE TABLE tb (KEY INT, VALUE STRING) STORED AS ORC")
    TestHive.sql("insert into table tb select key, value from src")
    TestHive.sql("select KEY from tb where VALUE='just_for_test' limit 5").collect()
    TestHive.sql("drop table tb")
  }

  test("Spark-4077: timestamp query for null value") {
    TestHive.sql("DROP TABLE IF EXISTS timestamp_query_null")
    TestHive.sql(
      """
        CREATE EXTERNAL TABLE timestamp_query_null (time TIMESTAMP,id INT)
        ROW FORMAT DELIMITED
        FIELDS TERMINATED BY ','
        LINES TERMINATED BY '\n'
      """.stripMargin)
    val location =
      Utils.getSparkClassLoader.getResource("data/files/issue-4077-data.txt").getFile()

    TestHive.sql(s"LOAD DATA LOCAL INPATH '$location' INTO TABLE timestamp_query_null")
    assert(TestHive.sql("SELECT time from timestamp_query_null limit 2").collect()
      === Array(Row(java.sql.Timestamp.valueOf("2014-12-11 00:00:00")), Row(null)))
    TestHive.sql("DROP TABLE timestamp_query_null")
  }

  test("Spark-4959 Attributes are case sensitive when using a select query from a projection") {
    sql("create table spark_4959 (col1 string)")
    sql("""insert into table spark_4959 select "hi" from src limit 1""")
    table("spark_4959").select(
      'col1.as("CaseSensitiveColName"),
      'col1.as("CaseSensitiveColName2")).registerTempTable("spark_4959_2")

    assert(sql("select CaseSensitiveColName from spark_4959_2").head() === Row("hi"))
    assert(sql("select casesensitivecolname from spark_4959_2").head() === Row("hi"))
  }
}

Source File: HiveSerDeSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.scalatest.BeforeAndAfterAll

import org.apache.spark.sql.hive.test.TestHive


class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
  override def beforeAll(): Unit = {
    import TestHive._
    import org.apache.hadoop.hive.serde2.RegexSerDe
      super.beforeAll()
    TestHive.cacheTables = false
    sql(s"""CREATE TABLE IF NOT EXISTS sales (key STRING, value INT)
       |ROW FORMAT SERDE '${classOf[RegexSerDe].getCanonicalName}'
       |WITH SERDEPROPERTIES ("input.regex" = "([^ ]*)\t([^ ]*)")
       """.stripMargin)
    sql(s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/sales.txt")}' INTO TABLE sales")
  }

  // table sales is not a cache table, and will be clear after reset
  createQueryTest("Read with RegexSerDe", "SELECT * FROM sales", false)

  createQueryTest(
    "Read and write with LazySimpleSerDe (tab separated)",
    "SELECT * from serdeins")

  createQueryTest("Read with AvroSerDe", "SELECT * FROM episodes")

  createQueryTest("Read Partitioned with AvroSerDe", "SELECT * FROM episodes_part")
}

Source File: HiveTypeCoercionSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.expressions.{Cast, EqualTo}
import org.apache.spark.sql.execution.Project
import org.apache.spark.sql.hive.test.TestHive


class HiveTypeCoercionSuite extends HiveComparisonTest {
  val baseTypes = Seq("1", "1.0", "1L", "1S", "1Y", "'1'")

  baseTypes.foreach { i =>
    baseTypes.foreach { j =>
      createQueryTest(s"$i + $j", s"SELECT $i + $j FROM src LIMIT 1")
    }
  }

  val nullVal = "null"
  baseTypes.init.foreach { i =>
    createQueryTest(s"case when then $i else $nullVal end ",
      s"SELECT case when true then $i else $nullVal end FROM src limit 1")
    createQueryTest(s"case when then $nullVal else $i end ",
      s"SELECT case when true then $nullVal else $i end FROM src limit 1")
  }

  test("[SPARK-2210] boolean cast on boolean value should be removed") {
    val q = "select cast(cast(key=0 as boolean) as boolean) from src"
    val project = TestHive.sql(q).queryExecution.executedPlan.collect {
      case e: Project => e
    }.head

    // No cast expression introduced
    project.transformAllExpressions { case c: Cast =>
      fail(s"unexpected cast $c")
      c
    }

    // Only one equality check
    var numEquals = 0
    project.transformAllExpressions { case e: EqualTo =>
      numEquals += 1
      e
    }
    assert(numEquals === 1)
  }
}

Source File: HiveOperatorQueryableSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.{Row, QueryTest}
import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton}


class HiveOperatorQueryableSuite extends QueryTest with TestHiveSingleton {
  import hiveContext._

  test("SPARK-5324 query result of describe command") {
    hiveContext.loadTestTable("src")

    // register a describe command to be a temp table
    sql("desc src").registerTempTable("mydesc")
    checkAnswer(
      sql("desc mydesc"),
      Seq(
        Row("col_name", "string", "name of the column"),
        Row("data_type", "string", "data type of the column"),
        Row("comment", "string", "comment of the column")))

    checkAnswer(
      sql("select * from mydesc"),
      Seq(
        Row("key", "int", null),
        Row("value", "string", null)))

    checkAnswer(
      sql("select col_name, data_type, comment from mydesc"),
      Seq(
        Row("key", "int", null),
        Row("value", "string", null)))
  }
}

Source File: RangerSparkMaskingExtensionTest.scala From spark-ranger with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.RangerSparkTestUtils._
import org.apache.spark.sql.catalyst.expressions.Alias
import org.apache.spark.sql.catalyst.plans.logical.{Project, RangerSparkMasking}
import org.scalatest.FunSuite

class RangerSparkMaskingExtensionTest extends FunSuite {

  private val spark = TestHive.sparkSession

  test("data masking for bob show last 4") {
    val extension = RangerSparkMaskingExtension(spark)
    val plan = spark.sql("select * from src").queryExecution.optimizedPlan
    println(plan)
    withUser("bob") {
      val newPlan = extension.apply(plan)
      assert(newPlan.isInstanceOf[Project])
      val project = newPlan.asInstanceOf[Project]
      val key = project.projectList.head
      assert(key.name === "key", "no affect on un masking attribute")
      val value = project.projectList.tail
      assert(value.head.name === "value", "attibute name should be unchanged")
      assert(value.head.asInstanceOf[Alias].child.sql ===
        "mask_show_last_n(`value`, 4, 'x', 'x', 'x', -1, '1')")
    }

    withUser("alice") {
      val newPlan = extension.apply(plan)
      assert(newPlan === RangerSparkMasking(plan))
    }
  }

}

Source File: HiveTypeCoercionSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.expressions.{Cast, EqualTo}
import org.apache.spark.sql.execution.ProjectExec
import org.apache.spark.sql.hive.test.TestHive


class HiveTypeCoercionSuite extends HiveComparisonTest {
  val baseTypes = Seq(
    ("1", "1"),
    ("1.0", "CAST(1.0 AS DOUBLE)"),
    ("1L", "1L"),
    ("1S", "1S"),
    ("1Y", "1Y"),
    ("'1'", "'1'"))

  baseTypes.foreach { case (ni, si) =>
    baseTypes.foreach { case (nj, sj) =>
      createQueryTest(s"$ni + $nj", s"SELECT $si + $sj FROM src LIMIT 1")
    }
  }

  val nullVal = "null"
  baseTypes.init.foreach { case (i, s) =>
    createQueryTest(s"case when then $i else $nullVal end ",
      s"SELECT case when true then $s else $nullVal end FROM src limit 1")
    createQueryTest(s"case when then $nullVal else $i end ",
      s"SELECT case when true then $nullVal else $s end FROM src limit 1")
  }

  test("[SPARK-2210] boolean cast on boolean value should be removed") {
    val q = "select cast(cast(key=0 as boolean) as boolean) from src"
    val project = TestHive.sql(q).queryExecution.sparkPlan.collect {
      case e: ProjectExec => e
    }.head

    // No cast expression introduced
    project.transformAllExpressions { case c: Cast =>
      fail(s"unexpected cast $c")
      c
    }

    // Only one equality check
    var numEquals = 0
    project.transformAllExpressions { case e: EqualTo =>
      numEquals += 1
      e
    }
    assert(numEquals === 1)
  }
}

Source File: HiveTypeCoercionSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.expressions.{Cast, EqualTo}
import org.apache.spark.sql.execution.ProjectExec
import org.apache.spark.sql.hive.test.TestHive


class HiveTypeCoercionSuite extends HiveComparisonTest {
  val baseTypes = Seq(
    ("1", "1"),
    ("1.0", "CAST(1.0 AS DOUBLE)"),
    ("1L", "1L"),
    ("1S", "1S"),
    ("1Y", "1Y"),
    ("'1'", "'1'"))

  baseTypes.foreach { case (ni, si) =>
    baseTypes.foreach { case (nj, sj) =>
      createQueryTest(s"$ni + $nj", s"SELECT $si + $sj FROM src LIMIT 1")
    }
  }

  val nullVal = "null"
  baseTypes.init.foreach { case (i, s) =>
    createQueryTest(s"case when then $i else $nullVal end ",
      s"SELECT case when true then $s else $nullVal end FROM src limit 1")
    createQueryTest(s"case when then $nullVal else $i end ",
      s"SELECT case when true then $nullVal else $s end FROM src limit 1")
  }

  test("[SPARK-2210] boolean cast on boolean value should be removed") {
    val q = "select cast(cast(key=0 as boolean) as boolean) from src"
    val project = TestHive.sql(q).queryExecution.sparkPlan.collect {
      case e: ProjectExec => e
    }.head

    // No cast expression introduced
    project.transformAllExpressions { case c: Cast =>
      fail(s"unexpected cast $c")
      c
    }

    // Only one equality check
    var numEquals = 0
    project.transformAllExpressions { case e: EqualTo =>
      numEquals += 1
      e
    }
    assert(numEquals === 1)
  }
}

Source File: HivemallOpsSuite.scala From hivemall-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming

import org.apache.spark.ml.feature.HmLabeledPoint
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HivemallOps._
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.types._
import org.apache.spark.streaming.HivemallStreamingOps._

class HivemallOpsSuite extends TestSuiteBase {

  import org.apache.spark.sql.hive.test.TestHive.implicits._
  import org.apache.spark.streaming.HivemallOpsSuite._

  test("streaming") {
    withStreamingContext(new StreamingContext(TestHive.sparkContext, Milliseconds(500))) { ssc =>
      val input = Seq(
        Seq(HmLabeledPoint(features = "1:0.6" :: "2:0.1" :: Nil)),
        Seq(HmLabeledPoint(features = "2:0.9" :: Nil)),
        Seq(HmLabeledPoint(features = "1:0.2" :: Nil)),
        Seq(HmLabeledPoint(features = "2:0.1" :: Nil)),
        Seq(HmLabeledPoint(features = "0:0.6" :: "2:0.4" :: Nil))
      )

      val stream = new TestInputStream[HmLabeledPoint](ssc, input, 2)

      // Apply predictions on input streams
      val prediction = stream.predict { testDf =>
          val testDf_exploded = testDf
            .explode_array($"features")
            .select(rowid(), extract_feature($"feature"), extract_weight($"feature"))
          val predictDf = testDf_exploded
            .join(model, testDf_exploded("feature") === model("feature"), "LEFT_OUTER")
            .select($"rowid", ($"weight" * $"value").as("value"))
            .groupby("rowid").sum("value")
            .select($"rowid", sigmoid($"SUM(value)"))
          assert(predictDf.count > 0)
          predictDf
        }

      // Dummy output stream
      prediction.foreachRDD { _ => {}}
    }
  }
}

object HivemallOpsSuite {
  implicit val sqlContext = TestHive
  val model =
     TestHive.createDataFrame(
       TestHive.sparkContext.parallelize(
         Row(0, 0.3f) ::
         Row(1, 0.1f) ::
         Row(2, 0.6f) ::
         Row(3, 0.2f) ::
         Nil
       ),
       StructType(
         StructField("feature", IntegerType, true) ::
         StructField("weight", FloatType, true) ::
         Nil)
     )
}

Source File: OapQuerySuite.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import java.util.{Locale, TimeZone}

import org.scalatest.{BeforeAndAfter, Ignore}

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.HiveUtils
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.internal.SQLConf

// Ignore because in separate package will encounter problem with shaded spark source.
@Ignore
class OapQuerySuite extends HiveComparisonTest with BeforeAndAfter  {
  private lazy val originalTimeZone = TimeZone.getDefault
  private lazy val originalLocale = Locale.getDefault
  import org.apache.spark.sql.hive.test.TestHive._

  // Note: invoke TestHive will create a SparkContext which can't be configured by us.
  // So be careful this may affect current using SparkContext and cause strange problem.
  private lazy val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled

  override def beforeAll() {
    super.beforeAll()
    TestHive.setCacheTables(true)
    // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*)
    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
    // Add Locale setting
    Locale.setDefault(Locale.US)
    // Ensures that cross joins are enabled so that we can test them
    TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true)
    TestHive.setConf(HiveUtils.CONVERT_METASTORE_PARQUET, true)
  }

  override def afterAll() {
    try {
      TestHive.setCacheTables(false)
      TimeZone.setDefault(originalTimeZone)
      Locale.setDefault(originalLocale)
      sql("DROP TEMPORARY FUNCTION IF EXISTS udtf_count2")
      TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled)
    } finally {
      super.afterAll()
    }
  }
  private def assertDupIndex(body: => Unit): Unit = {
    val e = intercept[AnalysisException] { body }
    assert(e.getMessage.toLowerCase.contains("exists"))
  }

  test("create hive table in parquet format") {
    try {
      sql("create table p_table (key int, val string) stored as parquet")
      sql("insert overwrite table p_table select * from src")
      sql("create oindex if not exists p_index on p_table(key)")
      assert(sql("select val from p_table where key = 238")
        .collect().head.getString(0) == "val_238")
    } finally {
      sql("drop oindex p_index on p_table")
      sql("drop table p_table")
    }
  }

  test("create duplicate hive table in parquet format") {
    try {
      sql("create table p_table1 (key int, val string) stored as parquet")
      sql("insert overwrite table p_table1 select * from src")
      sql("create oindex p_index on p_table1(key)")
      assertDupIndex { sql("create oindex p_index on p_table1(key)") }
    } finally {
      sql("drop oindex p_index on p_table1")
    }
  }
}

Source File: HiveSerDeSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.scalatest.BeforeAndAfterAll

import org.apache.spark.sql.hive.test.TestHive


class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
  override def beforeAll(): Unit = {
    import TestHive._
    import org.apache.hadoop.hive.serde2.RegexSerDe
    super.beforeAll()
    TestHive.setCacheTables(false)
    sql(s"""CREATE TABLE IF NOT EXISTS sales (key STRING, value INT)
       |ROW FORMAT SERDE '${classOf[RegexSerDe].getCanonicalName}'
       |WITH SERDEPROPERTIES ("input.regex" = "([^ ]*)\t([^ ]*)")
       """.stripMargin)
    sql(s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/sales.txt")}' INTO TABLE sales")
  }

  // table sales is not a cache table, and will be clear after reset
  createQueryTest("Read with RegexSerDe", "SELECT * FROM sales", false)

  createQueryTest(
    "Read and write with LazySimpleSerDe (tab separated)",
    "SELECT * from serdeins")

  createQueryTest("Read with AvroSerDe", "SELECT * FROM episodes")

  createQueryTest("Read Partitioned with AvroSerDe", "SELECT * FROM episodes_part")
}

Source File: HiveTypeCoercionSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.expressions.{Cast, EqualTo}
import org.apache.spark.sql.execution.ProjectExec
import org.apache.spark.sql.hive.test.TestHive


class HiveTypeCoercionSuite extends HiveComparisonTest {
  val baseTypes = Seq(
    ("1", "1"),
    ("1.0", "CAST(1.0 AS DOUBLE)"),
    ("1L", "1L"),
    ("1S", "1S"),
    ("1Y", "1Y"),
    ("'1'", "'1'"))

  baseTypes.foreach { case (ni, si) =>
    baseTypes.foreach { case (nj, sj) =>
      createQueryTest(s"$ni + $nj", s"SELECT $si + $sj FROM src LIMIT 1")
    }
  }

  val nullVal = "null"
  baseTypes.init.foreach { case (i, s) =>
    createQueryTest(s"case when then $i else $nullVal end ",
      s"SELECT case when true then $s else $nullVal end FROM src limit 1")
    createQueryTest(s"case when then $nullVal else $i end ",
      s"SELECT case when true then $nullVal else $s end FROM src limit 1")
  }

  test("[SPARK-2210] boolean cast on boolean value should be removed") {
    val q = "select cast(cast(key=0 as boolean) as boolean) from src"
    val project = TestHive.sql(q).queryExecution.sparkPlan.collect {
      case e: ProjectExec => e
    }.head

    // No cast expression introduced
    project.transformAllExpressions { case c: Cast =>
      fail(s"unexpected cast $c")
      c
    }

    // Only one equality check
    var numEquals = 0
    project.transformAllExpressions { case e: EqualTo =>
      numEquals += 1
      e
    }
    assert(numEquals === 1)
  }
}

Source File: DatasetPerformanceSuite.scala From spark-sql-perf with Apache License 2.0

5 votes

package com.databricks.spark.sql.perf

import org.apache.spark.sql.hive.test.TestHive
import org.scalatest.FunSuite

class DatasetPerformanceSuite extends FunSuite {
  ignore("run benchmark") {
    TestHive // Init HiveContext
    val benchmark = new DatasetPerformance() {
      override val numLongs = 100
    }
    import benchmark._

    val exp = runExperiment(allBenchmarks)
    exp.waitForFinish(10000)
  }
}

Source File: RangerSparkPlanOmitStrategyTest.scala From spark-ranger with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import org.apache.spark.sql.catalyst.plans.logical.{RangerSparkMasking, RangerSparkRowFilter}
import org.apache.spark.sql.hive.test.TestHive
import org.scalatest.FunSuite

class RangerSparkPlanOmitStrategyTest extends FunSuite {

  private val spark = TestHive.sparkSession

  test("ranger spark plan omit strategy") {
    val strategy = RangerSparkPlanOmitStrategy(spark)
    val df = spark.range(0, 5)
    val plan1 = df.queryExecution.optimizedPlan
    assert(strategy.apply(plan1) === Nil)
    val plan2 = RangerSparkRowFilter(plan1)
    assert(strategy.apply(plan2) === PlanLater(plan1) :: Nil)
    val plan3 = RangerSparkMasking(plan1)
    assert(strategy.apply(plan3) === PlanLater(plan1) :: Nil)
    val plan4 = RangerSparkMasking(plan2)
    assert(strategy.apply(plan4) === PlanLater(plan2) :: Nil)
    val plan5 = RangerSparkRowFilter(plan3)
    assert(strategy.apply(plan5) === PlanLater(plan3) :: Nil)
  }
}

Source File: HiveSerDeSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.scalatest.BeforeAndAfterAll

import org.apache.spark.sql.hive.test.TestHive


class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
  override def beforeAll(): Unit = {
    import TestHive._
    import org.apache.hadoop.hive.serde2.RegexSerDe
    super.beforeAll()
    TestHive.setCacheTables(false)
    sql(s"""CREATE TABLE IF NOT EXISTS sales (key STRING, value INT)
       |ROW FORMAT SERDE '${classOf[RegexSerDe].getCanonicalName}'
       |WITH SERDEPROPERTIES ("input.regex" = "([^ ]*)\t([^ ]*)")
       """.stripMargin)
    sql(s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/sales.txt")}' INTO TABLE sales")
  }

  // table sales is not a cache table, and will be clear after reset
  createQueryTest("Read with RegexSerDe", "SELECT * FROM sales", false)

  createQueryTest(
    "Read and write with LazySimpleSerDe (tab separated)",
    "SELECT * from serdeins")

  createQueryTest("Read with AvroSerDe", "SELECT * FROM episodes")

  createQueryTest("Read Partitioned with AvroSerDe", "SELECT * FROM episodes_part")
}

Source File: RangerSparkRowFilterExtensionTest.scala From spark-ranger with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.hive.test.TestHive
import org.scalatest.FunSuite
import org.apache.spark.sql.RangerSparkTestUtils._
import org.apache.spark.sql.catalyst.plans.logical.{Filter, RangerSparkRowFilter}

class RangerSparkRowFilterExtensionTest extends FunSuite {

  private val spark = TestHive.sparkSession

  test("ranger spark row filter extension") {
    val extension = RangerSparkRowFilterExtension(spark)
    val plan = spark.sql("select * from src").queryExecution.optimizedPlan
    println(plan)
    withUser("bob") {
      val newPlan = extension.apply(plan)
      assert(newPlan.isInstanceOf[RangerSparkRowFilter])
      val filters = newPlan.collect { case f: Filter => f }
      assert(filters.nonEmpty, "ranger row level filters should be applied automatically")
      println(newPlan)
    }
    withUser("alice") {
      val newPlan = extension.apply(plan)
      assert(newPlan.isInstanceOf[RangerSparkRowFilter])
      val filters = newPlan.collect { case f: Filter => f }
      assert(filters.isEmpty, "alice does not have implicit filters")
      println(newPlan)
    }
  }

}

Source File: DeltaHiveTest.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.test

import org.apache.spark.sql.delta.catalog.DeltaCatalog
import io.delta.sql.DeltaSparkSessionExtension
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.{SparkContext, SparkFunSuite}
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.hive.test.{TestHive, TestHiveContext}
import org.apache.spark.sql.internal.{SQLConf, StaticSQLConf}
import org.apache.spark.sql.test.SQLTestUtils


trait DeltaHiveTest extends SparkFunSuite with BeforeAndAfterAll { self: SQLTestUtils =>

  private var _session: SparkSession = _
  private var _hiveContext: TestHiveContext = _
  private var _sc: SparkContext = _

  override def beforeAll(): Unit = {
    val conf = TestHive.sparkSession.sparkContext.getConf.clone()
    TestHive.sparkSession.stop()
    conf.set(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[DeltaCatalog].getName)
    conf.set(StaticSQLConf.SPARK_SESSION_EXTENSIONS.key,
      classOf[DeltaSparkSessionExtension].getName)
    _sc = new SparkContext("local", this.getClass.getName, conf)
    _hiveContext = new TestHiveContext(_sc)
    _session = _hiveContext.sparkSession
    SparkSession.setActiveSession(_session)
    super.beforeAll()
  }

  override protected def spark: SparkSession = _session

  override def afterAll(): Unit = {
    try {
      _hiveContext.reset()
    } finally {
      _sc.stop()
    }
  }
}

Source File: HiveSerDeSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.scalatest.BeforeAndAfterAll

import org.apache.spark.sql.hive.test.TestHive


class HiveSerDeSuite extends HiveComparisonTest with BeforeAndAfterAll {
  override def beforeAll(): Unit = {
    import TestHive._
    import org.apache.hadoop.hive.serde2.RegexSerDe
    super.beforeAll()
    TestHive.setCacheTables(false)
    sql(s"""CREATE TABLE IF NOT EXISTS sales (key STRING, value INT)
       |ROW FORMAT SERDE '${classOf[RegexSerDe].getCanonicalName}'
       |WITH SERDEPROPERTIES ("input.regex" = "([^ ]*)\t([^ ]*)")
       """.stripMargin)
    sql(s"LOAD DATA LOCAL INPATH '${getHiveFile("data/files/sales.txt")}' INTO TABLE sales")
  }

  // table sales is not a cache table, and will be clear after reset
  createQueryTest("Read with RegexSerDe", "SELECT * FROM sales", false)

  createQueryTest(
    "Read and write with LazySimpleSerDe (tab separated)",
    "SELECT * from serdeins")

  createQueryTest("Read with AvroSerDe", "SELECT * FROM episodes")

  createQueryTest("Read Partitioned with AvroSerDe", "SELECT * FROM episodes_part")
}

Source File: HiveTypeCoercionSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.expressions.{Cast, EqualTo}
import org.apache.spark.sql.execution.ProjectExec
import org.apache.spark.sql.hive.test.TestHive


class HiveTypeCoercionSuite extends HiveComparisonTest {
  val baseTypes = Seq(
    ("1", "1"),
    ("1.0", "CAST(1.0 AS DOUBLE)"),
    ("1L", "1L"),
    ("1S", "1S"),
    ("1Y", "1Y"),
    ("'1'", "'1'"))

  baseTypes.foreach { case (ni, si) =>
    baseTypes.foreach { case (nj, sj) =>
      createQueryTest(s"$ni + $nj", s"SELECT $si + $sj FROM src LIMIT 1")
    }
  }

  val nullVal = "null"
  baseTypes.init.foreach { case (i, s) =>
    createQueryTest(s"case when then $i else $nullVal end ",
      s"SELECT case when true then $s else $nullVal end FROM src limit 1")
    createQueryTest(s"case when then $nullVal else $i end ",
      s"SELECT case when true then $nullVal else $s end FROM src limit 1")
  }

  test("[SPARK-2210] boolean cast on boolean value should be removed") {
    val q = "select cast(cast(key=0 as boolean) as boolean) from src"
    val project = TestHive.sql(q).queryExecution.sparkPlan.collect {
      case e: ProjectExec => e
    }.head

    // No cast expression introduced
    project.transformAllExpressions { case c: Cast =>
      fail(s"unexpected cast $c")
      c
    }

    // Only one equality check
    var numEquals = 0
    project.transformAllExpressions { case e: EqualTo =>
      numEquals += 1
      e
    }
    assert(numEquals === 1)
  }
}

Source File: HiveDataFrameAnalyticsSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.sql.{DataFrame, QueryTest}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.sql.hive.test.TestHive.implicits._
import org.scalatest.BeforeAndAfterAll

// TODO ideally we should put the test suite into the package `sql`, as
// `hive` package is optional in compiling, however, `SQLContext.sql` doesn't
// support the `cube` or `rollup` yet.
class HiveDataFrameAnalyticsSuite extends QueryTest with BeforeAndAfterAll {
  private var testData: DataFrame = _

  override def beforeAll() {
    testData = Seq((1, 2), (2, 4)).toDF("a", "b")
    TestHive.registerDataFrameAsTable(testData, "mytable")
  }

  override def afterAll(): Unit = {
    TestHive.dropTempTable("mytable")
  }

  test("rollup") {
    checkAnswer(
      testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b")),
      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with rollup").collect()
    )

    checkAnswer(
      testData.rollup("a", "b").agg(sum("b")),
      sql("select a, b, sum(b) from mytable group by a, b with rollup").collect()
    )
  }

  test("cube") {
    checkAnswer(
      testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")),
      sql("select a + b, b, sum(a - b) from mytable group by a + b, b with cube").collect()
    )

    checkAnswer(
      testData.cube("a", "b").agg(sum("b")),
      sql("select a, b, sum(b) from mytable group by a, b with cube").collect()
    )
  }
}

Source File: QueryPartitionSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import com.google.common.io.Files

import org.apache.spark.sql.{QueryTest, _}
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.hive.test.TestHive._
import org.apache.spark.util.Utils


class QueryPartitionSuite extends QueryTest {
  import org.apache.spark.sql.hive.test.TestHive.implicits._

  test("SPARK-5068: query data when path doesn't exist"){
    val testData = TestHive.sparkContext.parallelize(
      (1 to 10).map(i => TestData(i, i.toString))).toDF()
    testData.registerTempTable("testData")

    val tmpDir = Files.createTempDir()
    // create the table for test
    sql(s"CREATE TABLE table_with_partition(key int,value string) " +
      s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ")
    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='1') " +
      "SELECT key,value FROM testData")
    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='2') " +
      "SELECT key,value FROM testData")
    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='3') " +
      "SELECT key,value FROM testData")
    sql("INSERT OVERWRITE TABLE table_with_partition  partition (ds='4') " +
      "SELECT key,value FROM testData")

    // test for the exist path
    checkAnswer(sql("select key,value from table_with_partition"),
      testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect
        ++ testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect)

    // delete the path of one partition
    tmpDir.listFiles
      .find { f => f.isDirectory && f.getName().startsWith("ds=") }
      .foreach { f => Utils.deleteRecursively(f) }

    // test for after delete the path
    checkAnswer(sql("select key,value from table_with_partition"),
      testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect
        ++ testData.toSchemaRDD.collect)

    sql("DROP TABLE table_with_partition")
    sql("DROP TABLE createAndInsertTest")
  }
}

Source File: OrcTest.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import java.io.File

import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.sql._

private[sql] trait OrcTest extends SQLTestUtils {
  protected def hiveContext = sqlContext.asInstanceOf[HiveContext]

  import sqlContext.sparkContext
  import sqlContext.implicits._

  
  protected def withOrcTable[T <: Product: ClassTag: TypeTag]
      (data: Seq[T], tableName: String)
      (f: => Unit): Unit = {
    withOrcDataFrame(data) { df =>
      hiveContext.registerDataFrameAsTable(df, tableName)
      withTempTable(tableName)(f)
    }
  }

  protected def makeOrcFile[T <: Product: ClassTag: TypeTag](
      data: Seq[T], path: File): Unit = {
    data.toDF().write.format("orc").mode(SaveMode.Overwrite).save(path.getCanonicalPath)
  }

  protected def makeOrcFile[T <: Product: ClassTag: TypeTag](
      df: DataFrame, path: File): Unit = {
    df.write.format("orc").mode(SaveMode.Overwrite).save(path.getCanonicalPath)
  }
}

Source File: HiveParquetSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.sql.catalyst.expressions.Row
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.parquet.ParquetTest
import org.apache.spark.sql.{QueryTest, SQLConf}

case class Cases(lower: String, UPPER: String)

class HiveParquetSuite extends QueryTest with ParquetTest {
  val sqlContext = TestHive

  import sqlContext._

  def run(prefix: String): Unit = {
    test(s"$prefix: Case insensitive attribute names") {
      withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") {
        val expected = (1 to 4).map(i => Row(i.toString))
        checkAnswer(sql("SELECT upper FROM cases"), expected)
        checkAnswer(sql("SELECT LOWER FROM cases"), expected)
      }
    }

    test(s"$prefix: SELECT on Parquet table") {
      val data = (1 to 4).map(i => (i, s"val_$i"))
      withParquetTable(data, "t") {
        checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple))
      }
    }

    test(s"$prefix: Simple column projection + filter on Parquet table") {
      withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") {
        checkAnswer(
          sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"),
          Seq(Row(true, "val_2"), Row(true, "val_4")))
      }
    }

    test(s"$prefix: Converting Hive to Parquet Table via saveAsParquetFile") {
      withTempPath { dir =>
        sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath)
        read.parquet(dir.getCanonicalPath).registerTempTable("p")
        withTempTable("p") {
          checkAnswer(
            sql("SELECT * FROM src ORDER BY key"),
            sql("SELECT * from p ORDER BY key").collect().toSeq)
        }
      }
    }

    test(s"$prefix: INSERT OVERWRITE TABLE Parquet table") {
      withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") {
        withTempPath { file =>
          sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath)
          read.parquet(file.getCanonicalPath).registerTempTable("p")
          withTempTable("p") {
            // let's do three overwrites for good measure
            sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
            sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
            sql("INSERT OVERWRITE TABLE p SELECT * FROM t")
            checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq)
          }
        }
      }
    }
  }

  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") {
    run("Parquet data source enabled")
  }

  withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") {
    run("Parquet data source disabled")
  }
}

org.apache.spark.sql.hive.test.TestHive Scala Examples