org.apache.spark.sql.QueryTest Scala Examples
The following examples show how to use org.apache.spark.sql.QueryTest.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HiveParquetSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.execution.datasources.parquet.ParquetTest import org.apache.spark.sql.hive.test.TestHiveSingleton case class Cases(lower: String, UPPER: String) class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton { test("Case insensitive attribute names") { withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") { val expected = (1 to 4).map(i => Row(i.toString)) checkAnswer(sql("SELECT upper FROM cases"), expected) checkAnswer(sql("SELECT LOWER FROM cases"), expected) } } test("SELECT on Parquet table") { val data = (1 to 4).map(i => (i, s"val_$i")) withParquetTable(data, "t") { checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple)) } } test("Simple column projection + filter on Parquet table") { withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") { checkAnswer( sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"), Seq(Row(true, "val_2"), Row(true, "val_4"))) } } test("Converting Hive to Parquet Table via saveAsParquetFile") { withTempPath { dir => sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath) spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("p") withTempView("p") { checkAnswer( sql("SELECT * FROM src ORDER BY key"), sql("SELECT * from p ORDER BY key").collect().toSeq) } } } test("INSERT OVERWRITE TABLE Parquet table") { // Don't run with vectorized: currently relies on UnsafeRow. withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t", false) { withTempPath { file => sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath) spark.read.parquet(file.getCanonicalPath).createOrReplaceTempView("p") withTempView("p") { // let's do three overwrites for good measure sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq) } } } } }
Example 2
Source File: HiveDataFrameJoinSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveDataFrameJoinSuite extends QueryTest with TestHiveSingleton { import spark.implicits._ // We should move this into SQL package if we make case sensitivity configurable in SQL. test("join - self join auto resolve ambiguity with case insensitivity") { val df = Seq((1, "1"), (2, "2")).toDF("key", "value") checkAnswer( df.join(df, df("key") === df("Key")), Row(1, "1", 1, "1") :: Row(2, "2", 2, "2") :: Nil) checkAnswer( df.join(df.filter($"value" === "2"), df("key") === df("Key")), Row(2, "2", 2, "2") :: Nil) } }
Example 3
Source File: MetastoreRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MetastoreRelationSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("makeCopy and toJSON should work") { val table = CatalogTable( identifier = TableIdentifier("test", Some("db")), tableType = CatalogTableType.VIEW, storage = CatalogStorageFormat.empty, schema = StructType(StructField("a", IntegerType, true) :: Nil)) val relation = MetastoreRelation("db", "test")(table, null) // No exception should be thrown relation.makeCopy(Array("db", "test")) // No exception should be thrown relation.toJSON } test("SPARK-17409: Do Not Optimize Query in CTAS (Hive Serde Table) More Than Once") { withTable("bar") { withTempView("foo") { sql("select 0 as id").createOrReplaceTempView("foo") // If we optimize the query in CTAS more than once, the following saveAsTable will fail // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])` sql("CREATE TABLE bar AS SELECT * FROM foo group by id") checkAnswer(spark.table("bar"), Row(0) :: Nil) val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar")) assert(tableMetadata.provider == Some("hive"), "the expected table is a Hive serde table") } } } }
Example 4
Source File: HiveExplainSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils }" ) checkKeywordsNotExist(sql("EXPLAIN CODEGEN SELECT 1"), "== Physical Plan ==" ) intercept[ParseException] { sql("EXPLAIN EXTENDED CODEGEN SELECT 1") } } }
Example 5
Source File: HiveOperatorQueryableSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveOperatorQueryableSuite extends QueryTest with TestHiveSingleton { import spark._ test("SPARK-5324 query result of describe command") { hiveContext.loadTestTable("src") // Creates a temporary view with the output of a describe command sql("desc src").createOrReplaceTempView("mydesc") checkAnswer( sql("desc mydesc"), Seq( Row("col_name", "string", "name of the column"), Row("data_type", "string", "data type of the column"), Row("comment", "string", "comment of the column"))) checkAnswer( sql("select * from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) checkAnswer( sql("select col_name, data_type, comment from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) } }
Example 6
Source File: HivePlanTest.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.functions._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import spark.sql import spark.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t") val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 7
Source File: PruneFileSourcePartitionsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions, TableFileCatalog} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil } test("PruneFileSourcePartitions should not change the output of LogicalRelation") { withTable("test") { withTempDir { dir => sql( s""" |CREATE EXTERNAL TABLE test(i int) |PARTITIONED BY (p int) |STORED AS parquet |LOCATION '${dir.getAbsolutePath}'""".stripMargin) val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") val tableFileCatalog = new TableFileCatalog(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( location = tableFileCatalog, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, catalogTable = Some(tableMeta)) val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) } } } }
Example 8
Source File: HiveUtilsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.QueryTest class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("newTemporaryConfiguration overwrites listener configurations") { Seq(true, false).foreach { useInMemoryDerby => val conf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(conf(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "") } } }
Example 9
Source File: ListTablesSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.QueryTest import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.hive.test.TestHiveSingleton class ListTablesSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll { import hiveContext._ import hiveContext.implicits._ val df = sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value") override def beforeAll(): Unit = { super.beforeAll() // The catalog in HiveContext is a case insensitive one. sessionState.catalog.createTempView( "ListTablesSuiteTable", df.logicalPlan, overrideIfExists = true) sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)") sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB") sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)") } override def afterAll(): Unit = { try { sessionState.catalog.dropTable( TableIdentifier("ListTablesSuiteTable"), ignoreIfNotExists = true, purge = false) sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable") sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable") sql("DROP DATABASE IF EXISTS ListTablesSuiteDB") } finally { super.afterAll() } } test("get all tables of current database") { Seq(tables(), sql("SHOW TABLes")).foreach { case allTables => // We are using default DB. checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("", "listtablessuitetable", true)) checkAnswer( allTables.filter("tableName = 'hivelisttablessuitetable'"), Row("default", "hivelisttablessuitetable", false)) assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0) } } test("getting all tables with a database name") { Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach { case allTables => checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("", "listtablessuitetable", true)) assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hiveindblisttablessuitetable'"), Row("listtablessuitedb", "hiveindblisttablessuitetable", false)) } } }
Example 10
Source File: SQLBuilderTest.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import scala.util.control.NonFatal import org.apache.spark.sql.{DataFrame, Dataset, QueryTest} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.hive.test.TestHiveSingleton abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton { protected def checkSQL(e: Expression, expectedSQL: String): Unit = { val actualSQL = e.sql try { assert(actualSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following expression: | |${e.prettyName} | |$cause """.stripMargin) } } protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = { val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) => fail( s"""Cannot convert the following logical query plan to SQL: | |${plan.treeString} """.stripMargin) } try { assert(generatedSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following logical query plan: | |${plan.treeString} | |$cause """.stripMargin) } checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan)) } protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = { checkSQL(df.queryExecution.analyzed, expectedSQL) } }
Example 11
Source File: HadoopFsRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{File, FilenameFilter} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext class HadoopFsRelationSuite extends QueryTest with SharedSQLContext { test("sizeInBytes should be the total size of all files") { withTempDir{ dir => dir.delete() spark.range(1000).write.parquet(dir.toString) // ignore hidden files val allFiles = dir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { !name.startsWith(".") } }) val totalSize = allFiles.map(_.length()).sum val df = spark.read.parquet(dir.toString) assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize)) } } }
Example 12
Source File: PartitionedWriteSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils class PartitionedWriteSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("write many partitions") { val path = Utils.createTempDir() path.delete() val df = spark.range(100).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( spark.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) } test("write many partitions with repeats") { val path = Utils.createTempDir() path.delete() val base = spark.range(100) val df = base.union(base).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( spark.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq ++ (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) } test("partitioned columns should appear at the end of schema") { withTempPath { f => val path = f.getAbsolutePath Seq(1 -> "a").toDF("i", "j").write.partitionBy("i").parquet(path) assert(spark.read.parquet(path).schema.map(_.name) == Seq("j", "i")) } } }
Example 13
Source File: OptimizeHiveMetadataOnlyQuerySuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfter import org.apache.spark.metrics.source.HiveCatalogMetrics import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.expressions.NamedExpression import org.apache.spark.sql.catalyst.plans.logical.{Distinct, Filter, Project, SubqueryAlias} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf.OPTIMIZER_METADATA_ONLY import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class OptimizeHiveMetadataOnlyQuerySuite extends QueryTest with TestHiveSingleton with BeforeAndAfter with SQLTestUtils { import spark.implicits._ override def beforeAll(): Unit = { super.beforeAll() sql("CREATE TABLE metadata_only (id bigint, data string) PARTITIONED BY (part int)") (0 to 10).foreach(p => sql(s"ALTER TABLE metadata_only ADD PARTITION (part=$p)")) } override protected def afterAll(): Unit = { try { sql("DROP TABLE IF EXISTS metadata_only") } finally { super.afterAll() } } test("SPARK-23877: validate metadata-only query pushes filters to metastore") { withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") { val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount // verify the number of matching partitions assert(sql("SELECT DISTINCT part FROM metadata_only WHERE part < 5").collect().length === 5) // verify that the partition predicate was pushed down to the metastore assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount === 5) } } test("SPARK-23877: filter on projected expression") { withSQLConf(OPTIMIZER_METADATA_ONLY.key -> "true") { val startCount = HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount // verify the matching partitions val partitions = spark.internalCreateDataFrame(Distinct(Filter(($"x" < 5).expr, Project(Seq(($"part" + 1).as("x").expr.asInstanceOf[NamedExpression]), spark.table("metadata_only").logicalPlan.asInstanceOf[SubqueryAlias].child))) .queryExecution.toRdd, StructType(Seq(StructField("x", IntegerType)))) checkAnswer(partitions, Seq(1, 2, 3, 4).toDF("x")) // verify that the partition predicate was not pushed down to the metastore assert(HiveCatalogMetrics.METRIC_PARTITIONS_FETCHED.getCount - startCount == 11) } } }
Example 14
Source File: HiveParquetSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.execution.datasources.parquet.ParquetTest import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf case class Cases(lower: String, UPPER: String) class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton { test("Case insensitive attribute names") { withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") { val expected = (1 to 4).map(i => Row(i.toString)) checkAnswer(sql("SELECT upper FROM cases"), expected) checkAnswer(sql("SELECT LOWER FROM cases"), expected) } } test("SELECT on Parquet table") { val data = (1 to 4).map(i => (i, s"val_$i")) withParquetTable(data, "t") { checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple)) } } test("Simple column projection + filter on Parquet table") { withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") { checkAnswer( sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"), Seq(Row(true, "val_2"), Row(true, "val_4"))) } } test("Converting Hive to Parquet Table via saveAsParquetFile") { withTempPath { dir => sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath) spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("p") withTempView("p") { checkAnswer( sql("SELECT * FROM src ORDER BY key"), sql("SELECT * from p ORDER BY key").collect().toSeq) } } } test("INSERT OVERWRITE TABLE Parquet table") { // Don't run with vectorized: currently relies on UnsafeRow. withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t", false) { withTempPath { file => sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath) spark.read.parquet(file.getCanonicalPath).createOrReplaceTempView("p") withTempView("p") { // let's do three overwrites for good measure sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq) } } } } test("SPARK-25206: wrong records are returned by filter pushdown " + "when Hive metastore schema and parquet schema are in different letter cases") { withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> true.toString) { withTempPath { path => val data = spark.range(1, 10).toDF("id") data.write.parquet(path.getCanonicalPath) withTable("SPARK_25206") { sql("CREATE TABLE SPARK_25206 (ID LONG) USING parquet LOCATION " + s"'${path.getCanonicalPath}'") checkAnswer(sql("select id from SPARK_25206 where id > 0"), data) } } } } }
Example 15
Source File: HiveDataFrameJoinSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveDataFrameJoinSuite extends QueryTest with TestHiveSingleton { import spark.implicits._ // We should move this into SQL package if we make case sensitivity configurable in SQL. test("join - self join auto resolve ambiguity with case insensitivity") { val df = Seq((1, "1"), (2, "2")).toDF("key", "value") checkAnswer( df.join(df, df("key") === df("Key")), Row(1, "1", 1, "1") :: Row(2, "2", 2, "2") :: Nil) checkAnswer( df.join(df.filter($"value" === "2"), df("key") === df("Key")), Row(2, "2", 2, "2") :: Nil) } }
Example 16
Source File: HivePlanTest.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import spark.sql import spark.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t") val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 17
Source File: PruneFileSourcePartitionsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.scalatest.Matchers._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project, ResolvedHint} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec import org.apache.spark.sql.functions.broadcast import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil } test("PruneFileSourcePartitions should not change the output of LogicalRelation") { withTable("test") { withTempDir { dir => sql( s""" |CREATE EXTERNAL TABLE test(i int) |PARTITIONED BY (p int) |STORED AS parquet |LOCATION '${dir.toURI}'""".stripMargin) val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( location = catalogFileIndex, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, tableMeta) val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) } } } test("SPARK-20986 Reset table's statistics after PruneFileSourcePartitions rule") { withTable("tbl") { spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").saveAsTable("tbl") sql(s"ANALYZE TABLE tbl COMPUTE STATISTICS") val tableStats = spark.sessionState.catalog.getTableMetadata(TableIdentifier("tbl")).stats assert(tableStats.isDefined && tableStats.get.sizeInBytes > 0, "tableStats is lost") val df = sql("SELECT * FROM tbl WHERE p = 1") val sizes1 = df.queryExecution.analyzed.collect { case relation: LogicalRelation => relation.catalogTable.get.stats.get.sizeInBytes } assert(sizes1.size === 1, s"Size wrong for:\n ${df.queryExecution}") assert(sizes1(0) == tableStats.get.sizeInBytes) val relations = df.queryExecution.optimizedPlan.collect { case relation: LogicalRelation => relation } assert(relations.size === 1, s"Size wrong for:\n ${df.queryExecution}") val size2 = relations(0).stats.sizeInBytes assert(size2 == relations(0).catalogTable.get.stats.get.sizeInBytes) assert(size2 < tableStats.get.sizeInBytes) } } test("SPARK-26576 Broadcast hint not applied to partitioned table") { withTable("tbl") { withSQLConf(SQLConf.AUTO_BROADCASTJOIN_THRESHOLD.key -> "-1") { spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").saveAsTable("tbl") val df = spark.table("tbl") val qe = df.join(broadcast(df), "p").queryExecution qe.optimizedPlan.collect { case _: ResolvedHint => } should have size 1 qe.sparkPlan.collect { case j: BroadcastHashJoinExec => j } should have size 1 } } } }
Example 18
Source File: HiveUtilsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import java.net.URL import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.QueryTest import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SQLTestUtils} import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader} class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("newTemporaryConfiguration overwrites listener configurations") { Seq(true, false).foreach { useInMemoryDerby => val conf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(conf(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "") } } test("newTemporaryConfiguration respect spark.hadoop.foo=bar in SparkConf") { sys.props.put("spark.hadoop.foo", "bar") Seq(true, false) foreach { useInMemoryDerby => val hiveConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(!hiveConf.contains("spark.hadoop.foo")) assert(hiveConf("foo") === "bar") } } test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") { val conf = new SparkConf val contextClassLoader = Thread.currentThread().getContextClassLoader val loader = new ChildFirstURLClassLoader(Array(), contextClassLoader) try { Thread.currentThread().setContextClassLoader(loader) HiveUtils.newClientForMetadata( conf, SparkHadoopUtil.newConfiguration(conf), HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true)) } finally { Thread.currentThread().setContextClassLoader(contextClassLoader) } } test("toHiveString correctly handles UDTs") { val point = new ExamplePoint(50.0, 50.0) val tpe = new ExamplePointUDT() assert(HiveUtils.toHiveString((point, tpe)) === "(50.0, 50.0)") } }
Example 19
Source File: ListTablesSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.QueryTest import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.hive.test.TestHiveSingleton class ListTablesSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll { import hiveContext._ import hiveContext.implicits._ val df = sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value") override def beforeAll(): Unit = { super.beforeAll() // The catalog in HiveContext is a case insensitive one. sessionState.catalog.createTempView( "ListTablesSuiteTable", df.logicalPlan, overrideIfExists = true) sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)") sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB") sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)") } override def afterAll(): Unit = { try { sessionState.catalog.dropTable( TableIdentifier("ListTablesSuiteTable"), ignoreIfNotExists = true, purge = false) sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable") sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable") sql("DROP DATABASE IF EXISTS ListTablesSuiteDB") } finally { super.afterAll() } } test("get all tables of current database") { Seq(tables(), sql("SHOW TABLes")).foreach { case allTables => // We are using default DB. checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("", "listtablessuitetable", true)) checkAnswer( allTables.filter("tableName = 'hivelisttablessuitetable'"), Row("default", "hivelisttablessuitetable", false)) assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0) } } test("getting all tables with a database name") { Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach { case allTables => checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("", "listtablessuitetable", true)) assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hiveindblisttablessuitetable'"), Row("listtablessuitedb", "hiveindblisttablessuitetable", false)) } } }
Example 20
Source File: HiveVariableSubstitutionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveVariableSubstitutionSuite extends QueryTest with TestHiveSingleton { test("SET hivevar with prefix") { spark.sql("SET hivevar:county=gram") assert(spark.conf.getOption("county") === Some("gram")) } test("SET hivevar with dotted name") { spark.sql("SET hivevar:eloquent.mosquito.alphabet=zip") assert(spark.conf.getOption("eloquent.mosquito.alphabet") === Some("zip")) } test("hivevar substitution") { spark.conf.set("pond", "bus") checkAnswer(spark.sql("SELECT '${hivevar:pond}'"), Row("bus") :: Nil) } test("variable substitution without a prefix") { spark.sql("SET hivevar:flask=plaid") checkAnswer(spark.sql("SELECT '${flask}'"), Row("plaid") :: Nil) } test("variable substitution precedence") { spark.conf.set("turn.aloof", "questionable") spark.sql("SET hivevar:turn.aloof=dime") // hivevar clobbers the conf setting checkAnswer(spark.sql("SELECT '${turn.aloof}'"), Row("dime") :: Nil) } }
Example 21
Source File: FileFormatWriterSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.test.SharedSQLContext class FileFormatWriterSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("empty file should be skipped while write to file") { withTempPath { path => spark.range(100).repartition(10).where("id = 50").write.parquet(path.toString) val partFiles = path.listFiles() .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_")) assert(partFiles.length === 2) } } test("SPARK-22252: FileFormatWriter should respect the input query schema") { withTable("t1", "t2", "t3", "t4") { spark.range(1).select('id as 'col1, 'id as 'col2).write.saveAsTable("t1") spark.sql("select COL1, COL2 from t1").write.saveAsTable("t2") checkAnswer(spark.table("t2"), Row(0, 0)) // Test picking part of the columns when writing. spark.range(1).select('id, 'id as 'col1, 'id as 'col2).write.saveAsTable("t3") spark.sql("select COL1, COL2 from t3").write.saveAsTable("t4") checkAnswer(spark.table("t4"), Row(0, 0)) } } }
Example 22
Source File: HadoopFsRelationSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{File, FilenameFilter} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec} import org.apache.spark.sql.test.SharedSQLContext class HadoopFsRelationSuite extends QueryTest with SharedSQLContext { test("sizeInBytes should be the total size of all files") { withTempDir{ dir => dir.delete() spark.range(1000).write.parquet(dir.toString) // ignore hidden files val allFiles = dir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { !name.startsWith(".") && !name.startsWith("_") } }) val totalSize = allFiles.map(_.length()).sum val df = spark.read.parquet(dir.toString) assert(df.queryExecution.logical.stats.sizeInBytes === BigInt(totalSize)) } } test("SPARK-22790: spark.sql.sources.compressionFactor takes effect") { import testImplicits._ Seq(1.0, 0.5).foreach { compressionFactor => withSQLConf("spark.sql.sources.fileCompressionFactor" -> compressionFactor.toString, "spark.sql.autoBroadcastJoinThreshold" -> "400") { withTempPath { workDir => // the file size is 740 bytes val workDirPath = workDir.getAbsolutePath val data1 = Seq(100, 200, 300, 400).toDF("count") data1.write.parquet(workDirPath + "/data1") val df1FromFile = spark.read.parquet(workDirPath + "/data1") val data2 = Seq(100, 200, 300, 400).toDF("count") data2.write.parquet(workDirPath + "/data2") val df2FromFile = spark.read.parquet(workDirPath + "/data2") val joinedDF = df1FromFile.join(df2FromFile, Seq("count")) if (compressionFactor == 0.5) { val bJoinExec = joinedDF.queryExecution.executedPlan.collect { case bJoin: BroadcastHashJoinExec => bJoin } assert(bJoinExec.nonEmpty) val smJoinExec = joinedDF.queryExecution.executedPlan.collect { case smJoin: SortMergeJoinExec => smJoin } assert(smJoinExec.isEmpty) } else { // compressionFactor is 1.0 val bJoinExec = joinedDF.queryExecution.executedPlan.collect { case bJoin: BroadcastHashJoinExec => bJoin } assert(bJoinExec.isEmpty) val smJoinExec = joinedDF.queryExecution.executedPlan.collect { case smJoin: SortMergeJoinExec => smJoin } assert(smJoinExec.nonEmpty) } } } } } }
Example 23
Source File: ParquetFileFormatSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkException import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext { test("read parquet footers in parallel") { def testReadFooters(ignoreCorruptFiles: Boolean): Unit = { withTempDir { dir => val fs = FileSystem.get(spark.sessionState.newHadoopConf()) val basePath = dir.getCanonicalPath val path1 = new Path(basePath, "first") val path2 = new Path(basePath, "second") val path3 = new Path(basePath, "third") spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString) spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString) spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString) val fileStatuses = Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten val footers = ParquetFileFormat.readParquetFootersInParallel( spark.sessionState.newHadoopConf(), fileStatuses, ignoreCorruptFiles) assert(footers.size == 2) } } testReadFooters(true) val exception = intercept[SparkException] { testReadFooters(false) }.getCause assert(exception.getMessage().contains("Could not read footer for file")) } }
Example 24
Source File: DataSourceScanExecRedactionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class DataSourceScanExecRedactionSuite extends QueryTest with SharedSQLContext { override protected def sparkConf: SparkConf = super.sparkConf .set("spark.redaction.string.regex", "file:/[\\w_]+") test("treeString is redacted") { withTempDir { dir => val basePath = dir.getCanonicalPath spark.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) val df = spark.read.parquet(basePath) val rootPath = df.queryExecution.sparkPlan.find(_.isInstanceOf[FileSourceScanExec]).get .asInstanceOf[FileSourceScanExec].relation.location.rootPaths.head assert(rootPath.toString.contains(dir.toURI.getPath.stripSuffix("/"))) assert(!df.queryExecution.sparkPlan.treeString(verbose = true).contains(rootPath.getName)) assert(!df.queryExecution.executedPlan.treeString(verbose = true).contains(rootPath.getName)) assert(!df.queryExecution.toString.contains(rootPath.getName)) assert(!df.queryExecution.simpleString.contains(rootPath.getName)) val replacement = "*********" assert(df.queryExecution.sparkPlan.treeString(verbose = true).contains(replacement)) assert(df.queryExecution.executedPlan.treeString(verbose = true).contains(replacement)) assert(df.queryExecution.toString.contains(replacement)) assert(df.queryExecution.simpleString.contains(replacement)) } } private def isIncluded(queryExecution: QueryExecution, msg: String): Boolean = { queryExecution.toString.contains(msg) || queryExecution.simpleString.contains(msg) || queryExecution.stringWithStats.contains(msg) } test("explain is redacted using SQLConf") { withTempDir { dir => val basePath = dir.getCanonicalPath spark.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) val df = spark.read.parquet(basePath) val replacement = "*********" // Respect SparkConf and replace file:/ assert(isIncluded(df.queryExecution, replacement)) assert(isIncluded(df.queryExecution, "FileScan")) assert(!isIncluded(df.queryExecution, "file:/")) withSQLConf(SQLConf.SQL_STRING_REDACTION_PATTERN.key -> "(?i)FileScan") { // Respect SQLConf and replace FileScan assert(isIncluded(df.queryExecution, replacement)) assert(!isIncluded(df.queryExecution, "FileScan")) assert(isIncluded(df.queryExecution, "file:/")) } } } }
Example 25
Source File: SparkPlanSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.SparkEnv import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext class SparkPlanSuite extends QueryTest with SharedSQLContext { test("SPARK-21619 execution of a canonicalized plan should fail") { val plan = spark.range(10).queryExecution.executedPlan.canonicalized intercept[IllegalStateException] { plan.execute() } intercept[IllegalStateException] { plan.executeCollect() } intercept[IllegalStateException] { plan.executeCollectPublic() } intercept[IllegalStateException] { plan.executeToIterator() } intercept[IllegalStateException] { plan.executeBroadcast() } intercept[IllegalStateException] { plan.executeTake(1) } } test("SPARK-23731 plans should be canonicalizable after being (de)serialized") { withTempPath { path => spark.range(1).write.parquet(path.getAbsolutePath) val df = spark.read.parquet(path.getAbsolutePath) val fileSourceScanExec = df.queryExecution.sparkPlan.collectFirst { case p: FileSourceScanExec => p }.get val serializer = SparkEnv.get.serializer.newInstance() val readback = serializer.deserialize[FileSourceScanExec](serializer.serialize(fileSourceScanExec)) try { readback.canonicalized } catch { case e: Throwable => fail("FileSourceScanExec was not canonicalizable", e) } } } test("SPARK-25357 SparkPlanInfo of FileScan contains nonEmpty metadata") { withTempPath { path => spark.range(5).write.parquet(path.getAbsolutePath) val f = spark.read.parquet(path.getAbsolutePath) assert(SparkPlanInfo.fromSparkPlan(f.queryExecution.sparkPlan).metadata.nonEmpty) } } }
Example 26
Source File: SameResultSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.{DataFrame, QueryTest} import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, Project} import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.IntegerType class SameResultSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("FileSourceScanExec: different orders of data filters and partition filters") { withTempPath { path => val tmpDir = path.getCanonicalPath spark.range(10) .selectExpr("id as a", "id + 1 as b", "id + 2 as c", "id + 3 as d") .write .partitionBy("a", "b") .parquet(tmpDir) val df = spark.read.parquet(tmpDir) // partition filters: a > 1 AND b < 9 // data filters: c > 1 AND d < 9 val plan1 = getFileSourceScanExec(df.where("a > 1 AND b < 9 AND c > 1 AND d < 9")) val plan2 = getFileSourceScanExec(df.where("b < 9 AND a > 1 AND d < 9 AND c > 1")) assert(plan1.sameResult(plan2)) } } private def getFileSourceScanExec(df: DataFrame): FileSourceScanExec = { df.queryExecution.sparkPlan.find(_.isInstanceOf[FileSourceScanExec]).get .asInstanceOf[FileSourceScanExec] } test("SPARK-20725: partial aggregate should behave correctly for sameResult") { val df1 = spark.range(10).agg(sum($"id")) val df2 = spark.range(10).agg(sum($"id")) assert(df1.queryExecution.executedPlan.sameResult(df2.queryExecution.executedPlan)) val df3 = spark.range(10).agg(sumDistinct($"id")) val df4 = spark.range(10).agg(sumDistinct($"id")) assert(df3.queryExecution.executedPlan.sameResult(df4.queryExecution.executedPlan)) } test("Canonicalized result is case-insensitive") { val a = AttributeReference("A", IntegerType)() val b = AttributeReference("B", IntegerType)() val planUppercase = Project(Seq(a), LocalRelation(a, b)) val c = AttributeReference("a", IntegerType)() val d = AttributeReference("b", IntegerType)() val planLowercase = Project(Seq(c), LocalRelation(c, d)) assert(planUppercase.sameResult(planLowercase)) } }
Example 27
Source File: WholeStageCodegenSparkSubmitSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.TimeLimits import org.apache.spark.{SparkFunSuite, TestUtils} import org.apache.spark.deploy.SparkSubmitSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.{LocalSparkSession, QueryTest, Row, SparkSession} import org.apache.spark.sql.functions.{array, col, count, lit} import org.apache.spark.sql.types.IntegerType import org.apache.spark.unsafe.Platform import org.apache.spark.util.ResetSystemProperties // Due to the need to set driver's extraJavaOptions, this test needs to use actual SparkSubmit. class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach with ResetSystemProperties { test("Generated code on driver should not embed platform-specific constant") { val unusedJar = TestUtils.createJarWithClasses(Seq.empty) // HotSpot JVM specific: Set up a local cluster with the driver/executor using mismatched // settings of UseCompressedOops JVM option. val argsForSparkSubmit = Seq( "--class", WholeStageCodegenSparkSubmitSuite.getClass.getName.stripSuffix("$"), "--master", "local-cluster[1,1,1024]", "--driver-memory", "1g", "--conf", "spark.ui.enabled=false", "--conf", "spark.master.rest.enabled=false", "--conf", "spark.driver.extraJavaOptions=-XX:-UseCompressedOops", "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops", unusedJar.toString) SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") } } object WholeStageCodegenSparkSubmitSuite extends Assertions with Logging { var spark: SparkSession = _ def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") spark = SparkSession.builder().getOrCreate() // Make sure the test is run where the driver and the executors uses different object layouts val driverArrayHeaderSize = Platform.BYTE_ARRAY_OFFSET val executorArrayHeaderSize = spark.sparkContext.range(0, 1).map(_ => Platform.BYTE_ARRAY_OFFSET).collect.head.toInt assert(driverArrayHeaderSize > executorArrayHeaderSize) val df = spark.range(71773).select((col("id") % lit(10)).cast(IntegerType) as "v") .groupBy(array(col("v"))).agg(count(col("*"))) val plan = df.queryExecution.executedPlan assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined) val expectedAnswer = Row(Array(0), 7178) :: Row(Array(1), 7178) :: Row(Array(2), 7178) :: Row(Array(3), 7177) :: Row(Array(4), 7177) :: Row(Array(5), 7177) :: Row(Array(6), 7177) :: Row(Array(7), 7177) :: Row(Array(8), 7177) :: Row(Array(9), 7177) :: Nil val result = df.collect QueryTest.sameRows(result.toSeq, expectedAnswer) match { case Some(errMsg) => fail(errMsg) case _ => } } }
Example 28
Source File: OapRuntimeSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.oap import java.util.concurrent.{Executors, ExecutorService, TimeUnit} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.oap.SharedOapLocalClusterContext class OapRuntimeSuite extends QueryTest with SharedOapLocalClusterContext { test("OapRuntime is created once") { val oapruntime = new Array[OapRuntime](2) val threadPool: ExecutorService = Executors.newFixedThreadPool(2) try { for (i <- 0 to 1) { threadPool.execute(new Runnable { override def run(): Unit = { oapruntime(i) = OapRuntime.getOrCreate } }) } threadPool.awaitTermination(1000, TimeUnit.MILLISECONDS) } finally { threadPool.shutdown() } assert(oapruntime(0) == oapruntime(1)) } test("get sparkSession from OapRuntime") { assert(OapRuntime.getOrCreate.sparkSession == spark) } }
Example 29
Source File: OapBitmapWrappedFiberCacheSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.utils import java.io.{ByteArrayOutputStream, DataOutputStream, FileOutputStream} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FSDataInputStream, Path} import org.roaringbitmap.RoaringBitmap import org.apache.spark.sql.QueryTest import org.apache.spark.sql.execution.datasources.OapException import org.apache.spark.sql.execution.datasources.oap.filecache.{BitmapFiberId, FiberCache} import org.apache.spark.sql.oap.OapRuntime import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.util.Utils // Below are used to test the functionality of OapBitmapWrappedFiberCache class. class OapBitmapWrappedFiberCacheSuite extends QueryTest with SharedOapContext { private def loadRbFile(fin: FSDataInputStream, offset: Long, size: Int): FiberCache = OapRuntime.getOrCreate.fiberCacheManager.toIndexFiberCache(fin, offset, size) test("test the functionality of OapBitmapWrappedFiberCache class") { val CHUNK_SIZE = 1 << 16 val dataForRunChunk = (1 to 9).toSeq val dataForArrayChunk = Seq(1, 3, 5, 7, 9) val dataForBitmapChunk = (1 to 10000).filter(_ % 2 == 1) val dataCombination = dataForBitmapChunk ++ dataForArrayChunk ++ dataForRunChunk val dataArray = Array(dataForRunChunk, dataForArrayChunk, dataForBitmapChunk, dataCombination) dataArray.foreach(dataIdx => { val dir = Utils.createTempDir() val rb = new RoaringBitmap() dataIdx.foreach(rb.add) val rbFile = dir.getAbsolutePath + "rb.bin" rb.runOptimize() val rbFos = new FileOutputStream(rbFile) val rbBos = new ByteArrayOutputStream() val rbDos = new DataOutputStream(rbBos) rb.serialize(rbDos) rbBos.writeTo(rbFos) rbBos.close() rbDos.close() rbFos.close() val rbPath = new Path(rbFile.toString) val conf = new Configuration() val fin = rbPath.getFileSystem(conf).open(rbPath) val rbFileSize = rbPath.getFileSystem(conf).getFileStatus(rbPath).getLen val rbFiber = BitmapFiberId( () => loadRbFile(fin, 0L, rbFileSize.toInt), rbPath.toString, 0, 0) val rbWfc = new OapBitmapWrappedFiberCache( OapRuntime.getOrCreate.fiberCacheManager.get(rbFiber)) rbWfc.init val chunkLength = rbWfc.getTotalChunkLength val length = dataIdx.size / CHUNK_SIZE assert(chunkLength == (length + 1)) val chunkKeys = rbWfc.getChunkKeys assert(chunkKeys(0).toInt == 0) rbWfc.setOffset(0) val chunk = rbWfc.getIteratorForChunk(0) chunk match { case RunChunkIterator(rbWfc) => assert(chunk == RunChunkIterator(rbWfc)) case ArrayChunkIterator(rbWfc, 0) => assert(chunk == ArrayChunkIterator(rbWfc, 0)) case BitmapChunkIterator(rbWfc) => assert(chunk == BitmapChunkIterator(rbWfc)) case _ => throw new OapException("unexpected chunk in OapBitmapWrappedFiberCache.") } rbWfc.release fin.close dir.delete }) } }
Example 30
Source File: BitmapAnalyzeStatisticsSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.index import org.apache.hadoop.fs.RawLocalFileSystem import org.scalatest.BeforeAndAfterEach import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.util.Utils trait SharedOapContextWithRawLocalFileSystem extends SharedOapContext { oapSparkConf.set("spark.hadoop.fs.file.impl", classOf[RawLocalFileSystem].getName) } class BitmapAnalyzeStatisticsSuite extends QueryTest with SharedOapContextWithRawLocalFileSystem with BeforeAndAfterEach { import testImplicits._ override def beforeEach(): Unit = { val tempDir = Utils.createTempDir() val path = tempDir.getAbsolutePath sql(s"""CREATE TEMPORARY VIEW oap_test (a INT, b STRING) | USING oap | OPTIONS (path '$path')""".stripMargin) } override def afterEach(): Unit = { sqlContext.dropTempTable("oap_test") } test("Bitmap index typical equal test") { val data: Seq[(Int, String)] = (1 to 200).map { i => (i, s"this is test $i") } data.toDF("key", "value").createOrReplaceTempView("t") sql("insert overwrite table oap_test select * from t") sql("create oindex idxa on oap_test (a) USING BITMAP") checkAnswer(sql(s"SELECT * FROM oap_test WHERE a = 20 OR a = 21"), Row(20, "this is test 20") :: Row(21, "this is test 21") :: Nil) sql("drop oindex idxa on oap_test") } }
Example 31
Source File: ClusteredFilterSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.cluster import org.scalatest.BeforeAndAfterEach import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.test.oap.SharedOapLocalClusterContext import org.apache.spark.util.Utils class ClusteredFilterSuite extends QueryTest with SharedOapLocalClusterContext with BeforeAndAfterEach { import testImplicits._ private var currentPath: String = _ override def beforeEach(): Unit = { val path = Utils.createTempDir().getAbsolutePath currentPath = path sql(s"""CREATE TEMPORARY VIEW oap_test (a INT, b STRING) | USING oap | OPTIONS (path '$path')""".stripMargin) } override def afterEach(): Unit = { sqlContext.dropTempTable("oap_test") } test("filtering") { val data: Seq[(Int, String)] = (1 to 300).map { i => (i, s"this is test $i") } data.toDF("key", "value").createOrReplaceTempView("t") sql("insert overwrite table oap_test select * from t") sql("create oindex index1 on oap_test (a)") checkAnswer(sql("SELECT * FROM oap_test WHERE a = 1"), Row(1, "this is test 1") :: Nil) checkAnswer(sql("SELECT * FROM oap_test WHERE a > 1 AND a <= 3"), Row(2, "this is test 2") :: Row(3, "this is test 3") :: Nil) checkAnswer(sql("SELECT * FROM oap_test WHERE a <= 2"), Row(1, "this is test 1") :: Row(2, "this is test 2") :: Nil) checkAnswer(sql("SELECT * FROM oap_test WHERE a >= 300"), Row(300, "this is test 300") :: Nil) sql("drop oindex index1 on oap_test") } }
Example 32
Source File: DataFileSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.QueryTest import org.apache.spark.sql.execution.datasources.OapException import org.apache.spark.sql.execution.datasources.oap.OapFileFormat import org.apache.spark.sql.execution.datasources.parquet.SpecificParquetRecordReaderBase import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils class DataFileSuite extends QueryTest with SharedOapContext { override def beforeEach(): Unit = { val path = Utils.createTempDir().getAbsolutePath } // Override afterEach because OapDataFile will open a InputStream for OapDataFileMeta // but no method to manual close it and we can not to check open streams. override def afterEach(): Unit = {} test("apply and cache") { val data = (0 to 10).map(i => (i, (i + 'a').toChar.toString)) val schema = new StructType() val config = new Configuration() withTempPath { dir => val df = spark.createDataFrame(data) df.repartition(1).write.format("oap").save(dir.getAbsolutePath) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) val datafile = DataFile(file, schema, OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME, config) assert(datafile.path == file) assert(datafile.schema == schema) assert(datafile.configuration == config) } withTempPath { dir => val df = spark.createDataFrame(data) df.repartition(1).write.parquet(dir.getAbsolutePath) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) val datafile = DataFile(file, schema, OapFileFormat.PARQUET_DATA_FILE_CLASSNAME, config) assert(datafile.path == file) assert(datafile.schema == schema) assert(datafile.configuration == config) } withTempPath { dir => val df = spark.createDataFrame(data) df.repartition(1).write.format("orc").save(dir.getAbsolutePath) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) val datafile = DataFile(file, schema, OapFileFormat.ORC_DATA_FILE_CLASSNAME, config) assert(datafile.path == file) assert(datafile.schema == schema) assert(datafile.configuration == config) } // DataFile object is global. After OrcDataFile is added, then need to change to 3 if // we run the whole tests. assert(DataFile.cachedConstructorCount == 3) intercept[OapException] { DataFile("nofile", schema, "NotExistClass", config) assert(DataFile.cachedConstructorCount == 2) } } test("DataFile equals") { val data = (0 to 10).map(i => (i, (i + 'a').toChar.toString)) val schema = new StructType() val config = new Configuration() withTempPath { dir => val df = spark.createDataFrame(data) df.repartition(1).write.parquet(dir.getAbsolutePath) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) val datafile1 = DataFile(file, schema, OapFileFormat.PARQUET_DATA_FILE_CLASSNAME, config) val datafile2 = DataFile(file, schema, OapFileFormat.PARQUET_DATA_FILE_CLASSNAME, config) assert(datafile1.equals(datafile2)) assert(datafile1.hashCode() == datafile2.hashCode()) } withTempPath { dir => val df = spark.createDataFrame(data) df.repartition(1).write.format("oap").save(dir.getAbsolutePath) val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0) val datafile1 = DataFile(file, schema, OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME, config) val datafile2 = DataFile(file, schema, OapFileFormat.OAP_DATA_FILE_V1_CLASSNAME, config) assert(datafile1.equals(datafile2)) assert(datafile1.hashCode() == datafile2.hashCode()) } } }
Example 33
Source File: FileSkipSuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.oap.io import org.scalatest.BeforeAndAfterEach import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.oap.SharedOapContext import org.apache.spark.util.Utils class FileSkipSuite extends QueryTest with SharedOapContext with BeforeAndAfterEach { import testImplicits._ override def beforeEach(): Unit = { val path1 = Utils.createTempDir().getAbsolutePath sql(s"""CREATE TEMPORARY VIEW oap_test_1 (a INT, b STRING) | USING oap | OPTIONS (path '$path1')""".stripMargin) } override def afterEach(): Unit = { sqlContext.dropTempTable("oap_test_1") } test("skip all file (is not null)") { val data: Seq[(Int, String)] = scala.util.Random.shuffle(1 to 300).map(i => (i, null)).toSeq data.toDF("key", "value").createOrReplaceTempView("t") sql("insert overwrite table oap_test_1 select * from t") val result = sql("SELECT * FROM oap_test_1 WHERE b is not null") assert(result.count == 0) } test("skip all file (equal)") { val data: Seq[(Int, String)] = scala.util.Random.shuffle(1 to 300).map(i => (i, s"this is test $i")).toSeq data.toDF("key", "value").createOrReplaceTempView("t") sql("insert overwrite table oap_test_1 select * from t") val result1 = sql("SELECT * FROM oap_test_1 WHERE a = 1") assert(result1.count == 1) val result2 = sql("SELECT * FROM oap_test_1 WHERE a = 500") assert(result2.count == 0) } test("skip all file (lt)") { val data: Seq[(Int, String)] = scala.util.Random.shuffle(1 to 300).map(i => (i, s"this is test $i")).toSeq data.toDF("key", "value").createOrReplaceTempView("t") sql("insert overwrite table oap_test_1 select * from t") val result1 = sql("SELECT * FROM oap_test_1 WHERE a < 1") assert(result1.count == 0) val result2 = sql("SELECT * FROM oap_test_1 WHERE a < 2") assert(result2.count == 1) } test("skip all file (lteq)") { val data: Seq[(Int, String)] = scala.util.Random.shuffle(1 to 300).map(i => (i, s"this is test $i")).toSeq data.toDF("key", "value").createOrReplaceTempView("t") sql("insert overwrite table oap_test_1 select * from t") val result1 = sql("SELECT * FROM oap_test_1 WHERE a <= 0") assert(result1.count == 0) val result2 = sql("SELECT * FROM oap_test_1 WHERE a <= 1") assert(result2.count == 1) } test("skip all file (gt)") { val data: Seq[(Int, String)] = scala.util.Random.shuffle(1 to 300).map(i => (i, s"this is test $i")).toSeq data.toDF("key", "value").createOrReplaceTempView("t") sql("insert overwrite table oap_test_1 select * from t") val result1 = sql("SELECT * FROM oap_test_1 WHERE a > 300") assert(result1.count == 0) val result2 = sql("SELECT * FROM oap_test_1 WHERE a > 2") assert(result2.count == 298) } test("skip all file (gteq)") { val data: Seq[(Int, String)] = scala.util.Random.shuffle(1 to 300).map(i => (i, s"this is test $i")).toSeq data.toDF("key", "value").createOrReplaceTempView("t") sql("insert overwrite table oap_test_1 select * from t") val result1 = sql("SELECT * FROM oap_test_1 WHERE a >= 300") assert(result1.count == 1) val result2 = sql("SELECT * FROM oap_test_1 WHERE a >= 500") assert(result2.count == 0) } }
Example 34
Source File: HiveParquetSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.execution.datasources.parquet.ParquetTest import org.apache.spark.sql.hive.test.TestHiveSingleton case class Cases(lower: String, UPPER: String) class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton { test("Case insensitive attribute names") { withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") { val expected = (1 to 4).map(i => Row(i.toString)) checkAnswer(sql("SELECT upper FROM cases"), expected) checkAnswer(sql("SELECT LOWER FROM cases"), expected) } } test("SELECT on Parquet table") { val data = (1 to 4).map(i => (i, s"val_$i")) withParquetTable(data, "t") { checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple)) } } test("Simple column projection + filter on Parquet table") { withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") { checkAnswer( sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"), Seq(Row(true, "val_2"), Row(true, "val_4"))) } } test("Converting Hive to Parquet Table via saveAsParquetFile") { withTempPath { dir => sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath) spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("p") withTempView("p") { checkAnswer( sql("SELECT * FROM src ORDER BY key"), sql("SELECT * from p ORDER BY key").collect().toSeq) } } } test("INSERT OVERWRITE TABLE Parquet table") { // Don't run with vectorized: currently relies on UnsafeRow. withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t", false) { withTempPath { file => sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath) spark.read.parquet(file.getCanonicalPath).createOrReplaceTempView("p") withTempView("p") { // let's do three overwrites for good measure sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq) } } } } }
Example 35
Source File: HiveDataFrameJoinSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveDataFrameJoinSuite extends QueryTest with TestHiveSingleton { import spark.implicits._ // We should move this into SQL package if we make case sensitivity configurable in SQL. test("join - self join auto resolve ambiguity with case insensitivity") { val df = Seq((1, "1"), (2, "2")).toDF("key", "value") checkAnswer( df.join(df, df("key") === df("Key")), Row(1, "1", 1, "1") :: Row(2, "2", 2, "2") :: Nil) checkAnswer( df.join(df.filter($"value" === "2"), df("key") === df("Key")), Row(2, "2", 2, "2") :: Nil) } }
Example 36
Source File: MetastoreRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MetastoreRelationSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("makeCopy and toJSON should work") { val table = CatalogTable( identifier = TableIdentifier("test", Some("db")), tableType = CatalogTableType.VIEW, storage = CatalogStorageFormat.empty, schema = StructType(StructField("a", IntegerType, true) :: Nil)) val relation = MetastoreRelation("db", "test")(table, null) // No exception should be thrown relation.makeCopy(Array("db", "test")) // No exception should be thrown relation.toJSON } test("SPARK-17409: Do Not Optimize Query in CTAS (Hive Serde Table) More Than Once") { withTable("bar") { withTempView("foo") { sql("select 0 as id").createOrReplaceTempView("foo") // If we optimize the query in CTAS more than once, the following saveAsTable will fail // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])` sql("CREATE TABLE bar AS SELECT * FROM foo group by id") checkAnswer(spark.table("bar"), Row(0) :: Nil) val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar")) assert(tableMetadata.provider == Some("hive"), "the expected table is a Hive serde table") } } } }
Example 37
Source File: HiveOperatorQueryableSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveOperatorQueryableSuite extends QueryTest with TestHiveSingleton { import spark._ test("SPARK-5324 query result of describe command") { hiveContext.loadTestTable("src") // Creates a temporary view with the output of a describe command sql("desc src").createOrReplaceTempView("mydesc") checkAnswer( sql("desc mydesc"), Seq( Row("col_name", "string", "name of the column"), Row("data_type", "string", "data type of the column"), Row("comment", "string", "comment of the column"))) checkAnswer( sql("select * from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) checkAnswer( sql("select col_name, data_type, comment from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) } }
Example 38
Source File: HivePlanTest.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.functions._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import spark.sql import spark.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t") val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 39
Source File: PruneFileSourcePartitionsSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil } test("PruneFileSourcePartitions should not change the output of LogicalRelation") { withTable("test") { withTempDir { dir => sql( s""" |CREATE EXTERNAL TABLE test(i int) |PARTITIONED BY (p int) |STORED AS parquet |LOCATION '${dir.getAbsolutePath}'""".stripMargin) val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( location = catalogFileIndex, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, catalogTable = Some(tableMeta)) val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) } } } }
Example 40
Source File: HiveUtilsSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.QueryTest class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("newTemporaryConfiguration overwrites listener configurations") { Seq(true, false).foreach { useInMemoryDerby => val conf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(conf(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "") } } }
Example 41
Source File: ListTablesSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.QueryTest import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.hive.test.TestHiveSingleton class ListTablesSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll { import hiveContext._ import hiveContext.implicits._ val df = sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value") override def beforeAll(): Unit = { super.beforeAll() // The catalog in HiveContext is a case insensitive one. sessionState.catalog.createTempView( "ListTablesSuiteTable", df.logicalPlan, overrideIfExists = true) sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)") sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB") sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)") } override def afterAll(): Unit = { try { sessionState.catalog.dropTable( TableIdentifier("ListTablesSuiteTable"), ignoreIfNotExists = true, purge = false) sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable") sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable") sql("DROP DATABASE IF EXISTS ListTablesSuiteDB") } finally { super.afterAll() } } test("get all tables of current database") { Seq(tables(), sql("SHOW TABLes")).foreach { case allTables => // We are using default DB. checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("", "listtablessuitetable", true)) checkAnswer( allTables.filter("tableName = 'hivelisttablessuitetable'"), Row("default", "hivelisttablessuitetable", false)) assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0) } } test("getting all tables with a database name") { Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach { case allTables => checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("", "listtablessuitetable", true)) assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hiveindblisttablessuitetable'"), Row("listtablessuitedb", "hiveindblisttablessuitetable", false)) } } }
Example 42
Source File: HiveVariableSubstitutionSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveVariableSubstitutionSuite extends QueryTest with TestHiveSingleton { test("SET hivevar with prefix") { spark.sql("SET hivevar:county=gram") assert(spark.conf.getOption("county") === Some("gram")) } test("SET hivevar with dotted name") { spark.sql("SET hivevar:eloquent.mosquito.alphabet=zip") assert(spark.conf.getOption("eloquent.mosquito.alphabet") === Some("zip")) } test("hivevar substitution") { spark.conf.set("pond", "bus") checkAnswer(spark.sql("SELECT '${hivevar:pond}'"), Row("bus") :: Nil) } test("variable substitution without a prefix") { spark.sql("SET hivevar:flask=plaid") checkAnswer(spark.sql("SELECT '${flask}'"), Row("plaid") :: Nil) } test("variable substitution precedence") { spark.conf.set("turn.aloof", "questionable") spark.sql("SET hivevar:turn.aloof=dime") // hivevar clobbers the conf setting checkAnswer(spark.sql("SELECT '${turn.aloof}'"), Row("dime") :: Nil) } }
Example 43
Source File: SQLBuilderTest.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import scala.util.control.NonFatal import org.apache.spark.sql.{DataFrame, Dataset, QueryTest} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.hive.test.TestHiveSingleton abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton { protected def checkSQL(e: Expression, expectedSQL: String): Unit = { val actualSQL = e.sql try { assert(actualSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following expression: | |${e.prettyName} | |$cause """.stripMargin) } } protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = { val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) => fail( s"""Cannot convert the following logical query plan to SQL: | |${plan.treeString} """.stripMargin) } try { assert(generatedSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following logical query plan: | |${plan.treeString} | |$cause """.stripMargin) } checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan)) } protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = { checkSQL(df.queryExecution.analyzed, expectedSQL) } }
Example 44
Source File: HadoopFsRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{File, FilenameFilter} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext class HadoopFsRelationSuite extends QueryTest with SharedSQLContext { test("sizeInBytes should be the total size of all files") { withTempDir{ dir => dir.delete() spark.range(1000).write.parquet(dir.toString) // ignore hidden files val allFiles = dir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { !name.startsWith(".") } }) val totalSize = allFiles.map(_.length()).sum val df = spark.read.parquet(dir.toString) assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize)) } } }
Example 45
Source File: PartitionedWriteSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils class PartitionedWriteSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("write many partitions") { val path = Utils.createTempDir() path.delete() val df = spark.range(100).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( spark.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) } test("write many partitions with repeats") { val path = Utils.createTempDir() path.delete() val base = spark.range(100) val df = base.union(base).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( spark.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq ++ (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) } test("partitioned columns should appear at the end of schema") { withTempPath { f => val path = f.getAbsolutePath Seq(1 -> "a").toDF("i", "j").write.partitionBy("i").parquet(path) assert(spark.read.parquet(path).schema.map(_.name) == Seq("j", "i")) } } }
Example 46
Source File: DeltaRetentionSuiteBase.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import java.io.File import org.apache.spark.sql.delta.DeltaOperations.Truncate import org.apache.spark.sql.delta.actions.Metadata import org.apache.spark.sql.delta.util.FileNames import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.util.IntervalUtils import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.unsafe.types.UTF8String trait DeltaRetentionSuiteBase extends QueryTest with SharedSparkSession { protected val testOp = Truncate() protected override def sparkConf: SparkConf = super.sparkConf // Disable the log cleanup because it runs asynchronously and causes test flakiness .set("spark.databricks.delta.properties.defaults.enableExpiredLogCleanup", "false") protected def intervalStringToMillis(str: String): Long = { DeltaConfigs.getMilliSeconds( IntervalUtils.safeStringToInterval(UTF8String.fromString(str))) } protected def getDeltaFiles(dir: File): Seq[File] = dir.listFiles().filter(_.getName.endsWith(".json")) protected def getCheckpointFiles(dir: File): Seq[File] = dir.listFiles().filter(f => FileNames.isCheckpointFile(new Path(f.getCanonicalPath))) protected def getLogFiles(dir: File): Seq[File] protected def startTxnWithManualLogCleanup(log: DeltaLog): OptimisticTransaction = { val txn = log.startTransaction() // This will pick up `spark.databricks.delta.properties.defaults.enableExpiredLogCleanup` to // disable log cleanup. txn.updateMetadata(Metadata()) txn } test("startTxnWithManualLogCleanup") { withTempDir { tempDir => val log = DeltaLog(spark, new Path(tempDir.getCanonicalPath)) startTxnWithManualLogCleanup(log).commit(Nil, testOp) assert(!log.enableExpiredLogCleanup) } } }
Example 47
Source File: DeltaErrorsSuite.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import scala.sys.process.Process import org.apache.hadoop.fs.Path import org.scalatest.GivenWhenThen import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.{SharedSparkSession, SQLTestUtils} trait DeltaErrorsSuiteBase extends QueryTest with SharedSparkSession with GivenWhenThen with SQLTestUtils { val MAX_URL_ACCESS_RETRIES = 3 val path = "/sample/path" // Map of error name to the actual error message it throws // When adding an error, add the name of the function throwing the error as the key and the value // as the error being thrown def errorsToTest: Map[String, Throwable] = Map( "useDeltaOnOtherFormatPathException" -> DeltaErrors.useDeltaOnOtherFormatPathException("operation", path, spark), "useOtherFormatOnDeltaPathException" -> DeltaErrors.useOtherFormatOnDeltaPathException("operation", path, path, "format", spark), "createExternalTableWithoutLogException" -> DeltaErrors.createExternalTableWithoutLogException(new Path(path), "tableName", spark), "createExternalTableWithoutSchemaException" -> DeltaErrors.createExternalTableWithoutSchemaException(new Path(path), "tableName", spark), "createManagedTableWithoutSchemaException" -> DeltaErrors.createManagedTableWithoutSchemaException("tableName", spark), "multipleSourceRowMatchingTargetRowInMergeException" -> DeltaErrors.multipleSourceRowMatchingTargetRowInMergeException(spark), "concurrentModificationException" -> new ConcurrentWriteException(None)) def otherMessagesToTest: Map[String, String] = Map( "deltaFileNotFoundHint" -> DeltaErrors.deltaFileNotFoundHint( DeltaErrors.generateDocsLink( sparkConf, DeltaErrors.faqRelativePath, skipValidation = true), path)) def errorMessagesToTest: Map[String, String] = errorsToTest.mapValues(_.getMessage) ++ otherMessagesToTest def checkIfValidResponse(url: String, response: String): Boolean = { response.contains("HTTP/1.1 200 OK") || response.contains("HTTP/2 200") } def getUrlsFromMessage(message: String): List[String] = { val regexToFindUrl = "https://[^\\s]+".r regexToFindUrl.findAllIn(message).toList } def testUrls(): Unit = { errorMessagesToTest.foreach { case (errName, message) => getUrlsFromMessage(message).foreach { url => Given(s"*** Checking response for url: $url") var response = "" (1 to MAX_URL_ACCESS_RETRIES).foreach { attempt => if (attempt > 1) Thread.sleep(1000) response = Process("curl -I " + url).!! if (!checkIfValidResponse(url, response)) { fail( s""" |A link to the URL: '$url' is broken in the error: $errName, accessing this URL |does not result in a valid response, received the following response: $response """.stripMargin) } } } } } test("Validate that links to docs in DeltaErrors are correct") { testUrls() } } class DeltaErrorsSuite extends DeltaErrorsSuiteBase
Example 48
Source File: MergeIntoAccumulatorSuite.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ import org.apache.spark.sql.delta.commands.MergeIntoCommand import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.status.TaskDataWrapper import org.apache.spark.util.JsonProtocol class MergeIntoAccumulatorSuite extends QueryTest with SharedSparkSession with DeltaSQLCommandTest { import testImplicits._ private def runTestMergeCommand(): Unit = { // Run a simple merge command withTempView("source") { withTempDir { tempDir => val tempPath = tempDir.getCanonicalPath Seq((1, 1), (0, 3)).toDF("key", "value").createOrReplaceTempView("source") Seq((2, 2), (1, 4)).toDF("key", "value").write.format("delta").save(tempPath) spark.sql(s""" |MERGE INTO delta.`$tempPath` target |USING source src |ON src.key = target.key |WHEN MATCHED THEN UPDATE SET * |WHEN NOT MATCHED THEN INSERT * |""".stripMargin) } } } test("accumulators used by MERGE should not be tracked by Spark UI") { runTestMergeCommand() // Make sure all Spark events generated by the above command have been processed spark.sparkContext.listenerBus.waitUntilEmpty(30000) val store = spark.sparkContext.statusStore.store val iter = store.view(classOf[TaskDataWrapper]).closeableIterator() try { // Collect all accumulator names tracked by Spark UI. val accumNames = iter.asScala.toVector.flatMap { task => task.accumulatorUpdates.map(_.name) }.toSet // Verify accumulators used by MergeIntoCommand are not tracked. assert(!accumNames.contains(MergeIntoCommand.TOUCHED_FILES_ACCUM_NAME)) } finally { iter.close() } } }
Example 49
Source File: DatasetRefCacheSuite.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.util import org.apache.spark.sql.{QueryTest, SparkSession} import org.apache.spark.sql.test.SharedSparkSession class DatasetRefCacheSuite extends QueryTest with SharedSparkSession { test("should create a new Dataset when the active session is changed") { val cache = new DatasetRefCache(() => spark.range(1, 10) ) val ref = cache.get // Should reuse `Dataset` when the active session is the same assert(ref eq cache.get) SparkSession.setActiveSession(spark.newSession()) // Should create a new `Dataset` when the active session is changed assert(ref ne cache.get) } }
Example 50
Source File: EvolvabilitySuiteBase.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import java.io.File import org.apache.spark.sql.delta.actions.{Action, FileAction, SingleAction} import org.apache.hadoop.fs.Path import org.apache.spark.sql.{QueryTest, SparkSession} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils trait EvolvabilitySuiteBase extends QueryTest with SharedSparkSession { import testImplicits._ protected def testEvolvability(tablePath: String): Unit = { // Check we can load everything from a log checkpoint val deltaLog = DeltaLog.forTable(spark, new Path(tablePath)) val path = deltaLog.dataPath.toString checkDatasetUnorderly( spark.read.format("delta").load(path).select("id", "value").as[(Int, String)], 4 -> "d", 5 -> "e", 6 -> "f") assert(deltaLog.snapshot.metadata.schema === StructType.fromDDL("id INT, value STRING")) assert(deltaLog.snapshot.metadata.partitionSchema === StructType.fromDDL("id INT")) // Check we can load CheckpointMetaData assert(deltaLog.lastCheckpoint === Some(CheckpointMetaData(3, 6L, None))) // Check we can parse all `Action`s in delta files. It doesn't check correctness. deltaLog.getChanges(0L).toList.map(_._2.toList) } } // scalastyle:off def validateData(spark: SparkSession, path: String): Unit = { import org.apache.spark.sql.delta.util.FileNames._ import scala.reflect.runtime.{universe => ru} import spark.implicits._ val mirror = ru.runtimeMirror(this.getClass.getClassLoader) val tpe = ru.typeOf[Action] val clazz = tpe.typeSymbol.asClass assert(clazz.isSealed, s"${classOf[Action]} must be sealed") val deltaLog = DeltaLog.forTable(spark, new Path(path)) val deltas = 0L to deltaLog.snapshot.version val deltaFiles = deltas.map(deltaFile(deltaLog.logPath, _)).map(_.toString) val actionsTypesInLog = spark.read.schema(Action.logSchema).json(deltaFiles: _*) .as[SingleAction] .collect() .map(_.unwrap.getClass.asInstanceOf[Class[_]]) .toSet val allActionTypes = clazz.knownDirectSubclasses .flatMap { case t if t == ru.typeOf[FileAction].typeSymbol => t.asClass.knownDirectSubclasses case t => Set(t) } .map(t => mirror.runtimeClass(t.asClass)) val missingTypes = allActionTypes -- actionsTypesInLog val unknownTypes = actionsTypesInLog -- allActionTypes assert( missingTypes.isEmpty, s"missing types: $missingTypes. " + "Please update EvolveabilitySuite.generateData to include them in the log.") assert( unknownTypes.isEmpty, s"unknown types: $unknownTypes. " + s"Please make sure they inherit ${classOf[Action]} or ${classOf[FileAction]} directly.") } def main(args: Array[String]): Unit = { val spark = SparkSession.builder().master("local[2]").getOrCreate() val path = new File(args(0)) if (path.exists()) { // Don't delete automatically in case the user types a wrong path. // scalastyle:off throwerror throw new AssertionError(s"${path.getCanonicalPath} exists. Please delete it and retry.") // scalastyle:on throwerror } generateData(spark, path.toString) validateData(spark, path.toString) } }
Example 51
Source File: HiveParquetSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.execution.datasources.parquet.ParquetTest import org.apache.spark.sql.hive.test.TestHiveSingleton case class Cases(lower: String, UPPER: String) class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton { test("Case insensitive attribute names") { withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") { val expected = (1 to 4).map(i => Row(i.toString)) checkAnswer(sql("SELECT upper FROM cases"), expected) checkAnswer(sql("SELECT LOWER FROM cases"), expected) } } test("SELECT on Parquet table") { val data = (1 to 4).map(i => (i, s"val_$i")) withParquetTable(data, "t") { checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple)) } } test("Simple column projection + filter on Parquet table") { withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") { checkAnswer( sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"), Seq(Row(true, "val_2"), Row(true, "val_4"))) } } test("Converting Hive to Parquet Table via saveAsParquetFile") { withTempPath { dir => sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath) spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("p") withTempView("p") { checkAnswer( sql("SELECT * FROM src ORDER BY key"), sql("SELECT * from p ORDER BY key").collect().toSeq) } } } test("INSERT OVERWRITE TABLE Parquet table") { // Don't run with vectorized: currently relies on UnsafeRow. withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t", false) { withTempPath { file => sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath) spark.read.parquet(file.getCanonicalPath).createOrReplaceTempView("p") withTempView("p") { // let's do three overwrites for good measure sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq) } } } } }
Example 52
Source File: HiveDataFrameJoinSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveDataFrameJoinSuite extends QueryTest with TestHiveSingleton { import spark.implicits._ // We should move this into SQL package if we make case sensitivity configurable in SQL. test("join - self join auto resolve ambiguity with case insensitivity") { val df = Seq((1, "1"), (2, "2")).toDF("key", "value") checkAnswer( df.join(df, df("key") === df("Key")), Row(1, "1", 1, "1") :: Row(2, "2", 2, "2") :: Nil) checkAnswer( df.join(df.filter($"value" === "2"), df("key") === df("Key")), Row(2, "2", 2, "2") :: Nil) } }
Example 53
Source File: MetastoreRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable, CatalogTableType} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class MetastoreRelationSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("makeCopy and toJSON should work") { val table = CatalogTable( identifier = TableIdentifier("test", Some("db")), tableType = CatalogTableType.VIEW, storage = CatalogStorageFormat.empty, schema = StructType(StructField("a", IntegerType, true) :: Nil)) val relation = MetastoreRelation("db", "test")(table, null) // No exception should be thrown relation.makeCopy(Array("db", "test")) // No exception should be thrown relation.toJSON } test("SPARK-17409: Do Not Optimize Query in CTAS (Hive Serde Table) More Than Once") { withTable("bar") { withTempView("foo") { sql("select 0 as id").createOrReplaceTempView("foo") // If we optimize the query in CTAS more than once, the following saveAsTable will fail // with the error: `GROUP BY position 0 is not in select list (valid range is [1, 1])` sql("CREATE TABLE bar AS SELECT * FROM foo group by id") checkAnswer(spark.table("bar"), Row(0) :: Nil) val tableMetadata = spark.sessionState.catalog.getTableMetadata(TableIdentifier("bar")) assert(tableMetadata.provider == Some("hive"), "the expected table is a Hive serde table") } } } }
Example 54
Source File: HiveOperatorQueryableSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveOperatorQueryableSuite extends QueryTest with TestHiveSingleton { import spark._ test("SPARK-5324 query result of describe command") { hiveContext.loadTestTable("src") // Creates a temporary view with the output of a describe command sql("desc src").createOrReplaceTempView("mydesc") checkAnswer( sql("desc mydesc"), Seq( Row("col_name", "string", "name of the column"), Row("data_type", "string", "data type of the column"), Row("comment", "string", "comment of the column"))) checkAnswer( sql("select * from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) checkAnswer( sql("select col_name, data_type, comment from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) } }
Example 55
Source File: HivePlanTest.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.functions._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import spark.sql import spark.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t") val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 56
Source File: PruneFileSourcePartitionsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil } test("PruneFileSourcePartitions should not change the output of LogicalRelation") { withTable("test") { withTempDir { dir => sql( s""" |CREATE EXTERNAL TABLE test(i int) |PARTITIONED BY (p int) |STORED AS parquet |LOCATION '${dir.getAbsolutePath}'""".stripMargin) val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( location = catalogFileIndex, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, catalogTable = Some(tableMeta)) val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) } } } }
Example 57
Source File: HiveUtilsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.QueryTest class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("newTemporaryConfiguration overwrites listener configurations") { Seq(true, false).foreach { useInMemoryDerby => val conf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(conf(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "") } } }
Example 58
Source File: ListTablesSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.QueryTest import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.hive.test.TestHiveSingleton class ListTablesSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll { import hiveContext._ import hiveContext.implicits._ val df = sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value") override def beforeAll(): Unit = { super.beforeAll() // The catalog in HiveContext is a case insensitive one. sessionState.catalog.createTempView( "ListTablesSuiteTable", df.logicalPlan, overrideIfExists = true) sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)") sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB") sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)") } override def afterAll(): Unit = { try { sessionState.catalog.dropTable( TableIdentifier("ListTablesSuiteTable"), ignoreIfNotExists = true, purge = false) sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable") sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable") sql("DROP DATABASE IF EXISTS ListTablesSuiteDB") } finally { super.afterAll() } } test("get all tables of current database") { Seq(tables(), sql("SHOW TABLes")).foreach { case allTables => // We are using default DB. checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("", "listtablessuitetable", true)) checkAnswer( allTables.filter("tableName = 'hivelisttablessuitetable'"), Row("default", "hivelisttablessuitetable", false)) assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0) } } test("getting all tables with a database name") { Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach { case allTables => checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("", "listtablessuitetable", true)) assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hiveindblisttablessuitetable'"), Row("listtablessuitedb", "hiveindblisttablessuitetable", false)) } } }
Example 59
Source File: HiveVariableSubstitutionSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveVariableSubstitutionSuite extends QueryTest with TestHiveSingleton { test("SET hivevar with prefix") { spark.sql("SET hivevar:county=gram") assert(spark.conf.getOption("county") === Some("gram")) } test("SET hivevar with dotted name") { spark.sql("SET hivevar:eloquent.mosquito.alphabet=zip") assert(spark.conf.getOption("eloquent.mosquito.alphabet") === Some("zip")) } test("hivevar substitution") { spark.conf.set("pond", "bus") checkAnswer(spark.sql("SELECT '${hivevar:pond}'"), Row("bus") :: Nil) } test("variable substitution without a prefix") { spark.sql("SET hivevar:flask=plaid") checkAnswer(spark.sql("SELECT '${flask}'"), Row("plaid") :: Nil) } test("variable substitution precedence") { spark.conf.set("turn.aloof", "questionable") spark.sql("SET hivevar:turn.aloof=dime") // hivevar clobbers the conf setting checkAnswer(spark.sql("SELECT '${turn.aloof}'"), Row("dime") :: Nil) } }
Example 60
Source File: SQLBuilderTest.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst import scala.util.control.NonFatal import org.apache.spark.sql.{DataFrame, Dataset, QueryTest} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.hive.test.TestHiveSingleton abstract class SQLBuilderTest extends QueryTest with TestHiveSingleton { protected def checkSQL(e: Expression, expectedSQL: String): Unit = { val actualSQL = e.sql try { assert(actualSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following expression: | |${e.prettyName} | |$cause """.stripMargin) } } protected def checkSQL(plan: LogicalPlan, expectedSQL: String): Unit = { val generatedSQL = try new SQLBuilder(plan).toSQL catch { case NonFatal(e) => fail( s"""Cannot convert the following logical query plan to SQL: | |${plan.treeString} """.stripMargin) } try { assert(generatedSQL === expectedSQL) } catch { case cause: Throwable => fail( s"""Wrong SQL generated for the following logical query plan: | |${plan.treeString} | |$cause """.stripMargin) } checkAnswer(spark.sql(generatedSQL), Dataset.ofRows(spark, plan)) } protected def checkSQL(df: DataFrame, expectedSQL: String): Unit = { checkSQL(df.queryExecution.analyzed, expectedSQL) } }
Example 61
Source File: HadoopFsRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{File, FilenameFilter} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext class HadoopFsRelationSuite extends QueryTest with SharedSQLContext { test("sizeInBytes should be the total size of all files") { withTempDir{ dir => dir.delete() spark.range(1000).write.parquet(dir.toString) // ignore hidden files val allFiles = dir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { !name.startsWith(".") } }) val totalSize = allFiles.map(_.length()).sum val df = spark.read.parquet(dir.toString) assert(df.queryExecution.logical.statistics.sizeInBytes === BigInt(totalSize)) } } }
Example 62
Source File: PartitionedWriteSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils class PartitionedWriteSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("write many partitions") { val path = Utils.createTempDir() path.delete() val df = spark.range(100).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( spark.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) } test("write many partitions with repeats") { val path = Utils.createTempDir() path.delete() val base = spark.range(100) val df = base.union(base).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( spark.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq ++ (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) } test("partitioned columns should appear at the end of schema") { withTempPath { f => val path = f.getAbsolutePath Seq(1 -> "a").toDF("i", "j").write.partitionBy("i").parquet(path) assert(spark.read.parquet(path).schema.map(_.name) == Seq("j", "i")) } } }
Example 63
Source File: HiveDataFrameAnalyticsSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{DataFrame, QueryTest} import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ import org.scalatest.BeforeAndAfterAll // TODO ideally we should put the test suite into the package `sql`, as // `hive` package is optional in compiling, however, `SQLContext.sql` doesn't // support the `cube` or `rollup` yet. class HiveDataFrameAnalyticsSuite extends QueryTest with BeforeAndAfterAll { private var testData: DataFrame = _ override def beforeAll() { testData = Seq((1, 2), (2, 4)).toDF("a", "b") TestHive.registerDataFrameAsTable(testData, "mytable") } override def afterAll(): Unit = { TestHive.dropTempTable("mytable") } test("rollup") { checkAnswer( testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b")), sql("select a + b, b, sum(a - b) from mytable group by a + b, b with rollup").collect() ) checkAnswer( testData.rollup("a", "b").agg(sum("b")), sql("select a, b, sum(b) from mytable group by a, b with rollup").collect() ) } test("cube") { checkAnswer( testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")), sql("select a + b, b, sum(a - b) from mytable group by a + b, b with cube").collect() ) checkAnswer( testData.cube("a", "b").agg(sum("b")), sql("select a, b, sum(b) from mytable group by a, b with cube").collect() ) } }
Example 64
Source File: QueryPartitionSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import com.google.common.io.Files import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.util.Utils class QueryPartitionSuite extends QueryTest { import org.apache.spark.sql.hive.test.TestHive.implicits._ test("SPARK-5068: query data when path doesn't exist"){ val testData = TestHive.sparkContext.parallelize( (1 to 10).map(i => TestData(i, i.toString))).toDF() testData.registerTempTable("testData") val tmpDir = Files.createTempDir() // create the table for test sql(s"CREATE TABLE table_with_partition(key int,value string) " + s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + "SELECT key,value FROM testData") // test for the exist path checkAnswer(sql("select key,value from table_with_partition"), testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect) // delete the path of one partition tmpDir.listFiles .find { f => f.isDirectory && f.getName().startsWith("ds=") } .foreach { f => Utils.deleteRecursively(f) } // test for after delete the path checkAnswer(sql("select key,value from table_with_partition"), testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect ++ testData.toSchemaRDD.collect) sql("DROP TABLE table_with_partition") sql("DROP TABLE createAndInsertTest") } }
Example 65
Source File: HiveParquetSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.parquet.ParquetTest import org.apache.spark.sql.{QueryTest, SQLConf} case class Cases(lower: String, UPPER: String) class HiveParquetSuite extends QueryTest with ParquetTest { val sqlContext = TestHive import sqlContext._ def run(prefix: String): Unit = { test(s"$prefix: Case insensitive attribute names") { withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") { val expected = (1 to 4).map(i => Row(i.toString)) checkAnswer(sql("SELECT upper FROM cases"), expected) checkAnswer(sql("SELECT LOWER FROM cases"), expected) } } test(s"$prefix: SELECT on Parquet table") { val data = (1 to 4).map(i => (i, s"val_$i")) withParquetTable(data, "t") { checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple)) } } test(s"$prefix: Simple column projection + filter on Parquet table") { withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") { checkAnswer( sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"), Seq(Row(true, "val_2"), Row(true, "val_4"))) } } test(s"$prefix: Converting Hive to Parquet Table via saveAsParquetFile") { withTempPath { dir => sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath) read.parquet(dir.getCanonicalPath).registerTempTable("p") withTempTable("p") { checkAnswer( sql("SELECT * FROM src ORDER BY key"), sql("SELECT * from p ORDER BY key").collect().toSeq) } } } test(s"$prefix: INSERT OVERWRITE TABLE Parquet table") { withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") { withTempPath { file => sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath) read.parquet(file.getCanonicalPath).registerTempTable("p") withTempTable("p") { // let's do three overwrites for good measure sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq) } } } } } withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "true") { run("Parquet data source enabled") } withSQLConf(SQLConf.PARQUET_USE_DATA_SOURCE_API -> "false") { run("Parquet data source disabled") } }
Example 66
Source File: HiveDataFrameJoinSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{Row, QueryTest} import org.apache.spark.sql.hive.test.TestHive.implicits._ class HiveDataFrameJoinSuite extends QueryTest { // We should move this into SQL package if we make case sensitivity configurable in SQL. test("join - self join auto resolve ambiguity with case insensitivity") { val df = Seq((1, "1"), (2, "2")).toDF("key", "value") checkAnswer( df.join(df, df("key") === df("Key")), Row(1, "1", 1, "1") :: Row(2, "2", 2, "2") :: Nil) checkAnswer( df.join(df.filter($"value" === "2"), df("key") === df("Key")), Row(2, "2", 2, "2") :: Nil) } }
Example 67
Source File: UDFSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.QueryTest import org.apache.spark.sql.hive.test.TestHive._ case class FunctionResult(f1: String, f2: String) class UDFSuite extends QueryTest { test("UDF case insensitive") { udf.register("random0", () => { Math.random() }) udf.register("RANDOM1", () => { Math.random() }) udf.register("strlenScala", (_: String).length + (_: Int)) assert(sql("SELECT RANDOM0() FROM src LIMIT 1").head().getDouble(0) >= 0.0) assert(sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0) assert(sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5) } }
Example 68
Source File: HiveExplainSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.hive.test.TestHive._ class HiveExplainSuite extends QueryTest { test("explain extended command") { checkExistence(sql(" explain select * from src where key=123 "), true, "== Physical Plan ==") checkExistence(sql(" explain select * from src where key=123 "), false, "== Parsed Logical Plan ==", "== Analyzed Logical Plan ==", "== Optimized Logical Plan ==") checkExistence(sql(" explain extended select * from src where key=123 "), true, "== Parsed Logical Plan ==", "== Analyzed Logical Plan ==", "== Optimized Logical Plan ==", "== Physical Plan ==", "Code Generation", "== RDD ==") } test("explain create table command") { checkExistence(sql("explain create table temp__b as select * from src limit 2"), true, "== Physical Plan ==", "InsertIntoHiveTable", "Limit", "src") checkExistence(sql("explain extended create table temp__b as select * from src limit 2"), true, "== Parsed Logical Plan ==", "== Analyzed Logical Plan ==", "== Optimized Logical Plan ==", "== Physical Plan ==", "CreateTableAsSelect", "InsertIntoHiveTable", "Limit", "src") checkExistence(sql( """ | EXPLAIN EXTENDED CREATE TABLE temp__b | ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe" | WITH SERDEPROPERTIES("serde_p1"="p1","serde_p2"="p2") | STORED AS RCFile | TBLPROPERTIES("tbl_p1"="p11", "tbl_p2"="p22") | AS SELECT * FROM src LIMIT 2 """.stripMargin), true, "== Parsed Logical Plan ==", "== Analyzed Logical Plan ==", "== Optimized Logical Plan ==", "== Physical Plan ==", "CreateTableAsSelect", "InsertIntoHiveTable", "Limit", "src") } }
Example 69
Source File: HiveOperatorQueryableSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.{Row, QueryTest} import org.apache.spark.sql.hive.test.TestHive._ class HiveOperatorQueryableSuite extends QueryTest { test("SPARK-5324 query result of describe command") { loadTestTable("src") // register a describe command to be a temp table sql("desc src").registerTempTable("mydesc") checkAnswer( sql("desc mydesc"), Seq( Row("col_name", "string", "name of the column"), Row("data_type", "string", "data type of the column"), Row("comment", "string", "comment of the column"))) checkAnswer( sql("select * from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) checkAnswer( sql("select col_name, data_type, comment from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) } }
Example 70
Source File: ListTablesSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.Row class ListTablesSuite extends QueryTest with BeforeAndAfterAll { import org.apache.spark.sql.hive.test.TestHive.implicits._ val df = sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value") override def beforeAll(): Unit = { // The catalog in HiveContext is a case insensitive one. catalog.registerTable(Seq("ListTablesSuiteTable"), df.logicalPlan) catalog.registerTable(Seq("ListTablesSuiteDB", "InDBListTablesSuiteTable"), df.logicalPlan) sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)") sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB") sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)") } override def afterAll(): Unit = { catalog.unregisterTable(Seq("ListTablesSuiteTable")) catalog.unregisterTable(Seq("ListTablesSuiteDB", "InDBListTablesSuiteTable")) sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable") sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable") sql("DROP DATABASE IF EXISTS ListTablesSuiteDB") } test("get all tables of current database") { Seq(tables(), sql("SHOW TABLes")).foreach { case allTables => // We are using default DB. checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("listtablessuitetable", true)) assert(allTables.filter("tableName = 'indblisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hivelisttablessuitetable'"), Row("hivelisttablessuitetable", false)) assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0) } } test("getting all tables with a database name") { Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach { case allTables => checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("listtablessuitetable", true)) checkAnswer( allTables.filter("tableName = 'indblisttablessuitetable'"), Row("indblisttablessuitetable", true)) assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hiveindblisttablessuitetable'"), Row("hiveindblisttablessuitetable", false)) } } }
Example 71
Source File: HiveDataFrameAnalyticsSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{DataFrame, QueryTest} import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.hive.test.TestHive.implicits._ import org.scalatest.BeforeAndAfterAll // TODO ideally we should put the test suite into the package `sql`, as //TODO理想情况下,我们应该将测试套件放入“sql”包中 // `hive` package is optional in compiling, however, `SQLContext.sql` doesn't // support the `cube` or `rollup` yet. 不支持“cube”或“rollup” //Hive DataFrame分析套件 class HiveDataFrameAnalyticsSuite extends QueryTest with BeforeAndAfterAll { private var testData: DataFrame = _ override def beforeAll() { testData = Seq((1, 2), (2, 4)).toDF("a", "b") TestHive.registerDataFrameAsTable(testData, "mytable") } override def afterAll(): Unit = { TestHive.dropTempTable("mytable") } test("cube") { checkAnswer( testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")), sql("select a + b, b, sum(a - b) from mytable group by a + b, b with cube").collect() ) checkAnswer( testData.cube("a", "b").agg(sum("b")), sql("select a, b, sum(b) from mytable group by a, b with cube").collect() ) } }
Example 72
Source File: QueryPartitionSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import com.google.common.io.Files import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.util.Utils //查询分区套件 class QueryPartitionSuite extends QueryTest with SQLTestUtils { private lazy val ctx = org.apache.spark.sql.hive.test.TestHive import ctx.implicits._ protected def _sqlContext = ctx //查询数据当路径不存在时 test("SPARK-5068: query data when path doesn't exist"){ withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) { val testData = ctx.sparkContext.parallelize( (1 to 10).map(i => TestData(i, i.toString))).toDF() testData.registerTempTable("testData") val tmpDir = Files.createTempDir() // create the table for test 创建表进行测试 sql(s"CREATE TABLE table_with_partition(key int,value string) " + s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + "SELECT key,value FROM testData") // test for the exist path 测试存在的路径 checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) // delete the path of one partition 删除一个分区的路径 tmpDir.listFiles .find { f => f.isDirectory && f.getName().startsWith("ds=") } .foreach { f => Utils.deleteRecursively(f) } // test for after delete the path 测试后删除路径 checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) sql("DROP TABLE table_with_partition") sql("DROP TABLE createAndInsertTest") } } }
Example 73
Source File: HiveParquetSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.execution.datasources.parquet.ParquetTest import org.apache.spark.sql.{QueryTest, Row, SQLContext} case class Cases(lower: String, UPPER: String) class HiveParquetSuite extends QueryTest with ParquetTest { private val ctx = TestHive override def _sqlContext: SQLContext = ctx //不区分大小写的属性名 test("Case insensitive attribute names") { withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") { val expected = (1 to 4).map(i => Row(i.toString)) checkAnswer(sql("SELECT upper FROM cases"), expected) checkAnswer(sql("SELECT LOWER FROM cases"), expected) } } //select 在Parquet表查询 test("SELECT on Parquet table") { val data = (1 to 4).map(i => (i, s"val_$i")) withParquetTable(data, "t") { checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple)) } } //简单的柱投影+filter在Parquet表 test("Simple column projection + filter on Parquet table") { withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") { checkAnswer( sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"), Seq(Row(true, "val_2"), Row(true, "val_4"))) } } //通过saveAsParquetFile将Hive转换为Parquet Table test("Converting Hive to Parquet Table via saveAsParquetFile") { withTempPath { dir => sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath) ctx.read.parquet(dir.getCanonicalPath).registerTempTable("p") withTempTable("p") { checkAnswer( sql("SELECT * FROM src ORDER BY key"), sql("SELECT * from p ORDER BY key").collect().toSeq) } } } //插入覆盖表Parquet表 test("INSERT OVERWRITE TABLE Parquet table") { withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") { withTempPath { file => sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath) ctx.read.parquet(file.getCanonicalPath).registerTempTable("p") withTempTable("p") { // let's do three overwrites for good measure //让我们做良好的措施三覆盖 sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq) } } } } }
Example 74
Source File: HiveDataFrameJoinSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{Row, QueryTest} import org.apache.spark.sql.hive.test.TestHive.implicits._ class HiveDataFrameJoinSuite extends QueryTest { // We should move this into SQL package if we make case sensitivity configurable in SQL. //如果我们在SQL中配置区分大小写,我们应该将其移植到SQL包中 test("join - self join auto resolve ambiguity with case insensitivity") { val df = Seq((1, "1"), (2, "2")).toDF("key", "value") checkAnswer( df.join(df, df("key") === df("Key")), Row(1, "1", 1, "1") :: Row(2, "2", 2, "2") :: Nil) checkAnswer( df.join(df.filter($"value" === "2"), df("key") === df("Key")), Row(2, "2", 2, "2") :: Nil) } }
Example 75
Source File: UDFSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.QueryTest case class FunctionResult(f1: String, f2: String) class UDFSuite extends QueryTest { private lazy val ctx = org.apache.spark.sql.hive.test.TestHive //UDF不区分大小写 test("UDF case insensitive") { ctx.udf.register("random0", () => { Math.random() }) ctx.udf.register("RANDOM1", () => { Math.random() }) ctx.udf.register("strlenScala", (_: String).length + (_: Int)) assert(ctx.sql("SELECT RANDOM0() FROM src LIMIT 1").head().getDouble(0) >= 0.0) assert(ctx.sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0) assert(ctx.sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5) } }
Example 76
Source File: HiveOperatorQueryableSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.{Row, QueryTest} import org.apache.spark.sql.hive.test.TestHive._ class HiveOperatorQueryableSuite extends QueryTest { //描述命令的查询结果 test("SPARK-5324 query result of describe command") { loadTestTable("src") // register a describe command to be a temp table // 将描述命令注册为临时表 sql("desc src").registerTempTable("mydesc") checkAnswer( sql("desc mydesc"), Seq( Row("col_name", "string", "name of the column"), Row("data_type", "string", "data type of the column"), Row("comment", "string", "comment of the column"))) checkAnswer( sql("select * from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) checkAnswer( sql("select col_name, data_type, comment from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) } }
Example 77
Source File: HivePlanTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.functions._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.TestHive class HivePlanTest extends QueryTest { import TestHive._ import TestHive.implicits._ //自定义函数常量折叠 test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t") val optimized = sql("SELECT cos(null) FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } //共享相同分区的窗口表达式和order by子句 test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 78
Source File: ListTablesSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.hive.test.TestHive._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.Row class ListTablesSuite extends QueryTest with BeforeAndAfterAll { import org.apache.spark.sql.hive.test.TestHive.implicits._ val df = sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value") override def beforeAll(): Unit = { // The catalog in HiveContext is a case insensitive one. //HiveContext中的目录是不区分大小写的 catalog.registerTable(Seq("ListTablesSuiteTable"), df.logicalPlan) sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)") sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB") sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)") } override def afterAll(): Unit = { catalog.unregisterTable(Seq("ListTablesSuiteTable")) sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable") sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable") sql("DROP DATABASE IF EXISTS ListTablesSuiteDB") } //获取当前数据库的所有表 test("get all tables of current database") { Seq(tables(), sql("SHOW TABLes")).foreach { case allTables => // We are using default DB. //我们正在使用默认数据库 checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("listtablessuitetable", true)) checkAnswer( allTables.filter("tableName = 'hivelisttablessuitetable'"), Row("hivelisttablessuitetable", false)) assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0) } } //获取具有数据库名称的所有表 test("getting all tables with a database name") { Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach { case allTables => checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("listtablessuitetable", true)) assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hiveindblisttablessuitetable'"), Row("hiveindblisttablessuitetable", false)) } } }
Example 79
Source File: BroadcastJoinSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import scala.reflect.ClassTag import org.scalatest.BeforeAndAfterAll import org.apache.spark.{AccumulatorSuite, SparkConf, SparkContext} import org.apache.spark.sql.functions._ import org.apache.spark.sql.{SQLConf, SQLContext, QueryTest} private def testBroadcastJoin[T: ClassTag](name: String, joinType: String): Unit = { AccumulatorSuite.verifyPeakExecutionMemorySet(sc, name) { val df1 = sqlContext.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value") val df2 = sqlContext.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value") // Comparison at the end is for broadcast left semi join //最后的比较是广播左半连接 val joinExpression = df1("key") === df2("key") && df1("value") > df2("value") val df3 = df1.join(broadcast(df2), joinExpression, joinType) val plan = df3.queryExecution.executedPlan assert(plan.collect { case p: T => p }.size === 1) plan.executeCollect() } } //不安全广播散列加入更新峰值执行内存 test("unsafe broadcast hash join updates peak execution memory") { // testBroadcastJoin[BroadcastHashJoin]("unsafe broadcast hash join", "inner") } //不安全广播散列外部连接更新峰值执行内存 test("unsafe broadcast hash outer join updates peak execution memory") { // testBroadcastJoin[BroadcastHashOuterJoin]("unsafe broadcast hash outer join", "left_outer") } //不安全广播左半连接更新峰值执行存储器 test("unsafe broadcast left semi join updates peak execution memory") { //testBroadcastJoin[BroadcastLeftSemiJoinHash]("unsafe broadcast left semi join", "leftsemi") } }
Example 80
Source File: ParquetCompatibilityTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConversions._ import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.hadoop.ParquetFileReader import org.apache.parquet.schema.MessageType import org.apache.spark.sql.QueryTest private[sql] abstract class ParquetCompatibilityTest extends QueryTest with ParquetTest { protected def readParquetSchema(path: String): MessageType = { readParquetSchema(path, { path => !path.getName.startsWith("_") }) } //读Parquet模式 protected def readParquetSchema(path: String, pathFilter: Path => Boolean): MessageType = { val fsPath = new Path(path) val fs = fsPath.getFileSystem(configuration) val parquetFiles = fs.listStatus(fsPath, new PathFilter { override def accept(path: Path): Boolean = pathFilter(path) }).toSeq val footers = ParquetFileReader.readAllFootersInParallel(configuration, parquetFiles, true) footers.head.getParquetMetadata.getFileMetaData.getSchema } protected def logParquetSchema(path: String): Unit = { logInfo( //由parquet-avro写的Parquet文件的模式 s"""Schema of the Parquet file written by parquet-avro: |${readParquetSchema(path)} """.stripMargin) } } //复合Parquet的兼容性测试 object ParquetCompatibilityTest { def makeNullable[T <: AnyRef](i: Int)(f: => T): T = { if (i % 3 == 0) null.asInstanceOf[T] else f } }
Example 81
Source File: ExtraStrategiesSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String //快速操作 case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } //Nil是一个空的List override def children: Seq[SparkPlan] = Nil } //测试策略 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 FastOperator(attr.toAttribute :: Nil) :: Nil //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 case _ => Nil } } //额外的策略集 class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") {//插入一个额外的策略 try { //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sqlContext.sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 sqlContext.experimental.extraStrategies = Nil } } }
Example 82
Source File: PartitionedWriteSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{Row, QueryTest} import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils //分区写测试 class PartitionedWriteSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("write many partitions") {//写入更多分区 val path = Utils.createTempDir() path.delete() //val df: DataFrame val df = ctx.range(100).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( ctx.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) } test("write many partitions with repeats") {//用重复写多个分区 val path = Utils.createTempDir() path.delete() val base = ctx.range(100) val df = base.unionAll(base).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( ctx.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq ++ (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) } }
Example 83
Source File: HiveParquetSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.execution.datasources.parquet.ParquetTest import org.apache.spark.sql.hive.test.TestHiveSingleton case class Cases(lower: String, UPPER: String) class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton { test("Case insensitive attribute names") { withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") { val expected = (1 to 4).map(i => Row(i.toString)) checkAnswer(sql("SELECT upper FROM cases"), expected) checkAnswer(sql("SELECT LOWER FROM cases"), expected) } } test("SELECT on Parquet table") { val data = (1 to 4).map(i => (i, s"val_$i")) withParquetTable(data, "t") { checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple)) } } test("Simple column projection + filter on Parquet table") { withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") { checkAnswer( sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"), Seq(Row(true, "val_2"), Row(true, "val_4"))) } } test("Converting Hive to Parquet Table via saveAsParquetFile") { withTempPath { dir => sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath) spark.read.parquet(dir.getCanonicalPath).createOrReplaceTempView("p") withTempView("p") { checkAnswer( sql("SELECT * FROM src ORDER BY key"), sql("SELECT * from p ORDER BY key").collect().toSeq) } } } test("INSERT OVERWRITE TABLE Parquet table") { // Don't run with vectorized: currently relies on UnsafeRow. withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t", false) { withTempPath { file => sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath) spark.read.parquet(file.getCanonicalPath).createOrReplaceTempView("p") withTempView("p") { // let's do three overwrites for good measure sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq) } } } } }
Example 84
Source File: HiveDataFrameJoinSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveDataFrameJoinSuite extends QueryTest with TestHiveSingleton { import spark.implicits._ // We should move this into SQL package if we make case sensitivity configurable in SQL. test("join - self join auto resolve ambiguity with case insensitivity") { val df = Seq((1, "1"), (2, "2")).toDF("key", "value") checkAnswer( df.join(df, df("key") === df("Key")), Row(1, "1", 1, "1") :: Row(2, "2", 2, "2") :: Nil) checkAnswer( df.join(df.filter($"value" === "2"), df("key") === df("Key")), Row(2, "2", 2, "2") :: Nil) } }
Example 85
Source File: HivePlanTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import spark.sql import spark.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t") val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 86
Source File: PruneFileSourcePartitionsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.execution.datasources.{CatalogFileIndex, HadoopFsRelation, LogicalRelation, PruneFileSourcePartitions} import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.types.StructType class PruneFileSourcePartitionsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("PruneFileSourcePartitions", Once, PruneFileSourcePartitions) :: Nil } test("PruneFileSourcePartitions should not change the output of LogicalRelation") { withTable("test") { withTempDir { dir => sql( s""" |CREATE EXTERNAL TABLE test(i int) |PARTITIONED BY (p int) |STORED AS parquet |LOCATION '${dir.toURI}'""".stripMargin) val tableMeta = spark.sharedState.externalCatalog.getTable("default", "test") val catalogFileIndex = new CatalogFileIndex(spark, tableMeta, 0) val dataSchema = StructType(tableMeta.schema.filterNot { f => tableMeta.partitionColumnNames.contains(f.name) }) val relation = HadoopFsRelation( location = catalogFileIndex, partitionSchema = tableMeta.partitionSchema, dataSchema = dataSchema, bucketSpec = None, fileFormat = new ParquetFileFormat(), options = Map.empty)(sparkSession = spark) val logicalRelation = LogicalRelation(relation, tableMeta) val query = Project(Seq('i, 'p), Filter('p === 1, logicalRelation)).analyze val optimized = Optimize.execute(query) assert(optimized.missingInput.isEmpty) } } } test("SPARK-20986 Reset table's statistics after PruneFileSourcePartitions rule") { withTable("tbl") { spark.range(10).selectExpr("id", "id % 3 as p").write.partitionBy("p").saveAsTable("tbl") sql(s"ANALYZE TABLE tbl COMPUTE STATISTICS") val tableStats = spark.sessionState.catalog.getTableMetadata(TableIdentifier("tbl")).stats assert(tableStats.isDefined && tableStats.get.sizeInBytes > 0, "tableStats is lost") val df = sql("SELECT * FROM tbl WHERE p = 1") val sizes1 = df.queryExecution.analyzed.collect { case relation: LogicalRelation => relation.catalogTable.get.stats.get.sizeInBytes } assert(sizes1.size === 1, s"Size wrong for:\n ${df.queryExecution}") assert(sizes1(0) == tableStats.get.sizeInBytes) val relations = df.queryExecution.optimizedPlan.collect { case relation: LogicalRelation => relation } assert(relations.size === 1, s"Size wrong for:\n ${df.queryExecution}") val size2 = relations(0).stats.sizeInBytes assert(size2 == relations(0).catalogTable.get.stats.get.sizeInBytes) assert(size2 < tableStats.get.sizeInBytes) } } }
Example 87
Source File: HiveUtilsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import java.net.URL import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.QueryTest import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SQLTestUtils} import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader} class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("newTemporaryConfiguration overwrites listener configurations") { Seq(true, false).foreach { useInMemoryDerby => val conf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(conf(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "") } } test("newTemporaryConfiguration respect spark.hadoop.foo=bar in SparkConf") { sys.props.put("spark.hadoop.foo", "bar") Seq(true, false) foreach { useInMemoryDerby => val hiveConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(!hiveConf.contains("spark.hadoop.foo")) assert(hiveConf("foo") === "bar") } } test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") { val conf = new SparkConf val contextClassLoader = Thread.currentThread().getContextClassLoader val loader = new ChildFirstURLClassLoader(Array(), contextClassLoader) try { Thread.currentThread().setContextClassLoader(loader) HiveUtils.newClientForMetadata( conf, SparkHadoopUtil.newConfiguration(conf), HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true)) } finally { Thread.currentThread().setContextClassLoader(contextClassLoader) } } test("toHiveString correctly handles UDTs") { val point = new ExamplePoint(50.0, 50.0) val tpe = new ExamplePointUDT() assert(HiveUtils.toHiveString((point, tpe)) === "(50.0, 50.0)") } }
Example 88
Source File: ListTablesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.QueryTest import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.hive.test.TestHiveSingleton class ListTablesSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll { import hiveContext._ import hiveContext.implicits._ val df = sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value") override def beforeAll(): Unit = { super.beforeAll() // The catalog in HiveContext is a case insensitive one. sessionState.catalog.createTempView( "ListTablesSuiteTable", df.logicalPlan, overrideIfExists = true) sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)") sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB") sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)") } override def afterAll(): Unit = { try { sessionState.catalog.dropTable( TableIdentifier("ListTablesSuiteTable"), ignoreIfNotExists = true, purge = false) sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable") sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable") sql("DROP DATABASE IF EXISTS ListTablesSuiteDB") } finally { super.afterAll() } } test("get all tables of current database") { Seq(tables(), sql("SHOW TABLes")).foreach { case allTables => // We are using default DB. checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("", "listtablessuitetable", true)) checkAnswer( allTables.filter("tableName = 'hivelisttablessuitetable'"), Row("default", "hivelisttablessuitetable", false)) assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0) } } test("getting all tables with a database name") { Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach { case allTables => checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("", "listtablessuitetable", true)) assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hiveindblisttablessuitetable'"), Row("listtablessuitedb", "hiveindblisttablessuitetable", false)) } } }
Example 89
Source File: HiveVariableSubstitutionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveVariableSubstitutionSuite extends QueryTest with TestHiveSingleton { test("SET hivevar with prefix") { spark.sql("SET hivevar:county=gram") assert(spark.conf.getOption("county") === Some("gram")) } test("SET hivevar with dotted name") { spark.sql("SET hivevar:eloquent.mosquito.alphabet=zip") assert(spark.conf.getOption("eloquent.mosquito.alphabet") === Some("zip")) } test("hivevar substitution") { spark.conf.set("pond", "bus") checkAnswer(spark.sql("SELECT '${hivevar:pond}'"), Row("bus") :: Nil) } test("variable substitution without a prefix") { spark.sql("SET hivevar:flask=plaid") checkAnswer(spark.sql("SELECT '${flask}'"), Row("plaid") :: Nil) } test("variable substitution precedence") { spark.conf.set("turn.aloof", "questionable") spark.sql("SET hivevar:turn.aloof=dime") // hivevar clobbers the conf setting checkAnswer(spark.sql("SELECT '${turn.aloof}'"), Row("dime") :: Nil) } }
Example 90
Source File: FileFormatWriterSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.test.SharedSQLContext class FileFormatWriterSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("empty file should be skipped while write to file") { withTempPath { path => spark.range(100).repartition(10).where("id = 50").write.parquet(path.toString) val partFiles = path.listFiles() .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_")) assert(partFiles.length === 2) } } test("SPARK-22252: FileFormatWriter should respect the input query schema") { withTable("t1", "t2", "t3", "t4") { spark.range(1).select('id as 'col1, 'id as 'col2).write.saveAsTable("t1") spark.sql("select COL1, COL2 from t1").write.saveAsTable("t2") checkAnswer(spark.table("t2"), Row(0, 0)) // Test picking part of the columns when writing. spark.range(1).select('id, 'id as 'col1, 'id as 'col2).write.saveAsTable("t3") spark.sql("select COL1, COL2 from t3").write.saveAsTable("t4") checkAnswer(spark.table("t4"), Row(0, 0)) } } }
Example 91
Source File: HadoopFsRelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.{File, FilenameFilter} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, SortMergeJoinExec} import org.apache.spark.sql.test.SharedSQLContext class HadoopFsRelationSuite extends QueryTest with SharedSQLContext { test("sizeInBytes should be the total size of all files") { withTempDir{ dir => dir.delete() spark.range(1000).write.parquet(dir.toString) // ignore hidden files val allFiles = dir.listFiles(new FilenameFilter { override def accept(dir: File, name: String): Boolean = { !name.startsWith(".") && !name.startsWith("_") } }) val totalSize = allFiles.map(_.length()).sum val df = spark.read.parquet(dir.toString) assert(df.queryExecution.logical.stats.sizeInBytes === BigInt(totalSize)) } } test("SPARK-22790: spark.sql.sources.compressionFactor takes effect") { import testImplicits._ Seq(1.0, 0.5).foreach { compressionFactor => withSQLConf("spark.sql.sources.fileCompressionFactor" -> compressionFactor.toString, "spark.sql.autoBroadcastJoinThreshold" -> "400") { withTempPath { workDir => // the file size is 740 bytes val workDirPath = workDir.getAbsolutePath val data1 = Seq(100, 200, 300, 400).toDF("count") data1.write.parquet(workDirPath + "/data1") val df1FromFile = spark.read.parquet(workDirPath + "/data1") val data2 = Seq(100, 200, 300, 400).toDF("count") data2.write.parquet(workDirPath + "/data2") val df2FromFile = spark.read.parquet(workDirPath + "/data2") val joinedDF = df1FromFile.join(df2FromFile, Seq("count")) if (compressionFactor == 0.5) { val bJoinExec = joinedDF.queryExecution.executedPlan.collect { case bJoin: BroadcastHashJoinExec => bJoin } assert(bJoinExec.nonEmpty) val smJoinExec = joinedDF.queryExecution.executedPlan.collect { case smJoin: SortMergeJoinExec => smJoin } assert(smJoinExec.isEmpty) } else { // compressionFactor is 1.0 val bJoinExec = joinedDF.queryExecution.executedPlan.collect { case bJoin: BroadcastHashJoinExec => bJoin } assert(bJoinExec.isEmpty) val smJoinExec = joinedDF.queryExecution.executedPlan.collect { case smJoin: SortMergeJoinExec => smJoin } assert(smJoinExec.nonEmpty) } } } } } }
Example 92
Source File: ParquetFileFormatSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkException import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext { test("read parquet footers in parallel") { def testReadFooters(ignoreCorruptFiles: Boolean): Unit = { withTempDir { dir => val fs = FileSystem.get(sparkContext.hadoopConfiguration) val basePath = dir.getCanonicalPath val path1 = new Path(basePath, "first") val path2 = new Path(basePath, "second") val path3 = new Path(basePath, "third") spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString) spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString) spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString) val fileStatuses = Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten val footers = ParquetFileFormat.readParquetFootersInParallel( sparkContext.hadoopConfiguration, fileStatuses, ignoreCorruptFiles) assert(footers.size == 2) } } testReadFooters(true) val exception = intercept[java.io.IOException] { testReadFooters(false) } assert(exception.getMessage().contains("Could not read footer for file")) } }
Example 93
Source File: DataSourceScanExecRedactionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.hadoop.fs.Path import org.apache.spark.SparkConf import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class DataSourceScanExecRedactionSuite extends QueryTest with SharedSQLContext { override protected def sparkConf: SparkConf = super.sparkConf .set("spark.redaction.string.regex", "file:/[\\w_]+") test("treeString is redacted") { withTempDir { dir => val basePath = dir.getCanonicalPath spark.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) val df = spark.read.parquet(basePath) val rootPath = df.queryExecution.sparkPlan.find(_.isInstanceOf[FileSourceScanExec]).get .asInstanceOf[FileSourceScanExec].relation.location.rootPaths.head assert(rootPath.toString.contains(dir.toURI.getPath.stripSuffix("/"))) assert(!df.queryExecution.sparkPlan.treeString(verbose = true).contains(rootPath.getName)) assert(!df.queryExecution.executedPlan.treeString(verbose = true).contains(rootPath.getName)) assert(!df.queryExecution.toString.contains(rootPath.getName)) assert(!df.queryExecution.simpleString.contains(rootPath.getName)) val replacement = "*********" assert(df.queryExecution.sparkPlan.treeString(verbose = true).contains(replacement)) assert(df.queryExecution.executedPlan.treeString(verbose = true).contains(replacement)) assert(df.queryExecution.toString.contains(replacement)) assert(df.queryExecution.simpleString.contains(replacement)) } } private def isIncluded(queryExecution: QueryExecution, msg: String): Boolean = { queryExecution.toString.contains(msg) || queryExecution.simpleString.contains(msg) || queryExecution.stringWithStats.contains(msg) } test("explain is redacted using SQLConf") { withTempDir { dir => val basePath = dir.getCanonicalPath spark.range(0, 10).toDF("a").write.parquet(new Path(basePath, "foo=1").toString) val df = spark.read.parquet(basePath) val replacement = "*********" // Respect SparkConf and replace file:/ assert(isIncluded(df.queryExecution, replacement)) assert(isIncluded(df.queryExecution, "FileScan")) assert(!isIncluded(df.queryExecution, "file:/")) withSQLConf(SQLConf.SQL_STRING_REDACTION_PATTERN.key -> "(?i)FileScan") { // Respect SQLConf and replace FileScan assert(isIncluded(df.queryExecution, replacement)) assert(!isIncluded(df.queryExecution, "FileScan")) assert(isIncluded(df.queryExecution, "file:/")) } } } }
Example 94
Source File: SparkPlanSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext class SparkPlanSuite extends QueryTest with SharedSQLContext { test("SPARK-21619 execution of a canonicalized plan should fail") { val plan = spark.range(10).queryExecution.executedPlan.canonicalized intercept[IllegalStateException] { plan.execute() } intercept[IllegalStateException] { plan.executeCollect() } intercept[IllegalStateException] { plan.executeCollectPublic() } intercept[IllegalStateException] { plan.executeToIterator() } intercept[IllegalStateException] { plan.executeBroadcast() } intercept[IllegalStateException] { plan.executeTake(1) } } }
Example 95
Source File: SameResultSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.{DataFrame, QueryTest} import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext class SameResultSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("FileSourceScanExec: different orders of data filters and partition filters") { withTempPath { path => val tmpDir = path.getCanonicalPath spark.range(10) .selectExpr("id as a", "id + 1 as b", "id + 2 as c", "id + 3 as d") .write .partitionBy("a", "b") .parquet(tmpDir) val df = spark.read.parquet(tmpDir) // partition filters: a > 1 AND b < 9 // data filters: c > 1 AND d < 9 val plan1 = getFileSourceScanExec(df.where("a > 1 AND b < 9 AND c > 1 AND d < 9")) val plan2 = getFileSourceScanExec(df.where("b < 9 AND a > 1 AND d < 9 AND c > 1")) assert(plan1.sameResult(plan2)) } } private def getFileSourceScanExec(df: DataFrame): FileSourceScanExec = { df.queryExecution.sparkPlan.find(_.isInstanceOf[FileSourceScanExec]).get .asInstanceOf[FileSourceScanExec] } test("SPARK-20725: partial aggregate should behave correctly for sameResult") { val df1 = spark.range(10).agg(sum($"id")) val df2 = spark.range(10).agg(sum($"id")) assert(df1.queryExecution.executedPlan.sameResult(df2.queryExecution.executedPlan)) val df3 = spark.range(10).agg(sumDistinct($"id")) val df4 = spark.range(10).agg(sumDistinct($"id")) assert(df3.queryExecution.executedPlan.sameResult(df4.queryExecution.executedPlan)) } }
Example 96
Source File: HiveDataFrameAnalyticsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{DataFrame, QueryTest, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.scalatest.BeforeAndAfterAll // TODO ideally we should put the test suite into the package `sql`, as // `hive` package is optional in compiling, however, `SQLContext.sql` doesn't // support the `cube` or `rollup` yet. class HiveDataFrameAnalyticsSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll { import hiveContext.implicits._ import hiveContext.sql private var testData: DataFrame = _ override def beforeAll() { testData = Seq((1, 2), (2, 2), (3, 4)).toDF("a", "b") hiveContext.registerDataFrameAsTable(testData, "mytable") } override def afterAll(): Unit = { hiveContext.dropTempTable("mytable") } test("rollup") { checkAnswer( testData.rollup($"a" + $"b", $"b").agg(sum($"a" - $"b")), sql("select a + b, b, sum(a - b) from mytable group by a + b, b with rollup").collect() ) checkAnswer( testData.rollup("a", "b").agg(sum("b")), sql("select a, b, sum(b) from mytable group by a, b with rollup").collect() ) } test("collect functions") { checkAnswer( testData.select(collect_list($"a"), collect_list($"b")), Seq(Row(Seq(1, 2, 3), Seq(2, 2, 4))) ) checkAnswer( testData.select(collect_set($"a"), collect_set($"b")), Seq(Row(Seq(1, 2, 3), Seq(2, 4))) ) } test("cube") { checkAnswer( testData.cube($"a" + $"b", $"b").agg(sum($"a" - $"b")), sql("select a + b, b, sum(a - b) from mytable group by a + b, b with cube").collect() ) checkAnswer( testData.cube("a", "b").agg(sum("b")), sql("select a, b, sum(b) from mytable group by a, b with cube").collect() ) } }
Example 97
Source File: QueryPartitionSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import com.google.common.io.Files import org.apache.spark.util.Utils import org.apache.spark.sql.{QueryTest, _} import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class QueryPartitionSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { import hiveContext.implicits._ test("SPARK-5068: query data when path doesn't exist") { withSQLConf((SQLConf.HIVE_VERIFY_PARTITION_PATH.key, "true")) { val testData = sparkContext.parallelize( (1 to 10).map(i => TestData(i, i.toString))).toDF() testData.registerTempTable("testData") val tmpDir = Files.createTempDir() // create the table for test sql(s"CREATE TABLE table_with_partition(key int,value string) " + s"PARTITIONED by (ds string) location '${tmpDir.toURI.toString}' ") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='1') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='2') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='3') " + "SELECT key,value FROM testData") sql("INSERT OVERWRITE TABLE table_with_partition partition (ds='4') " + "SELECT key,value FROM testData") // test for the exist path checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) // delete the path of one partition tmpDir.listFiles .find { f => f.isDirectory && f.getName().startsWith("ds=") } .foreach { f => Utils.deleteRecursively(f) } // test for after delete the path checkAnswer(sql("select key,value from table_with_partition"), testData.toDF.collect ++ testData.toDF.collect ++ testData.toDF.collect) sql("DROP TABLE table_with_partition") sql("DROP TABLE createAndInsertTest") } } }
Example 98
Source File: HiveParquetSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.execution.datasources.parquet.ParquetTest import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.{QueryTest, Row} case class Cases(lower: String, UPPER: String) class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton { test("Case insensitive attribute names") { withParquetTable((1 to 4).map(i => Cases(i.toString, i.toString)), "cases") { val expected = (1 to 4).map(i => Row(i.toString)) checkAnswer(sql("SELECT upper FROM cases"), expected) checkAnswer(sql("SELECT LOWER FROM cases"), expected) } } test("SELECT on Parquet table") { val data = (1 to 4).map(i => (i, s"val_$i")) withParquetTable(data, "t") { checkAnswer(sql("SELECT * FROM t"), data.map(Row.fromTuple)) } } test("Simple column projection + filter on Parquet table") { withParquetTable((1 to 4).map(i => (i % 2 == 0, i, s"val_$i")), "t") { checkAnswer( sql("SELECT `_1`, `_3` FROM t WHERE `_1` = true"), Seq(Row(true, "val_2"), Row(true, "val_4"))) } } test("Converting Hive to Parquet Table via saveAsParquetFile") { withTempPath { dir => sql("SELECT * FROM src").write.parquet(dir.getCanonicalPath) hiveContext.read.parquet(dir.getCanonicalPath).registerTempTable("p") withTempTable("p") { checkAnswer( sql("SELECT * FROM src ORDER BY key"), sql("SELECT * from p ORDER BY key").collect().toSeq) } } } test("INSERT OVERWRITE TABLE Parquet table") { withParquetTable((1 to 10).map(i => (i, s"val_$i")), "t") { withTempPath { file => sql("SELECT * FROM t LIMIT 1").write.parquet(file.getCanonicalPath) hiveContext.read.parquet(file.getCanonicalPath).registerTempTable("p") withTempTable("p") { // let's do three overwrites for good measure sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") sql("INSERT OVERWRITE TABLE p SELECT * FROM t") checkAnswer(sql("SELECT * FROM p"), sql("SELECT * FROM t").collect().toSeq) } } } } }
Example 99
Source File: HiveDataFrameSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.QueryTest class HiveDataFrameSuite extends QueryTest with TestHiveSingleton { test("table name with schema") { // regression test for SPARK-11778 hiveContext.sql("create schema usrdb") hiveContext.sql("create table usrdb.test(c int)") hiveContext.read.table("usrdb.test") hiveContext.sql("drop table usrdb.test") hiveContext.sql("drop schema usrdb") } }
Example 100
Source File: HiveDataFrameJoinSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.{Row, QueryTest} import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveDataFrameJoinSuite extends QueryTest with TestHiveSingleton { import hiveContext.implicits._ // We should move this into SQL package if we make case sensitivity configurable in SQL. test("join - self join auto resolve ambiguity with case insensitivity") { val df = Seq((1, "1"), (2, "2")).toDF("key", "value") checkAnswer( df.join(df, df("key") === df("Key")), Row(1, "1", 1, "1") :: Row(2, "2", 2, "2") :: Nil) checkAnswer( df.join(df.filter($"value" === "2"), df("key") === df("Key")), Row(2, "2", 2, "2") :: Nil) } }
Example 101
Source File: UDFSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.QueryTest import org.apache.spark.sql.hive.test.TestHiveSingleton case class FunctionResult(f1: String, f2: String) class UDFSuite extends QueryTest with TestHiveSingleton { test("UDF case insensitive") { hiveContext.udf.register("random0", () => { Math.random() }) hiveContext.udf.register("RANDOM1", () => { Math.random() }) hiveContext.udf.register("strlenScala", (_: String).length + (_: Int)) assert(hiveContext.sql("SELECT RANDOM0() FROM src LIMIT 1").head().getDouble(0) >= 0.0) assert(hiveContext.sql("SELECT RANDOm1() FROM src LIMIT 1").head().getDouble(0) >= 0.0) assert(hiveContext.sql("SELECT strlenscala('test', 1) FROM src LIMIT 1").head().getInt(0) === 5) } }
Example 102
Source File: HiveExplainSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.hive.test.TestHiveSingleton class HiveExplainSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("explain extended command") { checkExistence(sql(" explain select * from src where key=123 "), true, "== Physical Plan ==") checkExistence(sql(" explain select * from src where key=123 "), false, "== Parsed Logical Plan ==", "== Analyzed Logical Plan ==", "== Optimized Logical Plan ==") checkExistence(sql(" explain extended select * from src where key=123 "), true, "== Parsed Logical Plan ==", "== Analyzed Logical Plan ==", "== Optimized Logical Plan ==", "== Physical Plan ==") } test("explain create table command") { checkExistence(sql("explain create table temp__b as select * from src limit 2"), true, "== Physical Plan ==", "InsertIntoHiveTable", "Limit", "src") checkExistence(sql("explain extended create table temp__b as select * from src limit 2"), true, "== Parsed Logical Plan ==", "== Analyzed Logical Plan ==", "== Optimized Logical Plan ==", "== Physical Plan ==", "CreateTableAsSelect", "InsertIntoHiveTable", "Limit", "src") checkExistence(sql( """ | EXPLAIN EXTENDED CREATE TABLE temp__b | ROW FORMAT SERDE "org.apache.hadoop.hive.serde2.columnar.ColumnarSerDe" | WITH SERDEPROPERTIES("serde_p1"="p1","serde_p2"="p2") | STORED AS RCFile | TBLPROPERTIES("tbl_p1"="p11", "tbl_p2"="p22") | AS SELECT * FROM src LIMIT 2 """.stripMargin), true, "== Parsed Logical Plan ==", "== Analyzed Logical Plan ==", "== Optimized Logical Plan ==", "== Physical Plan ==", "CreateTableAsSelect", "InsertIntoHiveTable", "Limit", "src") } test("SPARK-6212: The EXPLAIN output of CTAS only shows the analyzed plan") { withTempTable("jt") { val rdd = sparkContext.parallelize((1 to 10).map(i => s"""{"a":$i, "b":"str$i"}""")) hiveContext.read.json(rdd).registerTempTable("jt") val outputs = sql( s""" |EXPLAIN EXTENDED |CREATE TABLE t1 |AS |SELECT * FROM jt """.stripMargin).collect().map(_.mkString).mkString val shouldContain = "== Parsed Logical Plan ==" :: "== Analyzed Logical Plan ==" :: "Subquery" :: "== Optimized Logical Plan ==" :: "== Physical Plan ==" :: "CreateTableAsSelect" :: "InsertIntoHiveTable" :: "jt" :: Nil for (key <- shouldContain) { assert(outputs.contains(key), s"$key doesn't exist in result") } val physicalIndex = outputs.indexOf("== Physical Plan ==") assert(!outputs.substring(physicalIndex).contains("Subquery"), "Physical Plan should not contain Subquery since it's eliminated by optimizer") } } }
Example 103
Source File: HiveOperatorQueryableSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.{Row, QueryTest} import org.apache.spark.sql.hive.test.{TestHive, TestHiveSingleton} class HiveOperatorQueryableSuite extends QueryTest with TestHiveSingleton { import hiveContext._ test("SPARK-5324 query result of describe command") { hiveContext.loadTestTable("src") // register a describe command to be a temp table sql("desc src").registerTempTable("mydesc") checkAnswer( sql("desc mydesc"), Seq( Row("col_name", "string", "name of the column"), Row("data_type", "string", "data type of the column"), Row("comment", "string", "comment of the column"))) checkAnswer( sql("select * from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) checkAnswer( sql("select col_name, data_type, comment from mydesc"), Seq( Row("key", "int", null), Row("value", "string", null))) } }
Example 104
Source File: HivePlanTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.functions._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import hiveContext.sql import hiveContext.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t") val optimized = sql("SELECT cos(null) FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 105
Source File: ListTablesSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.scalatest.BeforeAndAfterAll import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.QueryTest import org.apache.spark.sql.Row class ListTablesSuite extends QueryTest with TestHiveSingleton with BeforeAndAfterAll { import hiveContext._ import hiveContext.implicits._ val df = sparkContext.parallelize((1 to 10).map(i => (i, s"str$i"))).toDF("key", "value") override def beforeAll(): Unit = { // The catalog in HiveContext is a case insensitive one. catalog.registerTable(TableIdentifier("ListTablesSuiteTable"), df.logicalPlan) sql("CREATE TABLE HiveListTablesSuiteTable (key int, value string)") sql("CREATE DATABASE IF NOT EXISTS ListTablesSuiteDB") sql("CREATE TABLE ListTablesSuiteDB.HiveInDBListTablesSuiteTable (key int, value string)") } override def afterAll(): Unit = { catalog.unregisterTable(TableIdentifier("ListTablesSuiteTable")) sql("DROP TABLE IF EXISTS HiveListTablesSuiteTable") sql("DROP TABLE IF EXISTS ListTablesSuiteDB.HiveInDBListTablesSuiteTable") sql("DROP DATABASE IF EXISTS ListTablesSuiteDB") } test("get all tables of current database") { Seq(tables(), sql("SHOW TABLes")).foreach { case allTables => // We are using default DB. checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("listtablessuitetable", true)) checkAnswer( allTables.filter("tableName = 'hivelisttablessuitetable'"), Row("hivelisttablessuitetable", false)) assert(allTables.filter("tableName = 'hiveindblisttablessuitetable'").count() === 0) } } test("getting all tables with a database name") { Seq(tables("listtablessuiteDb"), sql("SHOW TABLes in listTablesSuitedb")).foreach { case allTables => checkAnswer( allTables.filter("tableName = 'listtablessuitetable'"), Row("listtablessuitetable", true)) assert(allTables.filter("tableName = 'hivelisttablessuitetable'").count() === 0) checkAnswer( allTables.filter("tableName = 'hiveindblisttablessuitetable'"), Row("hiveindblisttablessuitetable", false)) } } }
Example 106
Source File: BroadcastJoinSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import scala.reflect.ClassTag import org.scalatest.BeforeAndAfterAll import org.apache.spark.{AccumulatorSuite, SparkConf, SparkContext} import org.apache.spark.sql.functions._ import org.apache.spark.sql.{SQLConf, SQLContext, QueryTest} private def testBroadcastJoin[T: ClassTag](name: String, joinType: String): Unit = { AccumulatorSuite.verifyPeakExecutionMemorySet(sqlContext.sparkContext, name) { val df1 = sqlContext.createDataFrame(Seq((1, "4"), (2, "2"))).toDF("key", "value") val df2 = sqlContext.createDataFrame(Seq((1, "1"), (2, "2"))).toDF("key", "value") // Comparison at the end is for broadcast left semi join val joinExpression = df1("key") === df2("key") && df1("value") > df2("value") val df3 = df1.join(broadcast(df2), joinExpression, joinType) val plan = df3.queryExecution.executedPlan assert(plan.collect { case p: T => p }.size === 1) plan.executeCollect() } } test("unsafe broadcast hash join updates peak execution memory") { testBroadcastJoin[BroadcastHashJoin]("unsafe broadcast hash join", "inner") } test("unsafe broadcast hash outer join updates peak execution memory") { testBroadcastJoin[BroadcastHashOuterJoin]("unsafe broadcast hash outer join", "left_outer") } test("unsafe broadcast left semi join updates peak execution memory") { testBroadcastJoin[BroadcastLeftSemiJoinHash]("unsafe broadcast left semi join", "leftsemi") } }
Example 107
Source File: JsonParsingOptionsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.json import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSQLContext 'Reynold Xin'}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.option("allowComments", "true").json(rdd) assert(df.schema.head.name == "name") assert(df.first().getString(0) == "Reynold Xin") } test("allowSingleQuotes off") { val str = """{'name': 'Reynold Xin'}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.option("allowSingleQuotes", "false").json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowSingleQuotes on") { val str = """{'name': 'Reynold Xin'}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.json(rdd) assert(df.schema.head.name == "name") assert(df.first().getString(0) == "Reynold Xin") } test("allowUnquotedFieldNames off") { val str = """{name: 'Reynold Xin'}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowUnquotedFieldNames on") { val str = """{name: 'Reynold Xin'}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.option("allowUnquotedFieldNames", "true").json(rdd) assert(df.schema.head.name == "name") assert(df.first().getString(0) == "Reynold Xin") } test("allowNumericLeadingZeros off") { val str = """{"age": 0018}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } test("allowNumericLeadingZeros on") { val str = """{"age": 0018}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.option("allowNumericLeadingZeros", "true").json(rdd) assert(df.schema.head.name == "age") assert(df.first().getLong(0) == 18) } // The following two tests are not really working - need to look into Jackson's // JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS. ignore("allowNonNumericNumbers off") { val str = """{"age": NaN}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.json(rdd) assert(df.schema.head.name == "_corrupt_record") } ignore("allowNonNumericNumbers on") { val str = """{"age": NaN}""" val rdd = sqlContext.sparkContext.parallelize(Seq(str)) val df = sqlContext.read.option("allowNonNumericNumbers", "true").json(rdd) assert(df.schema.head.name == "age") assert(df.first().getDouble(0).isNaN) } }
Example 108
Source File: TextSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} import org.apache.spark.util.Utils class TextSuite extends QueryTest with SharedSQLContext { test("reading text file") { verifyFrame(sqlContext.read.format("text").load(testFile)) } test("SQLContext.read.text() API") { verifyFrame(sqlContext.read.text(testFile)) } test("SPARK-12562 verify write.text() can handle column name beyond `value`") { val df = sqlContext.read.text(testFile).withColumnRenamed("value", "adwrasdf") val tempFile = Utils.createTempDir() tempFile.delete() df.write.text(tempFile.getCanonicalPath) verifyFrame(sqlContext.read.text(tempFile.getCanonicalPath)) Utils.deleteRecursively(tempFile) } test("error handling for invalid schema") { val tempFile = Utils.createTempDir() tempFile.delete() val df = sqlContext.range(2) intercept[AnalysisException] { df.write.text(tempFile.getCanonicalPath) } intercept[AnalysisException] { sqlContext.range(2).select(df("id"), df("id") + 1).write.text(tempFile.getCanonicalPath) } } private def testFile: String = { Thread.currentThread().getContextClassLoader.getResource("text-suite.txt").toString } private def verifyFrame(df: DataFrame): Unit = { // schema assert(df.schema == new StructType().add("value", StringType)) // verify content val data = df.collect() assert(data(0) == Row("This is a test file for the text data source")) assert(data(1) == Row("1+1")) // non ascii characters are not allowed in the code, so we disable the scalastyle here. // scalastyle:off assert(data(2) == Row("数据砖头")) // scalastyle:on assert(data(3) == Row("\"doh\"")) assert(data.length == 4) } }
Example 109
Source File: ParquetCompatibilityTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import scala.collection.JavaConverters.{collectionAsScalaIterableConverter, mapAsJavaMapConverter, seqAsJavaListConverter} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{Path, PathFilter} import org.apache.parquet.hadoop.api.WriteSupport import org.apache.parquet.hadoop.api.WriteSupport.WriteContext import org.apache.parquet.hadoop.{ParquetFileReader, ParquetWriter} import org.apache.parquet.io.api.RecordConsumer import org.apache.parquet.schema.{MessageType, MessageTypeParser} import org.apache.spark.sql.QueryTest def writeDirect( path: String, schema: String, metadata: Map[String, String], recordWriters: (RecordConsumer => Unit)*): Unit = { val messageType = MessageTypeParser.parseMessageType(schema) val writeSupport = new DirectWriteSupport(messageType, metadata) val parquetWriter = new ParquetWriter[RecordConsumer => Unit](new Path(path), writeSupport) try recordWriters.foreach(parquetWriter.write) finally parquetWriter.close() } }
Example 110
Source File: ExtraStrategiesSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { sqlContext.experimental.extraStrategies = Nil } } }
Example 111
Source File: PartitionedWriteSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{Row, QueryTest} import org.apache.spark.sql.functions._ import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.Utils class PartitionedWriteSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("write many partitions") { val path = Utils.createTempDir() path.delete() val df = sqlContext.range(100).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( sqlContext.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) } test("write many partitions with repeats") { val path = Utils.createTempDir() path.delete() val base = sqlContext.range(100) val df = base.unionAll(base).select($"id", lit(1).as("data")) df.write.partitionBy("id").save(path.getCanonicalPath) checkAnswer( sqlContext.read.load(path.getCanonicalPath), (0 to 99).map(Row(1, _)).toSeq ++ (0 to 99).map(Row(1, _)).toSeq) Utils.deleteRecursively(path) } test("partitioned columns should appear at the end of schema") { withTempPath { f => val path = f.getAbsolutePath Seq(1 -> "a").toDF("i", "j").write.partitionBy("i").parquet(path) assert(sqlContext.read.parquet(path).schema.map(_.name) == Seq("j", "i")) } } }