org.apache.spark.sql.AnalysisException Scala Examples
The following examples show how to use org.apache.spark.sql.AnalysisException.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: JdbcRelationProvider.scala From drizzle-spark with Apache License 2.0 | 7 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 3
Source File: CreateHiveTableAsSelectCommand.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.hive.MetastoreRelation case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, ignoreIfExists: Boolean) extends RunnableCommand { private val tableIdentifier = tableDesc.identifier override def innerChildren: Seq[LogicalPlan] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withFormat = tableDesc.withNewStorage( inputFormat = tableDesc.storage.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.storage.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.storage.serde.orElse(Some(classOf[LazySimpleSerDe].getName)), compressed = tableDesc.storage.compressed) val withSchema = if (withFormat.schema.isEmpty) { // Hive doesn't support specifying the column list for target table in CTAS // However we don't think SparkSQL should follow that. tableDesc.copy(schema = query.output.toStructType) } else { withFormat } sparkSession.sessionState.catalog.createTable(withSchema, ignoreIfExists = false) // Get the Metastore Relation sparkSession.sessionState.catalog.lookupRelation(tableIdentifier) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (sparkSession.sessionState.catalog.tableExists(tableIdentifier)) { if (ignoreIfExists) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$tableIdentifier already exists.") } } else { try { sparkSession.sessionState.executePlan(InsertIntoTable( metastoreRelation, Map(), query, overwrite = true, ifNotExists = false)).toRdd } catch { case NonFatal(e) => // drop the created table. sparkSession.sessionState.catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 4
Source File: SparkSQLDriver.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, Arrays, List => JList} import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.execution.QueryExecution private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, attr.dataType.catalogString, "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.hiveResultString() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 5
Source File: randomExpressions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, DoubleType} import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom @ExpressionDescription( usage = "_FUNC_(a) - Returns a random column with i.i.d. gaussian random distribution.") case class Randn(seed: Long) extends RDG { override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() def this() = this(Utils.random.nextLong()) def this(seed: Expression) = this(seed match { case IntegerLiteral(s) => s case _ => throw new AnalysisException("Input argument to randn must be an integer literal.") }) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val rngTerm = ctx.freshName("rng") val className = classOf[XORShiftRandom].getName ctx.addMutableState(className, rngTerm, s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""", isNull = "false") } }
Example 6
Source File: TimeWindow.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval case class TimeWindow( timeColumn: Expression, windowDuration: Long, slideDuration: Long, startTime: Long) extends UnaryExpression with ImplicitCastInputTypes with Unevaluable with NonSQLExpression { ////////////////////////// // SQL Constructors ////////////////////////// def this( timeColumn: Expression, windowDuration: Expression, slideDuration: Expression, startTime: Expression) = { this(timeColumn, TimeWindow.parseExpression(windowDuration), TimeWindow.parseExpression(slideDuration), TimeWindow.parseExpression(startTime)) } def this(timeColumn: Expression, windowDuration: Expression, slideDuration: Expression) = { this(timeColumn, TimeWindow.parseExpression(windowDuration), TimeWindow.parseExpression(slideDuration), 0) } def this(timeColumn: Expression, windowDuration: Expression) = { this(timeColumn, windowDuration, windowDuration) } override def child: Expression = timeColumn override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType) override def dataType: DataType = new StructType() .add(StructField("start", TimestampType)) .add(StructField("end", TimestampType)) // This expression is replaced in the analyzer. override lazy val resolved = false case class PreciseTimestamp(child: Expression) extends UnaryExpression with ExpectsInputTypes { override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType) override def dataType: DataType = LongType override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val eval = child.genCode(ctx) ev.copy(code = eval.code + s"""boolean ${ev.isNull} = ${eval.isNull}; |${ctx.javaType(dataType)} ${ev.value} = ${eval.value}; """.stripMargin) } }
Example 7
Source File: AnalysisTest.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SimpleCatalystConf import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ trait AnalysisTest extends PlanTest { protected val caseSensitiveAnalyzer = makeAnalyzer(caseSensitive = true) protected val caseInsensitiveAnalyzer = makeAnalyzer(caseSensitive = false) private def makeAnalyzer(caseSensitive: Boolean): Analyzer = { val conf = new SimpleCatalystConf(caseSensitive) val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf) catalog.createTempView("TaBlE", TestRelations.testRelation, overrideIfExists = true) new Analyzer(catalog, conf) { override val extendedResolutionRules = EliminateSubqueryAliases :: Nil } } protected def getAnalyzer(caseSensitive: Boolean) = { if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer } protected def checkAnalysis( inputPlan: LogicalPlan, expectedPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val actualPlan = analyzer.execute(inputPlan) analyzer.checkAnalysis(actualPlan) comparePlans(actualPlan, expectedPlan) } protected def assertAnalysisSuccess( inputPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val analysisAttempt = analyzer.execute(inputPlan) try analyzer.checkAnalysis(analysisAttempt) catch { case a: AnalysisException => fail( s""" |Failed to Analyze Plan |$inputPlan | |Partial Analysis |$analysisAttempt """.stripMargin, a) } } protected def assertAnalysisError( inputPlan: LogicalPlan, expectedErrors: Seq[String], caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val e = intercept[AnalysisException] { analyzer.checkAnalysis(analyzer.execute(inputPlan)) } if (!expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains)) { fail( s"""Exception message should contain the following substrings: | | ${expectedErrors.mkString("\n ")} | |Actual exception message: | | ${e.getMessage} """.stripMargin) } } }
Example 8
Source File: ResolveInlineTablesSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.types.{LongType, NullType} class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables.convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables.convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables.convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 9
Source File: CartesianProductExec.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsInternal { iter => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition: (InternalRow) => Boolean = newPredicate(condition.get, left.output ++ right.output) val joined = new JoinedRow iter.filter { r => boundCondition(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 10
Source File: VariableSubstitutionSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.internal import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException class VariableSubstitutionSuite extends SparkFunSuite { private lazy val conf = new SQLConf private lazy val sub = new VariableSubstitution(conf) test("system property") { System.setProperty("varSubSuite.var", "abcd") assert(sub.substitute("${system:varSubSuite.var}") == "abcd") } test("environmental variables") { assert(sub.substitute("${env:SPARK_TESTING}") == "1") } test("Spark configuration variable") { conf.setConfString("some-random-string-abcd", "1234abcd") assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${some-random-string-abcd}") == "1234abcd") } test("multiple substitutes") { val q = "select ${bar} ${foo} ${doo} this is great" conf.setConfString("bar", "1") conf.setConfString("foo", "2") conf.setConfString("doo", "3") assert(sub.substitute(q) == "select 1 2 3 this is great") } test("test nested substitutes") { val q = "select ${bar} ${foo} this is great" conf.setConfString("bar", "1") conf.setConfString("foo", "${bar}") assert(sub.substitute(q) == "select 1 1 this is great") } }
Example 11
Source File: ResolvedDataSourceSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.DataSource class ResolvedDataSourceSuite extends SparkFunSuite { private def getProvidingClass(name: String): Class[_] = DataSource(sparkSession = null, className = name).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("error message for unknown data sources") { val error1 = intercept[AnalysisException] { getProvidingClass("avro") } assert(error1.getMessage.contains("Failed to find data source: avro.")) val error2 = intercept[AnalysisException] { getProvidingClass("com.databricks.spark.avro") } assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro.")) val error3 = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 12
Source File: DDLSourceLoadSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{StringType, StructField, StructType} // please note that the META-INF/services had to be modified for the test directory for this to work class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext { test("data sources with the same name") { intercept[RuntimeException] { spark.read.format("Fluet da Bomb").load() } } test("load data source from format alias") { spark.read.format("gathering quorum").load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("specify full classname with duplicate formats") { spark.read.format("org.apache.spark.sql.sources.FakeSourceOne") .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("should fail to load ORC without Hive Support") { val e = intercept[AnalysisException] { spark.read.format("orc").load() } assert(e.message.contains("The ORC data source must be used with Hive support enabled")) } } class FakeSourceOne extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceTwo extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceThree extends RelationProvider with DataSourceRegister { def shortName(): String = "gathering quorum" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } }
Example 13
Source File: ErrorHandlers.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package daf.web import java.io.FileNotFoundException import java.lang.reflect.UndeclaredThrowableException import daf.error.InvalidRequestException import it.gov.daf.common.web.ErrorHandler import org.apache.spark.sql.AnalysisException import org.ietf.jgss.GSSException import play.api.mvc.Results object ErrorHandlers { val security: ErrorHandler = { case _: GSSException => Results.Unauthorized } val spark: ErrorHandler = { case _: FileNotFoundException => Results.NotFound case _: AnalysisException => Results.NotFound case error: UndeclaredThrowableException if error.getUndeclaredThrowable.isInstanceOf[AnalysisException] => Results.NotFound } val api: ErrorHandler = { case error: InvalidRequestException => Results.BadRequest { error.getMessage } } }
Example 14
Source File: rules.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.extensions import com.pingcap.tispark.statistics.StatisticsManager import com.pingcap.tispark.utils.ReflectionUtil._ import com.pingcap.tispark.{MetaManager, TiDBRelation, TiTableReference} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation} import org.apache.spark.sql.catalyst.catalog.TiSessionCatalog import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.command._ import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.{AnalysisException, _} case class TiResolutionRule(getOrCreateTiContext: SparkSession => TiContext)( sparkSession: SparkSession) extends Rule[LogicalPlan] { protected lazy val meta: MetaManager = tiContext.meta private lazy val autoLoad = tiContext.autoLoad private lazy val tiCatalog = tiContext.tiCatalog private lazy val tiSession = tiContext.tiSession private lazy val sqlContext = tiContext.sqlContext protected val tiContext: TiContext = getOrCreateTiContext(sparkSession) protected val resolveTiDBRelation: TableIdentifier => LogicalPlan = tableIdentifier => { val dbName = getDatabaseFromIdentifier(tableIdentifier) val tableName = tableIdentifier.table val table = meta.getTable(dbName, tableName) if (table.isEmpty) { throw new AnalysisException(s"Table or view '$tableName' not found in database '$dbName'") } if (autoLoad) { StatisticsManager.loadStatisticsInfo(table.get) } val sizeInBytes = StatisticsManager.estimateTableSize(table.get) val tiDBRelation = TiDBRelation(tiSession, TiTableReference(dbName, tableName, sizeInBytes), meta)( sqlContext) // Use SubqueryAlias so that projects and joins can correctly resolve // UnresolvedAttributes in JoinConditions, Projects, Filters, etc. newSubqueryAlias(tableName, LogicalRelation(tiDBRelation)) } override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp resolveTiDBRelations protected def resolveTiDBRelations: PartialFunction[LogicalPlan, LogicalPlan] = { case i @ InsertIntoTable(UnresolvedRelation(tableIdentifier), _, _, _, _) if tiCatalog .catalogOf(tableIdentifier.database) .exists(_.isInstanceOf[TiSessionCatalog]) => i.copy(table = EliminateSubqueryAliases(resolveTiDBRelation(tableIdentifier))) case UnresolvedRelation(tableIdentifier) if tiCatalog .catalogOf(tableIdentifier.database) .exists(_.isInstanceOf[TiSessionCatalog]) => resolveTiDBRelation(tableIdentifier) } private def getDatabaseFromIdentifier(tableIdentifier: TableIdentifier): String = tableIdentifier.database.getOrElse(tiCatalog.getCurrentDatabase) } case class TiDDLRule(getOrCreateTiContext: SparkSession => TiContext)(sparkSession: SparkSession) extends Rule[LogicalPlan] { protected lazy val tiContext: TiContext = getOrCreateTiContext(sparkSession) override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { // TODO: support other commands that may concern TiSpark catalog. case sd: ShowDatabasesCommand => TiShowDatabasesCommand(tiContext, sd) case sd: SetDatabaseCommand => TiSetDatabaseCommand(tiContext, sd) case st: ShowTablesCommand => TiShowTablesCommand(tiContext, st) case st: ShowColumnsCommand => TiShowColumnsCommand(tiContext, st) case dt: DescribeTableCommand => TiDescribeTablesCommand(tiContext, dt) case dc: DescribeColumnCommand => TiDescribeColumnCommand(tiContext, dc) case ct: CreateTableLikeCommand => TiCreateTableLikeCommand(tiContext, ct) } }
Example 15
Source File: CreateHiveTableAsSelectCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumnNames: Seq[String], mode: SaveMode) extends DataWritingCommand { private val tableIdentifier = tableDesc.identifier override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog if (catalog.tableExists(tableIdentifier)) { assert(mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } // For CTAS, there is no static partition values to insert. val partition = tableDesc.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( tableDesc, partition, query, overwrite = false, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(tableDesc.schema.isEmpty) catalog.createTable( tableDesc.copy(schema = outputColumns.toStructType), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 16
Source File: TestHiveSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.test.{TestHiveSingleton, TestHiveSparkSession} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils class TestHiveSuite extends TestHiveSingleton with SQLTestUtils { test("load test table based on case sensitivity") { val testHiveSparkSession = spark.asInstanceOf[TestHiveSparkSession] withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { sql("SELECT * FROM SRC").queryExecution.analyzed assert(testHiveSparkSession.getLoadedTables.contains("src")) assert(testHiveSparkSession.getLoadedTables.size == 1) } testHiveSparkSession.reset() withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { val err = intercept[AnalysisException] { sql("SELECT * FROM SRC").queryExecution.analyzed } assert(err.message.contains("Table or view not found")) } testHiveSparkSession.reset() } test("SPARK-15887: hive-site.xml should be loaded") { assert(hiveClient.getConf("hive.in.test", "") == "true") } }
Example 17
Source File: SparkSQLDriver.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, Arrays, List => JList} import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, attr.dataType.catalogString, "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) { execution.hiveResultString() } tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 18
Source File: HiveMetastoreLazyInitializationSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.SparkFunSuite import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.util.Utils class HiveMetastoreLazyInitializationSuite extends SparkFunSuite { test("lazily initialize Hive client") { val spark = SparkSession.builder() .appName("HiveMetastoreLazyInitializationSuite") .master("local[2]") .enableHiveSupport() .config("spark.hadoop.hive.metastore.uris", "thrift://127.0.0.1:11111") .getOrCreate() val originalLevel = org.apache.log4j.Logger.getRootLogger().getLevel try { // Avoid outputting a lot of expected warning logs spark.sparkContext.setLogLevel("error") // We should be able to run Spark jobs without Hive client. assert(spark.sparkContext.range(0, 1).count() === 1) // We should be able to use Spark SQL if no table references. assert(spark.sql("select 1 + 1").count() === 1) assert(spark.range(0, 1).count() === 1) // We should be able to use fs val path = Utils.createTempDir() path.delete() try { spark.range(0, 1).write.parquet(path.getAbsolutePath) assert(spark.read.parquet(path.getAbsolutePath).count() === 1) } finally { Utils.deleteRecursively(path) } // Make sure that we are not using the local derby metastore. val exceptionString = Utils.exceptionString(intercept[AnalysisException] { spark.sql("show tables") }) for (msg <- Seq( "show tables", "Could not connect to meta store", "org.apache.thrift.transport.TTransportException", "Connection refused")) { exceptionString.contains(msg) } } finally { spark.sparkContext.setLogLevel(originalLevel.toString) spark.stop() } } }
Example 19
Source File: XSQLAnalyzeTableCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.execution.command.{CommandUtils, RunnableCommand} import org.apache.spark.sql.xsql.XSQLSessionCatalog case class XSQLAnalyzeTableCommand(tableIdent: TableIdentifier, noscan: Boolean = true) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val sessionState = sparkSession.sessionState val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog] val catalogDB = catalog.getUsedCatalogDatabase(tableIdent.dataSource, tableIdent.database) if (catalogDB == None) { return Seq.empty[Row] } val ds = catalogDB.get.dataSourceName val db = catalogDB.get.name val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db), Some(ds)) val tableMeta = catalog.getRawTable(tableIdentWithDB) if (tableMeta.tableType == CatalogTableType.VIEW) { throw new AnalysisException("ANALYZE TABLE is not supported on views.") } // Compute stats for the whole table val newTotalSize = CommandUtils.calculateTotalSize(sparkSession, tableMeta) val newRowCount = if (noscan) None else Some(BigInt(sparkSession.table(tableIdentWithDB).count())) // Update the metastore if the above statistics of the table are different from those // recorded in the metastore. val newStats = CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount) if (newStats.isDefined) { catalog.alterTableStats(tableIdentWithDB, newStats) } Seq.empty[Row] } }
Example 20
Source File: XSQLCreateHiveTableAsSelectCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand import org.apache.spark.sql.xsql.XSQLSessionCatalog case class XSQLCreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumnNames: Seq[String], mode: SaveMode) extends DataWritingCommand { override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog] val tableIdentifier = catalog.getUsedTableIdentifier(tableDesc.identifier) val newTableDesc = tableDesc.copy(identifier = tableIdentifier) if (catalog.tableExists(tableIdentifier)) { assert( mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } XSQLInsertIntoHiveTable( newTableDesc, Map.empty, query, overwrite = false, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(newTableDesc.schema.isEmpty) catalog.createTable(newTableDesc.copy(schema = query.schema), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(newTableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap XSQLInsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 21
Source File: SchemaUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.util import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.types.StructType def checkColumnNameDuplication( columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new AnalysisException( s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } }
Example 22
Source File: HyperLogLogPlusPlus.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.aggregate import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.HyperLogLogPlusPlusHelper import org.apache.spark.sql.types._ // scalastyle:off override def eval(buffer: InternalRow): Any = { hllppHelper.query(buffer, mutableAggBufferOffset) } } object HyperLogLogPlusPlus { def validateDoubleLiteral(exp: Expression): Double = exp match { case Literal(d: Double, DoubleType) => d case Literal(dec: Decimal, _) => dec.toDouble case _ => throw new AnalysisException("The second argument should be a double literal.") } }
Example 23
Source File: randomExpressions.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types._ import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom // scalastyle:off line.size.limit @ExpressionDescription( usage = """_FUNC_([seed]) - Returns a random value with independent and identically distributed (i.i.d.) values drawn from the standard normal distribution.""", examples = """ Examples: > SELECT _FUNC_(); -0.3254147983080288 > SELECT _FUNC_(0); 1.1164209726833079 > SELECT _FUNC_(null); 1.1164209726833079 """, note = "The function is non-deterministic in general case.") // scalastyle:on line.size.limit case class Randn(child: Expression) extends RDG with ExpressionWithRandomSeed { def this() = this(Literal(Utils.random.nextLong(), LongType)) override def withNewSeed(seed: Long): Randn = Randn(Literal(seed, LongType)) override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val className = classOf[XORShiftRandom].getName val rngTerm = ctx.addMutableState(className, "rng") ctx.addPartitionInitializationStatement( s"$rngTerm = new $className(${seed}L + partitionIndex);") ev.copy(code = code""" final ${CodeGenerator.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""", isNull = FalseLiteral) } override def freshCopy(): Randn = Randn(child) } object Randn { def apply(seed: Long): Randn = Randn(Literal(seed, LongType)) }
Example 24
Source File: TimeWindow.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.CalendarInterval case class TimeWindow( timeColumn: Expression, windowDuration: Long, slideDuration: Long, startTime: Long) extends UnaryExpression with ImplicitCastInputTypes with Unevaluable with NonSQLExpression { ////////////////////////// // SQL Constructors ////////////////////////// def this( timeColumn: Expression, windowDuration: Expression, slideDuration: Expression, startTime: Expression) = { this(timeColumn, TimeWindow.parseExpression(windowDuration), TimeWindow.parseExpression(slideDuration), TimeWindow.parseExpression(startTime)) } def this(timeColumn: Expression, windowDuration: Expression, slideDuration: Expression) = { this(timeColumn, TimeWindow.parseExpression(windowDuration), TimeWindow.parseExpression(slideDuration), 0) } def this(timeColumn: Expression, windowDuration: Expression) = { this(timeColumn, windowDuration, windowDuration) } override def child: Expression = timeColumn override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType) override def dataType: DataType = new StructType() .add(StructField("start", TimestampType)) .add(StructField("end", TimestampType)) // This expression is replaced in the analyzer. override lazy val resolved = false case class PreciseTimestampConversion( child: Expression, fromType: DataType, toType: DataType) extends UnaryExpression with ExpectsInputTypes { override def inputTypes: Seq[AbstractDataType] = Seq(fromType) override def dataType: DataType = toType override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val eval = child.genCode(ctx) ev.copy(code = eval.code + code"""boolean ${ev.isNull} = ${eval.isNull}; |${CodeGenerator.javaType(dataType)} ${ev.value} = ${eval.value}; """.stripMargin) } override def nullSafeEval(input: Any): Any = input }
Example 25
Source File: view.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, View} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf object EliminateView extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transform { // The child should have the same output attributes with the View operator, so we simply // remove the View operator. case View(_, output, child) => assert(output == child.output, s"The output of the child ${child.output.mkString("[", ",", "]")} is different from the " + s"view output ${output.mkString("[", ",", "]")}") child } }
Example 26
Source File: ResolveTableValuedFunctions.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import java.util.Locale import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Range} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.{DataType, IntegerType, LongType} tvf("start" -> LongType, "end" -> LongType, "step" -> LongType, "numPartitions" -> IntegerType) { case Seq(start: Long, end: Long, step: Long, numPartitions: Int) => Range(start, end, step, Some(numPartitions)) }) ) override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) => // The whole resolution is somewhat difficult to understand here due to too much abstractions. // We should probably rewrite the following at some point. Reynold was just here to improve // error messages and didn't have time to do a proper rewrite. val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match { case Some(tvf) => def failAnalysis(): Nothing = { val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ") u.failAnalysis( s"""error: table-valued function ${u.functionName} with alternatives: |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")} |cannot be applied to: ($argTypes)""".stripMargin) } val resolved = tvf.flatMap { case (argList, resolver) => argList.implicitCast(u.functionArgs) match { case Some(casted) => try { Some(resolver(casted.map(_.eval()))) } catch { case e: AnalysisException => failAnalysis() } case _ => None } } resolved.headOption.getOrElse { failAnalysis() } case _ => u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function") } // If alias names assigned, add `Project` with the aliases if (u.outputNames.nonEmpty) { val outputAttrs = resolvedFunc.output // Checks if the number of the aliases is equal to expected one if (u.outputNames.size != outputAttrs.size) { u.failAnalysis(s"Number of given aliases does not match number of output columns. " + s"Function name: ${u.functionName}; number of aliases: " + s"${u.outputNames.size}; number of output columns: ${outputAttrs.size}.") } val aliases = outputAttrs.zip(u.outputNames).map { case (attr, name) => Alias(attr, name)() } Project(aliases, resolvedFunc) } else { resolvedFunc } } }
Example 27
Source File: StringUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import java.util.regex.{Pattern, PatternSyntaxException} import org.apache.spark.sql.AnalysisException import org.apache.spark.unsafe.types.UTF8String object StringUtils { def filterPattern(names: Seq[String], pattern: String): Seq[String] = { val funcNames = scala.collection.mutable.SortedSet.empty[String] pattern.trim().split("\\|").foreach { subPattern => try { val regex = ("(?i)" + subPattern.replaceAll("\\*", ".*")).r funcNames ++= names.filter{ name => regex.pattern.matcher(name).matches() } } catch { case _: PatternSyntaxException => } } funcNames.toSeq } }
Example 28
Source File: SchemaUtilsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.util import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.types.StructType class SchemaUtilsSuite extends SparkFunSuite { private def resolver(caseSensitiveAnalysis: Boolean): Resolver = { if (caseSensitiveAnalysis) { caseSensitiveResolution } else { caseInsensitiveResolution } } Seq((true, ("a", "a"), ("b", "b")), (false, ("a", "A"), ("b", "B"))).foreach { case (caseSensitive, (a0, a1), (b0, b1)) => val testType = if (caseSensitive) "case-sensitive" else "case-insensitive" test(s"Check column name duplication in $testType cases") { def checkExceptionCases(schemaStr: String, duplicatedColumns: Seq[String]): Unit = { val expectedErrorMsg = "Found duplicate column(s) in SchemaUtilsSuite: " + duplicatedColumns.map(c => s"`${c.toLowerCase}`").mkString(", ") val schema = StructType.fromDDL(schemaStr) var msg = intercept[AnalysisException] { SchemaUtils.checkSchemaColumnNameDuplication( schema, "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive) }.getMessage assert(msg.contains(expectedErrorMsg)) msg = intercept[AnalysisException] { SchemaUtils.checkColumnNameDuplication( schema.map(_.name), "in SchemaUtilsSuite", resolver(caseSensitive)) }.getMessage assert(msg.contains(expectedErrorMsg)) msg = intercept[AnalysisException] { SchemaUtils.checkColumnNameDuplication( schema.map(_.name), "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive) }.getMessage assert(msg.contains(expectedErrorMsg)) } checkExceptionCases(s"$a0 INT, b INT, $a1 INT", a0 :: Nil) checkExceptionCases(s"$a0 INT, b INT, $a1 INT, $a0 INT", a0 :: Nil) checkExceptionCases(s"$a0 INT, $b0 INT, $a1 INT, $a0 INT, $b1 INT", b0 :: a0 :: Nil) } } test("Check no exception thrown for valid schemas") { def checkNoExceptionCases(schemaStr: String, caseSensitive: Boolean): Unit = { val schema = StructType.fromDDL(schemaStr) SchemaUtils.checkSchemaColumnNameDuplication( schema, "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive) SchemaUtils.checkColumnNameDuplication( schema.map(_.name), "in SchemaUtilsSuite", resolver(caseSensitive)) SchemaUtils.checkColumnNameDuplication( schema.map(_.name), "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive) } checkNoExceptionCases("a INT, b INT, c INT", caseSensitive = true) checkNoExceptionCases("Aa INT, b INT, aA INT", caseSensitive = true) checkNoExceptionCases("a INT, b INT, c INT", caseSensitive = false) } }
Example 29
Source File: ResolveLambdaVariablesSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.types.{ArrayType, IntegerType} class ResolveLambdaVariablesSuite extends PlanTest { import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ object Analyzer extends RuleExecutor[LogicalPlan] { val batches = Batch("Resolution", FixedPoint(4), ResolveLambdaVariables(conf)) :: Nil } private val key = 'key.int private val values1 = 'values1.array(IntegerType) private val values2 = 'values2.array(ArrayType(ArrayType(IntegerType))) private val data = LocalRelation(Seq(key, values1, values2)) private val lvInt = NamedLambdaVariable("x", IntegerType, nullable = true) private val lvHiddenInt = NamedLambdaVariable("col0", IntegerType, nullable = true) private val lvArray = NamedLambdaVariable("x", ArrayType(IntegerType), nullable = true) private def plan(e: Expression): LogicalPlan = data.select(e.as("res")) private def checkExpression(e1: Expression, e2: Expression): Unit = { comparePlans(Analyzer.execute(plan(e1)), plan(e2)) } private def lv(s: Symbol) = UnresolvedNamedLambdaVariable(Seq(s.name)) test("resolution - no op") { checkExpression(key, key) } test("resolution - simple") { val in = ArrayTransform(values1, LambdaFunction(lv('x) + 1, lv('x) :: Nil)) val out = ArrayTransform(values1, LambdaFunction(lvInt + 1, lvInt :: Nil)) checkExpression(in, out) } test("resolution - nested") { val in = ArrayTransform(values2, LambdaFunction( ArrayTransform(lv('x), LambdaFunction(lv('x) + 1, lv('x) :: Nil)), lv('x) :: Nil)) val out = ArrayTransform(values2, LambdaFunction( ArrayTransform(lvArray, LambdaFunction(lvInt + 1, lvInt :: Nil)), lvArray :: Nil)) checkExpression(in, out) } test("resolution - hidden") { val in = ArrayTransform(values1, key) val out = ArrayTransform(values1, LambdaFunction(key, lvHiddenInt :: Nil, hidden = true)) checkExpression(in, out) } test("fail - name collisions") { val p = plan(ArrayTransform(values1, LambdaFunction(lv('x) + lv('X), lv('x) :: lv('X) :: Nil))) val msg = intercept[AnalysisException](Analyzer.execute(p)).getMessage assert(msg.contains("arguments should not have names that are semantically the same")) } test("fail - lambda arguments") { val p = plan(ArrayTransform(values1, LambdaFunction(lv('x) + lv('y) + lv('z), lv('x) :: lv('y) :: lv('z) :: Nil))) val msg = intercept[AnalysisException](Analyzer.execute(p)).getMessage assert(msg.contains("does not match the number of arguments expected")) } }
Example 30
Source File: AnalysisTest.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import java.net.URI import java.util.Locale import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.internal.SQLConf trait AnalysisTest extends PlanTest { protected val caseSensitiveAnalyzer = makeAnalyzer(caseSensitive = true) protected val caseInsensitiveAnalyzer = makeAnalyzer(caseSensitive = false) private def makeAnalyzer(caseSensitive: Boolean): Analyzer = { val conf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> caseSensitive) val catalog = new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf) catalog.createDatabase( CatalogDatabase("default", "", new URI("loc"), Map.empty), ignoreIfExists = false) catalog.createTempView("TaBlE", TestRelations.testRelation, overrideIfExists = true) catalog.createTempView("TaBlE2", TestRelations.testRelation2, overrideIfExists = true) catalog.createTempView("TaBlE3", TestRelations.testRelation3, overrideIfExists = true) new Analyzer(catalog, conf) { override val extendedResolutionRules = EliminateSubqueryAliases :: Nil } } protected def getAnalyzer(caseSensitive: Boolean) = { if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer } protected def checkAnalysis( inputPlan: LogicalPlan, expectedPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val actualPlan = analyzer.executeAndCheck(inputPlan) comparePlans(actualPlan, expectedPlan) } protected override def comparePlans( plan1: LogicalPlan, plan2: LogicalPlan, checkAnalysis: Boolean = false): Unit = { // Analysis tests may have not been fully resolved, so skip checkAnalysis. super.comparePlans(plan1, plan2, checkAnalysis) } protected def assertAnalysisSuccess( inputPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val analysisAttempt = analyzer.execute(inputPlan) try analyzer.checkAnalysis(analysisAttempt) catch { case a: AnalysisException => fail( s""" |Failed to Analyze Plan |$inputPlan | |Partial Analysis |$analysisAttempt """.stripMargin, a) } } protected def assertAnalysisError( inputPlan: LogicalPlan, expectedErrors: Seq[String], caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val e = intercept[AnalysisException] { analyzer.checkAnalysis(analyzer.execute(inputPlan)) } if (!expectedErrors.map(_.toLowerCase(Locale.ROOT)).forall( e.getMessage.toLowerCase(Locale.ROOT).contains)) { fail( s"""Exception message should contain the following substrings: | | ${expectedErrors.mkString("\n ")} | |Actual exception message: | | ${e.getMessage} """.stripMargin) } } }
Example 31
Source File: ResolveInlineTablesSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Cast, Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types.{LongType, NullType, TimestampType} class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(conf)(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables(conf).convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("convert TimeZoneAwareExpression") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType)))) val withTimeZone = ResolveTimeZone(conf).apply(table) val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone) val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType) .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long] assert(output.map(_.dataType) == Seq(TimestampType)) assert(data.size == 1) assert(data.head.getLong(0) == correct) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables(conf).convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables(conf).convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 32
Source File: ResolveSubquerySuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{InSubquery, ListQuery} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project} class ResolveSubquerySuite extends AnalysisTest { val a = 'a.int val b = 'b.int val t1 = LocalRelation(a) val t2 = LocalRelation(b) test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") { val expr = Filter( InSubquery(Seq(a), ListQuery(Project(Seq(UnresolvedAttribute("a")), t2))), t1) val m = intercept[AnalysisException] { SimpleAnalyzer.checkAnalysis(SimpleAnalyzer.ResolveSubquery(expr)) }.getMessage assert(m.contains( "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses")) } }
Example 33
Source File: CheckCartesianProductsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.scalatest.Matchers._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.internal.SQLConf.CROSS_JOINS_ENABLED class CheckCartesianProductsSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("Check Cartesian Products", Once, CheckCartesianProducts) :: Nil } val testRelation1 = LocalRelation('a.int, 'b.int) val testRelation2 = LocalRelation('c.int, 'd.int) val joinTypesWithRequiredCondition = Seq(Inner, LeftOuter, RightOuter, FullOuter) val joinTypesWithoutRequiredCondition = Seq(LeftSemi, LeftAnti, ExistenceJoin('exists)) test("CheckCartesianProducts doesn't throw an exception if cross joins are enabled)") { withSQLConf(CROSS_JOINS_ENABLED.key -> "true") { noException should be thrownBy { for (joinType <- joinTypesWithRequiredCondition ++ joinTypesWithoutRequiredCondition) { performCartesianProductCheck(joinType) } } } } test("CheckCartesianProducts throws an exception for join types that require a join condition") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithRequiredCondition) { val thrownException = the [AnalysisException] thrownBy { performCartesianProductCheck(joinType) } assert(thrownException.message.contains("Detected implicit cartesian product")) } } } test("CheckCartesianProducts doesn't throw an exception if a join condition is present") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithRequiredCondition) { noException should be thrownBy { performCartesianProductCheck(joinType, Some('a === 'd)) } } } } test("CheckCartesianProducts doesn't throw an exception if join types don't require conditions") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithoutRequiredCondition) { noException should be thrownBy { performCartesianProductCheck(joinType) } } } } private def performCartesianProductCheck( joinType: JoinType, condition: Option[Expression] = None): Unit = { val analyzedPlan = testRelation1.join(testRelation2, joinType, condition).analyze val optimizedPlan = Optimize.execute(analyzedPlan) comparePlans(analyzedPlan, optimizedPlan) } }
Example 34
Source File: JdbcRelationProvider.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val resolver = sqlContext.conf.resolver val timeZoneId = sqlContext.conf.sessionLocalTimeZone val schema = JDBCRelation.getSchema(resolver, jdbcOptions) val parts = JDBCRelation.columnPartition(schema, resolver, timeZoneId, jdbcOptions) JDBCRelation(schema, parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val options = new JdbcOptionsInWrite(parameters) val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis val conn = JdbcUtils.createConnectionFactory(options)() try { val tableExists = JdbcUtils.tableExists(conn, options) if (tableExists) { mode match { case SaveMode.Overwrite => if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, options) val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, options.table, options) createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } case SaveMode.Append => val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '${options.table}' already exists. " + s"SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 35
Source File: DataSourceUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.types._ object DataSourceUtils { private def verifySchema(format: FileFormat, schema: StructType, isReadPath: Boolean): Unit = { schema.foreach { field => if (!format.supportDataType(field.dataType, isReadPath)) { throw new AnalysisException( s"$format data source does not support ${field.dataType.catalogString} data type.") } } } // SPARK-24626: Metadata files and temporary files should not be // counted as data files, so that they shouldn't participate in tasks like // location size calculation. private[sql] def isDataPath(path: Path): Boolean = { val name = path.getName !(name.startsWith("_") || name.startsWith(".")) } }
Example 36
Source File: AnalyzeTableCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTableType case class AnalyzeTableCommand( tableIdent: TableIdentifier, noscan: Boolean = true) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val sessionState = sparkSession.sessionState val db = tableIdent.database.getOrElse(sessionState.catalog.getCurrentDatabase) val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db)) val tableMeta = sessionState.catalog.getTableMetadata(tableIdentWithDB) if (tableMeta.tableType == CatalogTableType.VIEW) { throw new AnalysisException("ANALYZE TABLE is not supported on views.") } // Compute stats for the whole table val newTotalSize = CommandUtils.calculateTotalSize(sparkSession, tableMeta) val newRowCount = if (noscan) None else Some(BigInt(sparkSession.table(tableIdentWithDB).count())) // Update the metastore if the above statistics of the table are different from those // recorded in the metastore. val newStats = CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount) if (newStats.isDefined) { sessionState.catalog.alterTableStats(tableIdentWithDB, newStats) } Seq.empty[Row] } }
Example 37
Source File: RateStreamProvider.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.sources import java.util.Optional import org.apache.spark.network.util.JavaUtils import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.streaming.continuous.RateStreamContinuousReader import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.sources.v2._ import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousReader, MicroBatchReader} import org.apache.spark.sql.types._ def valueAtSecond(seconds: Long, rowsPerSecond: Long, rampUpTimeSeconds: Long): Long = { // E.g., rampUpTimeSeconds = 4, rowsPerSecond = 10 // Then speedDeltaPerSecond = 2 // // seconds = 0 1 2 3 4 5 6 // speed = 0 2 4 6 8 10 10 (speedDeltaPerSecond * seconds) // end value = 0 2 6 12 20 30 40 (0 + speedDeltaPerSecond * seconds) * (seconds + 1) / 2 val speedDeltaPerSecond = rowsPerSecond / (rampUpTimeSeconds + 1) if (seconds <= rampUpTimeSeconds) { // Calculate "(0 + speedDeltaPerSecond * seconds) * (seconds + 1) / 2" in a special way to // avoid overflow if (seconds % 2 == 1) { (seconds + 1) / 2 * speedDeltaPerSecond * seconds } else { seconds / 2 * speedDeltaPerSecond * (seconds + 1) } } else { // rampUpPart is just a special case of the above formula: rampUpTimeSeconds == seconds val rampUpPart = valueAtSecond(rampUpTimeSeconds, rowsPerSecond, rampUpTimeSeconds) rampUpPart + (seconds - rampUpTimeSeconds) * rowsPerSecond } } }
Example 38
Source File: JdbcUtilsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.types._ class JdbcUtilsSuite extends SparkFunSuite { val tableSchema = StructType(Seq( StructField("C1", StringType, false), StructField("C2", IntegerType, false))) val caseSensitive = org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution val caseInsensitive = org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution test("Parse user specified column types") { assert(JdbcUtils.getCustomSchema(tableSchema, null, caseInsensitive) === tableSchema) assert(JdbcUtils.getCustomSchema(tableSchema, "", caseInsensitive) === tableSchema) assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE", caseInsensitive) === StructType(Seq(StructField("C1", DateType, false), StructField("C2", IntegerType, false)))) assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE", caseSensitive) === StructType(Seq(StructField("C1", StringType, false), StructField("C2", IntegerType, false)))) assert( JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseInsensitive) === StructType(Seq(StructField("C1", DateType, false), StructField("C2", StringType, false)))) assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseSensitive) === StructType(Seq(StructField("C1", StringType, false), StructField("C2", StringType, false)))) // Throw AnalysisException val duplicate = intercept[AnalysisException]{ JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, c1 STRING", caseInsensitive) === StructType(Seq(StructField("c1", DateType, false), StructField("c1", StringType, false))) } assert(duplicate.getMessage.contains( "Found duplicate column(s) in the customSchema option value")) // Throw ParseException val dataTypeNotSupported = intercept[ParseException]{ JdbcUtils.getCustomSchema(tableSchema, "c3 DATEE, C2 STRING", caseInsensitive) === StructType(Seq(StructField("c3", DateType, false), StructField("C2", StringType, false))) } assert(dataTypeNotSupported.getMessage.contains("DataType datee is not supported")) val mismatchedInput = intercept[ParseException]{ JdbcUtils.getCustomSchema(tableSchema, "c3 DATE. C2 STRING", caseInsensitive) === StructType(Seq(StructField("c3", DateType, false), StructField("C2", StringType, false))) } assert(mismatchedInput.getMessage.contains("mismatched input '.' expecting")) } }
Example 39
Source File: QueryExecutionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation} import org.apache.spark.sql.test.SharedSQLContext class QueryExecutionSuite extends SharedSQLContext { test("toString() exception/error handling") { spark.experimental.extraStrategies = Seq( new SparkStrategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = Nil }) def qe: QueryExecution = new QueryExecution(spark, OneRowRelation()) // Nothing! assert(qe.toString.contains("OneRowRelation")) // Throw an AnalysisException - this should be captured. spark.experimental.extraStrategies = Seq( new SparkStrategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = throw new AnalysisException("exception") }) assert(qe.toString.contains("org.apache.spark.sql.AnalysisException")) // Throw an Error - this should not be captured. spark.experimental.extraStrategies = Seq( new SparkStrategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = throw new Error("error") }) val error = intercept[Error](qe.toString) assert(error.getMessage.contains("error")) } }
Example 40
Source File: ExecutorSideSQLConfSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.internal import org.apache.spark.SparkFunSuite import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.execution.debug.codegenStringSeq import org.apache.spark.sql.functions.col import org.apache.spark.sql.test.SQLTestUtils class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils { import testImplicits._ protected var spark: SparkSession = null // Create a new [[SparkSession]] running in local-cluster mode. override def beforeAll(): Unit = { super.beforeAll() spark = SparkSession.builder() .master("local-cluster[2,1,1024]") .appName("testing") .getOrCreate() } override def afterAll(): Unit = { spark.stop() spark = null } override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { pairs.foreach { case (k, v) => SQLConf.get.setConfString(k, v) } try f finally { pairs.foreach { case (k, _) => SQLConf.get.unsetConf(k) } } } test("ReadOnlySQLConf is correctly created at the executor side") { withSQLConf("spark.sql.x" -> "a") { val checks = spark.range(10).mapPartitions { _ => val conf = SQLConf.get Iterator(conf.isInstanceOf[ReadOnlySQLConf] && conf.getConfString("spark.sql.x") == "a") }.collect() assert(checks.forall(_ == true)) } } test("case-sensitive config should work for json schema inference") { withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { withTempPath { path => val pathString = path.getCanonicalPath spark.range(10).select('id.as("ID")).write.json(pathString) spark.range(10).write.mode("append").json(pathString) assert(spark.read.json(pathString).columns.toSet == Set("id", "ID")) } } } test("SPARK-24727 CODEGEN_CACHE_MAX_ENTRIES is correctly referenced at the executor side") { withSQLConf(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES.key -> "300") { val checks = spark.range(10).mapPartitions { _ => val conf = SQLConf.get Iterator(conf.isInstanceOf[ReadOnlySQLConf] && conf.getConfString(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES.key) == "300") }.collect() assert(checks.forall(_ == true)) } } test("SPARK-22219: refactor to control to generate comment") { Seq(true, false).foreach { flag => withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> flag.toString) { val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count() .queryExecution.executedPlan) assert(res.length == 2) assert(res.forall { case (_, code) => (code.contains("* Codegend pipeline") == flag) && (code.contains("// input[") == flag) }) } } } }
Example 41
Source File: VariableSubstitutionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.internal import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException class VariableSubstitutionSuite extends SparkFunSuite { private lazy val conf = new SQLConf private lazy val sub = new VariableSubstitution(conf) test("system property") { System.setProperty("varSubSuite.var", "abcd") assert(sub.substitute("${system:varSubSuite.var}") == "abcd") } test("environmental variables") { assert(sub.substitute("${env:SPARK_TESTING}") == "1") } test("Spark configuration variable") { conf.setConfString("some-random-string-abcd", "1234abcd") assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${some-random-string-abcd}") == "1234abcd") } test("multiple substitutes") { val q = "select ${bar} ${foo} ${doo} this is great" conf.setConfString("bar", "1") conf.setConfString("foo", "2") conf.setConfString("doo", "3") assert(sub.substitute(q) == "select 1 2 3 this is great") } test("test nested substitutes") { val q = "select ${bar} ${foo} this is great" conf.setConfString("bar", "1") conf.setConfString("foo", "${bar}") assert(sub.substitute(q) == "select 1 1 this is great") } }
Example 42
Source File: ResolvedDataSourceSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.test.SharedSQLContext class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext { private def getProvidingClass(name: String): Class[_] = DataSource( sparkSession = spark, className = name, options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID) ).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("avro: show deploy guide for loading the external avro module") { Seq("avro", "org.apache.spark.sql.avro").foreach { provider => val message = intercept[AnalysisException] { getProvidingClass(provider) }.getMessage assert(message.contains(s"Failed to find data source: $provider")) assert(message.contains("Please deploy the application as per the deployment section of")) } } test("kafka: show deploy guide for loading the external kafka module") { val message = intercept[AnalysisException] { getProvidingClass("kafka") }.getMessage assert(message.contains("Failed to find data source: kafka")) assert(message.contains("Please deploy the application as per the deployment section of")) } test("error message for unknown data sources") { val error = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 43
Source File: DDLSourceLoadSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ // please note that the META-INF/services had to be modified for the test directory for this to work class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext { test("data sources with the same name - internal data sources") { val e = intercept[AnalysisException] { spark.read.format("Fluet da Bomb").load() } assert(e.getMessage.contains("Multiple sources found for Fluet da Bomb")) } test("data sources with the same name - internal data source/external data source") { assert(spark.read.format("datasource").load().schema == StructType(Seq(StructField("longType", LongType, nullable = false)))) } test("data sources with the same name - external data sources") { val e = intercept[AnalysisException] { spark.read.format("Fake external source").load() } assert(e.getMessage.contains("Multiple sources found for Fake external source")) } test("load data source from format alias") { assert(spark.read.format("gathering quorum").load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))) } test("specify full classname with duplicate formats") { assert(spark.read.format("org.apache.spark.sql.sources.FakeSourceOne") .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))) } } class FakeSourceOne extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceTwo extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("integerType", IntegerType, nullable = false))) } } class FakeSourceThree extends RelationProvider with DataSourceRegister { def shortName(): String = "gathering quorum" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceFour extends RelationProvider with DataSourceRegister { def shortName(): String = "datasource" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("longType", LongType, nullable = false))) } }
Example 44
Source File: AnnotationParser.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.parser import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.AbstractSparkSQLParser import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{AnnotationReference, Expression, Literal} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String protected def toTableMetadata(metadata: Map[String, Expression]): Metadata = { val res = new MetadataBuilder() metadata.foreach { case (k, v:Literal) => v.dataType match { case StringType => if (k.equals("?")) { sys.error("column metadata key can not be ?") } if (k.equals("*")) { sys.error("column metadata key can not be *") } res.putString(k, v.value.asInstanceOf[UTF8String].toString) case LongType => res.putLong(k, v.value.asInstanceOf[Long]) case DoubleType => res.putDouble(k, v.value.asInstanceOf[Double]) case NullType => res.putString(k, null) case a:ArrayType => res.putString(k, v.value.toString) } case (k, v:AnnotationReference) => sys.error("column metadata can not have a reference to another column metadata") } res.build() } }
Example 45
Source File: ResolveDropCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.CaseSensitivityUtils._ import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources.commands.UnresolvedDropCommand import org.apache.spark.sql.sources.{DropRelation, RelationKind, Table} import scala.util.Try case class ResolveDropCommand(analyzer: Analyzer, catalog: Catalog) extends Rule[LogicalPlan] with TableDependencyCalculator { private def failAnalysis(reason: String) = throw new AnalysisException(reason) override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { case UnresolvedDropCommand(kind, allowNotExisting, tableIdent, cascade) => val plan = resolvePlan(kind, tableIdent, allowNotExisting) val affected = plan.map { lp => val targetKind = RelationKind.kindOf(lp).getOrElse(Table) checkValidKind(kind, tableIdent, targetKind) buildDependentsMap(catalog, tableIdent) } affected.foreach(checkAllowedToDrop(cascade)) DropRunnableCommand(affected.getOrElse(Map.empty)) } private def getDropRelation(plan: LogicalPlan): Option[DropRelation] = plan.collectFirst { case d: LogicalPlan with DropRelation => d case LogicalRelation(d: DropRelation, _) => d } private def resolvePlan(kind: DropTarget, tableIdent: TableIdentifier, allowNotExisting: Boolean): Option[LogicalPlan] = { Try(catalog.lookupRelation(tableIdent)).toOption match { case Some(plan) => Some(plan) case None if allowNotExisting => None case None => failAnalysis( s"""${kind.targetName.toLowerCase} ${tableIdent.unquotedString} does not exist. To " |DROP a ${kind.targetName.toLowerCase} regardless if it exists of not, use |DROP ${kind.targetName.toUpperCase} IF EXISTS.""".stripMargin) } } private def checkAllowedToDrop(cascade: Boolean) (dependents: Map[String, Option[DropRelation]]) = { if (dependents.size > 1 && !cascade) { failAnalysis("Can not drop because more than one relation has " + s"references to the target relation: ${dependents.keys.mkString(",")}. " + s"to force drop use 'CASCADE'.") } } private def checkValidKind(kind: DropTarget, tableIdent: TableIdentifier, targetKind: RelationKind): Unit = { if (!kind.accepts(targetKind)) { failAnalysis( s"Relation '${tableIdent.unquotedString} of kind" + s"$targetKind is not a ${kind.targetName}. " + s"Please use DROP ${targetKind.name.toUpperCase()} to drop it.") } } private def buildDependentsMap(catalog: Catalog, identifier: TableIdentifier): Map[String, Option[DropRelation]] = { val tables = getTables(catalog, identifier.database) val tablesAndDependents = buildDependentsMap(tables) def aggregate(acc: Set[TableIdentifier], next: List[TableIdentifier]): Set[TableIdentifier] = next match { case Nil => acc case ident :: rest => val dependents = tablesAndDependents(ident) aggregate(acc ++ dependents, rest ++ dependents.diff(acc)) } val dependentsSet = aggregate(Set(identifier), identifier :: Nil) dependentsSet.flatMap { dependent => tables.get(dependent).map(dependent.table -> getDropRelation(_)) }.toMap } }
Example 46
Source File: ResolveAppendCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.{AppendRunnableCommand, LogicalRelation} import org.apache.spark.sql.sources.AppendRelation import org.apache.spark.sql.sources.commands.UnresolvedAppendCommand case class ResolveAppendCommand(analyzer: Analyzer) extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan transform { case UnresolvedAppendCommand(table, options) => val resolvedTable = analyzer.execute(table) resolvedTable.collectFirst { case LogicalRelation(appendRelation: AppendRelation, _) => AppendRunnableCommand(appendRelation, options) }.getOrElse { throw new AnalysisException(s"Cannot append ${resolvedTable.treeString}") } } }
Example 47
Source File: ResolveInferSchemaCommand.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.sources.commands.{InferSchemaCommand, Orc, Parquet, UnresolvedInferSchemaCommand} import scala.util.Try case class ResolveInferSchemaCommand(sqlContext: SQLContext) extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan.transform { case UnresolvedInferSchemaCommand(path, explicitFileType) => val fileType = explicitFileType.getOrElse(path.toLowerCase match { case p if p.endsWith(".orc") => Orc case p if p.endsWith(".parquet") => Parquet case invalid => throw new AnalysisException(s"Could not determine file format of '$path'") }) InferSchemaCommand(path, fileType) } }
Example 48
Source File: ResolveTableFunctions.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.tablefunctions.UnresolvedTableFunction import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule case class ResolveTableFunctions( analyzer: Analyzer, registry: TableFunctionRegistry = TableFunctionRegistry) extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transformDown { case UnresolvedTableFunction(name, arguments) => val lookup = registry.lookupFunction(name) lookup match { case Some(f) => val analyzed = f.analyze(analyzer, arguments) ResolvedTableFunction(f, analyzed) case None => throw new AnalysisException(s"Undefined function $name") } } }
Example 49
Source File: HiveEmulationSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.catalyst.analysis.NoSuchTableException import org.apache.spark.sql.hive.SapHiveContext import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{AnalysisException, GlobalSapSQLContext, Row, SapSQLConf} import org.apache.spark.util.DummyRelationUtils._ import org.apache.spark.util.SqlContextConfigurationUtils import org.scalatest.FunSuite class HiveEmulationSuite extends FunSuite with GlobalSapSQLContext with SqlContextConfigurationUtils { private def createTable(name: String, schema: StructType): Unit = sqlc.createDataFrame(sc.parallelize(Seq.empty[Row]), schema).registerTempTable(name) private def withHiveEmulation[A](op: => A): A = withConf(SapSQLConf.HIVE_EMULATION.key, "true")(op) test("Show schemas shows a default schema when hive emulation is on") { withHiveEmulation { val values = sqlc.sql("SHOW SCHEMAS").collect().toSet assertResult(Set(Row("default")))(values) } } test("Show schemas throws if hive emulation is off") { intercept[RuntimeException](sqlc.sql("SHOW SCHEMAS")) } test("Desc an existing table") { withHiveEmulation { createTable("foo", StructType('a.int :: 'b.int :: Nil)) val values = sqlc.sql("DESC foo").collect().toSet assertResult( Set( Row("a", "int", null), Row("b", "int", null)))(values) } } test("Desc a non-existent table throws") { withHiveEmulation { intercept[NoSuchTableException] { sqlc.sql("DESC bar").collect() } } } test("Describe an existing table") { withHiveEmulation { createTable("foo", StructType('a.int :: 'b.int :: Nil)) val values = sqlc.sql("DESCRIBE FORMATTED foo").collect().toList assertResult( List( Row(s"# col_name${" " * 12}\tdata_type${" " * 11}\tcomment${" " * 13}\t"), Row(""), Row(s"a${" " * 19}\tint${" " * 17}\tnull${" " * 16}\t"), Row(s"b${" " * 19}\tint${" " * 17}\tnull${" " * 16}\t")))(values) } } test("Retrieval of a database prefixed table") { val hc = new SapHiveContext(sc) hc.setConf(SapSQLConf.HIVE_EMULATION, true) val expected = Set(Row(0, 0), Row(0, 1), Row(1, 0), Row(1, 1)) val rdd = hc.sparkContext.parallelize(expected.toSeq) hc.createDataFrame(rdd, StructType('a.int :: 'b.int :: Nil)).registerTempTable("foo") val results = hc.sql("SELECT * FROM default.foo").collect().toSet assertResult(expected)(results) hc.setConf(SapSQLConf.HIVE_EMULATION, false) intercept[AnalysisException] { hc.sql("SELECT * FROM default.foo") } } test("USE statements should not do anything when in hive emulation mode") { withConf(SapSQLConf.HIVE_EMULATION.key, "true") { sqlc.sql("USE foo bar") } } test("Any other use command should throw an exception") { intercept[RuntimeException] { sqlc.sql("USE foo bar") } } }
Example 50
Source File: InferSchemaCommandSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.io.FileNotFoundException import org.apache.spark.sql.{AnalysisException, GlobalSapSQLContext, Row} import org.scalatest.FunSuite import com.sap.spark.util.TestUtils.{getFileFromClassPath, withTempDirectory} class InferSchemaCommandSuite extends FunSuite with GlobalSapSQLContext { test("Inferring of schema fails on non-existent file") { withTempDirectory { dir => val nonExistentPath = dir.path + "/non-existent" intercept[FileNotFoundException] { sqlc.sql(s"""INFER SCHEMA OF "$nonExistentPath" AS ORC""").collect() } } } // scalastyle:off magic.number test("Inferring of schema works on parquet file") { val personFile = getFileFromClassPath("/pers.parquet") val result = sqlc.sql(s"""INFER SCHEMA OF "$personFile"""").collect().toSet assertResult( Set( Row("name", 1, true, "VARCHAR(*)", null, null, null), Row("age", 2, true, "INTEGER", 32, 2, 0)))(result) } test("Inferring of schema works on orc file") { val personFile = getFileFromClassPath("/pers.orc") val result = sqlc.sql(s"""INFER SCHEMA OF "$personFile"""").collect().toSet assertResult( Set( Row("name", 1, true, "VARCHAR(*)", null, null, null), Row("age", 2, true, "INTEGER", 32, 2, 0)))(result) } // scalastyle:on magic.number test("Inferring of schema fails on invalid file") { val invalidFile = getFileFromClassPath("/simple.csv") intercept[AnalysisException] { sqlc.sql(s"""INFER SCHEMA OF "$invalidFile"""") } } }
Example 51
Source File: ResolveTableFunctionsSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.tablefunctions.UnresolvedTableFunction import org.mockito.Mockito._ import org.scalatest.FunSuite class ResolveTableFunctionsSuite extends FunSuite { test("resolution of previously registered function") { val tf = mock(classOf[TableFunction]) val analyzer = mock(classOf[Analyzer]) val registry = new SimpleTableFunctionRegistry() val strategy = ResolveTableFunctions(analyzer, registry) registry.registerFunction("foo", tf) val unresolved = UnresolvedTableFunction("foo", Seq.empty) when(tf.analyze(analyzer, Seq.empty)) thenReturn Seq.empty val resolved = strategy.apply(unresolved) assert(resolved == ResolvedTableFunction(tf, Seq.empty)) verify(tf).analyze(analyzer, Seq.empty) } test("fail on unregistered functions") { val analyzer = mock(classOf[Analyzer]) val strategy = ResolveTableFunctions(analyzer) val unresolved = UnresolvedTableFunction("foo", Seq.empty) intercept[AnalysisException] { strategy(unresolved) } } }
Example 52
Source File: EventHubsWriter.scala From azure-event-hubs-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.eventhubs import org.apache.spark.internal.Logging import org.apache.spark.sql.{ AnalysisException, SparkSession } import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.types.{ BinaryType, StringType } import org.apache.spark.util.Utils private[eventhubs] object EventHubsWriter extends Logging { val BodyAttributeName = "body" val PartitionKeyAttributeName = "partitionKey" val PartitionIdAttributeName = "partition" val PropertiesAttributeName = "properties" override def toString: String = "EventHubsWriter" private def validateQuery(schema: Seq[Attribute], parameters: Map[String, String]): Unit = { schema .find(_.name == BodyAttributeName) .getOrElse( throw new AnalysisException(s"Required attribute '$BodyAttributeName' not found.") ) .dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException( s"$BodyAttributeName attribute type " + s"must be a String or BinaryType.") } } def write( sparkSession: SparkSession, queryExecution: QueryExecution, parameters: Map[String, String] ): Unit = { val schema = queryExecution.analyzed.output validateQuery(schema, parameters) queryExecution.toRdd.foreachPartition { iter => val writeTask = new EventHubsWriteTask(parameters, schema) Utils.tryWithSafeFinally(block = writeTask.execute(iter))( finallyBlock = writeTask.close() ) } } }
Example 53
Source File: OapQuerySuite.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import java.util.{Locale, TimeZone} import org.scalatest.{BeforeAndAfter, Ignore} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.internal.SQLConf // Ignore because in separate package will encounter problem with shaded spark source. @Ignore class OapQuerySuite extends HiveComparisonTest with BeforeAndAfter { private lazy val originalTimeZone = TimeZone.getDefault private lazy val originalLocale = Locale.getDefault import org.apache.spark.sql.hive.test.TestHive._ // Note: invoke TestHive will create a SparkContext which can't be configured by us. // So be careful this may affect current using SparkContext and cause strange problem. private lazy val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled override def beforeAll() { super.beforeAll() TestHive.setCacheTables(true) // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*) TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles")) // Add Locale setting Locale.setDefault(Locale.US) // Ensures that cross joins are enabled so that we can test them TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true) TestHive.setConf(HiveUtils.CONVERT_METASTORE_PARQUET, true) } override def afterAll() { try { TestHive.setCacheTables(false) TimeZone.setDefault(originalTimeZone) Locale.setDefault(originalLocale) sql("DROP TEMPORARY FUNCTION IF EXISTS udtf_count2") TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled) } finally { super.afterAll() } } private def assertDupIndex(body: => Unit): Unit = { val e = intercept[AnalysisException] { body } assert(e.getMessage.toLowerCase.contains("exists")) } test("create hive table in parquet format") { try { sql("create table p_table (key int, val string) stored as parquet") sql("insert overwrite table p_table select * from src") sql("create oindex if not exists p_index on p_table(key)") assert(sql("select val from p_table where key = 238") .collect().head.getString(0) == "val_238") } finally { sql("drop oindex p_index on p_table") sql("drop table p_table") } } test("create duplicate hive table in parquet format") { try { sql("create table p_table1 (key int, val string) stored as parquet") sql("insert overwrite table p_table1 select * from src") sql("create oindex p_index on p_table1(key)") assertDupIndex { sql("create oindex p_index on p_table1(key)") } } finally { sql("drop oindex p_index on p_table1") } } }
Example 54
Source File: CreateHiveTableAsSelectCommand.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, OverwriteOptions} import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.hive.MetastoreRelation case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, ignoreIfExists: Boolean) extends RunnableCommand { private val tableIdentifier = tableDesc.identifier override def innerChildren: Seq[LogicalPlan] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withFormat = tableDesc.withNewStorage( inputFormat = tableDesc.storage.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.storage.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.storage.serde.orElse(Some(classOf[LazySimpleSerDe].getName)), compressed = tableDesc.storage.compressed) val withSchema = if (withFormat.schema.isEmpty) { // Hive doesn't support specifying the column list for target table in CTAS // However we don't think SparkSQL should follow that. tableDesc.copy(schema = query.output.toStructType) } else { withFormat } sparkSession.sessionState.catalog.createTable(withSchema, ignoreIfExists = false) // Get the Metastore Relation sparkSession.sessionState.catalog.lookupRelation(tableIdentifier) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (sparkSession.sessionState.catalog.tableExists(tableIdentifier)) { if (ignoreIfExists) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$tableIdentifier already exists.") } } else { try { sparkSession.sessionState.executePlan(InsertIntoTable( metastoreRelation, Map(), query, overwrite = OverwriteOptions(true), ifNotExists = false)).toRdd } catch { case NonFatal(e) => // drop the created table. sparkSession.sessionState.catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 55
Source File: SparkSQLDriver.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, Arrays, List => JList} import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.execution.QueryExecution private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, attr.dataType.catalogString, "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.hiveResultString() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 56
Source File: AnalysisTest.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SimpleCatalystConf import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ trait AnalysisTest extends PlanTest { protected val caseSensitiveAnalyzer = makeAnalyzer(caseSensitive = true) protected val caseInsensitiveAnalyzer = makeAnalyzer(caseSensitive = false) private def makeAnalyzer(caseSensitive: Boolean): Analyzer = { val conf = new SimpleCatalystConf(caseSensitive) val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf) catalog.createTempView("TaBlE", TestRelations.testRelation, overrideIfExists = true) new Analyzer(catalog, conf) { override val extendedResolutionRules = EliminateSubqueryAliases :: Nil } } protected def getAnalyzer(caseSensitive: Boolean) = { if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer } protected def checkAnalysis( inputPlan: LogicalPlan, expectedPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val actualPlan = analyzer.execute(inputPlan) analyzer.checkAnalysis(actualPlan) comparePlans(actualPlan, expectedPlan) } protected def assertAnalysisSuccess( inputPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val analysisAttempt = analyzer.execute(inputPlan) try analyzer.checkAnalysis(analysisAttempt) catch { case a: AnalysisException => fail( s""" |Failed to Analyze Plan |$inputPlan | |Partial Analysis |$analysisAttempt """.stripMargin, a) } } protected def assertAnalysisError( inputPlan: LogicalPlan, expectedErrors: Seq[String], caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val e = intercept[AnalysisException] { analyzer.checkAnalysis(analyzer.execute(inputPlan)) } if (!expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains)) { fail( s"""Exception message should contain the following substrings: | | ${expectedErrors.mkString("\n ")} | |Actual exception message: | | ${e.getMessage} """.stripMargin) } } }
Example 57
Source File: ResolveInlineTablesSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.types.{LongType, NullType} class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables.convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables.convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables.convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 58
Source File: ResolveSubquerySuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project} class ResolveSubquerySuite extends AnalysisTest { val a = 'a.int val b = 'b.int val t1 = LocalRelation(a) val t2 = LocalRelation(b) test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") { val expr = Filter(In(a, Seq(ListQuery(Project(Seq(OuterReference(a)), t2)))), t1) val m = intercept[AnalysisException] { SimpleAnalyzer.ResolveSubquery(expr) }.getMessage assert(m.contains( "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses")) } }
Example 59
Source File: CartesianProductExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark._ import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int) extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) { override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = { // We will not sort the rows, so prefixComparator and recordComparator are null. val sorter = UnsafeExternalSorter.create( context.taskMemoryManager(), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, context, null, null, 1024, SparkEnv.get.memoryManager.pageSizeBytes, SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold", UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD), false) val partition = split.asInstanceOf[CartesianPartition] for (y <- rdd2.iterator(partition.s2, context)) { sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false) } // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow] def createIter(): Iterator[UnsafeRow] = { val iter = sorter.getIterator val unsafeRow = new UnsafeRow(numFieldsOfRight) new Iterator[UnsafeRow] { override def hasNext: Boolean = { iter.hasNext } override def next(): UnsafeRow = { iter.loadNext() unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength) unsafeRow } } } val resultIter = for (x <- rdd1.iterator(partition.s1, context); y <- createIter()) yield (x, y) CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]]( resultIter, sorter.cleanupResources()) } } case class CartesianProductExec( left: SparkPlan, right: SparkPlan, condition: Option[Expression]) extends BinaryExecNode { override def output: Seq[Attribute] = left.output ++ right.output override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]] val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]] val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size) pair.mapPartitionsWithIndexInternal { (index, iter) => val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema) val filtered = if (condition.isDefined) { val boundCondition = newPredicate(condition.get, left.output ++ right.output) boundCondition.initialize(index) val joined = new JoinedRow iter.filter { r => boundCondition.eval(joined(r._1, r._2)) } } else { iter } filtered.map { r => numOutputRows += 1 joiner.join(r._1, r._2) } } } }
Example 60
Source File: JdbcRelationProvider.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 61
Source File: QueryExecutionSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation} import org.apache.spark.sql.test.SharedSQLContext class QueryExecutionSuite extends SharedSQLContext { test("toString() exception/error handling") { val badRule = new SparkStrategy { var mode: String = "" override def apply(plan: LogicalPlan): Seq[SparkPlan] = mode.toLowerCase match { case "exception" => throw new AnalysisException(mode) case "error" => throw new Error(mode) case _ => Nil } } spark.experimental.extraStrategies = badRule :: Nil def qe: QueryExecution = new QueryExecution(spark, OneRowRelation) // Nothing! badRule.mode = "" assert(qe.toString.contains("OneRowRelation")) // Throw an AnalysisException - this should be captured. badRule.mode = "exception" assert(qe.toString.contains("org.apache.spark.sql.AnalysisException")) // Throw an Error - this should not be captured. badRule.mode = "error" val error = intercept[Error](qe.toString) assert(error.getMessage.contains("error")) } }
Example 62
Source File: VariableSubstitutionSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.internal import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException class VariableSubstitutionSuite extends SparkFunSuite { private lazy val conf = new SQLConf private lazy val sub = new VariableSubstitution(conf) test("system property") { System.setProperty("varSubSuite.var", "abcd") assert(sub.substitute("${system:varSubSuite.var}") == "abcd") } test("environmental variables") { assert(sub.substitute("${env:SPARK_TESTING}") == "1") } test("Spark configuration variable") { conf.setConfString("some-random-string-abcd", "1234abcd") assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${some-random-string-abcd}") == "1234abcd") } test("multiple substitutes") { val q = "select ${bar} ${foo} ${doo} this is great" conf.setConfString("bar", "1") conf.setConfString("foo", "2") conf.setConfString("doo", "3") assert(sub.substitute(q) == "select 1 2 3 this is great") } test("test nested substitutes") { val q = "select ${bar} ${foo} this is great" conf.setConfString("bar", "1") conf.setConfString("foo", "${bar}") assert(sub.substitute(q) == "select 1 1 this is great") } }
Example 63
Source File: ResolvedDataSourceSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.DataSource class ResolvedDataSourceSuite extends SparkFunSuite { private def getProvidingClass(name: String): Class[_] = DataSource(sparkSession = null, className = name).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("error message for unknown data sources") { val error1 = intercept[AnalysisException] { getProvidingClass("avro") } assert(error1.getMessage.contains("Failed to find data source: avro.")) val error2 = intercept[AnalysisException] { getProvidingClass("com.databricks.spark.avro") } assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro.")) val error3 = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 64
Source File: DDLSourceLoadSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{StringType, StructField, StructType} // please note that the META-INF/services had to be modified for the test directory for this to work class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext { test("data sources with the same name") { intercept[RuntimeException] { spark.read.format("Fluet da Bomb").load() } } test("load data source from format alias") { spark.read.format("gathering quorum").load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("specify full classname with duplicate formats") { spark.read.format("org.apache.spark.sql.sources.FakeSourceOne") .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("should fail to load ORC without Hive Support") { val e = intercept[AnalysisException] { spark.read.format("orc").load() } assert(e.message.contains("The ORC data source must be used with Hive support enabled")) } } class FakeSourceOne extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceTwo extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceThree extends RelationProvider with DataSourceRegister { def shortName(): String = "gathering quorum" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } }
Example 65
Source File: CarbonMetastoreTypes.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.util import scala.util.parsing.combinator.RegexParsers import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.types._ object CarbonMetastoreTypes extends RegexParsers { protected lazy val primitiveType: Parser[DataType] = "string" ^^^ StringType | "varchar" ^^^ StringType | "float" ^^^ FloatType | "int" ^^^ IntegerType | "tinyint" ^^^ ShortType | "short" ^^^ ShortType | "double" ^^^ DoubleType | "long" ^^^ LongType | "binary" ^^^ BinaryType | "boolean" ^^^ BooleanType | fixedDecimalType | "decimal" ^^^ "decimal" ^^^ DecimalType(10, 0) | "varchar\\((\\d+)\\)".r ^^^ StringType | "date" ^^^ DateType | "timestamp" ^^^ TimestampType protected lazy val fixedDecimalType: Parser[DataType] = "decimal" ~> "(" ~> "^[1-9]\\d*".r ~ ("," ~> "^[0-9]\\d*".r <~ ")") ^^ { case precision ~ scale => DecimalType(precision.toInt, scale.toInt) } protected lazy val arrayType: Parser[DataType] = "array" ~> "<" ~> dataType <~ ">" ^^ { case tpe => ArrayType(tpe) } protected lazy val mapType: Parser[DataType] = "map" ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ { case t1 ~ _ ~ t2 => MapType(t1, t2) } protected lazy val structField: Parser[StructField] = "[a-zA-Z0-9_]*".r ~ ":" ~ dataType ^^ { case name ~ _ ~ tpe => StructField(name, tpe, nullable = true) } protected lazy val structType: Parser[DataType] = "struct" ~> "<" ~> repsep(structField, ",") <~ ">" ^^ { case fields => StructType(fields) } protected lazy val dataType: Parser[DataType] = arrayType | mapType | structType | primitiveType def toDataType(metastoreType: String): DataType = { parseAll(dataType, metastoreType) match { case Success(result, _) => result case _: NoSuccess => throw new AnalysisException(s"Unsupported dataType: $metastoreType") } } def toMetastoreType(dt: DataType): String = { dt match { case ArrayType(elementType, _) => s"array<${ toMetastoreType(elementType) }>" case StructType(fields) => s"struct<${ fields.map(f => s"${ f.name }:${ toMetastoreType(f.dataType) }") .mkString(",") }>" case MapType(keyType, valueType, _) => s"map<${ toMetastoreType(keyType) }, ${ toMetastoreType(valueType) }>" case StringType => "string" case FloatType => "float" case IntegerType => "int" case ShortType => "tinyint" case DoubleType => "double" case LongType => "bigint" case BinaryType => "binary" case BooleanType => "boolean" case DecimalType() => "decimal" case TimestampType => "timestamp" case DateType => "date" } } }
Example 66
Source File: AlterTableRevertTestCase.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.carbondata.restructure import java.io.File import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.test.TestQueryExecutor import org.apache.spark.sql.test.util.QueryTest import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.metadata.CarbonMetadata import org.apache.carbondata.spark.exception.ProcessMetaDataException class AlterTableRevertTestCase extends QueryTest with BeforeAndAfterAll { override def beforeAll() { sql("drop table if exists reverttest") sql( "CREATE TABLE reverttest(intField int,stringField string,timestampField timestamp," + "decimalField decimal(6,2)) STORED AS carbondata") sql(s"LOAD DATA LOCAL INPATH '$resourcesPath/restructure/data4.csv' INTO TABLE reverttest " + s"options('FILEHEADER'='intField,stringField,timestampField,decimalField')") } test("test to revert new added columns on failure") { intercept[ProcessMetaDataException] { hiveClient.runSqlHive("set hive.security.authorization.enabled=true") sql( "Alter table reverttest add columns(newField string) TBLPROPERTIES" + "('DEFAULT.VALUE.newField'='def')") hiveClient.runSqlHive("set hive.security.authorization.enabled=false") intercept[AnalysisException] { sql("select newField from reverttest") } } } test("test to revert table name on failure") { val exception = intercept[ProcessMetaDataException] { new File(TestQueryExecutor.warehouse + "/reverttest_fail").mkdir() sql("alter table reverttest rename to reverttest_fail") new File(TestQueryExecutor.warehouse + "/reverttest_fail").delete() } val result = sql("select * from reverttest").count() assert(result.equals(1L)) sql("drop table if exists reverttest_fail") } test("test to revert drop columns on failure") { intercept[ProcessMetaDataException] { hiveClient.runSqlHive("set hive.security.authorization.enabled=true") sql("Alter table reverttest drop columns(decimalField)") hiveClient.runSqlHive("set hive.security.authorization.enabled=false") } assert(sql("select decimalField from reverttest").count().equals(1L)) } test("test to revert changed datatype on failure") { intercept[ProcessMetaDataException] { hiveClient.runSqlHive("set hive.security.authorization.enabled=true") sql("Alter table reverttest change intField intfield bigint") hiveClient.runSqlHive("set hive.security.authorization.enabled=false") } assert( sql("select intfield from reverttest").schema.fields.apply(0).dataType.simpleString == "int") } test("test to check if dictionary files are deleted for new column if query fails") { intercept[ProcessMetaDataException] { hiveClient.runSqlHive("set hive.security.authorization.enabled=true") sql( "Alter table reverttest add columns(newField string) TBLPROPERTIES" + "('DEFAULT.VALUE.newField'='def')") hiveClient.runSqlHive("set hive.security.authorization.enabled=false") intercept[AnalysisException] { sql("select newField from reverttest") } val carbonTable = CarbonMetadata.getInstance.getCarbonTable("default", "reverttest") assert(new File(carbonTable.getMetadataPath).listFiles().length < 6) } } override def afterAll() { hiveClient.runSqlHive("set hive.security.authorization.enabled=false") sql("drop table if exists reverttest") sql("drop table if exists reverttest_fail") } }
Example 67
Source File: CarbonTableSchemaCommonSuite.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.test.util.QueryTest import org.junit.Assert import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.spark.exception.ProcessMetaDataException class CarbonTableSchemaCommonSuite extends QueryTest with BeforeAndAfterAll { test("Creating table: Duplicate dimensions found with name, it should throw AnalysisException") { sql("DROP TABLE IF EXISTS carbon_table") try { sql( s""" | CREATE TABLE carbon_table( | BB INT, bb char(10) | ) | STORED AS carbondata """.stripMargin) Assert.assertTrue(false) } catch { case _: AnalysisException => Assert.assertTrue(true) case _: Exception => Assert.assertTrue(false) } finally { sql("DROP TABLE IF EXISTS carbon_table") } } test("Altering table: Duplicate column found with name, it should throw RuntimeException") { sql("DROP TABLE IF EXISTS carbon_table") sql( s""" | CREATE TABLE if not exists carbon_table( | BB INT, cc char(10) | ) | STORED AS carbondata """.stripMargin) val ex = intercept[ProcessMetaDataException] { sql( s""" | alter TABLE carbon_table add columns( | bb char(10) ) """.stripMargin) } sql("DROP TABLE IF EXISTS carbon_table") } }
Example 68
Source File: MVExceptionTestCase.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.view.rewrite import org.apache.carbondata.common.exceptions.sql.{MalformedCarbonCommandException, MalformedMVCommandException} import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.test.util.QueryTest import org.scalatest.BeforeAndAfterAll class MVExceptionTestCase extends QueryTest with BeforeAndAfterAll { override def beforeAll: Unit = { drop() sql("create table main_table (name string,age int,height int) STORED AS carbondata") } test("test mv no base table") { val ex = intercept[AnalysisException] { sql("create materialized view main_table_mv as select sum(age),name from main_table_error group by name") } assert(ex.getMessage().contains("Table or view not found: main_table_error")) } test("test mv reduplicate mv table") { val ex = intercept[MalformedMVCommandException] { sql("create materialized view main_table_mv1 as select sum(age),name from main_table group by name") sql("create materialized view main_table_mv1 as select sum(age),name from main_table group by name") } assertResult("Materialized view with name default.main_table_mv1 already exists")(ex.getMessage) } test("test mv creation with limit in query") { val ex = intercept[MalformedCarbonCommandException] { sql("create materialized view maintable_mv2 as select sum(age),name from main_table group by name limit 10") } assertResult("Materialized view does not support the query with limit")(ex.getMessage) } def drop(): Unit = { sql("drop table IF EXISTS main_table") sql("drop table if exists main_table_error") sql("drop materialized view if exists main_table_mv") sql("drop materialized view if exists main_table_mv1") } override def afterAll(): Unit = { drop() } }
Example 69
Source File: TestDataWithDicExcludeAndInclude.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.dataload import org.apache.spark.sql.AnalysisException import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.util.CarbonProperties import org.apache.spark.sql.test.util.QueryTest class TestLoadDataWithDictionaryExcludeAndInclude extends QueryTest with BeforeAndAfterAll { var filePath: String = _ var pwd: String = _ def buildTestData() = { filePath = s"$resourcesPath/emptyDimensionData.csv" } def dropTable() = { sql("DROP TABLE IF EXISTS exclude_include_t3") sql("DROP TABLE IF EXISTS exclude_include_hive_t3") } def buildTable() = { try { sql( """ CREATE TABLE exclude_include_hive_t3 (ID Int, date Timestamp, country String, name String, phonetype String, serialname String, salary Int) row format delimited fields terminated by ',' """) sql( """ CREATE TABLE exclude_include_t3 (ID Int, date Timestamp, country String, name String, phonetype String, serialname String, salary Int) STORED AS carbondata """) } catch { case ex: Throwable => LOGGER.error(ex.getMessage + "\r\n" + ex.getStackTraceString) } } def loadTable() = { try { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, "yyyy/MM/dd") sql( s""" LOAD DATA LOCAL INPATH '$filePath' into table exclude_include_t3 """) sql( s""" LOAD DATA LOCAL INPATH '$resourcesPath/emptyDimensionDataHive.csv' into table exclude_include_hive_t3 """) } catch { case ex: Throwable => LOGGER.error(ex.getMessage + "\r\n" + ex.getStackTraceString) } } override def beforeAll { dropTable buildTestData buildTable loadTable } test("test load data with dictionary exclude & include and with empty dimension") { checkAnswer( sql("select ID from exclude_include_t3"), sql("select ID from exclude_include_hive_t3") ) } override def afterAll { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT) dropTable } }
Example 70
Source File: TestCreateTableIfNotExists.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.createTable import java.util.concurrent.{Callable, Executors, ExecutorService, Future, TimeUnit} import org.apache.spark.sql.test.util.QueryTest import org.apache.spark.sql.AnalysisException import org.scalatest.BeforeAndAfterAll class TestCreateTableIfNotExists extends QueryTest with BeforeAndAfterAll { override def beforeAll { sql("use default") sql("drop table if exists test") sql("drop table if exists sourceTable") sql("drop table if exists targetTable") } test("test create table if not exists") { sql("create table test(a int, b string) STORED AS carbondata") try { // table creation should be successful sql("create table if not exists test(a int, b string) STORED AS carbondata") assert(true) } catch { case ex: Exception => assert(false) } } test("test create table if not exist concurrently") { val executorService: ExecutorService = Executors.newFixedThreadPool(10) var futures: List[Future[_]] = List() for (i <- 0 until (3)) { futures = futures :+ runAsync() } executorService.shutdown() executorService.awaitTermination(30L, TimeUnit.SECONDS) futures.foreach { future => assertResult("PASS")(future.get.toString) } def runAsync(): Future[String] = { executorService.submit(new Callable[String] { override def call() = { // Create table var result = "PASS" try { sql("create table IF NOT EXISTS TestIfExists(name string) STORED AS carbondata") } catch { case exception: Exception => result = exception.getMessage exception.printStackTrace() } result } }) } } test("test create table without column specified") { val exception = intercept[AnalysisException] { sql("create table TableWithoutColumn STORED AS carbondata tblproperties('sort_columns'='')") } assert(exception.getMessage.contains("Unable to infer the schema")) } override def afterAll { sql("use default") sql("drop table if exists test") sql("drop table if exists sourceTable") sql("drop table if exists targetTable") sql("drop table if exists TestIfExists") } }
Example 71
Source File: TestCarbonCli.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.test.util.QueryTest import org.scalatest.BeforeAndAfterAll class TestCarbonCli extends QueryTest with BeforeAndAfterAll{ override protected def beforeAll(): Unit = { sql("drop table if exists OneRowTable") sql("create table OneRowTable(col1 string, col2 string, col3 int, col4 double) STORED AS carbondata") sql("insert into OneRowTable select '0.1', 'a.b', 1, 1.2") } test("CarbonCli table summary") { checkExistence( sql("carboncli for table OneRowTable options('-cmd summary -a')"), true, "## Summary") checkExistence( sql("carboncli for table OneRowTable options('-cmd summary -v')"), true, "## version Details") checkExistence( sql("carboncli for table OneRowTable options('-cmd summary -s')"), true, "## Schema") checkExistence( sql("carboncli for table OneRowTable options('-cmd summary -t')"), true, "## Table Properties") checkExistence( sql("carboncli for table OneRowTable options('-cmd summary -m')"), true, "## Segment") } test("CarbonCli column details") { checkExistence( sql("carboncli for table OneRowTable options('-cmd summary -c col1')"), true, "## Column Statistics for 'col1'") } test("CarbonCli benchmark") { checkExistence( sql("carboncli for table OneRowTable options('-cmd benchmark -c col1')"), true, "## Benchmark") } test("CarbonCli invalid cmd"){ assert(intercept[AnalysisException] { sql("carboncli for table OneRowTable").show() }.getMessage().contains("mismatched input 'carboncli'")) assert(intercept[Exception] { sql("carboncli for table OneRowTable options('')") }.getMessage().contains("Missing required option: cmd")) checkExistence(sql("carboncli for table OneRowTable options('-cmd test')"), true, "command test is not supported") } override protected def afterAll(): Unit = { sql("drop table if exists OneRowTable") } }
Example 72
Source File: HiveDeltaDDLSuite.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import org.apache.spark.sql.delta.test.DeltaHiveTest import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.test.TestHiveSingleton abstract class HiveDeltaDDLSuiteBase extends DeltaDDLTestBase { override protected def verifyDescribeTable(tblName: String): Unit = { val res = sql(s"DESCRIBE TABLE $tblName").collect() assert(res.takeRight(2).map(_.getString(1)) === Seq("name", "dept")) } override protected def verifyNullabilityFailure(exception: AnalysisException): Unit = { exception.getMessage.contains("not supported for changing column") } } class HiveDeltaDDLSuite extends HiveDeltaDDLSuiteBase with DeltaHiveTest
Example 73
Source File: PlanUtil.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.util import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.hive.sparklinedata.SPLSessionState import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SQLContext} import org.json4s._ import org.json4s.jackson.JsonMethods._ import org.sparklinedata.druid.metadata.DruidRelationInfo import org.sparklinedata.druid.{DruidQuery, DruidRelation, QuerySpec, Utils} object PlanUtil { import Utils._ def druidRelationInfo(tableName: String)(implicit sqlContext: SQLContext): Option[DruidRelationInfo] = { sqlContext.table(tableName).logicalPlan.collectFirst { case LogicalRelation(DruidRelation(drInfo, _), _, _) => drInfo } } def dataFrame(drInfo: DruidRelationInfo, dq: DruidQuery)( implicit sqlContext: SQLContext): DataFrame = { val dR = DruidRelation(drInfo, Some(dq))(sqlContext) val lP = LogicalRelation(dR, None) Dataset.ofRows(sqlContext.sparkSession, lP) } @throws(classOf[AnalysisException]) def logicalPlan(dsName: String, dqStr: String, usingHist: Boolean)( implicit sqlContext: SQLContext): LogicalPlan = { val drInfo = druidRelationInfo(dsName) if (!drInfo.isDefined) { throw new AnalysisException(s"Cannot execute a DruidQuery on $dsName") } val dq = new DruidQuery(parse(dqStr).extract[QuerySpec], drInfo.get.options.useSmile(sqlContext), usingHist, drInfo.get.options.numSegmentsPerHistoricalQuery(sqlContext)) val dR = DruidRelation(drInfo.get, Some(dq))(sqlContext) LogicalRelation(dR, None) } def maxCardinalityIsOne(lp: LogicalPlan): Boolean = { var isone = false val aggs = lp.collect {case ag: Aggregate if ag.groupingExpressions.isEmpty => ag} if (aggs.nonEmpty) { isone = !isCardinalityAugmented(lp, aggs.asInstanceOf[Seq[LogicalPlan]]) } isone } }
Example 74
Source File: CreateHiveTableAsSelectCommand.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, OverwriteOptions} import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.hive.MetastoreRelation case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, ignoreIfExists: Boolean) extends RunnableCommand { private val tableIdentifier = tableDesc.identifier override def innerChildren: Seq[LogicalPlan] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withFormat = tableDesc.withNewStorage( inputFormat = tableDesc.storage.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.storage.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.storage.serde.orElse(Some(classOf[LazySimpleSerDe].getName)), compressed = tableDesc.storage.compressed) val withSchema = if (withFormat.schema.isEmpty) { // Hive doesn't support specifying the column list for target table in CTAS // However we don't think SparkSQL should follow that. tableDesc.copy(schema = query.output.toStructType) } else { withFormat } sparkSession.sessionState.catalog.createTable(withSchema, ignoreIfExists = false) // Get the Metastore Relation sparkSession.sessionState.catalog.lookupRelation(tableIdentifier) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (sparkSession.sessionState.catalog.tableExists(tableIdentifier)) { if (ignoreIfExists) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$tableIdentifier already exists.") } } else { try { sparkSession.sessionState.executePlan(InsertIntoTable( metastoreRelation, Map(), query, overwrite = OverwriteOptions(true), ifNotExists = false)).toRdd } catch { case NonFatal(e) => // drop the created table. sparkSession.sessionState.catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 75
Source File: SparkSQLDriver.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{Arrays, ArrayList => JArrayList, List => JList} import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SQLContext, SparkSession} import org.apache.spark.sql.execution.QueryExecution private[hive] class SparkSQLDriver(val sparkSession: SparkSession = SparkSQLEnv.sparkSession) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, attr.dataType.catalogString, "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { sparkSession.sparkContext.setJobDescription(command) val execution = sparkSession.sessionState.executePlan(sparkSession.sql(command).logicalPlan) hiveResponse = execution.hiveResultString() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 76
Source File: AnalysisTest.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.SimpleCatalystConf import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ trait AnalysisTest extends PlanTest { protected val caseSensitiveAnalyzer = makeAnalyzer(caseSensitive = true) protected val caseInsensitiveAnalyzer = makeAnalyzer(caseSensitive = false) private def makeAnalyzer(caseSensitive: Boolean): Analyzer = { val conf = new SimpleCatalystConf(caseSensitive) val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf) catalog.createTempView("TaBlE", TestRelations.testRelation, overrideIfExists = true) new Analyzer(catalog, conf) { override val extendedResolutionRules = EliminateSubqueryAliases :: Nil } } protected def getAnalyzer(caseSensitive: Boolean) = { if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer } protected def checkAnalysis( inputPlan: LogicalPlan, expectedPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val actualPlan = analyzer.execute(inputPlan) analyzer.checkAnalysis(actualPlan) comparePlans(actualPlan, expectedPlan) } protected def assertAnalysisSuccess( inputPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val analysisAttempt = analyzer.execute(inputPlan) try analyzer.checkAnalysis(analysisAttempt) catch { case a: AnalysisException => fail( s""" |Failed to Analyze Plan |$inputPlan | |Partial Analysis |$analysisAttempt """.stripMargin, a) } } protected def assertAnalysisError( inputPlan: LogicalPlan, expectedErrors: Seq[String], caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val e = intercept[AnalysisException] { analyzer.checkAnalysis(analyzer.execute(inputPlan)) } if (!expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains)) { fail( s"""Exception message should contain the following substrings: | | ${expectedErrors.mkString("\n ")} | |Actual exception message: | | ${e.getMessage} """.stripMargin) } } }
Example 77
Source File: ResolveInlineTablesSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.types.{LongType, NullType} class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables.convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables.convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables.convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 78
Source File: ResolveSubquerySuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project} class ResolveSubquerySuite extends AnalysisTest { val a = 'a.int val b = 'b.int val t1 = LocalRelation(a) val t2 = LocalRelation(b) test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") { val expr = Filter(In(a, Seq(ListQuery(Project(Seq(OuterReference(a)), t2)))), t1) val m = intercept[AnalysisException] { SimpleAnalyzer.ResolveSubquery(expr) }.getMessage assert(m.contains( "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses")) } }
Example 79
Source File: JdbcRelationProvider.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 80
Source File: QueryExecutionSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation} import org.apache.spark.sql.test.SharedSQLContext class QueryExecutionSuite extends SharedSQLContext { test("toString() exception/error handling") { val badRule = new SparkStrategy { var mode: String = "" override def apply(plan: LogicalPlan): Seq[SparkPlan] = mode.toLowerCase match { case "exception" => throw new AnalysisException(mode) case "error" => throw new Error(mode) case _ => Nil } } spark.experimental.extraStrategies = badRule :: Nil def qe: QueryExecution = new QueryExecution(spark, OneRowRelation) // Nothing! badRule.mode = "" assert(qe.toString.contains("OneRowRelation")) // Throw an AnalysisException - this should be captured. badRule.mode = "exception" assert(qe.toString.contains("org.apache.spark.sql.AnalysisException")) // Throw an Error - this should not be captured. badRule.mode = "error" val error = intercept[Error](qe.toString) assert(error.getMessage.contains("error")) } }
Example 81
Source File: VariableSubstitutionSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.internal import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException class VariableSubstitutionSuite extends SparkFunSuite { private lazy val conf = new SQLConf private lazy val sub = new VariableSubstitution(conf) test("system property") { System.setProperty("varSubSuite.var", "abcd") assert(sub.substitute("${system:varSubSuite.var}") == "abcd") } test("environmental variables") { assert(sub.substitute("${env:SPARK_TESTING}") == "1") } test("Spark configuration variable") { conf.setConfString("some-random-string-abcd", "1234abcd") assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${some-random-string-abcd}") == "1234abcd") } test("multiple substitutes") { val q = "select ${bar} ${foo} ${doo} this is great" conf.setConfString("bar", "1") conf.setConfString("foo", "2") conf.setConfString("doo", "3") assert(sub.substitute(q) == "select 1 2 3 this is great") } test("test nested substitutes") { val q = "select ${bar} ${foo} this is great" conf.setConfString("bar", "1") conf.setConfString("foo", "${bar}") assert(sub.substitute(q) == "select 1 1 this is great") } }
Example 82
Source File: ResolvedDataSourceSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.DataSource class ResolvedDataSourceSuite extends SparkFunSuite { private def getProvidingClass(name: String): Class[_] = DataSource(sparkSession = null, className = name).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("error message for unknown data sources") { val error1 = intercept[AnalysisException] { getProvidingClass("avro") } assert(error1.getMessage.contains("Failed to find data source: avro.")) val error2 = intercept[AnalysisException] { getProvidingClass("com.databricks.spark.avro") } assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro.")) val error3 = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 83
Source File: DDLSourceLoadSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{StringType, StructField, StructType} // please note that the META-INF/services had to be modified for the test directory for this to work class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext { test("data sources with the same name") { intercept[RuntimeException] { spark.read.format("Fluet da Bomb").load() } } test("load data source from format alias") { spark.read.format("gathering quorum").load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("specify full classname with duplicate formats") { spark.read.format("org.apache.spark.sql.sources.FakeSourceOne") .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))) } test("should fail to load ORC without Hive Support") { val e = intercept[AnalysisException] { spark.read.format("orc").load() } assert(e.message.contains("The ORC data source must be used with Hive support enabled")) } } class FakeSourceOne extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceTwo extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceThree extends RelationProvider with DataSourceRegister { def shortName(): String = "gathering quorum" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } }
Example 84
Source File: OrcFileOperator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.Logging import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.HiveMetastoreTypes import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(path: String, conf: Option[Configuration]): StructType = { val reader = getFileReader(path, conf).getOrElse { throw new AnalysisException( s"Failed to discover schema from ORC files stored in $path. " + "Probably there are either no ORC files or only empty ORC files.") } val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $path, got Hive schema string: $schema") HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType] } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDir) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) if (paths == null || paths.isEmpty) { throw new IllegalArgumentException( s"orcFileOperator: path $path does not have valid orc files matching the pattern") } paths } }
Example 85
Source File: CreateTableAsSelect.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.annotation.Experimental import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.client.{HiveTable, HiveColumn} import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation, HiveMetastoreTypes} private[hive] case class CreateTableAsSelect( tableDesc: HiveTable, query: LogicalPlan, allowExisting: Boolean) extends RunnableCommand { def database: String = tableDesc.database def tableName: String = tableDesc.name override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withSchema = tableDesc.copy( schema = query.output.map(c => HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)), inputFormat = tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName()))) hiveContext.catalog.client.createTable(withSchema) // Get the Metastore Relation hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (hiveContext.catalog.tableExists(Seq(database, tableName))) { if (allowExisting) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$database.$tableName already exists.") } } else { hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd } Seq.empty[Row] } override def argString: String = { s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]\n" + query.toString } }
Example 86
Source File: AbstractSparkSQLDriver.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import scala.collection.JavaConversions._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.Logging import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} private[hive] abstract class AbstractSparkSQLDriver( val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: context.QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.size == 0) { new Schema(new FieldSchema("Response code", "string", "") :: Nil, null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "") } new Schema(fieldSchemas, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.stringResult() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } } def runWrapper(command: String): CommandProcessorResponseWrapper = try { val result = run(command) new CommandProcessorResponseWrapper(result, null) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponseWrapper(new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null), ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponseWrapper(new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null), cause) } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } } private[hive] case class CommandProcessorResponseWrapper( rc : CommandProcessorResponse, cause : Throwable)
Example 87
Source File: SqlSchemaInferrer.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.inference import ai.deepsense.commons.spark.sql.UserDefinedFunctions import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{AnalysisException, Row} import ai.deepsense.sparkutils.SQL case class SqlInferenceWarning(sqlExpression: String, warningText: String) extends InferenceWarning(s"Schema for SQL formula '$sqlExpression' cannot be inferred ($warningText).") class SqlSchemaInferrer { def inferSchema(sqlExpression: String, inputSchemas: (String, StructType)*) : (StructType, InferenceWarnings) = { try { val localSpark = SQL.createEmptySparkSQLSession() inputSchemas.foreach { case (dataFrameId, schema) => val emptyData = localSpark.sparkContext.parallelize(Seq(Row.empty)) val emptyDf = localSpark.createDataFrame(emptyData, schema) SQL.registerTempTable(emptyDf, dataFrameId) } val resultSchema = localSpark.sql(sqlExpression).schema val warnings = if (!namesUnique(inputSchemas)) { InferenceWarnings(SqlInferenceWarning(sqlExpression, "DataFrame ids must be unique.")) } else if (resultSchema.isEmpty) { InferenceWarnings(SqlInferenceWarning(sqlExpression, "Expression must be non-empty.")) } else { InferenceWarnings.empty } (resultSchema, warnings) } catch { case e @ (_: AnalysisException | _: IllegalArgumentException) => (StructType(Seq.empty), InferenceWarnings(SqlInferenceWarning(sqlExpression, s"Invalid Spark SQL expression: ${e.getMessage}"))) } } private def namesUnique(inputSchemas: Seq[(String, StructType)]): Boolean = { val names = inputSchemas.map { case (name, _) => name} names.size == names.toSet.size } }
Example 88
Source File: CSVInOutTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.io.csv import com.salesforce.op.test.TestSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.{AnalysisException, DataFrame} import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class CSVInOutTest extends FlatSpec with TestSparkContext { private val csvReader = new CSVInOut(CSVOptions(header = true)) private val csvFile = s"$testDataDir/PassengerDataAllWithHeader.csv" Spec[CSVInOut] should "throw error for bad file paths with DataFrame" in { val error = intercept[AnalysisException](csvReader.readDataFrame("/bad/file/path/read/dataframe")) error.getMessage should endWith ("Path does not exist: file:/bad/file/path/read/dataframe;") } it should "throw error for bad file paths with RDD" in { val error = intercept[AnalysisException](csvReader.readRDD("/bad/file/path/read/rdd")) error.getMessage should endWith ("Path does not exist: file:/bad/file/path/read/rdd;") } it should "read a CSV file to DataFrame" in { val res = csvReader.readDataFrame(csvFile) res shouldBe a[DataFrame] res.count shouldBe 891 } it should "read a CSV file to RDD" in { val res = csvReader.readRDD(csvFile) res shouldBe a[RDD[_]] res.count shouldBe 891 } }
Example 89
Source File: SparkSQLDriver.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, List => JList} import scala.collection.JavaConversions._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.Logging import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} private[hive] class SparkSQLDriver( val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: context.QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.size == 0) { new Schema(new FieldSchema("Response code", "string", "") :: Nil, null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "") } new Schema(fieldSchemas, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.stringResult() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 90
Source File: randomExpressions.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types.{DataType, DoubleType} import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom case class Randn(seed: Long) extends RDG { override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() def this() = this(Utils.random.nextLong()) def this(seed: Expression) = this(seed match { case IntegerLiteral(s) => s case _ => throw new AnalysisException("Input argument to rand must be an integer literal.") }) override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val rngTerm = ctx.freshName("rng") val className = classOf[XORShiftRandom].getName ctx.addMutableState(className, rngTerm, s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextGaussian(); """ } }
Example 91
Source File: KafkaWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.{util => ju} import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} import org.apache.spark.sql.types.{BinaryType, StringType} import org.apache.spark.util.Utils private[kafka010] object KafkaWriter extends Logging { val TOPIC_ATTRIBUTE_NAME: String = "topic" val KEY_ATTRIBUTE_NAME: String = "key" val VALUE_ATTRIBUTE_NAME: String = "value" override def toString: String = "KafkaWriter" def validateQuery( schema: Seq[Attribute], kafkaParameters: ju.Map[String, Object], topic: Option[String] = None): Unit = { schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse( if (topic.isEmpty) { throw new AnalysisException(s"topic option required when no " + s"'$TOPIC_ATTRIBUTE_NAME' attribute is present. Use the " + s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a topic.") } else { Literal(topic.get, StringType) } ).dataType match { case StringType => // good case _ => throw new AnalysisException(s"Topic type must be a String") } schema.find(_.name == KEY_ATTRIBUTE_NAME).getOrElse( Literal(null, StringType) ).dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException(s"$KEY_ATTRIBUTE_NAME attribute type " + s"must be a String or BinaryType") } schema.find(_.name == VALUE_ATTRIBUTE_NAME).getOrElse( throw new AnalysisException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found") ).dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException(s"$VALUE_ATTRIBUTE_NAME attribute type " + s"must be a String or BinaryType") } } def write( sparkSession: SparkSession, queryExecution: QueryExecution, kafkaParameters: ju.Map[String, Object], topic: Option[String] = None): Unit = { val schema = queryExecution.analyzed.output validateQuery(schema, kafkaParameters, topic) queryExecution.toRdd.foreachPartition { iter => val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic) Utils.tryWithSafeFinally(block = writeTask.execute(iter))( finallyBlock = writeTask.close()) } } }
Example 92
Source File: CreateHiveTableAsSelectCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumns: Seq[Attribute], mode: SaveMode) extends DataWritingCommand { private val tableIdentifier = tableDesc.identifier override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog if (catalog.tableExists(tableIdentifier)) { assert(mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } InsertIntoHiveTable( tableDesc, Map.empty, query, overwrite = false, ifPartitionNotExists = false, outputColumns = outputColumns).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(tableDesc.schema.isEmpty) catalog.createTable(tableDesc.copy(schema = query.schema), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumns = outputColumns).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 93
Source File: TestHiveSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.test.{TestHiveSingleton, TestHiveSparkSession} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SQLTestUtils class TestHiveSuite extends TestHiveSingleton with SQLTestUtils { test("load test table based on case sensitivity") { val testHiveSparkSession = spark.asInstanceOf[TestHiveSparkSession] withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") { sql("SELECT * FROM SRC").queryExecution.analyzed assert(testHiveSparkSession.getLoadedTables.contains("src")) assert(testHiveSparkSession.getLoadedTables.size == 1) } testHiveSparkSession.reset() withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") { val err = intercept[AnalysisException] { sql("SELECT * FROM SRC").queryExecution.analyzed } assert(err.message.contains("Table or view not found")) } testHiveSparkSession.reset() } test("SPARK-15887: hive-site.xml should be loaded") { assert(hiveClient.getConf("hive.in.test", "") == "true") } }
Example 94
Source File: SparkSQLDriver.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, Arrays, List => JList} import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, attr.dataType.catalogString, "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) { execution.hiveResultString() } tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 95
Source File: HiveMetastoreLazyInitializationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.SparkFunSuite import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.util.Utils class HiveMetastoreLazyInitializationSuite extends SparkFunSuite { test("lazily initialize Hive client") { val spark = SparkSession.builder() .appName("HiveMetastoreLazyInitializationSuite") .master("local[2]") .enableHiveSupport() .config("spark.hadoop.hive.metastore.uris", "thrift://127.0.0.1:11111") .getOrCreate() val originalLevel = org.apache.log4j.Logger.getRootLogger().getLevel try { // Avoid outputting a lot of expected warning logs spark.sparkContext.setLogLevel("error") // We should be able to run Spark jobs without Hive client. assert(spark.sparkContext.range(0, 1).count() === 1) // Make sure that we are not using the local derby metastore. val exceptionString = Utils.exceptionString(intercept[AnalysisException] { spark.sql("show tables") }) for (msg <- Seq( "show tables", "Could not connect to meta store", "org.apache.thrift.transport.TTransportException", "Connection refused")) { exceptionString.contains(msg) } } finally { spark.sparkContext.setLogLevel(originalLevel.toString) spark.stop() } } }
Example 96
Source File: SchemaUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.util import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.types.StructType def checkColumnNameDuplication( columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new AnalysisException( s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } }
Example 97
Source File: StringUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import java.util.regex.{Pattern, PatternSyntaxException} import org.apache.spark.sql.AnalysisException import org.apache.spark.unsafe.types.UTF8String object StringUtils { def filterPattern(names: Seq[String], pattern: String): Seq[String] = { val funcNames = scala.collection.mutable.SortedSet.empty[String] pattern.trim().split("\\|").foreach { subPattern => try { val regex = ("(?i)" + subPattern.replaceAll("\\*", ".*")).r funcNames ++= names.filter{ name => regex.pattern.matcher(name).matches() } } catch { case _: PatternSyntaxException => } } funcNames.toSeq } }
Example 98
Source File: SchemaUtilsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.util import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis._ import org.apache.spark.sql.types.StructType class SchemaUtilsSuite extends SparkFunSuite { private def resolver(caseSensitiveAnalysis: Boolean): Resolver = { if (caseSensitiveAnalysis) { caseSensitiveResolution } else { caseInsensitiveResolution } } Seq((true, ("a", "a"), ("b", "b")), (false, ("a", "A"), ("b", "B"))).foreach { case (caseSensitive, (a0, a1), (b0, b1)) => val testType = if (caseSensitive) "case-sensitive" else "case-insensitive" test(s"Check column name duplication in $testType cases") { def checkExceptionCases(schemaStr: String, duplicatedColumns: Seq[String]): Unit = { val expectedErrorMsg = "Found duplicate column(s) in SchemaUtilsSuite: " + duplicatedColumns.map(c => s"`${c.toLowerCase}`").mkString(", ") val schema = StructType.fromDDL(schemaStr) var msg = intercept[AnalysisException] { SchemaUtils.checkSchemaColumnNameDuplication( schema, "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive) }.getMessage assert(msg.contains(expectedErrorMsg)) msg = intercept[AnalysisException] { SchemaUtils.checkColumnNameDuplication( schema.map(_.name), "in SchemaUtilsSuite", resolver(caseSensitive)) }.getMessage assert(msg.contains(expectedErrorMsg)) msg = intercept[AnalysisException] { SchemaUtils.checkColumnNameDuplication( schema.map(_.name), "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive) }.getMessage assert(msg.contains(expectedErrorMsg)) } checkExceptionCases(s"$a0 INT, b INT, $a1 INT", a0 :: Nil) checkExceptionCases(s"$a0 INT, b INT, $a1 INT, $a0 INT", a0 :: Nil) checkExceptionCases(s"$a0 INT, $b0 INT, $a1 INT, $a0 INT, $b1 INT", b0 :: a0 :: Nil) } } test("Check no exception thrown for valid schemas") { def checkNoExceptionCases(schemaStr: String, caseSensitive: Boolean): Unit = { val schema = StructType.fromDDL(schemaStr) SchemaUtils.checkSchemaColumnNameDuplication( schema, "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive) SchemaUtils.checkColumnNameDuplication( schema.map(_.name), "in SchemaUtilsSuite", resolver(caseSensitive)) SchemaUtils.checkColumnNameDuplication( schema.map(_.name), "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive) } checkNoExceptionCases("a INT, b INT, c INT", caseSensitive = true) checkNoExceptionCases("Aa INT, b INT, aA INT", caseSensitive = true) checkNoExceptionCases("a INT, b INT, c INT", caseSensitive = false) } }
Example 99
Source File: AnalysisTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import java.net.URI import java.util.Locale import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, InMemoryCatalog, SessionCatalog} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.internal.SQLConf trait AnalysisTest extends PlanTest { protected val caseSensitiveAnalyzer = makeAnalyzer(caseSensitive = true) protected val caseInsensitiveAnalyzer = makeAnalyzer(caseSensitive = false) private def makeAnalyzer(caseSensitive: Boolean): Analyzer = { val conf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> caseSensitive) val catalog = new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf) catalog.createDatabase( CatalogDatabase("default", "", new URI("loc"), Map.empty), ignoreIfExists = false) catalog.createTempView("TaBlE", TestRelations.testRelation, overrideIfExists = true) catalog.createTempView("TaBlE2", TestRelations.testRelation2, overrideIfExists = true) catalog.createTempView("TaBlE3", TestRelations.testRelation3, overrideIfExists = true) new Analyzer(catalog, conf) { override val extendedResolutionRules = EliminateSubqueryAliases :: Nil } } protected def getAnalyzer(caseSensitive: Boolean) = { if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer } protected def checkAnalysis( inputPlan: LogicalPlan, expectedPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val actualPlan = analyzer.executeAndCheck(inputPlan) comparePlans(actualPlan, expectedPlan) } protected override def comparePlans( plan1: LogicalPlan, plan2: LogicalPlan, checkAnalysis: Boolean = false): Unit = { // Analysis tests may have not been fully resolved, so skip checkAnalysis. super.comparePlans(plan1, plan2, checkAnalysis) } protected def assertAnalysisSuccess( inputPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val analysisAttempt = analyzer.execute(inputPlan) try analyzer.checkAnalysis(analysisAttempt) catch { case a: AnalysisException => fail( s""" |Failed to Analyze Plan |$inputPlan | |Partial Analysis |$analysisAttempt """.stripMargin, a) } } protected def assertAnalysisError( inputPlan: LogicalPlan, expectedErrors: Seq[String], caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val e = intercept[AnalysisException] { analyzer.checkAnalysis(analyzer.execute(inputPlan)) } if (!expectedErrors.map(_.toLowerCase(Locale.ROOT)).forall( e.getMessage.toLowerCase(Locale.ROOT).contains)) { fail( s"""Exception message should contain the following substrings: | | ${expectedErrors.mkString("\n ")} | |Actual exception message: | | ${e.getMessage} """.stripMargin) } } }
Example 100
Source File: ResolveInlineTablesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Cast, Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types.{LongType, NullType, TimestampType} class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(conf)(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables(conf).convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("convert TimeZoneAwareExpression") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType)))) val withTimeZone = ResolveTimeZone(conf).apply(table) val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone) val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType) .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long] assert(output.map(_.dataType) == Seq(TimestampType)) assert(data.size == 1) assert(data.head.getLong(0) == correct) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables(conf).convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables(conf).convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 101
Source File: ResolveSubquerySuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference} import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project} class ResolveSubquerySuite extends AnalysisTest { val a = 'a.int val b = 'b.int val t1 = LocalRelation(a) val t2 = LocalRelation(b) test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") { val expr = Filter(In(a, Seq(ListQuery(Project(Seq(UnresolvedAttribute("a")), t2)))), t1) val m = intercept[AnalysisException] { SimpleAnalyzer.checkAnalysis(SimpleAnalyzer.ResolveSubquery(expr)) }.getMessage assert(m.contains( "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses")) } }
Example 102
Source File: CheckCartesianProductsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.scalatest.Matchers._ import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.internal.SQLConf.CROSS_JOINS_ENABLED class CheckCartesianProductsSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("Check Cartesian Products", Once, CheckCartesianProducts) :: Nil } val testRelation1 = LocalRelation('a.int, 'b.int) val testRelation2 = LocalRelation('c.int, 'd.int) val joinTypesWithRequiredCondition = Seq(Inner, LeftOuter, RightOuter, FullOuter) val joinTypesWithoutRequiredCondition = Seq(LeftSemi, LeftAnti, ExistenceJoin('exists)) test("CheckCartesianProducts doesn't throw an exception if cross joins are enabled)") { withSQLConf(CROSS_JOINS_ENABLED.key -> "true") { noException should be thrownBy { for (joinType <- joinTypesWithRequiredCondition ++ joinTypesWithoutRequiredCondition) { performCartesianProductCheck(joinType) } } } } test("CheckCartesianProducts throws an exception for join types that require a join condition") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithRequiredCondition) { val thrownException = the [AnalysisException] thrownBy { performCartesianProductCheck(joinType) } assert(thrownException.message.contains("Detected implicit cartesian product")) } } } test("CheckCartesianProducts doesn't throw an exception if a join condition is present") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithRequiredCondition) { noException should be thrownBy { performCartesianProductCheck(joinType, Some('a === 'd)) } } } } test("CheckCartesianProducts doesn't throw an exception if join types don't require conditions") { withSQLConf(CROSS_JOINS_ENABLED.key -> "false") { for (joinType <- joinTypesWithoutRequiredCondition) { noException should be thrownBy { performCartesianProductCheck(joinType) } } } } private def performCartesianProductCheck( joinType: JoinType, condition: Option[Expression] = None): Unit = { val analyzedPlan = testRelation1.join(testRelation2, joinType, condition).analyze val optimizedPlan = Optimize.execute(analyzedPlan) comparePlans(analyzedPlan, optimizedPlan) } }
Example 103
Source File: JdbcRelationProvider.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { import JDBCOptions._ val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn.isEmpty) { assert(lowerBound.isEmpty && upperBound.isEmpty, "When 'partitionColumn' is not specified, " + s"'$JDBC_LOWER_BOUND' and '$JDBC_UPPER_BOUND' are expected to be empty") null } else { assert(lowerBound.nonEmpty && upperBound.nonEmpty && numPartitions.nonEmpty, s"When 'partitionColumn' is specified, '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND', and " + s"'$JDBC_NUM_PARTITIONS' are also required") JDBCPartitioningInfo( partitionColumn.get, lowerBound.get, upperBound.get, numPartitions.get) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val options = new JDBCOptions(parameters) val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis val conn = JdbcUtils.createConnectionFactory(options)() try { val tableExists = JdbcUtils.tableExists(conn, options) if (tableExists) { mode match { case SaveMode.Overwrite => if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, options) val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, options.table) createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } case SaveMode.Append => val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '${options.table}' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 104
Source File: AnalyzeTableCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.CatalogTableType case class AnalyzeTableCommand( tableIdent: TableIdentifier, noscan: Boolean = true) extends RunnableCommand { override def run(sparkSession: SparkSession): Seq[Row] = { val sessionState = sparkSession.sessionState val db = tableIdent.database.getOrElse(sessionState.catalog.getCurrentDatabase) val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db)) val tableMeta = sessionState.catalog.getTableMetadata(tableIdentWithDB) if (tableMeta.tableType == CatalogTableType.VIEW) { throw new AnalysisException("ANALYZE TABLE is not supported on views.") } // Compute stats for the whole table val newTotalSize = CommandUtils.calculateTotalSize(sessionState, tableMeta) val newRowCount = if (noscan) None else Some(BigInt(sparkSession.table(tableIdentWithDB).count())) // Update the metastore if the above statistics of the table are different from those // recorded in the metastore. val newStats = CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount) if (newStats.isDefined) { sessionState.catalog.alterTableStats(tableIdentWithDB, newStats) } Seq.empty[Row] } }
Example 105
Source File: JdbcUtilsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.parser.ParseException import org.apache.spark.sql.types._ class JdbcUtilsSuite extends SparkFunSuite { val tableSchema = StructType(Seq( StructField("C1", StringType, false), StructField("C2", IntegerType, false))) val caseSensitive = org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution val caseInsensitive = org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution test("Parse user specified column types") { assert(JdbcUtils.getCustomSchema(tableSchema, null, caseInsensitive) === tableSchema) assert(JdbcUtils.getCustomSchema(tableSchema, "", caseInsensitive) === tableSchema) assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE", caseInsensitive) === StructType(Seq(StructField("C1", DateType, false), StructField("C2", IntegerType, false)))) assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE", caseSensitive) === StructType(Seq(StructField("C1", StringType, false), StructField("C2", IntegerType, false)))) assert( JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseInsensitive) === StructType(Seq(StructField("C1", DateType, false), StructField("C2", StringType, false)))) assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseSensitive) === StructType(Seq(StructField("C1", StringType, false), StructField("C2", StringType, false)))) // Throw AnalysisException val duplicate = intercept[AnalysisException]{ JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, c1 STRING", caseInsensitive) === StructType(Seq(StructField("c1", DateType, false), StructField("c1", StringType, false))) } assert(duplicate.getMessage.contains( "Found duplicate column(s) in the customSchema option value")) // Throw ParseException val dataTypeNotSupported = intercept[ParseException]{ JdbcUtils.getCustomSchema(tableSchema, "c3 DATEE, C2 STRING", caseInsensitive) === StructType(Seq(StructField("c3", DateType, false), StructField("C2", StringType, false))) } assert(dataTypeNotSupported.getMessage.contains("DataType datee is not supported")) val mismatchedInput = intercept[ParseException]{ JdbcUtils.getCustomSchema(tableSchema, "c3 DATE. C2 STRING", caseInsensitive) === StructType(Seq(StructField("c3", DateType, false), StructField("C2", StringType, false))) } assert(mismatchedInput.getMessage.contains("mismatched input '.' expecting")) } }
Example 106
Source File: QueryExecutionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation} import org.apache.spark.sql.test.SharedSQLContext class QueryExecutionSuite extends SharedSQLContext { test("toString() exception/error handling") { spark.experimental.extraStrategies = Seq( new SparkStrategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = Nil }) def qe: QueryExecution = new QueryExecution(spark, OneRowRelation()) // Nothing! assert(qe.toString.contains("OneRowRelation")) // Throw an AnalysisException - this should be captured. spark.experimental.extraStrategies = Seq( new SparkStrategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = throw new AnalysisException("exception") }) assert(qe.toString.contains("org.apache.spark.sql.AnalysisException")) // Throw an Error - this should not be captured. spark.experimental.extraStrategies = Seq( new SparkStrategy { override def apply(plan: LogicalPlan): Seq[SparkPlan] = throw new Error("error") }) val error = intercept[Error](qe.toString) assert(error.getMessage.contains("error")) } }
Example 107
Source File: VariableSubstitutionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.internal import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException class VariableSubstitutionSuite extends SparkFunSuite { private lazy val conf = new SQLConf private lazy val sub = new VariableSubstitution(conf) test("system property") { System.setProperty("varSubSuite.var", "abcd") assert(sub.substitute("${system:varSubSuite.var}") == "abcd") } test("environmental variables") { assert(sub.substitute("${env:SPARK_TESTING}") == "1") } test("Spark configuration variable") { conf.setConfString("some-random-string-abcd", "1234abcd") assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd") assert(sub.substitute("${some-random-string-abcd}") == "1234abcd") } test("multiple substitutes") { val q = "select ${bar} ${foo} ${doo} this is great" conf.setConfString("bar", "1") conf.setConfString("foo", "2") conf.setConfString("doo", "3") assert(sub.substitute(q) == "select 1 2 3 this is great") } test("test nested substitutes") { val q = "select ${bar} ${foo} this is great" conf.setConfString("bar", "1") conf.setConfString("foo", "${bar}") assert(sub.substitute(q) == "select 1 1 this is great") } }
Example 108
Source File: TextSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row} import org.apache.spark.util.Utils class TextSuite extends QueryTest with SharedSQLContext { test("reading text file") { verifyFrame(sqlContext.read.format("text").load(testFile)) } test("SQLContext.read.text() API") { verifyFrame(sqlContext.read.text(testFile)) } test("SPARK-12562 verify write.text() can handle column name beyond `value`") { val df = sqlContext.read.text(testFile).withColumnRenamed("value", "adwrasdf") val tempFile = Utils.createTempDir() tempFile.delete() df.write.text(tempFile.getCanonicalPath) verifyFrame(sqlContext.read.text(tempFile.getCanonicalPath)) Utils.deleteRecursively(tempFile) } test("error handling for invalid schema") { val tempFile = Utils.createTempDir() tempFile.delete() val df = sqlContext.range(2) intercept[AnalysisException] { df.write.text(tempFile.getCanonicalPath) } intercept[AnalysisException] { sqlContext.range(2).select(df("id"), df("id") + 1).write.text(tempFile.getCanonicalPath) } } private def testFile: String = { Thread.currentThread().getContextClassLoader.getResource("text-suite.txt").toString } private def verifyFrame(df: DataFrame): Unit = { // schema assert(df.schema == new StructType().add("value", StringType)) // verify content val data = df.collect() assert(data(0) == Row("This is a test file for the text data source")) assert(data(1) == Row("1+1")) // non ascii characters are not allowed in the code, so we disable the scalastyle here. // scalastyle:off assert(data(2) == Row("数据砖头")) // scalastyle:on assert(data(3) == Row("\"doh\"")) assert(data.length == 4) } }
Example 109
Source File: SqlSchemaInferrer.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.inference import io.deepsense.commons.spark.sql.UserDefinedFunctions import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{AnalysisException, Row} import io.deepsense.sparkutils.SQL case class SqlInferenceWarning(sqlExpression: String, warningText: String) extends InferenceWarning(s"Schema for SQL formula '$sqlExpression' cannot be inferred ($warningText).") class SqlSchemaInferrer { def inferSchema(sqlExpression: String, inputSchemas: (String, StructType)*) : (StructType, InferenceWarnings) = { try { val localSpark = SQL.createEmptySparkSQLSession() UserDefinedFunctions.registerFunctions(localSpark.udfRegistration) inputSchemas.foreach { case (dataFrameId, schema) => val emptyData = localSpark.sparkContext.parallelize(Seq(Row.empty)) val emptyDf = localSpark.createDataFrame(emptyData, schema) SQL.registerTempTable(emptyDf, dataFrameId) } val resultSchema = localSpark.sql(sqlExpression).schema val warnings = if (!namesUnique(inputSchemas)) { InferenceWarnings(SqlInferenceWarning(sqlExpression, "DataFrame ids must be unique.")) } else if (resultSchema.isEmpty) { InferenceWarnings(SqlInferenceWarning(sqlExpression, "Expression must be non-empty.")) } else { InferenceWarnings.empty } (resultSchema, warnings) } catch { case e @ (_: AnalysisException | _: IllegalArgumentException) => (StructType(Seq.empty), InferenceWarnings(SqlInferenceWarning(sqlExpression, s"Invalid Spark SQL expression: ${e.getMessage}"))) } } private def namesUnique(inputSchemas: Seq[(String, StructType)]): Boolean = { val names = inputSchemas.map { case (name, _) => name} names.size == names.toSet.size } }
Example 110
Source File: ExpectsGenotypeFieldsSuite.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.util import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.functions._ import io.projectglow.Glow import io.projectglow.functions._ import io.projectglow.sql.GlowBaseTest class ExpectsGenotypeFieldsSuite extends GlowBaseTest { lazy val gatkTestVcf = s"$testDataHome/variantsplitternormalizer-test/test_left_align_hg38_altered.vcf" lazy val sess = spark // This is how we originally detected an issue where ExpectsGenotypeFields succeeds during // resolution but fails during physical planning. // PR: https://github.com/projectglow/glow/pull/224 test("use genotype_states after splitting multiallelics") { val df = spark.read.format("vcf").load(gatkTestVcf) val split = Glow.transform("split_multiallelics", df) split.select(genotype_states(col("genotypes"))).collect() } test("use genotype_states after array_zip") { import sess.implicits._ val df = spark .createDataFrame(Seq((Seq("a"), Seq(Seq(1, 1))))) .withColumnRenamed("_1", "sampleId") .withColumnRenamed("_2", "calls") val zipped = df.select(arrays_zip(col("sampleId"), col("calls")).as("genotypes")) val states = zipped.select(genotype_states(col("genotypes"))) assert(states.as[Seq[Int]].head == Seq(2)) } test("type check") { val df = spark.createDataFrame(Seq(Tuple1("a"))).withColumnRenamed("_1", "sampleId") val withGenotypes = df.select(array(struct("sampleId")).as("genotypes")) val ex = intercept[AnalysisException](withGenotypes.select(genotype_states(col("genotypes")))) assert(ex.message.contains("Genotype struct was missing required fields: (name: calls")) } }
Example 111
Source File: SqlExtensionProviderSuite.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, Literal, UnaryExpression} import org.apache.spark.sql.types.{DataType, IntegerType} import io.projectglow.GlowSuite class SqlExtensionProviderSuite extends GlowSuite { override def beforeAll(): Unit = { super.beforeAll() SqlExtensionProvider.registerFunctions( spark.sessionState.conf, spark.sessionState.functionRegistry, "test-functions.yml") } private lazy val sess = spark test("one arg function") { import sess.implicits._ assert(spark.range(1).selectExpr("one_arg_test(id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("one_arg_test()").collect() } intercept[AnalysisException] { spark.range(1).selectExpr("one_arg_test(id, id)").collect() } } test("two arg function") { import sess.implicits._ assert(spark.range(1).selectExpr("two_arg_test(id, id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("two_arg_test(id)").collect() } intercept[AnalysisException] { spark.range(1).selectExpr("two_arg_test(id, id, id)").collect() } } test("var args function") { import sess.implicits._ assert(spark.range(1).selectExpr("var_args_test(id, id)").as[Int].head() == 1) assert(spark.range(1).selectExpr("var_args_test(id, id, id, id)").as[Int].head() == 1) assert(spark.range(1).selectExpr("var_args_test(id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("var_args_test()").collect() } } test("can call optional arg function") { import sess.implicits._ assert(spark.range(1).selectExpr("optional_arg_test(id)").as[Int].head() == 1) assert(spark.range(1).selectExpr("optional_arg_test(id, id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("optional_arg_test()").collect() } intercept[AnalysisException] { spark.range(1).selectExpr("optional_arg_test(id, id, id)").collect() } } } trait TestExpr extends Expression with CodegenFallback { override def dataType: DataType = IntegerType override def nullable: Boolean = true override def eval(input: InternalRow): Any = 1 } case class OneArgExpr(child: Expression) extends UnaryExpression with TestExpr case class TwoArgExpr(left: Expression, right: Expression) extends BinaryExpression with TestExpr case class VarArgsExpr(arg: Expression, varArgs: Seq[Expression]) extends TestExpr { override def children: Seq[Expression] = arg +: varArgs } case class OptionalArgExpr(required: Expression, optional: Expression) extends TestExpr { def this(required: Expression) = this(required, Literal(1)) override def children: Seq[Expression] = Seq(required, optional) }
Example 112
Source File: StringColumnConstraintTest.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import de.frosner.ddq.core.Check import de.frosner.ddq.testutils.{SparkContexts, TestData} import org.apache.spark.sql.AnalysisException import org.scalatest.{FlatSpec, Matchers} class StringColumnConstraintTest extends FlatSpec with Matchers with SparkContexts { "A StringColumnConstraint" should "succeed if all rows satisfy the given condition" in { val constraintString = "column > 0" val check = Check(TestData.makeIntegerDf(spark, List(1, 2, 3))).satisfies(constraintString) val constraint = check.constraints.head val result = StringColumnConstraintResult( constraint = StringColumnConstraint(constraintString), data = Some(StringColumnConstraintResultData(failedRows = 0L)), status = ConstraintSuccess ) check.run().constraintResults shouldBe Map(constraint -> result) } it should "fail if there are rows that do not satisfy the given condition" in { val constraintString = "column > 1" val check = Check(TestData.makeIntegerDf(spark, List(1, 2, 3))).satisfies(constraintString) val constraint = check.constraints.head val result = StringColumnConstraintResult( constraint = StringColumnConstraint(constraintString), data = Some(StringColumnConstraintResultData(failedRows = 1L)), status = ConstraintFailure ) check.run().constraintResults shouldBe Map(constraint -> result) } it should "error if the column does not exist" in { val constraintString = "notExisting > 0" val check = Check(TestData.makeIntegerDf(spark, List(1, 2, 3))).satisfies(constraintString) val constraint = check.constraints.head val result = check.run().constraintResults(constraint) result match { case StringColumnConstraintResult( StringColumnConstraint("notExisting > 0"), None, constraintError: ConstraintError ) => { val analysisException = constraintError.throwable.asInstanceOf[AnalysisException] analysisException.message shouldBe "cannot resolve '`notExisting`' given input columns: [column]" } } } "A StringColumnConstraintResult" should "have the correct success message" in { val constraint = StringColumnConstraint("column > 0") val result = StringColumnConstraintResult( constraint = constraint, data = Some(StringColumnConstraintResultData(failedRows = 0L)), status = ConstraintSuccess ) result.message shouldBe "Constraint column > 0 is satisfied." } it should "have the correct failure message (one row)" in { val constraint = StringColumnConstraint("column > 0") val result = StringColumnConstraintResult( constraint = constraint, data = Some(StringColumnConstraintResultData(failedRows = 1L)), status = ConstraintFailure ) result.message shouldBe "1 row did not satisfy constraint column > 0." } it should "have the correct failure message (multiple rows)" in { val constraint = StringColumnConstraint("column > 0") val result = StringColumnConstraintResult( constraint = constraint, data = Some(StringColumnConstraintResultData(failedRows = 2L)), status = ConstraintFailure ) result.message shouldBe "2 rows did not satisfy constraint column > 0." } it should "have the correct error message" in { val constraint = StringColumnConstraint("column > 0") val result = StringColumnConstraintResult( constraint = constraint, data = None, status = ConstraintError(new IllegalArgumentException("error")) ) result.message shouldBe "Checking constraint column > 0 failed: java.lang.IllegalArgumentException: error" } it should "throw an exception if it is created with an illegal combination of fields" in { intercept[IllegalConstraintResultException] { StringColumnConstraintResult( constraint = StringColumnConstraint("column > 0"), status = ConstraintFailure, data = None ) } } }
Example 113
Source File: CustomConstraintTest.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import de.frosner.ddq.core.Check import de.frosner.ddq.testutils.{SparkContexts, TestData} import org.apache.spark.sql.AnalysisException import org.scalatest.{FlatSpec, Matchers} class CustomConstraintTest extends FlatSpec with Matchers with SparkContexts { "A CustomConstraint" should "succeed if the function returns a success message" in { val constraintName = "name" val successMsg = "success" val check = Check(TestData.makeNullableStringDf(spark, List("a"))).custom(constraintName, { df => Right(successMsg) }) val constraint = check.constraints.head val result = CustomConstraintResult( constraint = constraint.asInstanceOf[CustomConstraint], message = s"Custom constraint '$constraintName' succeeded: $successMsg", status = ConstraintSuccess ) check.run().constraintResults shouldBe Map(constraint -> result) } it should "fail if the function returns a failure message" in { val constraintName = "name" val failureMsg = "failure" val check = Check(TestData.makeNullableStringDf(spark, List("a"))).custom(constraintName, { df => Left(failureMsg) }) val constraint = check.constraints.head val result = CustomConstraintResult( constraint = constraint.asInstanceOf[CustomConstraint], message = s"Custom constraint '$constraintName' failed: $failureMsg", status = ConstraintFailure ) check.run().constraintResults shouldBe Map(constraint -> result) } it should "error if the function throws an exception" in { val constraintName = "name" val exception = new Exception() val check = Check(TestData.makeNullableStringDf(spark, List("a"))).custom(constraintName, { df => throw exception }) val constraint = check.constraints.head val result = check.run().constraintResults(constraint) result match { case CustomConstraintResult( customConstraint: CustomConstraint, "Custom constraint 'name' errored: java.lang.Exception", constraintError: ConstraintError ) => { constraintError.throwable shouldBe exception } } } }
Example 114
Source File: AlwaysNullConstraintTest.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import de.frosner.ddq.core.Check import de.frosner.ddq.testutils.{SparkContexts, TestData} import org.apache.spark.sql.AnalysisException import org.scalatest.{FlatSpec, Matchers} class AlwaysNullConstraintTest extends FlatSpec with Matchers with SparkContexts { "An AlwaysNullConstraint" should "succeed if the column is always null" in { val column = "column" val check = Check(TestData.makeNullableStringDf(spark, List(null, null, null))).isAlwaysNull(column) val constraint = check.constraints.head val result = AlwaysNullConstraintResult( constraint = AlwaysNullConstraint(column), data = Some(AlwaysNullConstraintResultData(nonNullRows = 0L)), status = ConstraintSuccess ) check.run().constraintResults shouldBe Map(constraint -> result) } it should "fail if the column is not always null" in { val column = "column" val check = Check(TestData.makeNullableStringDf(spark, List("a", null, null))).isAlwaysNull(column) val constraint = check.constraints.head val result = AlwaysNullConstraintResult( constraint = AlwaysNullConstraint(column), data = Some(AlwaysNullConstraintResultData(nonNullRows = 1L)), status = ConstraintFailure ) check.run().constraintResults shouldBe Map(constraint -> result) } it should "error if the column is not existing" in { val column = "notExisting" val check = Check(TestData.makeNullableStringDf(spark, List("a", null, null))).isAlwaysNull(column) val constraint = check.constraints.head val result = check.run().constraintResults(constraint) result match { case AlwaysNullConstraintResult( AlwaysNullConstraint("notExisting"), constraintError: ConstraintError, None ) => val analysisException = constraintError.throwable.asInstanceOf[AnalysisException] analysisException.message shouldBe "cannot resolve '`notExisting`' given input columns: [column]" } } "An AlwaysNullConstraintResult" should "have the correct success message" in { val constraint = AlwaysNullConstraint("c") val result = AlwaysNullConstraintResult( constraint = constraint, status = ConstraintSuccess, data = Some(AlwaysNullConstraintResultData(0L)) ) result.message shouldBe "Column c is always null." } it should "have the correct failure message (one row)" in { val constraint = AlwaysNullConstraint("c") val result = AlwaysNullConstraintResult( constraint = constraint, status = ConstraintFailure, data = Some(AlwaysNullConstraintResultData(1L)) ) result.message shouldBe "Column c contains 1 non-null row (should always be null)." } it should "have the correct failure message (multiple rows)" in { val constraint = AlwaysNullConstraint("c") val result = AlwaysNullConstraintResult( constraint = constraint, status = ConstraintFailure, data = Some(AlwaysNullConstraintResultData(2L)) ) result.message shouldBe "Column c contains 2 non-null rows (should always be null)." } it should "have the correct error message" in { val constraint = AlwaysNullConstraint("c") val result = AlwaysNullConstraintResult( constraint = constraint, status = ConstraintError(new IllegalArgumentException("column c not found")), data = None ) result.message shouldBe "Checking column c for being always null failed: " + "java.lang.IllegalArgumentException: column c not found" } it should "throw an exception if it is created with an illegal combination of fields" in { intercept[IllegalConstraintResultException] { AlwaysNullConstraintResult( constraint = AlwaysNullConstraint("c"), status = ConstraintFailure, data = None ) } } }
Example 115
Source File: MergeClauseSuite.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.merge import org.apache.spark.SparkFunSuite import org.apache.spark.sql.{AnalysisException, functions} class MergeClauseSuite extends SparkFunSuite { def insertClause(addCondition : Boolean = true): MergeWhenNotInsert = { if (addCondition) { MergeWhenNotInsert(Some(functions.expr("x > 2").expr), Seq(functions.col("x").expr, functions.col("y").expr)) } else { MergeWhenNotInsert(None, Seq(functions.col("x").expr, functions.col("y").expr)) } } def updateClause(addCondition : Boolean = true): MergeWhenUpdateClause = { if (addCondition) { val updateCondition = Some(functions.expr("a > 2").expr) MergeWhenUpdateClause(updateCondition, Map("b" -> functions.lit(3).expr), isStar = false) } else { MergeWhenUpdateClause(None, Map("b" -> functions.lit(3).expr), isStar = false) } } def deleteClause(addCondition : Boolean = true): MergeWhenDelete = { if (addCondition) { MergeWhenDelete(Some(functions.expr("a < 1").expr)) } else { MergeWhenDelete(None) } } test("Validate MergeClauses") { val clauses = Seq(insertClause(), updateClause(), deleteClause()) MergeWhenClause.validate(clauses) } test("Invalid MergeClause cases") { val invalidMerge = "MERGE Validation Error: " //empty clauses checkInvalidMergeClause(invalidMerge + MergeWhenClause.atleastOneClauseError, Seq()) // multi update or insert clauses val multiUpdateClauses = Seq(updateClause(), updateClause(), insertClause()) checkInvalidMergeClause(invalidMerge + MergeWhenClause.justOneClausePerTypeError, multiUpdateClauses) // multi match clauses with first clause without condition val invalidMultiMatch = Seq(updateClause(false), deleteClause()) checkInvalidMergeClause(invalidMerge + MergeWhenClause.matchClauseConditionError, invalidMultiMatch) // invalid Update Clause val invalidUpdateClause = MergeWhenUpdateClause(None, Map(), isStar = false) val thrown = intercept[IllegalArgumentException] { MergeWhenClause.validate(Seq(invalidUpdateClause)) } assert(thrown.getMessage === "UPDATE Clause in MERGE should have one or more SET Values") } private def checkInvalidMergeClause(invalidMessage: String, multiUpdateClauses: Seq[MergeWhenClause]) = { val thrown = intercept[AnalysisException] { MergeWhenClause.validate(multiUpdateClauses) } assert(thrown.message === invalidMessage) } }
Example 116
Source File: HiveAcidUtils.scala From spark-acid with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import scala.collection.JavaConverters._ import com.qubole.spark.hiveacid.hive.HiveAcidMetadata import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTablePartition, CatalogUtils} import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference, Expression, InterpretedPredicate, PrettyAttribute} object HiveAcidUtils { def prunePartitionsByFilter( hiveAcidMetadata: HiveAcidMetadata, inputPartitions: Seq[CatalogTablePartition], predicates: Option[Expression], defaultTimeZoneId: String): Seq[CatalogTablePartition] = { if (predicates.isEmpty) { inputPartitions } else { val partitionSchema = hiveAcidMetadata.partitionSchema val partitionColumnNames = hiveAcidMetadata.partitionSchema.fieldNames.toSet val nonPartitionPruningPredicates = predicates.filterNot { _.references.map(_.name).toSet.subsetOf(partitionColumnNames) } if (nonPartitionPruningPredicates.nonEmpty) { throw new AnalysisException("Expected only partition pruning predicates: " + nonPartitionPruningPredicates) } val boundPredicate = InterpretedPredicate.create(predicates.get.transform { case att: Attribute => val index = partitionSchema.indexWhere(_.name == att.name) BoundReference(index, partitionSchema(index).dataType, nullable = true) }) inputPartitions.filter { p => boundPredicate.eval(p.toRow(partitionSchema, defaultTimeZoneId)) } } } def convertToCatalogTablePartition(hp: com.qubole.shaded.hadoop.hive.ql.metadata.Partition): CatalogTablePartition = { val apiPartition = hp.getTPartition val properties: Map[String, String] = if (hp.getParameters != null) { hp.getParameters.asScala.toMap } else { Map.empty } CatalogTablePartition( spec = Option(hp.getSpec).map(_.asScala.toMap).getOrElse(Map.empty), storage = CatalogStorageFormat( locationUri = Option(CatalogUtils.stringToURI(apiPartition.getSd.getLocation)), inputFormat = Option(apiPartition.getSd.getInputFormat), outputFormat = Option(apiPartition.getSd.getOutputFormat), serde = Option(apiPartition.getSd.getSerdeInfo.getSerializationLib), compressed = apiPartition.getSd.isCompressed, properties = Option(apiPartition.getSd.getSerdeInfo.getParameters) .map(_.asScala.toMap).orNull), createTime = apiPartition.getCreateTime.toLong * 1000, lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000, parameters = properties, stats = None) // TODO: need to implement readHiveStats } }
Example 117
Source File: ResolvedDataSourceSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.test.SharedSQLContext class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext { private def getProvidingClass(name: String): Class[_] = DataSource( sparkSession = spark, className = name, options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID) ).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("error message for unknown data sources") { val error1 = intercept[AnalysisException] { getProvidingClass("avro") } assert(error1.getMessage.contains("Failed to find data source: avro.")) val error2 = intercept[AnalysisException] { getProvidingClass("com.databricks.spark.avro") } assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro.")) val error3 = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 118
Source File: AnalysisTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.{TableIdentifier, SimpleCatalystConf} trait AnalysisTest extends PlanTest { val (caseSensitiveAnalyzer, caseInsensitiveAnalyzer) = { val caseSensitiveConf = new SimpleCatalystConf(true) val caseInsensitiveConf = new SimpleCatalystConf(false) val caseSensitiveCatalog = new SimpleCatalog(caseSensitiveConf) val caseInsensitiveCatalog = new SimpleCatalog(caseInsensitiveConf) caseSensitiveCatalog.registerTable(TableIdentifier("TaBlE"), TestRelations.testRelation) caseInsensitiveCatalog.registerTable(TableIdentifier("TaBlE"), TestRelations.testRelation) new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitiveConf) { override val extendedResolutionRules = EliminateSubQueries :: Nil } -> new Analyzer(caseInsensitiveCatalog, EmptyFunctionRegistry, caseInsensitiveConf) { override val extendedResolutionRules = EliminateSubQueries :: Nil } } protected def getAnalyzer(caseSensitive: Boolean) = { if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer } protected def checkAnalysis( inputPlan: LogicalPlan, expectedPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val actualPlan = analyzer.execute(inputPlan) analyzer.checkAnalysis(actualPlan) comparePlans(actualPlan, expectedPlan) } protected def assertAnalysisSuccess( inputPlan: LogicalPlan, caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) analyzer.checkAnalysis(analyzer.execute(inputPlan)) } protected def assertAnalysisError( inputPlan: LogicalPlan, expectedErrors: Seq[String], caseSensitive: Boolean = true): Unit = { val analyzer = getAnalyzer(caseSensitive) val e = intercept[AnalysisException] { analyzer.checkAnalysis(analyzer.execute(inputPlan)) } assert(expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains), s"Expected to throw Exception contains: ${expectedErrors.mkString(", ")}, " + s"actually we get ${e.getMessage}") } }
Example 119
Source File: randomExpressions.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types.{DataType, DoubleType} import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom case class Randn(seed: Long) extends RDG { override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() def this() = this(Utils.random.nextLong()) def this(seed: Expression) = this(seed match { case IntegerLiteral(s) => s case _ => throw new AnalysisException("Input argument to rand must be an integer literal.") }) override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val rngTerm = ctx.freshName("rng") val className = classOf[XORShiftRandom].getName ctx.addMutableState(className, rngTerm, s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian(); """ } }
Example 120
Source File: SparkSQLDriver.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{Arrays, ArrayList => JArrayList, List => JList} import org.apache.log4j.LogManager import org.apache.spark.sql.AnalysisException import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.Logging import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} private[hive] class SparkSQLDriver( val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: context.QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.stringResult() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 121
Source File: CreateTableAsSelect.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable} import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes, MetastoreRelation} import org.apache.spark.sql.{AnalysisException, Row, SQLContext} private[hive] case class CreateTableAsSelect( tableDesc: HiveTable, query: LogicalPlan, allowExisting: Boolean) extends RunnableCommand { val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database)) override def children: Seq[LogicalPlan] = Seq(query) override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withFormat = tableDesc.copy( inputFormat = tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName()))) val withSchema = if (withFormat.schema.isEmpty) { // Hive doesn't support specifying the column list for target table in CTAS // However we don't think SparkSQL should follow that. tableDesc.copy(schema = query.output.map(c => HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null))) } else { withFormat } hiveContext.catalog.client.createTable(withSchema) // Get the Metastore Relation hiveContext.catalog.lookupRelation(tableIdentifier, None) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (hiveContext.catalog.tableExists(tableIdentifier)) { if (allowExisting) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$tableIdentifier already exists.") } } else { hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, TableName: ${tableDesc.name}, InsertIntoHiveTable]" } }
Example 122
Source File: CreateViewAsSelect.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.hive.{HiveMetastoreTypes, HiveContext} import org.apache.spark.sql.{AnalysisException, Row, SQLContext} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable} // TODO: Note that this class can NOT canonicalize the view SQL string entirely, which is different // from Hive and may not work for some cases like create view on self join. private[hive] case class CreateViewAsSelect( tableDesc: HiveTable, childSchema: Seq[Attribute], allowExisting: Boolean, orReplace: Boolean) extends RunnableCommand { assert(tableDesc.schema == Nil || tableDesc.schema.length == childSchema.length) assert(tableDesc.viewText.isDefined) val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database)) override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] if (hiveContext.catalog.tableExists(tableIdentifier)) { if (allowExisting) { // view already exists, will do nothing, to keep consistent with Hive } else if (orReplace) { hiveContext.catalog.client.alertView(prepareTable()) } else { throw new AnalysisException(s"View $tableIdentifier already exists. " + "If you want to update the view definition, please use ALTER VIEW AS or " + "CREATE OR REPLACE VIEW AS") } } else { hiveContext.catalog.client.createView(prepareTable()) } Seq.empty[Row] } private def prepareTable(): HiveTable = { // setup column types according to the schema of child. val schema = if (tableDesc.schema == Nil) { childSchema.map { attr => HiveColumn(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), null) } } else { childSchema.zip(tableDesc.schema).map { case (attr, col) => HiveColumn(col.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), col.comment) } } val columnNames = childSchema.map(f => verbose(f.name)) // When user specified column names for view, we should create a project to do the renaming. // When no column name specified, we still need to create a project to declare the columns // we need, to make us more robust to top level `*`s. val projectList = if (tableDesc.schema == Nil) { columnNames.mkString(", ") } else { columnNames.zip(tableDesc.schema.map(f => verbose(f.name))).map { case (name, alias) => s"$name AS $alias" }.mkString(", ") } val viewName = verbose(tableDesc.name) val expandedText = s"SELECT $projectList FROM (${tableDesc.viewText.get}) $viewName" tableDesc.copy(schema = schema, viewText = Some(expandedText)) } // escape backtick with double-backtick in column name and wrap it with backtick. private def verbose(name: String) = s"`${name.replaceAll("`", "``")}`" }
Example 123
Source File: OrcFileOperator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.Logging import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.HiveMetastoreTypes import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(path: String, conf: Option[Configuration]): StructType = { val reader = getFileReader(path, conf).getOrElse { throw new AnalysisException( s"Failed to discover schema from ORC files stored in $path. " + "Probably there are either no ORC files or only empty ORC files.") } val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $path, got Hive schema string: $schema") HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType] } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDir) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) if (paths == null || paths.isEmpty) { throw new IllegalArgumentException( s"orcFileOperator: path $path does not have valid orc files matching the pattern") } paths } }
Example 124
Source File: DDLSourceLoadSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ // please note that the META-INF/services had to be modified for the test directory for this to work class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext { test("data sources with the same name - internal data sources") { val e = intercept[AnalysisException] { spark.read.format("Fluet da Bomb").load() } assert(e.getMessage.contains("Multiple sources found for Fluet da Bomb")) } test("data sources with the same name - internal data source/external data source") { assert(spark.read.format("datasource").load().schema == StructType(Seq(StructField("longType", LongType, nullable = false)))) } test("data sources with the same name - external data sources") { val e = intercept[AnalysisException] { spark.read.format("Fake external source").load() } assert(e.getMessage.contains("Multiple sources found for Fake external source")) } test("load data source from format alias") { assert(spark.read.format("gathering quorum").load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))) } test("specify full classname with duplicate formats") { assert(spark.read.format("org.apache.spark.sql.sources.FakeSourceOne") .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))) } } class FakeSourceOne extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceTwo extends RelationProvider with DataSourceRegister { def shortName(): String = "Fluet da Bomb" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("integerType", IntegerType, nullable = false))) } } class FakeSourceThree extends RelationProvider with DataSourceRegister { def shortName(): String = "gathering quorum" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("stringType", StringType, nullable = false))) } } class FakeSourceFour extends RelationProvider with DataSourceRegister { def shortName(): String = "datasource" override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation = new BaseRelation { override def sqlContext: SQLContext = cont override def schema: StructType = StructType(Seq(StructField("longType", LongType, nullable = false))) } }
Example 125
Source File: TestShowPartitions.scala From carbondata with Apache License 2.0 | 4 votes |
package org.apache.carbondata.spark.testsuite.partition import org.apache.spark.sql.{AnalysisException, DataFrame, Row} import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.util.CarbonProperties import org.apache.spark.sql.test.util.QueryTest import org.apache.carbondata.spark.exception.ProcessMetaDataException class TestShowPartition extends QueryTest with BeforeAndAfterAll { override def beforeAll = { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, "dd-MM-yyyy") sql("drop table if exists notPartitionTable") sql(""" | CREATE TABLE notPartitionTable | ( | vin String, | logdate Timestamp, | phonenumber Int, | country String, | area String | ) | STORED AS carbondata """.stripMargin) sql(s"CREATE DATABASE if not exists partitionDB") sql("DROP TABLE IF EXISTS hiveTable") sql(""" | create table hiveTable(id int, name string) partitioned by (city string) | row format delimited fields terminated by ',' """.stripMargin) sql("alter table hiveTable add partition (city = 'Hangzhou')") sql(s"CREATE DATABASE if not exists hiveDB") sql("DROP TABLE IF EXISTS hiveDB.hiveTable") sql(""" | create table hiveDB.hiveTable(id int, name string) partitioned by (city string) | row format delimited fields terminated by ',' """.stripMargin) sql("alter table hiveDB.hiveTable add partition (city = 'Shanghai')") } test("show partition table: exception when show not partition table") { val errorMessage = intercept[AnalysisException] { sql("show partitions notPartitionTable").show() } assert(errorMessage.getMessage.contains( "SHOW PARTITIONS is not allowed on a table that is not partitioned")) } test("show partition table: hive partition table") { // EqualTo checkAnswer(sql("show partitions hiveTable"), Seq(Row("city=Hangzhou"))) sql("use hiveDB").show() checkAnswer(sql("show partitions hiveTable"), Seq(Row("city=Shanghai"))) sql("use default").show() } override def afterAll = { sql("use default") sql("drop table if exists notPartitionTable") sql("drop table if exists hiveTable") try { sql("drop table if exists hiveDB.hiveTable") } catch { case ex: NoSuchDatabaseException => print(ex.getMessage()) } sql("DROP DATABASE if exists partitionDB") sql("DROP DATABASE if exists hiveDB") CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT) } }