org.apache.spark.sql.AnalysisException Scala Examples

The following examples show how to use org.apache.spark.sql.AnalysisException. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala    From drizzle-spark   with Apache License 2.0 12 votes vote down vote up
package org.apache.spark.sql.execution.datasources.text

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, Path}
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.io.compress.GzipCodec
import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext}
import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat}
import org.apache.hadoop.util.ReflectionUtils

import org.apache.spark.TaskContext
import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.UnsafeRow
import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter}
import org.apache.spark.sql.catalyst.util.CompressionCodecs
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.util.SerializableConfiguration


  def getCompressionExtension(context: TaskAttemptContext): String = {
    // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile
    if (FileOutputFormat.getCompressOutput(context)) {
      val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec])
      ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension
    } else {
      ""
    }
  }
} 
Example 2
Source File: JdbcRelationProvider.scala    From drizzle-spark   with Apache License 2.0 7 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn == null) {
      null
    } else {
      JDBCPartitioningInfo(
        partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val url = jdbcOptions.url
    val table = jdbcOptions.table
    val createTableOptions = jdbcOptions.createTableOptions
    val isTruncate = jdbcOptions.isTruncate

    val conn = JdbcUtils.createConnectionFactory(jdbcOptions)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, url, table)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (isTruncate && isCascadingTruncateTable(url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, table)
              saveTable(df, url, table, jdbcOptions)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, table)
              createTable(df.schema, url, table, createTableOptions, conn)
              saveTable(df, url, table, jdbcOptions)
            }

          case SaveMode.Append =>
            saveTable(df, url, table, jdbcOptions)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '$table' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(df.schema, url, table, createTableOptions, conn)
        saveTable(df, url, table, jdbcOptions)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
} 
Example 3
Source File: CreateHiveTableAsSelectCommand.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import scala.util.control.NonFatal

import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.hive.MetastoreRelation



case class CreateHiveTableAsSelectCommand(
    tableDesc: CatalogTable,
    query: LogicalPlan,
    ignoreIfExists: Boolean)
  extends RunnableCommand {

  private val tableIdentifier = tableDesc.identifier

  override def innerChildren: Seq[LogicalPlan] = Seq(query)

  override def run(sparkSession: SparkSession): Seq[Row] = {
    lazy val metastoreRelation: MetastoreRelation = {
      import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
      import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
      import org.apache.hadoop.io.Text
      import org.apache.hadoop.mapred.TextInputFormat

      val withFormat =
        tableDesc.withNewStorage(
          inputFormat =
            tableDesc.storage.inputFormat.orElse(Some(classOf[TextInputFormat].getName)),
          outputFormat =
            tableDesc.storage.outputFormat
              .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)),
          serde = tableDesc.storage.serde.orElse(Some(classOf[LazySimpleSerDe].getName)),
          compressed = tableDesc.storage.compressed)

      val withSchema = if (withFormat.schema.isEmpty) {
        // Hive doesn't support specifying the column list for target table in CTAS
        // However we don't think SparkSQL should follow that.
        tableDesc.copy(schema = query.output.toStructType)
      } else {
        withFormat
      }

      sparkSession.sessionState.catalog.createTable(withSchema, ignoreIfExists = false)

      // Get the Metastore Relation
      sparkSession.sessionState.catalog.lookupRelation(tableIdentifier) match {
        case r: MetastoreRelation => r
      }
    }
    // TODO ideally, we should get the output data ready first and then
    // add the relation into catalog, just in case of failure occurs while data
    // processing.
    if (sparkSession.sessionState.catalog.tableExists(tableIdentifier)) {
      if (ignoreIfExists) {
        // table already exists, will do nothing, to keep consistent with Hive
      } else {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
    } else {
      try {
        sparkSession.sessionState.executePlan(InsertIntoTable(
          metastoreRelation, Map(), query, overwrite = true, ifNotExists = false)).toRdd
      } catch {
        case NonFatal(e) =>
          // drop the created table.
          sparkSession.sessionState.catalog.dropTable(tableIdentifier, ignoreIfNotExists = true,
            purge = false)
          throw e
      }
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:${tableDesc.database}}, " +
    s"TableName: ${tableDesc.identifier.table}, " +
    s"InsertIntoHiveTable]"
  }
} 
Example 4
Source File: SparkSQLDriver.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.util.{ArrayList => JArrayList, Arrays, List => JList}

import scala.collection.JavaConverters._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.execution.QueryExecution


private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.isEmpty) {
      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, attr.dataType.catalogString, "")
      }

      new Schema(fieldSchemas.asJava, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.sessionState.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.hiveResultString()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
} 
Example 5
Source File: randomExpressions.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{DataType, DoubleType}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom


@ExpressionDescription(
  usage = "_FUNC_(a) - Returns a random column with i.i.d. gaussian random distribution.")
case class Randn(seed: Long) extends RDG {
  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()

  def this() = this(Utils.random.nextLong())

  def this(seed: Expression) = this(seed match {
    case IntegerLiteral(s) => s
    case _ => throw new AnalysisException("Input argument to randn must be an integer literal.")
  })

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val rngTerm = ctx.freshName("rng")
    val className = classOf[XORShiftRandom].getName
    ctx.addMutableState(className, rngTerm,
      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
    ev.copy(code = s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""", isNull = "false")
  }
} 
Example 6
Source File: TimeWindow.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.commons.lang3.StringUtils

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.CalendarInterval

case class TimeWindow(
    timeColumn: Expression,
    windowDuration: Long,
    slideDuration: Long,
    startTime: Long) extends UnaryExpression
  with ImplicitCastInputTypes
  with Unevaluable
  with NonSQLExpression {

  //////////////////////////
  // SQL Constructors
  //////////////////////////

  def this(
      timeColumn: Expression,
      windowDuration: Expression,
      slideDuration: Expression,
      startTime: Expression) = {
    this(timeColumn, TimeWindow.parseExpression(windowDuration),
      TimeWindow.parseExpression(slideDuration), TimeWindow.parseExpression(startTime))
  }

  def this(timeColumn: Expression, windowDuration: Expression, slideDuration: Expression) = {
    this(timeColumn, TimeWindow.parseExpression(windowDuration),
      TimeWindow.parseExpression(slideDuration), 0)
  }

  def this(timeColumn: Expression, windowDuration: Expression) = {
    this(timeColumn, windowDuration, windowDuration)
  }

  override def child: Expression = timeColumn
  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
  override def dataType: DataType = new StructType()
    .add(StructField("start", TimestampType))
    .add(StructField("end", TimestampType))

  // This expression is replaced in the analyzer.
  override lazy val resolved = false

  
case class PreciseTimestamp(child: Expression) extends UnaryExpression with ExpectsInputTypes {
  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
  override def dataType: DataType = LongType
  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val eval = child.genCode(ctx)
    ev.copy(code = eval.code +
      s"""boolean ${ev.isNull} = ${eval.isNull};
         |${ctx.javaType(dataType)} ${ev.value} = ${eval.value};
       """.stripMargin)
  }
} 
Example 7
Source File: AnalysisTest.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.SimpleCatalystConf
import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._

trait AnalysisTest extends PlanTest {

  protected val caseSensitiveAnalyzer = makeAnalyzer(caseSensitive = true)
  protected val caseInsensitiveAnalyzer = makeAnalyzer(caseSensitive = false)

  private def makeAnalyzer(caseSensitive: Boolean): Analyzer = {
    val conf = new SimpleCatalystConf(caseSensitive)
    val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf)
    catalog.createTempView("TaBlE", TestRelations.testRelation, overrideIfExists = true)
    new Analyzer(catalog, conf) {
      override val extendedResolutionRules = EliminateSubqueryAliases :: Nil
    }
  }

  protected def getAnalyzer(caseSensitive: Boolean) = {
    if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer
  }

  protected def checkAnalysis(
      inputPlan: LogicalPlan,
      expectedPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val actualPlan = analyzer.execute(inputPlan)
    analyzer.checkAnalysis(actualPlan)
    comparePlans(actualPlan, expectedPlan)
  }

  protected def assertAnalysisSuccess(
      inputPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val analysisAttempt = analyzer.execute(inputPlan)
    try analyzer.checkAnalysis(analysisAttempt) catch {
      case a: AnalysisException =>
        fail(
          s"""
            |Failed to Analyze Plan
            |$inputPlan
            |
            |Partial Analysis
            |$analysisAttempt
          """.stripMargin, a)
    }
  }

  protected def assertAnalysisError(
      inputPlan: LogicalPlan,
      expectedErrors: Seq[String],
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val e = intercept[AnalysisException] {
      analyzer.checkAnalysis(analyzer.execute(inputPlan))
    }

    if (!expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains)) {
      fail(
        s"""Exception message should contain the following substrings:
           |
           |  ${expectedErrors.mkString("\n  ")}
           |
           |Actual exception message:
           |
           |  ${e.getMessage}
         """.stripMargin)
    }
  }
} 
Example 8
Source File: ResolveInlineTablesSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Literal, Rand}
import org.apache.spark.sql.catalyst.expressions.aggregate.Count
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.types.{LongType, NullType}


class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter {

  private def lit(v: Any): Literal = Literal(v)

  test("validate inputs are foldable") {
    ResolveInlineTables.validateInputEvaluable(
      UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)))))

    // nondeterministic (rand) should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1)))))
    }

    // aggregate should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1))))))
    }

    // unresolved attribute should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A")))))
    }
  }

  test("validate input dimensions") {
    ResolveInlineTables.validateInputDimension(
      UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2)))))

    // num alias != data dimension
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2)))))
    }

    // num alias == data dimension, but data themselves are inconsistent
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22)))))
    }
  }

  test("do not fire the rule if not all expressions are resolved") {
    val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A"))))
    assert(ResolveInlineTables(table) == table)
  }

  test("convert") {
    val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted = ResolveInlineTables.convert(table)

    assert(converted.output.map(_.dataType) == Seq(LongType))
    assert(converted.data.size == 2)
    assert(converted.data(0).getLong(0) == 1L)
    assert(converted.data(1).getLong(0) == 2L)
  }

  test("nullability inference in convert") {
    val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted1 = ResolveInlineTables.convert(table1)
    assert(!converted1.schema.fields(0).nullable)

    val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType))))
    val converted2 = ResolveInlineTables.convert(table2)
    assert(converted2.schema.fields(0).nullable)
  }
} 
Example 9
Source File: CartesianProductExec.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.joins

import org.apache.spark._
import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD}
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.util.CompletionIterator
import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter


class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int)
  extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) {

  override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = {
    // We will not sort the rows, so prefixComparator and recordComparator are null.
    val sorter = UnsafeExternalSorter.create(
      context.taskMemoryManager(),
      SparkEnv.get.blockManager,
      SparkEnv.get.serializerManager,
      context,
      null,
      null,
      1024,
      SparkEnv.get.memoryManager.pageSizeBytes,
      SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold",
        UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD),
      false)

    val partition = split.asInstanceOf[CartesianPartition]
    for (y <- rdd2.iterator(partition.s2, context)) {
      sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false)
    }

    // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow]
    def createIter(): Iterator[UnsafeRow] = {
      val iter = sorter.getIterator
      val unsafeRow = new UnsafeRow(numFieldsOfRight)
      new Iterator[UnsafeRow] {
        override def hasNext: Boolean = {
          iter.hasNext
        }
        override def next(): UnsafeRow = {
          iter.loadNext()
          unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength)
          unsafeRow
        }
      }
    }

    val resultIter =
      for (x <- rdd1.iterator(partition.s1, context);
           y <- createIter()) yield (x, y)
    CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]](
      resultIter, sorter.cleanupResources())
  }
}


case class CartesianProductExec(
    left: SparkPlan,
    right: SparkPlan,
    condition: Option[Expression]) extends BinaryExecNode {
  override def output: Seq[Attribute] = left.output ++ right.output

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")

    val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]]
    val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]

    val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size)
    pair.mapPartitionsInternal { iter =>
      val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
      val filtered = if (condition.isDefined) {
        val boundCondition: (InternalRow) => Boolean =
          newPredicate(condition.get, left.output ++ right.output)
        val joined = new JoinedRow

        iter.filter { r =>
          boundCondition(joined(r._1, r._2))
        }
      } else {
        iter
      }
      filtered.map { r =>
        numOutputRows += 1
        joiner.join(r._1, r._2)
      }
    }
  }
} 
Example 10
Source File: VariableSubstitutionSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.internal

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException

class VariableSubstitutionSuite extends SparkFunSuite {

  private lazy val conf = new SQLConf
  private lazy val sub = new VariableSubstitution(conf)

  test("system property") {
    System.setProperty("varSubSuite.var", "abcd")
    assert(sub.substitute("${system:varSubSuite.var}") == "abcd")
  }

  test("environmental variables") {
    assert(sub.substitute("${env:SPARK_TESTING}") == "1")
  }

  test("Spark configuration variable") {
    conf.setConfString("some-random-string-abcd", "1234abcd")
    assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${some-random-string-abcd}") == "1234abcd")
  }

  test("multiple substitutes") {
    val q = "select ${bar} ${foo} ${doo} this is great"
    conf.setConfString("bar", "1")
    conf.setConfString("foo", "2")
    conf.setConfString("doo", "3")
    assert(sub.substitute(q) == "select 1 2 3 this is great")
  }

  test("test nested substitutes") {
    val q = "select ${bar} ${foo} this is great"
    conf.setConfString("bar", "1")
    conf.setConfString("foo", "${bar}")
    assert(sub.substitute(q) == "select 1 1 this is great")
  }

} 
Example 11
Source File: ResolvedDataSourceSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.execution.datasources.DataSource

class ResolvedDataSourceSuite extends SparkFunSuite {
  private def getProvidingClass(name: String): Class[_] =
    DataSource(sparkSession = null, className = name).providingClass

  test("jdbc") {
    assert(
      getProvidingClass("jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.jdbc") ===
        classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
  }

  test("json") {
    assert(
      getProvidingClass("json") ===
      classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
  }

  test("parquet") {
    assert(
      getProvidingClass("parquet") ===
      classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
  }

  test("csv") {
    assert(
      getProvidingClass("csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
    assert(
      getProvidingClass("com.databricks.spark.csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
  }

  test("error message for unknown data sources") {
    val error1 = intercept[AnalysisException] {
      getProvidingClass("avro")
    }
    assert(error1.getMessage.contains("Failed to find data source: avro."))

    val error2 = intercept[AnalysisException] {
      getProvidingClass("com.databricks.spark.avro")
    }
    assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro."))

    val error3 = intercept[ClassNotFoundException] {
      getProvidingClass("asfdwefasdfasdf")
    }
    assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf."))
  }
} 
Example 12
Source File: DDLSourceLoadSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {
    intercept[RuntimeException] {
      spark.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {
    spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {
    spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without Hive Support") {
    val e = intercept[AnalysisException] {
      spark.read.format("orc").load()
    }
    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
} 
Example 13
Source File: ErrorHandlers.scala    From daf   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package daf.web

import java.io.FileNotFoundException
import java.lang.reflect.UndeclaredThrowableException

import daf.error.InvalidRequestException
import it.gov.daf.common.web.ErrorHandler
import org.apache.spark.sql.AnalysisException
import org.ietf.jgss.GSSException
import play.api.mvc.Results

object ErrorHandlers {

  val security: ErrorHandler = {
    case _: GSSException => Results.Unauthorized
  }

  val spark: ErrorHandler = {
    case _: FileNotFoundException => Results.NotFound
    case _: AnalysisException     => Results.NotFound
    case error: UndeclaredThrowableException if error.getUndeclaredThrowable.isInstanceOf[AnalysisException] => Results.NotFound
  }

  val api: ErrorHandler = {
    case error: InvalidRequestException => Results.BadRequest { error.getMessage }
  }

} 
Example 14
Source File: rules.scala    From tispark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.extensions

import com.pingcap.tispark.statistics.StatisticsManager
import com.pingcap.tispark.utils.ReflectionUtil._
import com.pingcap.tispark.{MetaManager, TiDBRelation, TiTableReference}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.{EliminateSubqueryAliases, UnresolvedRelation}
import org.apache.spark.sql.catalyst.catalog.TiSessionCatalog
import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.command._
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.{AnalysisException, _}

case class TiResolutionRule(getOrCreateTiContext: SparkSession => TiContext)(
    sparkSession: SparkSession)
    extends Rule[LogicalPlan] {
  protected lazy val meta: MetaManager = tiContext.meta
  private lazy val autoLoad = tiContext.autoLoad
  private lazy val tiCatalog = tiContext.tiCatalog
  private lazy val tiSession = tiContext.tiSession
  private lazy val sqlContext = tiContext.sqlContext
  protected val tiContext: TiContext = getOrCreateTiContext(sparkSession)
  protected val resolveTiDBRelation: TableIdentifier => LogicalPlan =
    tableIdentifier => {
      val dbName = getDatabaseFromIdentifier(tableIdentifier)
      val tableName = tableIdentifier.table
      val table = meta.getTable(dbName, tableName)
      if (table.isEmpty) {
        throw new AnalysisException(s"Table or view '$tableName' not found in database '$dbName'")
      }
      if (autoLoad) {
        StatisticsManager.loadStatisticsInfo(table.get)
      }
      val sizeInBytes = StatisticsManager.estimateTableSize(table.get)
      val tiDBRelation =
        TiDBRelation(tiSession, TiTableReference(dbName, tableName, sizeInBytes), meta)(
          sqlContext)
      // Use SubqueryAlias so that projects and joins can correctly resolve
      // UnresolvedAttributes in JoinConditions, Projects, Filters, etc.
      newSubqueryAlias(tableName, LogicalRelation(tiDBRelation))
    }

  override def apply(plan: LogicalPlan): LogicalPlan =
    plan transformUp resolveTiDBRelations

  protected def resolveTiDBRelations: PartialFunction[LogicalPlan, LogicalPlan] = {
    case i @ InsertIntoTable(UnresolvedRelation(tableIdentifier), _, _, _, _)
        if tiCatalog
          .catalogOf(tableIdentifier.database)
          .exists(_.isInstanceOf[TiSessionCatalog]) =>
      i.copy(table = EliminateSubqueryAliases(resolveTiDBRelation(tableIdentifier)))
    case UnresolvedRelation(tableIdentifier)
        if tiCatalog
          .catalogOf(tableIdentifier.database)
          .exists(_.isInstanceOf[TiSessionCatalog]) =>
      resolveTiDBRelation(tableIdentifier)
  }

  private def getDatabaseFromIdentifier(tableIdentifier: TableIdentifier): String =
    tableIdentifier.database.getOrElse(tiCatalog.getCurrentDatabase)
}

case class TiDDLRule(getOrCreateTiContext: SparkSession => TiContext)(sparkSession: SparkSession)
    extends Rule[LogicalPlan] {
  protected lazy val tiContext: TiContext = getOrCreateTiContext(sparkSession)

  override def apply(plan: LogicalPlan): LogicalPlan =
    plan transformUp {
      // TODO: support other commands that may concern TiSpark catalog.
      case sd: ShowDatabasesCommand =>
        TiShowDatabasesCommand(tiContext, sd)
      case sd: SetDatabaseCommand =>
        TiSetDatabaseCommand(tiContext, sd)
      case st: ShowTablesCommand =>
        TiShowTablesCommand(tiContext, st)
      case st: ShowColumnsCommand =>
        TiShowColumnsCommand(tiContext, st)
      case dt: DescribeTableCommand =>
        TiDescribeTablesCommand(tiContext, dt)
      case dc: DescribeColumnCommand =>
        TiDescribeColumnCommand(tiContext, dc)
      case ct: CreateTableLikeCommand =>
        TiCreateTableLikeCommand(tiContext, ct)
    }
} 
Example 15
Source File: CreateHiveTableAsSelectCommand.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import scala.util.control.NonFatal

import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.command.DataWritingCommand



case class CreateHiveTableAsSelectCommand(
    tableDesc: CatalogTable,
    query: LogicalPlan,
    outputColumnNames: Seq[String],
    mode: SaveMode)
  extends DataWritingCommand {

  private val tableIdentifier = tableDesc.identifier

  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog
    if (catalog.tableExists(tableIdentifier)) {
      assert(mode != SaveMode.Overwrite,
        s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite")

      if (mode == SaveMode.ErrorIfExists) {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
      if (mode == SaveMode.Ignore) {
        // Since the table already exists and the save mode is Ignore, we will just return.
        return Seq.empty
      }

      // For CTAS, there is no static partition values to insert.
      val partition = tableDesc.partitionColumnNames.map(_ -> None).toMap
      InsertIntoHiveTable(
        tableDesc,
        partition,
        query,
        overwrite = false,
        ifPartitionNotExists = false,
        outputColumnNames = outputColumnNames).run(sparkSession, child)
    } else {
      // TODO ideally, we should get the output data ready first and then
      // add the relation into catalog, just in case of failure occurs while data
      // processing.
      assert(tableDesc.schema.isEmpty)
      catalog.createTable(
        tableDesc.copy(schema = outputColumns.toStructType), ignoreIfExists = false)

      try {
        // Read back the metadata of the table which was created just now.
        val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier)
        // For CTAS, there is no static partition values to insert.
        val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap
        InsertIntoHiveTable(
          createdTableMeta,
          partition,
          query,
          overwrite = true,
          ifPartitionNotExists = false,
          outputColumnNames = outputColumnNames).run(sparkSession, child)
      } catch {
        case NonFatal(e) =>
          // drop the created table.
          catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false)
          throw e
      }
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:${tableDesc.database}, " +
    s"TableName: ${tableDesc.identifier.table}, " +
    s"InsertIntoHiveTable]"
  }
} 
Example 16
Source File: TestHiveSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.test.{TestHiveSingleton, TestHiveSparkSession}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SQLTestUtils


class TestHiveSuite extends TestHiveSingleton with SQLTestUtils {
  test("load test table based on case sensitivity") {
    val testHiveSparkSession = spark.asInstanceOf[TestHiveSparkSession]

    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
      sql("SELECT * FROM SRC").queryExecution.analyzed
      assert(testHiveSparkSession.getLoadedTables.contains("src"))
      assert(testHiveSparkSession.getLoadedTables.size == 1)
    }
    testHiveSparkSession.reset()

    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
      val err = intercept[AnalysisException] {
        sql("SELECT * FROM SRC").queryExecution.analyzed
      }
      assert(err.message.contains("Table or view not found"))
    }
    testHiveSparkSession.reset()
  }

  test("SPARK-15887: hive-site.xml should be loaded") {
    assert(hiveClient.getConf("hive.in.test", "") == "true")
  }
} 
Example 17
Source File: SparkSQLDriver.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.util.{ArrayList => JArrayList, Arrays, List => JList}

import scala.collection.JavaConverters._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}


private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.isEmpty) {
      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, attr.dataType.catalogString, "")
      }

      new Schema(fieldSchemas.asJava, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.sessionState.executePlan(context.sql(command).logicalPlan)
      hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) {
        execution.hiveResultString()
      }
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
} 
Example 18
Source File: HiveMetastoreLazyInitializationSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.{AnalysisException, SparkSession}
import org.apache.spark.util.Utils

class HiveMetastoreLazyInitializationSuite extends SparkFunSuite {

  test("lazily initialize Hive client") {
    val spark = SparkSession.builder()
      .appName("HiveMetastoreLazyInitializationSuite")
      .master("local[2]")
      .enableHiveSupport()
      .config("spark.hadoop.hive.metastore.uris", "thrift://127.0.0.1:11111")
      .getOrCreate()
    val originalLevel = org.apache.log4j.Logger.getRootLogger().getLevel
    try {
      // Avoid outputting a lot of expected warning logs
      spark.sparkContext.setLogLevel("error")

      // We should be able to run Spark jobs without Hive client.
      assert(spark.sparkContext.range(0, 1).count() === 1)

      // We should be able to use Spark SQL if no table references.
      assert(spark.sql("select 1 + 1").count() === 1)
      assert(spark.range(0, 1).count() === 1)

      // We should be able to use fs
      val path = Utils.createTempDir()
      path.delete()
      try {
        spark.range(0, 1).write.parquet(path.getAbsolutePath)
        assert(spark.read.parquet(path.getAbsolutePath).count() === 1)
      } finally {
        Utils.deleteRecursively(path)
      }

      // Make sure that we are not using the local derby metastore.
      val exceptionString = Utils.exceptionString(intercept[AnalysisException] {
        spark.sql("show tables")
      })
      for (msg <- Seq(
        "show tables",
        "Could not connect to meta store",
        "org.apache.thrift.transport.TTransportException",
        "Connection refused")) {
        exceptionString.contains(msg)
      }
    } finally {
      spark.sparkContext.setLogLevel(originalLevel.toString)
      spark.stop()
    }
  }
} 
Example 19
Source File: XSQLAnalyzeTableCommand.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.xsql.execution.command

import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.CatalogTableType
import org.apache.spark.sql.execution.command.{CommandUtils, RunnableCommand}
import org.apache.spark.sql.xsql.XSQLSessionCatalog


case class XSQLAnalyzeTableCommand(tableIdent: TableIdentifier, noscan: Boolean = true)
  extends RunnableCommand {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val sessionState = sparkSession.sessionState
    val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog]
    val catalogDB = catalog.getUsedCatalogDatabase(tableIdent.dataSource, tableIdent.database)
    if (catalogDB == None) {
      return Seq.empty[Row]
    }
    val ds = catalogDB.get.dataSourceName
    val db = catalogDB.get.name
    val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db), Some(ds))
    val tableMeta = catalog.getRawTable(tableIdentWithDB)
    if (tableMeta.tableType == CatalogTableType.VIEW) {
      throw new AnalysisException("ANALYZE TABLE is not supported on views.")
    }

    // Compute stats for the whole table
    val newTotalSize = CommandUtils.calculateTotalSize(sparkSession, tableMeta)
    val newRowCount =
      if (noscan) None else Some(BigInt(sparkSession.table(tableIdentWithDB).count()))

    // Update the metastore if the above statistics of the table are different from those
    // recorded in the metastore.
    val newStats = CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount)
    if (newStats.isDefined) {
      catalog.alterTableStats(tableIdentWithDB, newStats)
    }

    Seq.empty[Row]
  }
} 
Example 20
Source File: XSQLCreateHiveTableAsSelectCommand.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.xsql.execution.command

import scala.util.control.NonFatal

import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.command.DataWritingCommand
import org.apache.spark.sql.xsql.XSQLSessionCatalog


case class XSQLCreateHiveTableAsSelectCommand(
    tableDesc: CatalogTable,
    query: LogicalPlan,
    outputColumnNames: Seq[String],
    mode: SaveMode)
  extends DataWritingCommand {

  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog]
    val tableIdentifier = catalog.getUsedTableIdentifier(tableDesc.identifier)
    val newTableDesc = tableDesc.copy(identifier = tableIdentifier)
    if (catalog.tableExists(tableIdentifier)) {
      assert(
        mode != SaveMode.Overwrite,
        s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite")

      if (mode == SaveMode.ErrorIfExists) {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
      if (mode == SaveMode.Ignore) {
        // Since the table already exists and the save mode is Ignore, we will just return.
        return Seq.empty
      }

      XSQLInsertIntoHiveTable(
        newTableDesc,
        Map.empty,
        query,
        overwrite = false,
        ifPartitionNotExists = false,
        outputColumnNames = outputColumnNames).run(sparkSession, child)
    } else {
      // TODO ideally, we should get the output data ready first and then
      // add the relation into catalog, just in case of failure occurs while data
      // processing.
      assert(newTableDesc.schema.isEmpty)
      catalog.createTable(newTableDesc.copy(schema = query.schema), ignoreIfExists = false)

      try {
        // Read back the metadata of the table which was created just now.
        val createdTableMeta = catalog.getTableMetadata(newTableDesc.identifier)
        // For CTAS, there is no static partition values to insert.
        val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap
        XSQLInsertIntoHiveTable(
          createdTableMeta,
          partition,
          query,
          overwrite = true,
          ifPartitionNotExists = false,
          outputColumnNames = outputColumnNames).run(sparkSession, child)
      } catch {
        case NonFatal(e) =>
          // drop the created table.
          catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false)
          throw e
      }
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[TableName: ${tableDesc.identifier.table}, " +
      s"InsertIntoHiveTable]"
  }
} 
Example 21
Source File: SchemaUtils.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.util

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.types.StructType



  def checkColumnNameDuplication(
      columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = {
    val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
    if (names.distinct.length != names.length) {
      val duplicateColumns = names.groupBy(identity).collect {
        case (x, ys) if ys.length > 1 => s"`$x`"
      }
      throw new AnalysisException(
        s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
    }
  }
} 
Example 22
Source File: HyperLogLogPlusPlus.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions.aggregate

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.util.HyperLogLogPlusPlusHelper
import org.apache.spark.sql.types._

// scalastyle:off

  override def eval(buffer: InternalRow): Any = {
    hllppHelper.query(buffer, mutableAggBufferOffset)
  }
}

object HyperLogLogPlusPlus {
  def validateDoubleLiteral(exp: Expression): Double = exp match {
    case Literal(d: Double, DoubleType) => d
    case Literal(dec: Decimal, _) => dec.toDouble
    case _ =>
      throw new AnalysisException("The second argument should be a double literal.")
  }
} 
Example 23
Source File: randomExpressions.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral}
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.types._
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom


// scalastyle:off line.size.limit
@ExpressionDescription(
  usage = """_FUNC_([seed]) - Returns a random value with independent and identically distributed (i.i.d.) values drawn from the standard normal distribution.""",
  examples = """
    Examples:
      > SELECT _FUNC_();
       -0.3254147983080288
      > SELECT _FUNC_(0);
       1.1164209726833079
      > SELECT _FUNC_(null);
       1.1164209726833079
  """,
  note = "The function is non-deterministic in general case.")
// scalastyle:on line.size.limit
case class Randn(child: Expression) extends RDG with ExpressionWithRandomSeed {

  def this() = this(Literal(Utils.random.nextLong(), LongType))

  override def withNewSeed(seed: Long): Randn = Randn(Literal(seed, LongType))

  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val className = classOf[XORShiftRandom].getName
    val rngTerm = ctx.addMutableState(className, "rng")
    ctx.addPartitionInitializationStatement(
      s"$rngTerm = new $className(${seed}L + partitionIndex);")
    ev.copy(code = code"""
      final ${CodeGenerator.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""",
      isNull = FalseLiteral)
  }

  override def freshCopy(): Randn = Randn(child)
}

object Randn {
  def apply(seed: Long): Randn = Randn(Literal(seed, LongType))
} 
Example 24
Source File: TimeWindow.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.commons.lang3.StringUtils

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.TypeCheckFailure
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode}
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.CalendarInterval

case class TimeWindow(
    timeColumn: Expression,
    windowDuration: Long,
    slideDuration: Long,
    startTime: Long) extends UnaryExpression
  with ImplicitCastInputTypes
  with Unevaluable
  with NonSQLExpression {

  //////////////////////////
  // SQL Constructors
  //////////////////////////

  def this(
      timeColumn: Expression,
      windowDuration: Expression,
      slideDuration: Expression,
      startTime: Expression) = {
    this(timeColumn, TimeWindow.parseExpression(windowDuration),
      TimeWindow.parseExpression(slideDuration), TimeWindow.parseExpression(startTime))
  }

  def this(timeColumn: Expression, windowDuration: Expression, slideDuration: Expression) = {
    this(timeColumn, TimeWindow.parseExpression(windowDuration),
      TimeWindow.parseExpression(slideDuration), 0)
  }

  def this(timeColumn: Expression, windowDuration: Expression) = {
    this(timeColumn, windowDuration, windowDuration)
  }

  override def child: Expression = timeColumn
  override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType)
  override def dataType: DataType = new StructType()
    .add(StructField("start", TimestampType))
    .add(StructField("end", TimestampType))

  // This expression is replaced in the analyzer.
  override lazy val resolved = false

  
case class PreciseTimestampConversion(
    child: Expression,
    fromType: DataType,
    toType: DataType) extends UnaryExpression with ExpectsInputTypes {
  override def inputTypes: Seq[AbstractDataType] = Seq(fromType)
  override def dataType: DataType = toType
  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val eval = child.genCode(ctx)
    ev.copy(code = eval.code +
      code"""boolean ${ev.isNull} = ${eval.isNull};
         |${CodeGenerator.javaType(dataType)} ${ev.value} = ${eval.value};
       """.stripMargin)
  }
  override def nullSafeEval(input: Any): Any = input
} 
Example 25
Source File: view.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Alias, Attribute, Cast}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, View}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.internal.SQLConf


object EliminateView extends Rule[LogicalPlan] {
  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
    // The child should have the same output attributes with the View operator, so we simply
    // remove the View operator.
    case View(_, output, child) =>
      assert(output == child.output,
        s"The output of the child ${child.output.mkString("[", ",", "]")} is different from the " +
          s"view output ${output.mkString("[", ",", "]")}")
      child
  }
} 
Example 26
Source File: ResolveTableValuedFunctions.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import java.util.Locale

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Alias, Expression}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Range}
import org.apache.spark.sql.catalyst.rules._
import org.apache.spark.sql.types.{DataType, IntegerType, LongType}


      tvf("start" -> LongType, "end" -> LongType, "step" -> LongType,
          "numPartitions" -> IntegerType) {
        case Seq(start: Long, end: Long, step: Long, numPartitions: Int) =>
          Range(start, end, step, Some(numPartitions))
      })
  )

  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
    case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) =>
      // The whole resolution is somewhat difficult to understand here due to too much abstractions.
      // We should probably rewrite the following at some point. Reynold was just here to improve
      // error messages and didn't have time to do a proper rewrite.
      val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match {
        case Some(tvf) =>

          def failAnalysis(): Nothing = {
            val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ")
            u.failAnalysis(
              s"""error: table-valued function ${u.functionName} with alternatives:
                 |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")}
                 |cannot be applied to: ($argTypes)""".stripMargin)
          }

          val resolved = tvf.flatMap { case (argList, resolver) =>
            argList.implicitCast(u.functionArgs) match {
              case Some(casted) =>
                try {
                  Some(resolver(casted.map(_.eval())))
                } catch {
                  case e: AnalysisException =>
                    failAnalysis()
                }
              case _ =>
                None
            }
          }
          resolved.headOption.getOrElse {
            failAnalysis()
          }
        case _ =>
          u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function")
      }

      // If alias names assigned, add `Project` with the aliases
      if (u.outputNames.nonEmpty) {
        val outputAttrs = resolvedFunc.output
        // Checks if the number of the aliases is equal to expected one
        if (u.outputNames.size != outputAttrs.size) {
          u.failAnalysis(s"Number of given aliases does not match number of output columns. " +
            s"Function name: ${u.functionName}; number of aliases: " +
            s"${u.outputNames.size}; number of output columns: ${outputAttrs.size}.")
        }
        val aliases = outputAttrs.zip(u.outputNames).map {
          case (attr, name) => Alias(attr, name)()
        }
        Project(aliases, resolvedFunc)
      } else {
        resolvedFunc
      }
  }
} 
Example 27
Source File: StringUtils.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.util

import java.util.regex.{Pattern, PatternSyntaxException}

import org.apache.spark.sql.AnalysisException
import org.apache.spark.unsafe.types.UTF8String

object StringUtils {

  
  def filterPattern(names: Seq[String], pattern: String): Seq[String] = {
    val funcNames = scala.collection.mutable.SortedSet.empty[String]
    pattern.trim().split("\\|").foreach { subPattern =>
      try {
        val regex = ("(?i)" + subPattern.replaceAll("\\*", ".*")).r
        funcNames ++= names.filter{ name => regex.pattern.matcher(name).matches() }
      } catch {
        case _: PatternSyntaxException =>
      }
    }
    funcNames.toSeq
  }
} 
Example 28
Source File: SchemaUtilsSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.util

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.types.StructType

class SchemaUtilsSuite extends SparkFunSuite {

  private def resolver(caseSensitiveAnalysis: Boolean): Resolver = {
    if (caseSensitiveAnalysis) {
      caseSensitiveResolution
    } else {
      caseInsensitiveResolution
    }
  }

  Seq((true, ("a", "a"), ("b", "b")), (false, ("a", "A"), ("b", "B"))).foreach {
      case (caseSensitive, (a0, a1), (b0, b1)) =>

    val testType = if (caseSensitive) "case-sensitive" else "case-insensitive"
    test(s"Check column name duplication in $testType cases") {
      def checkExceptionCases(schemaStr: String, duplicatedColumns: Seq[String]): Unit = {
        val expectedErrorMsg = "Found duplicate column(s) in SchemaUtilsSuite: " +
          duplicatedColumns.map(c => s"`${c.toLowerCase}`").mkString(", ")
        val schema = StructType.fromDDL(schemaStr)
        var msg = intercept[AnalysisException] {
          SchemaUtils.checkSchemaColumnNameDuplication(
            schema, "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
        }.getMessage
        assert(msg.contains(expectedErrorMsg))
        msg = intercept[AnalysisException] {
          SchemaUtils.checkColumnNameDuplication(
            schema.map(_.name), "in SchemaUtilsSuite", resolver(caseSensitive))
        }.getMessage
        assert(msg.contains(expectedErrorMsg))
        msg = intercept[AnalysisException] {
          SchemaUtils.checkColumnNameDuplication(
            schema.map(_.name), "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
        }.getMessage
        assert(msg.contains(expectedErrorMsg))
      }

      checkExceptionCases(s"$a0 INT, b INT, $a1 INT", a0 :: Nil)
      checkExceptionCases(s"$a0 INT, b INT, $a1 INT, $a0 INT", a0 :: Nil)
      checkExceptionCases(s"$a0 INT, $b0 INT, $a1 INT, $a0 INT, $b1 INT", b0 :: a0 :: Nil)
    }
  }

  test("Check no exception thrown for valid schemas") {
    def checkNoExceptionCases(schemaStr: String, caseSensitive: Boolean): Unit = {
      val schema = StructType.fromDDL(schemaStr)
      SchemaUtils.checkSchemaColumnNameDuplication(
        schema, "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
      SchemaUtils.checkColumnNameDuplication(
        schema.map(_.name), "in SchemaUtilsSuite", resolver(caseSensitive))
      SchemaUtils.checkColumnNameDuplication(
        schema.map(_.name), "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
    }

    checkNoExceptionCases("a INT, b INT, c INT", caseSensitive = true)
    checkNoExceptionCases("Aa INT, b INT, aA INT", caseSensitive = true)

    checkNoExceptionCases("a INT, b INT, c INT", caseSensitive = false)
  }
} 
Example 29
Source File: ResolveLambdaVariablesSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.types.{ArrayType, IntegerType}


class ResolveLambdaVariablesSuite extends PlanTest {
  import org.apache.spark.sql.catalyst.dsl.expressions._
  import org.apache.spark.sql.catalyst.dsl.plans._

  object Analyzer extends RuleExecutor[LogicalPlan] {
    val batches = Batch("Resolution", FixedPoint(4), ResolveLambdaVariables(conf)) :: Nil
  }

  private val key = 'key.int
  private val values1 = 'values1.array(IntegerType)
  private val values2 = 'values2.array(ArrayType(ArrayType(IntegerType)))
  private val data = LocalRelation(Seq(key, values1, values2))
  private val lvInt = NamedLambdaVariable("x", IntegerType, nullable = true)
  private val lvHiddenInt = NamedLambdaVariable("col0", IntegerType, nullable = true)
  private val lvArray = NamedLambdaVariable("x", ArrayType(IntegerType), nullable = true)

  private def plan(e: Expression): LogicalPlan = data.select(e.as("res"))

  private def checkExpression(e1: Expression, e2: Expression): Unit = {
    comparePlans(Analyzer.execute(plan(e1)), plan(e2))
  }

  private def lv(s: Symbol) = UnresolvedNamedLambdaVariable(Seq(s.name))

  test("resolution - no op") {
    checkExpression(key, key)
  }

  test("resolution - simple") {
    val in = ArrayTransform(values1, LambdaFunction(lv('x) + 1, lv('x) :: Nil))
    val out = ArrayTransform(values1, LambdaFunction(lvInt + 1, lvInt :: Nil))
    checkExpression(in, out)
  }

  test("resolution - nested") {
    val in = ArrayTransform(values2, LambdaFunction(
      ArrayTransform(lv('x), LambdaFunction(lv('x) + 1, lv('x) :: Nil)), lv('x) :: Nil))
    val out = ArrayTransform(values2, LambdaFunction(
      ArrayTransform(lvArray, LambdaFunction(lvInt + 1, lvInt :: Nil)), lvArray :: Nil))
    checkExpression(in, out)
  }

  test("resolution - hidden") {
    val in = ArrayTransform(values1, key)
    val out = ArrayTransform(values1, LambdaFunction(key, lvHiddenInt :: Nil, hidden = true))
    checkExpression(in, out)
  }

  test("fail - name collisions") {
    val p = plan(ArrayTransform(values1,
      LambdaFunction(lv('x) + lv('X), lv('x) :: lv('X) :: Nil)))
    val msg = intercept[AnalysisException](Analyzer.execute(p)).getMessage
    assert(msg.contains("arguments should not have names that are semantically the same"))
  }

  test("fail - lambda arguments") {
    val p = plan(ArrayTransform(values1,
      LambdaFunction(lv('x) + lv('y) + lv('z), lv('x) :: lv('y) :: lv('z) :: Nil)))
    val msg = intercept[AnalysisException](Analyzer.execute(p)).getMessage
    assert(msg.contains("does not match the number of arguments expected"))
  }
} 
Example 30
Source File: AnalysisTest.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import java.net.URI
import java.util.Locale

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, InMemoryCatalog, SessionCatalog}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.internal.SQLConf

trait AnalysisTest extends PlanTest {

  protected val caseSensitiveAnalyzer = makeAnalyzer(caseSensitive = true)
  protected val caseInsensitiveAnalyzer = makeAnalyzer(caseSensitive = false)

  private def makeAnalyzer(caseSensitive: Boolean): Analyzer = {
    val conf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> caseSensitive)
    val catalog = new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf)
    catalog.createDatabase(
      CatalogDatabase("default", "", new URI("loc"), Map.empty),
      ignoreIfExists = false)
    catalog.createTempView("TaBlE", TestRelations.testRelation, overrideIfExists = true)
    catalog.createTempView("TaBlE2", TestRelations.testRelation2, overrideIfExists = true)
    catalog.createTempView("TaBlE3", TestRelations.testRelation3, overrideIfExists = true)
    new Analyzer(catalog, conf) {
      override val extendedResolutionRules = EliminateSubqueryAliases :: Nil
    }
  }

  protected def getAnalyzer(caseSensitive: Boolean) = {
    if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer
  }

  protected def checkAnalysis(
      inputPlan: LogicalPlan,
      expectedPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val actualPlan = analyzer.executeAndCheck(inputPlan)
    comparePlans(actualPlan, expectedPlan)
  }

  protected override def comparePlans(
      plan1: LogicalPlan,
      plan2: LogicalPlan,
      checkAnalysis: Boolean = false): Unit = {
    // Analysis tests may have not been fully resolved, so skip checkAnalysis.
    super.comparePlans(plan1, plan2, checkAnalysis)
  }

  protected def assertAnalysisSuccess(
      inputPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val analysisAttempt = analyzer.execute(inputPlan)
    try analyzer.checkAnalysis(analysisAttempt) catch {
      case a: AnalysisException =>
        fail(
          s"""
            |Failed to Analyze Plan
            |$inputPlan
            |
            |Partial Analysis
            |$analysisAttempt
          """.stripMargin, a)
    }
  }

  protected def assertAnalysisError(
      inputPlan: LogicalPlan,
      expectedErrors: Seq[String],
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val e = intercept[AnalysisException] {
      analyzer.checkAnalysis(analyzer.execute(inputPlan))
    }

    if (!expectedErrors.map(_.toLowerCase(Locale.ROOT)).forall(
        e.getMessage.toLowerCase(Locale.ROOT).contains)) {
      fail(
        s"""Exception message should contain the following substrings:
           |
           |  ${expectedErrors.mkString("\n  ")}
           |
           |Actual exception message:
           |
           |  ${e.getMessage}
         """.stripMargin)
    }
  }
} 
Example 31
Source File: ResolveInlineTablesSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Cast, Literal, Rand}
import org.apache.spark.sql.catalyst.expressions.aggregate.Count
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types.{LongType, NullType, TimestampType}


class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter {

  private def lit(v: Any): Literal = Literal(v)

  test("validate inputs are foldable") {
    ResolveInlineTables(conf).validateInputEvaluable(
      UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)))))

    // nondeterministic (rand) should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1)))))
    }

    // aggregate should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1))))))
    }

    // unresolved attribute should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A")))))
    }
  }

  test("validate input dimensions") {
    ResolveInlineTables(conf).validateInputDimension(
      UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2)))))

    // num alias != data dimension
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputDimension(
        UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2)))))
    }

    // num alias == data dimension, but data themselves are inconsistent
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputDimension(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22)))))
    }
  }

  test("do not fire the rule if not all expressions are resolved") {
    val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A"))))
    assert(ResolveInlineTables(conf)(table) == table)
  }

  test("convert") {
    val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted = ResolveInlineTables(conf).convert(table)

    assert(converted.output.map(_.dataType) == Seq(LongType))
    assert(converted.data.size == 2)
    assert(converted.data(0).getLong(0) == 1L)
    assert(converted.data(1).getLong(0) == 2L)
  }

  test("convert TimeZoneAwareExpression") {
    val table = UnresolvedInlineTable(Seq("c1"),
      Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType))))
    val withTimeZone = ResolveTimeZone(conf).apply(table)
    val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone)
    val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType)
      .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long]
    assert(output.map(_.dataType) == Seq(TimestampType))
    assert(data.size == 1)
    assert(data.head.getLong(0) == correct)
  }

  test("nullability inference in convert") {
    val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted1 = ResolveInlineTables(conf).convert(table1)
    assert(!converted1.schema.fields(0).nullable)

    val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType))))
    val converted2 = ResolveInlineTables(conf).convert(table2)
    assert(converted2.schema.fields(0).nullable)
  }
} 
Example 32
Source File: ResolveSubquerySuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions.{InSubquery, ListQuery}
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project}


class ResolveSubquerySuite extends AnalysisTest {

  val a = 'a.int
  val b = 'b.int
  val t1 = LocalRelation(a)
  val t2 = LocalRelation(b)

  test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") {
    val expr = Filter(
      InSubquery(Seq(a), ListQuery(Project(Seq(UnresolvedAttribute("a")), t2))), t1)
    val m = intercept[AnalysisException] {
      SimpleAnalyzer.checkAnalysis(SimpleAnalyzer.ResolveSubquery(expr))
    }.getMessage
    assert(m.contains(
      "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses"))
  }
} 
Example 33
Source File: CheckCartesianProductsSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.optimizer

import org.scalatest.Matchers._

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.internal.SQLConf.CROSS_JOINS_ENABLED

class CheckCartesianProductsSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = Batch("Check Cartesian Products", Once, CheckCartesianProducts) :: Nil
  }

  val testRelation1 = LocalRelation('a.int, 'b.int)
  val testRelation2 = LocalRelation('c.int, 'd.int)

  val joinTypesWithRequiredCondition = Seq(Inner, LeftOuter, RightOuter, FullOuter)
  val joinTypesWithoutRequiredCondition = Seq(LeftSemi, LeftAnti, ExistenceJoin('exists))

  test("CheckCartesianProducts doesn't throw an exception if cross joins are enabled)") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "true") {
      noException should be thrownBy {
        for (joinType <- joinTypesWithRequiredCondition ++ joinTypesWithoutRequiredCondition) {
          performCartesianProductCheck(joinType)
        }
      }
    }
  }

  test("CheckCartesianProducts throws an exception for join types that require a join condition") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithRequiredCondition) {
        val thrownException = the [AnalysisException] thrownBy {
          performCartesianProductCheck(joinType)
        }
        assert(thrownException.message.contains("Detected implicit cartesian product"))
      }
    }
  }

  test("CheckCartesianProducts doesn't throw an exception if a join condition is present") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithRequiredCondition) {
        noException should be thrownBy {
          performCartesianProductCheck(joinType, Some('a === 'd))
        }
      }
    }
  }

  test("CheckCartesianProducts doesn't throw an exception if join types don't require conditions") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithoutRequiredCondition) {
        noException should be thrownBy {
          performCartesianProductCheck(joinType)
        }
      }
    }
  }

  private def performCartesianProductCheck(
      joinType: JoinType,
      condition: Option[Expression] = None): Unit = {
    val analyzedPlan = testRelation1.join(testRelation2, joinType, condition).analyze
    val optimizedPlan = Optimize.execute(analyzedPlan)
    comparePlans(analyzedPlan, optimizedPlan)
  }
} 
Example 34
Source File: JdbcRelationProvider.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val resolver = sqlContext.conf.resolver
    val timeZoneId = sqlContext.conf.sessionLocalTimeZone
    val schema = JDBCRelation.getSchema(resolver, jdbcOptions)
    val parts = JDBCRelation.columnPartition(schema, resolver, timeZoneId, jdbcOptions)
    JDBCRelation(schema, parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val options = new JdbcOptionsInWrite(parameters)
    val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis

    val conn = JdbcUtils.createConnectionFactory(options)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, options)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, options)
              val tableSchema = JdbcUtils.getSchemaOption(conn, options)
              saveTable(df, tableSchema, isCaseSensitive, options)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, options.table, options)
              createTable(conn, df, options)
              saveTable(df, Some(df.schema), isCaseSensitive, options)
            }

          case SaveMode.Append =>
            val tableSchema = JdbcUtils.getSchemaOption(conn, options)
            saveTable(df, tableSchema, isCaseSensitive, options)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '${options.table}' already exists. " +
                s"SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(conn, df, options)
        saveTable(df, Some(df.schema), isCaseSensitive, options)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
} 
Example 35
Source File: DataSourceUtils.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources

import org.apache.hadoop.fs.Path
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.types._


object DataSourceUtils {

  
  private def verifySchema(format: FileFormat, schema: StructType, isReadPath: Boolean): Unit = {
    schema.foreach { field =>
      if (!format.supportDataType(field.dataType, isReadPath)) {
        throw new AnalysisException(
          s"$format data source does not support ${field.dataType.catalogString} data type.")
      }
    }
  }

  // SPARK-24626: Metadata files and temporary files should not be
  // counted as data files, so that they shouldn't participate in tasks like
  // location size calculation.
  private[sql] def isDataPath(path: Path): Boolean = {
    val name = path.getName
    !(name.startsWith("_") || name.startsWith("."))
  }
} 
Example 36
Source File: AnalyzeTableCommand.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command

import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.CatalogTableType



case class AnalyzeTableCommand(
    tableIdent: TableIdentifier,
    noscan: Boolean = true) extends RunnableCommand {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val sessionState = sparkSession.sessionState
    val db = tableIdent.database.getOrElse(sessionState.catalog.getCurrentDatabase)
    val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db))
    val tableMeta = sessionState.catalog.getTableMetadata(tableIdentWithDB)
    if (tableMeta.tableType == CatalogTableType.VIEW) {
      throw new AnalysisException("ANALYZE TABLE is not supported on views.")
    }

    // Compute stats for the whole table
    val newTotalSize = CommandUtils.calculateTotalSize(sparkSession, tableMeta)
    val newRowCount =
      if (noscan) None else Some(BigInt(sparkSession.table(tableIdentWithDB).count()))

    // Update the metastore if the above statistics of the table are different from those
    // recorded in the metastore.
    val newStats = CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount)
    if (newStats.isDefined) {
      sessionState.catalog.alterTableStats(tableIdentWithDB, newStats)
    }

    Seq.empty[Row]
  }
} 
Example 37
Source File: RateStreamProvider.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.sources

import java.util.Optional

import org.apache.spark.network.util.JavaUtils
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.execution.streaming.continuous.RateStreamContinuousReader
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.sources.v2._
import org.apache.spark.sql.sources.v2.reader.streaming.{ContinuousReader, MicroBatchReader}
import org.apache.spark.sql.types._


  def valueAtSecond(seconds: Long, rowsPerSecond: Long, rampUpTimeSeconds: Long): Long = {
    // E.g., rampUpTimeSeconds = 4, rowsPerSecond = 10
    // Then speedDeltaPerSecond = 2
    //
    // seconds   = 0 1 2  3  4  5  6
    // speed     = 0 2 4  6  8 10 10 (speedDeltaPerSecond * seconds)
    // end value = 0 2 6 12 20 30 40 (0 + speedDeltaPerSecond * seconds) * (seconds + 1) / 2
    val speedDeltaPerSecond = rowsPerSecond / (rampUpTimeSeconds + 1)
    if (seconds <= rampUpTimeSeconds) {
      // Calculate "(0 + speedDeltaPerSecond * seconds) * (seconds + 1) / 2" in a special way to
      // avoid overflow
      if (seconds % 2 == 1) {
        (seconds + 1) / 2 * speedDeltaPerSecond * seconds
      } else {
        seconds / 2 * speedDeltaPerSecond * (seconds + 1)
      }
    } else {
      // rampUpPart is just a special case of the above formula: rampUpTimeSeconds == seconds
      val rampUpPart = valueAtSecond(rampUpTimeSeconds, rowsPerSecond, rampUpTimeSeconds)
      rampUpPart + (seconds - rampUpTimeSeconds) * rowsPerSecond
    }
  }
} 
Example 38
Source File: JdbcUtilsSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.parser.ParseException
import org.apache.spark.sql.types._

class JdbcUtilsSuite extends SparkFunSuite {

  val tableSchema = StructType(Seq(
    StructField("C1", StringType, false), StructField("C2", IntegerType, false)))
  val caseSensitive = org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution
  val caseInsensitive = org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution

  test("Parse user specified column types") {
    assert(JdbcUtils.getCustomSchema(tableSchema, null, caseInsensitive) === tableSchema)
    assert(JdbcUtils.getCustomSchema(tableSchema, "", caseInsensitive) === tableSchema)

    assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE", caseInsensitive) ===
      StructType(Seq(StructField("C1", DateType, false), StructField("C2", IntegerType, false))))
    assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE", caseSensitive) ===
      StructType(Seq(StructField("C1", StringType, false), StructField("C2", IntegerType, false))))

    assert(
      JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseInsensitive) ===
      StructType(Seq(StructField("C1", DateType, false), StructField("C2", StringType, false))))
    assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseSensitive) ===
      StructType(Seq(StructField("C1", StringType, false), StructField("C2", StringType, false))))

    // Throw AnalysisException
    val duplicate = intercept[AnalysisException]{
      JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, c1 STRING", caseInsensitive) ===
        StructType(Seq(StructField("c1", DateType, false), StructField("c1", StringType, false)))
    }
    assert(duplicate.getMessage.contains(
      "Found duplicate column(s) in the customSchema option value"))

    // Throw ParseException
    val dataTypeNotSupported = intercept[ParseException]{
      JdbcUtils.getCustomSchema(tableSchema, "c3 DATEE, C2 STRING", caseInsensitive) ===
        StructType(Seq(StructField("c3", DateType, false), StructField("C2", StringType, false)))
    }
    assert(dataTypeNotSupported.getMessage.contains("DataType datee is not supported"))

    val mismatchedInput = intercept[ParseException]{
      JdbcUtils.getCustomSchema(tableSchema, "c3 DATE. C2 STRING", caseInsensitive) ===
        StructType(Seq(StructField("c3", DateType, false), StructField("C2", StringType, false)))
    }
    assert(mismatchedInput.getMessage.contains("mismatched input '.' expecting"))
  }
} 
Example 39
Source File: QueryExecutionSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation}
import org.apache.spark.sql.test.SharedSQLContext

class QueryExecutionSuite extends SharedSQLContext {
  test("toString() exception/error handling") {
    spark.experimental.extraStrategies = Seq(
        new SparkStrategy {
          override def apply(plan: LogicalPlan): Seq[SparkPlan] = Nil
        })

    def qe: QueryExecution = new QueryExecution(spark, OneRowRelation())

    // Nothing!
    assert(qe.toString.contains("OneRowRelation"))

    // Throw an AnalysisException - this should be captured.
    spark.experimental.extraStrategies = Seq(
      new SparkStrategy {
        override def apply(plan: LogicalPlan): Seq[SparkPlan] =
          throw new AnalysisException("exception")
      })
    assert(qe.toString.contains("org.apache.spark.sql.AnalysisException"))

    // Throw an Error - this should not be captured.
    spark.experimental.extraStrategies = Seq(
      new SparkStrategy {
        override def apply(plan: LogicalPlan): Seq[SparkPlan] =
          throw new Error("error")
      })
    val error = intercept[Error](qe.toString)
    assert(error.getMessage.contains("error"))
  }
} 
Example 40
Source File: ExecutorSideSQLConfSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.internal

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.{AnalysisException, SparkSession}
import org.apache.spark.sql.execution.debug.codegenStringSeq
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.test.SQLTestUtils

class ExecutorSideSQLConfSuite extends SparkFunSuite with SQLTestUtils {
  import testImplicits._

  protected var spark: SparkSession = null

  // Create a new [[SparkSession]] running in local-cluster mode.
  override def beforeAll(): Unit = {
    super.beforeAll()
    spark = SparkSession.builder()
      .master("local-cluster[2,1,1024]")
      .appName("testing")
      .getOrCreate()
  }

  override def afterAll(): Unit = {
    spark.stop()
    spark = null
  }

  override def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = {
    pairs.foreach { case (k, v) =>
      SQLConf.get.setConfString(k, v)
    }
    try f finally {
      pairs.foreach { case (k, _) =>
        SQLConf.get.unsetConf(k)
      }
    }
  }

  test("ReadOnlySQLConf is correctly created at the executor side") {
    withSQLConf("spark.sql.x" -> "a") {
      val checks = spark.range(10).mapPartitions { _ =>
        val conf = SQLConf.get
        Iterator(conf.isInstanceOf[ReadOnlySQLConf] && conf.getConfString("spark.sql.x") == "a")
      }.collect()
      assert(checks.forall(_ == true))
    }
  }

  test("case-sensitive config should work for json schema inference") {
    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
      withTempPath { path =>
        val pathString = path.getCanonicalPath
        spark.range(10).select('id.as("ID")).write.json(pathString)
        spark.range(10).write.mode("append").json(pathString)
        assert(spark.read.json(pathString).columns.toSet == Set("id", "ID"))
      }
    }
  }

  test("SPARK-24727 CODEGEN_CACHE_MAX_ENTRIES is correctly referenced at the executor side") {
    withSQLConf(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES.key -> "300") {
      val checks = spark.range(10).mapPartitions { _ =>
        val conf = SQLConf.get
        Iterator(conf.isInstanceOf[ReadOnlySQLConf] &&
          conf.getConfString(StaticSQLConf.CODEGEN_CACHE_MAX_ENTRIES.key) == "300")
      }.collect()
      assert(checks.forall(_ == true))
    }
  }

  test("SPARK-22219: refactor to control to generate comment") {
    Seq(true, false).foreach { flag =>
      withSQLConf(StaticSQLConf.CODEGEN_COMMENTS.key -> flag.toString) {
        val res = codegenStringSeq(spark.range(10).groupBy(col("id") * 2).count()
          .queryExecution.executedPlan)
        assert(res.length == 2)
        assert(res.forall { case (_, code) =>
          (code.contains("* Codegend pipeline") == flag) &&
            (code.contains("// input[") == flag)
        })
      }
    }
  }
} 
Example 41
Source File: VariableSubstitutionSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.internal

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException

class VariableSubstitutionSuite extends SparkFunSuite {

  private lazy val conf = new SQLConf
  private lazy val sub = new VariableSubstitution(conf)

  test("system property") {
    System.setProperty("varSubSuite.var", "abcd")
    assert(sub.substitute("${system:varSubSuite.var}") == "abcd")
  }

  test("environmental variables") {
    assert(sub.substitute("${env:SPARK_TESTING}") == "1")
  }

  test("Spark configuration variable") {
    conf.setConfString("some-random-string-abcd", "1234abcd")
    assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${some-random-string-abcd}") == "1234abcd")
  }

  test("multiple substitutes") {
    val q = "select ${bar} ${foo} ${doo} this is great"
    conf.setConfString("bar", "1")
    conf.setConfString("foo", "2")
    conf.setConfString("doo", "3")
    assert(sub.substitute(q) == "select 1 2 3 this is great")
  }

  test("test nested substitutes") {
    val q = "select ${bar} ${foo} this is great"
    conf.setConfString("bar", "1")
    conf.setConfString("foo", "${bar}")
    assert(sub.substitute(q) == "select 1 1 this is great")
  }

} 
Example 42
Source File: ResolvedDataSourceSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.test.SharedSQLContext

class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext {
  private def getProvidingClass(name: String): Class[_] =
    DataSource(
      sparkSession = spark,
      className = name,
      options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID)
    ).providingClass

  test("jdbc") {
    assert(
      getProvidingClass("jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.jdbc") ===
        classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
  }

  test("json") {
    assert(
      getProvidingClass("json") ===
      classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
  }

  test("parquet") {
    assert(
      getProvidingClass("parquet") ===
      classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
  }

  test("csv") {
    assert(
      getProvidingClass("csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
    assert(
      getProvidingClass("com.databricks.spark.csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
  }

  test("avro: show deploy guide for loading the external avro module") {
    Seq("avro", "org.apache.spark.sql.avro").foreach { provider =>
      val message = intercept[AnalysisException] {
        getProvidingClass(provider)
      }.getMessage
      assert(message.contains(s"Failed to find data source: $provider"))
      assert(message.contains("Please deploy the application as per the deployment section of"))
    }
  }

  test("kafka: show deploy guide for loading the external kafka module") {
    val message = intercept[AnalysisException] {
      getProvidingClass("kafka")
    }.getMessage
    assert(message.contains("Failed to find data source: kafka"))
    assert(message.contains("Please deploy the application as per the deployment section of"))
  }

  test("error message for unknown data sources") {
    val error = intercept[ClassNotFoundException] {
      getProvidingClass("asfdwefasdfasdf")
    }
    assert(error.getMessage.contains("Failed to find data source: asfdwefasdfasdf."))
  }
} 
Example 43
Source File: DDLSourceLoadSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types._


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name - internal data sources") {
    val e = intercept[AnalysisException] {
      spark.read.format("Fluet da Bomb").load()
    }
    assert(e.getMessage.contains("Multiple sources found for Fluet da Bomb"))
  }

  test("data sources with the same name - internal data source/external data source") {
    assert(spark.read.format("datasource").load().schema ==
      StructType(Seq(StructField("longType", LongType, nullable = false))))
  }

  test("data sources with the same name - external data sources") {
    val e = intercept[AnalysisException] {
      spark.read.format("Fake external source").load()
    }
    assert(e.getMessage.contains("Multiple sources found for Fake external source"))
  }

  test("load data source from format alias") {
    assert(spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false))))
  }

  test("specify full classname with duplicate formats") {
    assert(spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("integerType", IntegerType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceFour extends RelationProvider with DataSourceRegister {

  def shortName(): String = "datasource"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("longType", LongType, nullable = false)))
    }
} 
Example 44
Source File: AnnotationParser.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.parser

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.AbstractSparkSQLParser
import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute
import org.apache.spark.sql.catalyst.expressions.{AnnotationReference, Expression, Literal}
import org.apache.spark.sql.types._
import org.apache.spark.unsafe.types.UTF8String


  protected def toTableMetadata(metadata: Map[String, Expression]): Metadata = {
    val res = new MetadataBuilder()
    metadata.foreach {
      case (k, v:Literal) =>
        v.dataType match {
          case StringType =>
            if (k.equals("?")) {
              sys.error("column metadata key can not be ?")
            }
            if (k.equals("*")) {
              sys.error("column metadata key can not be *")
            }
            res.putString(k, v.value.asInstanceOf[UTF8String].toString)
          case LongType => res.putLong(k, v.value.asInstanceOf[Long])
          case DoubleType => res.putDouble(k, v.value.asInstanceOf[Double])
          case NullType =>
            res.putString(k, null)
          case a:ArrayType => res.putString(k, v.value.toString)
        }
      case (k, v:AnnotationReference) =>
        sys.error("column metadata can not have a reference to another column metadata")
    }
    res.build()
  }
} 
Example 45
Source File: ResolveDropCommand.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.CaseSensitivityUtils._
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.sources.commands.UnresolvedDropCommand
import org.apache.spark.sql.sources.{DropRelation, RelationKind, Table}

import scala.util.Try


case class ResolveDropCommand(analyzer: Analyzer, catalog: Catalog)
  extends Rule[LogicalPlan]
  with TableDependencyCalculator {

  private def failAnalysis(reason: String) = throw new AnalysisException(reason)

  override def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
    case UnresolvedDropCommand(kind, allowNotExisting, tableIdent, cascade) =>
      val plan = resolvePlan(kind, tableIdent, allowNotExisting)

      val affected = plan.map { lp =>
        val targetKind = RelationKind.kindOf(lp).getOrElse(Table)
        checkValidKind(kind, tableIdent, targetKind)
        buildDependentsMap(catalog, tableIdent)
      }

      affected.foreach(checkAllowedToDrop(cascade))
      DropRunnableCommand(affected.getOrElse(Map.empty))
  }

  private def getDropRelation(plan: LogicalPlan): Option[DropRelation] = plan.collectFirst {
    case d: LogicalPlan with DropRelation => d
    case LogicalRelation(d: DropRelation, _) => d
  }

  private def resolvePlan(kind: DropTarget,
                          tableIdent: TableIdentifier,
                          allowNotExisting: Boolean): Option[LogicalPlan] = {
    Try(catalog.lookupRelation(tableIdent)).toOption match {
      case Some(plan) => Some(plan)
      case None if allowNotExisting => None
      case None => failAnalysis(
        s"""${kind.targetName.toLowerCase} ${tableIdent.unquotedString} does not exist. To "
          |DROP a ${kind.targetName.toLowerCase} regardless if it exists of not, use
          |DROP ${kind.targetName.toUpperCase} IF EXISTS.""".stripMargin)
    }
  }

  private def checkAllowedToDrop(cascade: Boolean)
                                (dependents: Map[String, Option[DropRelation]]) = {
    if (dependents.size > 1 && !cascade) {
      failAnalysis("Can not drop because more than one relation has " +
        s"references to the target relation: ${dependents.keys.mkString(",")}. " +
        s"to force drop use 'CASCADE'.")
    }
  }

  private def checkValidKind(kind: DropTarget,
                             tableIdent: TableIdentifier,
                             targetKind: RelationKind): Unit = {
    if (!kind.accepts(targetKind)) {
      failAnalysis(
        s"Relation '${tableIdent.unquotedString} of kind" +
        s"$targetKind is not a ${kind.targetName}. " +
        s"Please use DROP ${targetKind.name.toUpperCase()} to drop it.")
    }
  }

  private def buildDependentsMap(catalog: Catalog,
                                 identifier: TableIdentifier): Map[String, Option[DropRelation]] = {
    val tables = getTables(catalog, identifier.database)
    val tablesAndDependents = buildDependentsMap(tables)

    def aggregate(acc: Set[TableIdentifier],
                  next: List[TableIdentifier]): Set[TableIdentifier] = next match {
      case Nil => acc
      case ident :: rest =>
        val dependents = tablesAndDependents(ident)
        aggregate(acc ++ dependents, rest ++ dependents.diff(acc))
    }

    val dependentsSet = aggregate(Set(identifier), identifier :: Nil)
    dependentsSet.flatMap { dependent =>
      tables.get(dependent).map(dependent.table -> getDropRelation(_))
    }.toMap
  }
} 
Example 46
Source File: ResolveAppendCommand.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.execution.datasources.{AppendRunnableCommand, LogicalRelation}
import org.apache.spark.sql.sources.AppendRelation
import org.apache.spark.sql.sources.commands.UnresolvedAppendCommand

case class ResolveAppendCommand(analyzer: Analyzer) extends Rule[LogicalPlan] {
  override def apply(plan: LogicalPlan): LogicalPlan = plan transform {
    case UnresolvedAppendCommand(table, options) =>
      val resolvedTable = analyzer.execute(table)
      resolvedTable.collectFirst {
        case LogicalRelation(appendRelation: AppendRelation, _) =>
          AppendRunnableCommand(appendRelation, options)
      }.getOrElse {
          throw new AnalysisException(s"Cannot append ${resolvedTable.treeString}")
      }
  }
} 
Example 47
Source File: ResolveInferSchemaCommand.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.sources.commands.{InferSchemaCommand, Orc, Parquet, UnresolvedInferSchemaCommand}

import scala.util.Try


case class ResolveInferSchemaCommand(sqlContext: SQLContext) extends Rule[LogicalPlan] {
  override def apply(plan: LogicalPlan): LogicalPlan = plan.transform {
    case UnresolvedInferSchemaCommand(path, explicitFileType) =>
      val fileType = explicitFileType.getOrElse(path.toLowerCase match {
        case p if p.endsWith(".orc") => Orc
        case p if p.endsWith(".parquet") => Parquet
        case invalid =>
          throw new AnalysisException(s"Could not determine file format of '$path'")
      })
      InferSchemaCommand(path, fileType)
  }
} 
Example 48
Source File: ResolveTableFunctions.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.tablefunctions.UnresolvedTableFunction
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule

case class ResolveTableFunctions(
    analyzer: Analyzer, registry: TableFunctionRegistry = TableFunctionRegistry)
  extends Rule[LogicalPlan] {

  def apply(plan: LogicalPlan): LogicalPlan = plan transformDown {
    case UnresolvedTableFunction(name, arguments) =>
      val lookup = registry.lookupFunction(name)
      lookup match {
        case Some(f) =>
          val analyzed = f.analyze(analyzer, arguments)
          ResolvedTableFunction(f, analyzed)
        case None =>
          throw new AnalysisException(s"Undefined function $name")
      }
  }
} 
Example 49
Source File: HiveEmulationSuite.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import org.apache.spark.sql.catalyst.analysis.NoSuchTableException
import org.apache.spark.sql.hive.SapHiveContext
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{AnalysisException, GlobalSapSQLContext, Row, SapSQLConf}
import org.apache.spark.util.DummyRelationUtils._
import org.apache.spark.util.SqlContextConfigurationUtils
import org.scalatest.FunSuite

class HiveEmulationSuite
  extends FunSuite
  with GlobalSapSQLContext
  with SqlContextConfigurationUtils {

  private def createTable(name: String, schema: StructType): Unit =
    sqlc.createDataFrame(sc.parallelize(Seq.empty[Row]), schema).registerTempTable(name)

  private def withHiveEmulation[A](op: => A): A =
    withConf(SapSQLConf.HIVE_EMULATION.key, "true")(op)

  test("Show schemas shows a default schema when hive emulation is on") {
    withHiveEmulation {
      val values = sqlc.sql("SHOW SCHEMAS").collect().toSet

      assertResult(Set(Row("default")))(values)
    }
  }

  test("Show schemas throws if hive emulation is off") {
    intercept[RuntimeException](sqlc.sql("SHOW SCHEMAS"))
  }

  test("Desc an existing table") {
    withHiveEmulation {
      createTable("foo", StructType('a.int :: 'b.int :: Nil))
      val values = sqlc.sql("DESC foo").collect().toSet

      assertResult(
        Set(
          Row("a", "int", null),
          Row("b", "int", null)))(values)
    }
  }

  test("Desc a non-existent table throws") {
    withHiveEmulation {
      intercept[NoSuchTableException] {
        sqlc.sql("DESC bar").collect()
      }
    }
  }

  test("Describe an existing table") {
    withHiveEmulation {
      createTable("foo", StructType('a.int :: 'b.int :: Nil))
      val values = sqlc.sql("DESCRIBE FORMATTED foo").collect().toList
      assertResult(
        List(
          Row(s"# col_name${" " * 12}\tdata_type${" " * 11}\tcomment${" " * 13}\t"),
          Row(""),
          Row(s"a${" " * 19}\tint${" " * 17}\tnull${" " * 16}\t"),
          Row(s"b${" " * 19}\tint${" " * 17}\tnull${" " * 16}\t")))(values)
    }
  }

  test("Retrieval of a database prefixed table") {
    val hc = new SapHiveContext(sc)
    hc.setConf(SapSQLConf.HIVE_EMULATION, true)
    val expected = Set(Row(0, 0), Row(0, 1), Row(1, 0), Row(1, 1))
    val rdd = hc.sparkContext.parallelize(expected.toSeq)
    hc.createDataFrame(rdd, StructType('a.int :: 'b.int :: Nil)).registerTempTable("foo")

    val results = hc.sql("SELECT * FROM default.foo").collect().toSet
    assertResult(expected)(results)

    hc.setConf(SapSQLConf.HIVE_EMULATION, false)
    intercept[AnalysisException] {
      hc.sql("SELECT * FROM default.foo")
    }
  }

  test("USE statements should not do anything when in hive emulation mode") {
    withConf(SapSQLConf.HIVE_EMULATION.key, "true") {
      sqlc.sql("USE foo bar")
    }
  }

  test("Any other use command should throw an exception") {
    intercept[RuntimeException] {
      sqlc.sql("USE foo bar")
    }
  }
} 
Example 50
Source File: InferSchemaCommandSuite.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import java.io.FileNotFoundException

import org.apache.spark.sql.{AnalysisException, GlobalSapSQLContext, Row}
import org.scalatest.FunSuite
import com.sap.spark.util.TestUtils.{getFileFromClassPath, withTempDirectory}

class InferSchemaCommandSuite extends FunSuite with GlobalSapSQLContext {
  test("Inferring of schema fails on non-existent file") {
    withTempDirectory { dir =>
      val nonExistentPath = dir.path + "/non-existent"

      intercept[FileNotFoundException] {
        sqlc.sql(s"""INFER SCHEMA OF "$nonExistentPath" AS ORC""").collect()
      }
    }
  }

  // scalastyle:off magic.number
  test("Inferring of schema works on parquet file") {
    val personFile = getFileFromClassPath("/pers.parquet")

    val result = sqlc.sql(s"""INFER SCHEMA OF "$personFile"""").collect().toSet

    assertResult(
      Set(
        Row("name", 1, true, "VARCHAR(*)", null, null, null),
        Row("age", 2, true, "INTEGER", 32, 2, 0)))(result)
  }

  test("Inferring of schema works on orc file") {
    val personFile = getFileFromClassPath("/pers.orc")

    val result = sqlc.sql(s"""INFER SCHEMA OF "$personFile"""").collect().toSet

    assertResult(
      Set(
        Row("name", 1, true, "VARCHAR(*)", null, null, null),
        Row("age", 2, true, "INTEGER", 32, 2, 0)))(result)
  }
  // scalastyle:on magic.number

  test("Inferring of schema fails on invalid file") {
    val invalidFile = getFileFromClassPath("/simple.csv")

    intercept[AnalysisException] {
      sqlc.sql(s"""INFER SCHEMA OF "$invalidFile"""")
    }
  }
} 
Example 51
Source File: ResolveTableFunctionsSuite.scala    From HANAVora-Extensions   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.tablefunctions.UnresolvedTableFunction
import org.mockito.Mockito._
import org.scalatest.FunSuite

class ResolveTableFunctionsSuite extends FunSuite {

  test("resolution of previously registered function") {
    val tf = mock(classOf[TableFunction])
    val analyzer = mock(classOf[Analyzer])
    val registry = new SimpleTableFunctionRegistry()
    val strategy = ResolveTableFunctions(analyzer, registry)
    registry.registerFunction("foo", tf)
    val unresolved = UnresolvedTableFunction("foo", Seq.empty)
    when(tf.analyze(analyzer, Seq.empty)) thenReturn Seq.empty

    val resolved = strategy.apply(unresolved)

    assert(resolved == ResolvedTableFunction(tf, Seq.empty))
    verify(tf).analyze(analyzer, Seq.empty)
  }

  test("fail on unregistered functions") {
    val analyzer = mock(classOf[Analyzer])
    val strategy = ResolveTableFunctions(analyzer)
    val unresolved = UnresolvedTableFunction("foo", Seq.empty)

    intercept[AnalysisException] {
      strategy(unresolved)
    }
  }
} 
Example 52
Source File: EventHubsWriter.scala    From azure-event-hubs-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.eventhubs

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{ AnalysisException, SparkSession }
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.types.{ BinaryType, StringType }
import org.apache.spark.util.Utils


private[eventhubs] object EventHubsWriter extends Logging {

  val BodyAttributeName = "body"
  val PartitionKeyAttributeName = "partitionKey"
  val PartitionIdAttributeName = "partition"
  val PropertiesAttributeName = "properties"

  override def toString: String = "EventHubsWriter"

  private def validateQuery(schema: Seq[Attribute], parameters: Map[String, String]): Unit = {
    schema
      .find(_.name == BodyAttributeName)
      .getOrElse(
        throw new AnalysisException(s"Required attribute '$BodyAttributeName' not found.")
      )
      .dataType match {
      case StringType | BinaryType => // good
      case _ =>
        throw new AnalysisException(
          s"$BodyAttributeName attribute type " +
            s"must be a String or BinaryType.")
    }
  }

  def write(
      sparkSession: SparkSession,
      queryExecution: QueryExecution,
      parameters: Map[String, String]
  ): Unit = {
    val schema = queryExecution.analyzed.output
    validateQuery(schema, parameters)
    queryExecution.toRdd.foreachPartition { iter =>
      val writeTask = new EventHubsWriteTask(parameters, schema)
      Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
        finallyBlock = writeTask.close()
      )
    }
  }
} 
Example 53
Source File: OapQuerySuite.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import java.util.{Locale, TimeZone}

import org.scalatest.{BeforeAndAfter, Ignore}

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.HiveUtils
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.internal.SQLConf

// Ignore because in separate package will encounter problem with shaded spark source.
@Ignore
class OapQuerySuite extends HiveComparisonTest with BeforeAndAfter  {
  private lazy val originalTimeZone = TimeZone.getDefault
  private lazy val originalLocale = Locale.getDefault
  import org.apache.spark.sql.hive.test.TestHive._

  // Note: invoke TestHive will create a SparkContext which can't be configured by us.
  // So be careful this may affect current using SparkContext and cause strange problem.
  private lazy val originalCrossJoinEnabled = TestHive.conf.crossJoinEnabled

  override def beforeAll() {
    super.beforeAll()
    TestHive.setCacheTables(true)
    // Timezone is fixed to America/Los_Angeles for those timezone sensitive tests (timestamp_*)
    TimeZone.setDefault(TimeZone.getTimeZone("America/Los_Angeles"))
    // Add Locale setting
    Locale.setDefault(Locale.US)
    // Ensures that cross joins are enabled so that we can test them
    TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, true)
    TestHive.setConf(HiveUtils.CONVERT_METASTORE_PARQUET, true)
  }

  override def afterAll() {
    try {
      TestHive.setCacheTables(false)
      TimeZone.setDefault(originalTimeZone)
      Locale.setDefault(originalLocale)
      sql("DROP TEMPORARY FUNCTION IF EXISTS udtf_count2")
      TestHive.setConf(SQLConf.CROSS_JOINS_ENABLED, originalCrossJoinEnabled)
    } finally {
      super.afterAll()
    }
  }
  private def assertDupIndex(body: => Unit): Unit = {
    val e = intercept[AnalysisException] { body }
    assert(e.getMessage.toLowerCase.contains("exists"))
  }

  test("create hive table in parquet format") {
    try {
      sql("create table p_table (key int, val string) stored as parquet")
      sql("insert overwrite table p_table select * from src")
      sql("create oindex if not exists p_index on p_table(key)")
      assert(sql("select val from p_table where key = 238")
        .collect().head.getString(0) == "val_238")
    } finally {
      sql("drop oindex p_index on p_table")
      sql("drop table p_table")
    }
  }

  test("create duplicate hive table in parquet format") {
    try {
      sql("create table p_table1 (key int, val string) stored as parquet")
      sql("insert overwrite table p_table1 select * from src")
      sql("create oindex p_index on p_table1(key)")
      assertDupIndex { sql("create oindex p_index on p_table1(key)") }
    } finally {
      sql("drop oindex p_index on p_table1")
    }
  }
} 
Example 54
Source File: CreateHiveTableAsSelectCommand.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import scala.util.control.NonFatal

import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, OverwriteOptions}
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.hive.MetastoreRelation



case class CreateHiveTableAsSelectCommand(
    tableDesc: CatalogTable,
    query: LogicalPlan,
    ignoreIfExists: Boolean)
  extends RunnableCommand {

  private val tableIdentifier = tableDesc.identifier

  override def innerChildren: Seq[LogicalPlan] = Seq(query)

  override def run(sparkSession: SparkSession): Seq[Row] = {
    lazy val metastoreRelation: MetastoreRelation = {
      import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
      import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
      import org.apache.hadoop.io.Text
      import org.apache.hadoop.mapred.TextInputFormat

      val withFormat =
        tableDesc.withNewStorage(
          inputFormat =
            tableDesc.storage.inputFormat.orElse(Some(classOf[TextInputFormat].getName)),
          outputFormat =
            tableDesc.storage.outputFormat
              .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)),
          serde = tableDesc.storage.serde.orElse(Some(classOf[LazySimpleSerDe].getName)),
          compressed = tableDesc.storage.compressed)

      val withSchema = if (withFormat.schema.isEmpty) {
        // Hive doesn't support specifying the column list for target table in CTAS
        // However we don't think SparkSQL should follow that.
        tableDesc.copy(schema = query.output.toStructType)
      } else {
        withFormat
      }

      sparkSession.sessionState.catalog.createTable(withSchema, ignoreIfExists = false)

      // Get the Metastore Relation
      sparkSession.sessionState.catalog.lookupRelation(tableIdentifier) match {
        case r: MetastoreRelation => r
      }
    }
    // TODO ideally, we should get the output data ready first and then
    // add the relation into catalog, just in case of failure occurs while data
    // processing.
    if (sparkSession.sessionState.catalog.tableExists(tableIdentifier)) {
      if (ignoreIfExists) {
        // table already exists, will do nothing, to keep consistent with Hive
      } else {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
    } else {
      try {
        sparkSession.sessionState.executePlan(InsertIntoTable(
          metastoreRelation, Map(), query, overwrite = OverwriteOptions(true),
          ifNotExists = false)).toRdd
      } catch {
        case NonFatal(e) =>
          // drop the created table.
          sparkSession.sessionState.catalog.dropTable(tableIdentifier, ignoreIfNotExists = true,
            purge = false)
          throw e
      }
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:${tableDesc.database}}, " +
    s"TableName: ${tableDesc.identifier.table}, " +
    s"InsertIntoHiveTable]"
  }
} 
Example 55
Source File: SparkSQLDriver.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.util.{ArrayList => JArrayList, Arrays, List => JList}

import scala.collection.JavaConverters._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.execution.QueryExecution


private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.isEmpty) {
      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, attr.dataType.catalogString, "")
      }

      new Schema(fieldSchemas.asJava, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.sessionState.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.hiveResultString()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
} 
Example 56
Source File: AnalysisTest.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.SimpleCatalystConf
import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._

trait AnalysisTest extends PlanTest {

  protected val caseSensitiveAnalyzer = makeAnalyzer(caseSensitive = true)
  protected val caseInsensitiveAnalyzer = makeAnalyzer(caseSensitive = false)

  private def makeAnalyzer(caseSensitive: Boolean): Analyzer = {
    val conf = new SimpleCatalystConf(caseSensitive)
    val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf)
    catalog.createTempView("TaBlE", TestRelations.testRelation, overrideIfExists = true)
    new Analyzer(catalog, conf) {
      override val extendedResolutionRules = EliminateSubqueryAliases :: Nil
    }
  }

  protected def getAnalyzer(caseSensitive: Boolean) = {
    if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer
  }

  protected def checkAnalysis(
      inputPlan: LogicalPlan,
      expectedPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val actualPlan = analyzer.execute(inputPlan)
    analyzer.checkAnalysis(actualPlan)
    comparePlans(actualPlan, expectedPlan)
  }

  protected def assertAnalysisSuccess(
      inputPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val analysisAttempt = analyzer.execute(inputPlan)
    try analyzer.checkAnalysis(analysisAttempt) catch {
      case a: AnalysisException =>
        fail(
          s"""
            |Failed to Analyze Plan
            |$inputPlan
            |
            |Partial Analysis
            |$analysisAttempt
          """.stripMargin, a)
    }
  }

  protected def assertAnalysisError(
      inputPlan: LogicalPlan,
      expectedErrors: Seq[String],
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val e = intercept[AnalysisException] {
      analyzer.checkAnalysis(analyzer.execute(inputPlan))
    }

    if (!expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains)) {
      fail(
        s"""Exception message should contain the following substrings:
           |
           |  ${expectedErrors.mkString("\n  ")}
           |
           |Actual exception message:
           |
           |  ${e.getMessage}
         """.stripMargin)
    }
  }
} 
Example 57
Source File: ResolveInlineTablesSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Literal, Rand}
import org.apache.spark.sql.catalyst.expressions.aggregate.Count
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.types.{LongType, NullType}


class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter {

  private def lit(v: Any): Literal = Literal(v)

  test("validate inputs are foldable") {
    ResolveInlineTables.validateInputEvaluable(
      UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)))))

    // nondeterministic (rand) should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1)))))
    }

    // aggregate should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1))))))
    }

    // unresolved attribute should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A")))))
    }
  }

  test("validate input dimensions") {
    ResolveInlineTables.validateInputDimension(
      UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2)))))

    // num alias != data dimension
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2)))))
    }

    // num alias == data dimension, but data themselves are inconsistent
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22)))))
    }
  }

  test("do not fire the rule if not all expressions are resolved") {
    val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A"))))
    assert(ResolveInlineTables(table) == table)
  }

  test("convert") {
    val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted = ResolveInlineTables.convert(table)

    assert(converted.output.map(_.dataType) == Seq(LongType))
    assert(converted.data.size == 2)
    assert(converted.data(0).getLong(0) == 1L)
    assert(converted.data(1).getLong(0) == 2L)
  }

  test("nullability inference in convert") {
    val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted1 = ResolveInlineTables.convert(table1)
    assert(!converted1.schema.fields(0).nullable)

    val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType))))
    val converted2 = ResolveInlineTables.convert(table2)
    assert(converted2.schema.fields(0).nullable)
  }
} 
Example 58
Source File: ResolveSubquerySuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference}
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project}


class ResolveSubquerySuite extends AnalysisTest {

  val a = 'a.int
  val b = 'b.int
  val t1 = LocalRelation(a)
  val t2 = LocalRelation(b)

  test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") {
    val expr = Filter(In(a, Seq(ListQuery(Project(Seq(OuterReference(a)), t2)))), t1)
    val m = intercept[AnalysisException] {
      SimpleAnalyzer.ResolveSubquery(expr)
    }.getMessage
    assert(m.contains(
      "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses"))
  }
} 
Example 59
Source File: CartesianProductExec.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.joins

import org.apache.spark._
import org.apache.spark.rdd.{CartesianPartition, CartesianRDD, RDD}
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, JoinedRow, UnsafeRow}
import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeRowJoiner
import org.apache.spark.sql.execution.{BinaryExecNode, SparkPlan}
import org.apache.spark.sql.execution.metric.SQLMetrics
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.util.CompletionIterator
import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter


class UnsafeCartesianRDD(left : RDD[UnsafeRow], right : RDD[UnsafeRow], numFieldsOfRight: Int)
  extends CartesianRDD[UnsafeRow, UnsafeRow](left.sparkContext, left, right) {

  override def compute(split: Partition, context: TaskContext): Iterator[(UnsafeRow, UnsafeRow)] = {
    // We will not sort the rows, so prefixComparator and recordComparator are null.
    val sorter = UnsafeExternalSorter.create(
      context.taskMemoryManager(),
      SparkEnv.get.blockManager,
      SparkEnv.get.serializerManager,
      context,
      null,
      null,
      1024,
      SparkEnv.get.memoryManager.pageSizeBytes,
      SparkEnv.get.conf.getLong("spark.shuffle.spill.numElementsForceSpillThreshold",
        UnsafeExternalSorter.DEFAULT_NUM_ELEMENTS_FOR_SPILL_THRESHOLD),
      false)

    val partition = split.asInstanceOf[CartesianPartition]
    for (y <- rdd2.iterator(partition.s2, context)) {
      sorter.insertRecord(y.getBaseObject, y.getBaseOffset, y.getSizeInBytes, 0, false)
    }

    // Create an iterator from sorter and wrapper it as Iterator[UnsafeRow]
    def createIter(): Iterator[UnsafeRow] = {
      val iter = sorter.getIterator
      val unsafeRow = new UnsafeRow(numFieldsOfRight)
      new Iterator[UnsafeRow] {
        override def hasNext: Boolean = {
          iter.hasNext
        }
        override def next(): UnsafeRow = {
          iter.loadNext()
          unsafeRow.pointTo(iter.getBaseObject, iter.getBaseOffset, iter.getRecordLength)
          unsafeRow
        }
      }
    }

    val resultIter =
      for (x <- rdd1.iterator(partition.s1, context);
           y <- createIter()) yield (x, y)
    CompletionIterator[(UnsafeRow, UnsafeRow), Iterator[(UnsafeRow, UnsafeRow)]](
      resultIter, sorter.cleanupResources())
  }
}


case class CartesianProductExec(
    left: SparkPlan,
    right: SparkPlan,
    condition: Option[Expression]) extends BinaryExecNode {
  override def output: Seq[Attribute] = left.output ++ right.output

  override lazy val metrics = Map(
    "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"))

  protected override def doExecute(): RDD[InternalRow] = {
    val numOutputRows = longMetric("numOutputRows")

    val leftResults = left.execute().asInstanceOf[RDD[UnsafeRow]]
    val rightResults = right.execute().asInstanceOf[RDD[UnsafeRow]]

    val pair = new UnsafeCartesianRDD(leftResults, rightResults, right.output.size)
    pair.mapPartitionsWithIndexInternal { (index, iter) =>
      val joiner = GenerateUnsafeRowJoiner.create(left.schema, right.schema)
      val filtered = if (condition.isDefined) {
        val boundCondition = newPredicate(condition.get, left.output ++ right.output)
        boundCondition.initialize(index)
        val joined = new JoinedRow

        iter.filter { r =>
          boundCondition.eval(joined(r._1, r._2))
        }
      } else {
        iter
      }
      filtered.map { r =>
        numOutputRows += 1
        joiner.join(r._1, r._2)
      }
    }
  }
} 
Example 60
Source File: JdbcRelationProvider.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn == null) {
      null
    } else {
      JDBCPartitioningInfo(
        partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val url = jdbcOptions.url
    val table = jdbcOptions.table
    val createTableOptions = jdbcOptions.createTableOptions
    val isTruncate = jdbcOptions.isTruncate

    val conn = JdbcUtils.createConnectionFactory(jdbcOptions)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, url, table)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (isTruncate && isCascadingTruncateTable(url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, table)
              saveTable(df, url, table, jdbcOptions)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, table)
              createTable(df.schema, url, table, createTableOptions, conn)
              saveTable(df, url, table, jdbcOptions)
            }

          case SaveMode.Append =>
            saveTable(df, url, table, jdbcOptions)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '$table' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(df.schema, url, table, createTableOptions, conn)
        saveTable(df, url, table, jdbcOptions)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
} 
Example 61
Source File: QueryExecutionSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation}
import org.apache.spark.sql.test.SharedSQLContext

class QueryExecutionSuite extends SharedSQLContext {
  test("toString() exception/error handling") {
    val badRule = new SparkStrategy {
      var mode: String = ""
      override def apply(plan: LogicalPlan): Seq[SparkPlan] = mode.toLowerCase match {
        case "exception" => throw new AnalysisException(mode)
        case "error" => throw new Error(mode)
        case _ => Nil
      }
    }
    spark.experimental.extraStrategies = badRule :: Nil

    def qe: QueryExecution = new QueryExecution(spark, OneRowRelation)

    // Nothing!
    badRule.mode = ""
    assert(qe.toString.contains("OneRowRelation"))

    // Throw an AnalysisException - this should be captured.
    badRule.mode = "exception"
    assert(qe.toString.contains("org.apache.spark.sql.AnalysisException"))

    // Throw an Error - this should not be captured.
    badRule.mode = "error"
    val error = intercept[Error](qe.toString)
    assert(error.getMessage.contains("error"))
  }
} 
Example 62
Source File: VariableSubstitutionSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.internal

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException

class VariableSubstitutionSuite extends SparkFunSuite {

  private lazy val conf = new SQLConf
  private lazy val sub = new VariableSubstitution(conf)

  test("system property") {
    System.setProperty("varSubSuite.var", "abcd")
    assert(sub.substitute("${system:varSubSuite.var}") == "abcd")
  }

  test("environmental variables") {
    assert(sub.substitute("${env:SPARK_TESTING}") == "1")
  }

  test("Spark configuration variable") {
    conf.setConfString("some-random-string-abcd", "1234abcd")
    assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${some-random-string-abcd}") == "1234abcd")
  }

  test("multiple substitutes") {
    val q = "select ${bar} ${foo} ${doo} this is great"
    conf.setConfString("bar", "1")
    conf.setConfString("foo", "2")
    conf.setConfString("doo", "3")
    assert(sub.substitute(q) == "select 1 2 3 this is great")
  }

  test("test nested substitutes") {
    val q = "select ${bar} ${foo} this is great"
    conf.setConfString("bar", "1")
    conf.setConfString("foo", "${bar}")
    assert(sub.substitute(q) == "select 1 1 this is great")
  }

} 
Example 63
Source File: ResolvedDataSourceSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.execution.datasources.DataSource

class ResolvedDataSourceSuite extends SparkFunSuite {
  private def getProvidingClass(name: String): Class[_] =
    DataSource(sparkSession = null, className = name).providingClass

  test("jdbc") {
    assert(
      getProvidingClass("jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.jdbc") ===
        classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
  }

  test("json") {
    assert(
      getProvidingClass("json") ===
      classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
  }

  test("parquet") {
    assert(
      getProvidingClass("parquet") ===
      classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
  }

  test("csv") {
    assert(
      getProvidingClass("csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
    assert(
      getProvidingClass("com.databricks.spark.csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
  }

  test("error message for unknown data sources") {
    val error1 = intercept[AnalysisException] {
      getProvidingClass("avro")
    }
    assert(error1.getMessage.contains("Failed to find data source: avro."))

    val error2 = intercept[AnalysisException] {
      getProvidingClass("com.databricks.spark.avro")
    }
    assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro."))

    val error3 = intercept[ClassNotFoundException] {
      getProvidingClass("asfdwefasdfasdf")
    }
    assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf."))
  }
} 
Example 64
Source File: DDLSourceLoadSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {
    intercept[RuntimeException] {
      spark.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {
    spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {
    spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without Hive Support") {
    val e = intercept[AnalysisException] {
      spark.read.format("orc").load()
    }
    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
} 
Example 65
Source File: CarbonMetastoreTypes.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.util

import scala.util.parsing.combinator.RegexParsers

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.types._

object CarbonMetastoreTypes extends RegexParsers {
  protected lazy val primitiveType: Parser[DataType] =
    "string" ^^^ StringType |
    "varchar" ^^^ StringType |
    "float" ^^^ FloatType |
    "int" ^^^ IntegerType |
    "tinyint" ^^^ ShortType |
    "short" ^^^ ShortType |
    "double" ^^^ DoubleType |
    "long" ^^^ LongType |
    "binary" ^^^ BinaryType |
    "boolean" ^^^ BooleanType |
    fixedDecimalType |
    "decimal" ^^^ "decimal" ^^^ DecimalType(10, 0) |
    "varchar\\((\\d+)\\)".r ^^^ StringType |
    "date" ^^^ DateType |
    "timestamp" ^^^ TimestampType

  protected lazy val fixedDecimalType: Parser[DataType] =
    "decimal" ~> "(" ~> "^[1-9]\\d*".r ~ ("," ~> "^[0-9]\\d*".r <~ ")") ^^ {
      case precision ~ scale =>
        DecimalType(precision.toInt, scale.toInt)
    }

  protected lazy val arrayType: Parser[DataType] =
    "array" ~> "<" ~> dataType <~ ">" ^^ {
      case tpe => ArrayType(tpe)
    }

  protected lazy val mapType: Parser[DataType] =
    "map" ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ {
      case t1 ~ _ ~ t2 => MapType(t1, t2)
    }

  protected lazy val structField: Parser[StructField] =
    "[a-zA-Z0-9_]*".r ~ ":" ~ dataType ^^ {
      case name ~ _ ~ tpe => StructField(name, tpe, nullable = true)
    }

  protected lazy val structType: Parser[DataType] =
    "struct" ~> "<" ~> repsep(structField, ",") <~ ">" ^^ {
      case fields => StructType(fields)
    }

  protected lazy val dataType: Parser[DataType] =
    arrayType |
    mapType |
    structType |
    primitiveType

  def toDataType(metastoreType: String): DataType = {
    parseAll(dataType, metastoreType) match {
      case Success(result, _) => result
      case _: NoSuccess =>
        throw new AnalysisException(s"Unsupported dataType: $metastoreType")
    }
  }

  def toMetastoreType(dt: DataType): String = {
    dt match {
      case ArrayType(elementType, _) => s"array<${ toMetastoreType(elementType) }>"
      case StructType(fields) =>
        s"struct<${
          fields.map(f => s"${ f.name }:${ toMetastoreType(f.dataType) }")
            .mkString(",")
        }>"
      case MapType(keyType, valueType, _) =>
        s"map<${ toMetastoreType(keyType) }, ${ toMetastoreType(valueType) }>"
      case StringType => "string"
      case FloatType => "float"
      case IntegerType => "int"
      case ShortType => "tinyint"
      case DoubleType => "double"
      case LongType => "bigint"
      case BinaryType => "binary"
      case BooleanType => "boolean"
      case DecimalType() => "decimal"
      case TimestampType => "timestamp"
      case DateType => "date"
    }
  }
} 
Example 66
Source File: AlterTableRevertTestCase.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.carbondata.restructure

import java.io.File

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.test.TestQueryExecutor
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll

import org.apache.carbondata.core.metadata.CarbonMetadata
import org.apache.carbondata.spark.exception.ProcessMetaDataException

class AlterTableRevertTestCase extends QueryTest with BeforeAndAfterAll {

  override def beforeAll() {
    sql("drop table if exists reverttest")
    sql(
      "CREATE TABLE reverttest(intField int,stringField string,timestampField timestamp," +
      "decimalField decimal(6,2)) STORED AS carbondata")
    sql(s"LOAD DATA LOCAL INPATH '$resourcesPath/restructure/data4.csv' INTO TABLE reverttest " +
        s"options('FILEHEADER'='intField,stringField,timestampField,decimalField')")
  }

  test("test to revert new added columns on failure") {
    intercept[ProcessMetaDataException] {
      hiveClient.runSqlHive("set hive.security.authorization.enabled=true")
      sql(
        "Alter table reverttest add columns(newField string) TBLPROPERTIES" +
        "('DEFAULT.VALUE.newField'='def')")
      hiveClient.runSqlHive("set hive.security.authorization.enabled=false")
      intercept[AnalysisException] {
        sql("select newField from reverttest")
      }
    }
  }

  test("test to revert table name on failure") {
    val exception = intercept[ProcessMetaDataException] {
      new File(TestQueryExecutor.warehouse + "/reverttest_fail").mkdir()
      sql("alter table reverttest rename to reverttest_fail")
      new File(TestQueryExecutor.warehouse + "/reverttest_fail").delete()
    }
    val result = sql("select * from reverttest").count()
    assert(result.equals(1L))
    sql("drop table if exists reverttest_fail")
  }

  test("test to revert drop columns on failure") {
    intercept[ProcessMetaDataException] {
      hiveClient.runSqlHive("set hive.security.authorization.enabled=true")
      sql("Alter table reverttest drop columns(decimalField)")
      hiveClient.runSqlHive("set hive.security.authorization.enabled=false")
    }
    assert(sql("select decimalField from reverttest").count().equals(1L))
  }

  test("test to revert changed datatype on failure") {
    intercept[ProcessMetaDataException] {
      hiveClient.runSqlHive("set hive.security.authorization.enabled=true")
      sql("Alter table reverttest change intField intfield bigint")
      hiveClient.runSqlHive("set hive.security.authorization.enabled=false")
    }
    assert(
      sql("select intfield from reverttest").schema.fields.apply(0).dataType.simpleString == "int")
  }

  test("test to check if dictionary files are deleted for new column if query fails") {
    intercept[ProcessMetaDataException] {
      hiveClient.runSqlHive("set hive.security.authorization.enabled=true")
      sql(
        "Alter table reverttest add columns(newField string) TBLPROPERTIES" +
        "('DEFAULT.VALUE.newField'='def')")
      hiveClient.runSqlHive("set hive.security.authorization.enabled=false")
      intercept[AnalysisException] {
        sql("select newField from reverttest")
      }
      val carbonTable = CarbonMetadata.getInstance.getCarbonTable("default", "reverttest")

      assert(new File(carbonTable.getMetadataPath).listFiles().length < 6)
    }
  }

  override def afterAll() {
    hiveClient.runSqlHive("set hive.security.authorization.enabled=false")
    sql("drop table if exists reverttest")
    sql("drop table if exists reverttest_fail")
  }

} 
Example 67
Source File: CarbonTableSchemaCommonSuite.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.test.util.QueryTest
import org.junit.Assert
import org.scalatest.BeforeAndAfterAll

import org.apache.carbondata.spark.exception.ProcessMetaDataException

class CarbonTableSchemaCommonSuite extends QueryTest with BeforeAndAfterAll {

  test("Creating table: Duplicate dimensions found with name, it should throw AnalysisException") {
    sql("DROP TABLE IF EXISTS carbon_table")
    try {
      sql(
        s"""
           | CREATE TABLE carbon_table(
           | BB INT, bb char(10)
           | )
           | STORED AS carbondata
       """.stripMargin)
      Assert.assertTrue(false)
    } catch {
      case _: AnalysisException => Assert.assertTrue(true)
      case _: Exception => Assert.assertTrue(false)
    } finally {
      sql("DROP TABLE IF EXISTS carbon_table")
    }
  }

  test("Altering table: Duplicate column found with name, it should throw RuntimeException") {
    sql("DROP TABLE IF EXISTS carbon_table")
    sql(
      s"""
         | CREATE TABLE if not exists carbon_table(
         | BB INT, cc char(10)
         | )
         | STORED AS carbondata
       """.stripMargin)

    val ex = intercept[ProcessMetaDataException] {
      sql(
        s"""
           | alter TABLE carbon_table add columns(
           | bb char(10)
            )
       """.stripMargin)
    }
    sql("DROP TABLE IF EXISTS carbon_table")
  }

} 
Example 68
Source File: MVExceptionTestCase.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.view.rewrite

import org.apache.carbondata.common.exceptions.sql.{MalformedCarbonCommandException, MalformedMVCommandException}
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll

class MVExceptionTestCase  extends QueryTest with BeforeAndAfterAll {
  override def beforeAll: Unit = {
    drop()
    sql("create table main_table (name string,age int,height int) STORED AS carbondata")
  }

  test("test mv no base table") {
    val ex = intercept[AnalysisException] {
      sql("create materialized view main_table_mv as select sum(age),name from main_table_error group by name")
    }
    assert(ex.getMessage().contains("Table or view not found: main_table_error"))
  }

  test("test mv reduplicate mv table") {
    val ex = intercept[MalformedMVCommandException] {
      sql("create materialized view main_table_mv1 as select sum(age),name from main_table group by name")
      sql("create materialized view main_table_mv1 as select sum(age),name from main_table group by name")
    }
    assertResult("Materialized view with name default.main_table_mv1 already exists")(ex.getMessage)
  }

  test("test mv creation with limit in query") {
    val ex = intercept[MalformedCarbonCommandException] {
      sql("create materialized view maintable_mv2 as select sum(age),name from main_table group by name limit 10")
    }
    assertResult("Materialized view does not support the query with limit")(ex.getMessage)
  }

  def drop(): Unit = {
    sql("drop table IF EXISTS main_table")
    sql("drop table if exists main_table_error")
    sql("drop materialized view if exists main_table_mv")
    sql("drop materialized view if exists main_table_mv1")
  }

  override def afterAll(): Unit = {
    drop()
  }
} 
Example 69
Source File: TestDataWithDicExcludeAndInclude.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.testsuite.dataload

import org.apache.spark.sql.AnalysisException
import org.scalatest.BeforeAndAfterAll

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.spark.sql.test.util.QueryTest

class TestLoadDataWithDictionaryExcludeAndInclude extends QueryTest with BeforeAndAfterAll {
  var filePath: String = _
  var pwd: String = _

  def buildTestData() = {
    filePath = s"$resourcesPath/emptyDimensionData.csv"
  }

  def dropTable() = {
    sql("DROP TABLE IF EXISTS exclude_include_t3")
    sql("DROP TABLE IF EXISTS exclude_include_hive_t3")
  }

  def buildTable() = {
    try {
      sql(
        """
           CREATE TABLE exclude_include_hive_t3
           (ID Int, date Timestamp, country String,
           name String, phonetype String, serialname String, salary Int)
           row format delimited fields terminated by ','
        """)
      sql(
        """
           CREATE TABLE exclude_include_t3
           (ID Int, date Timestamp, country String,
           name String, phonetype String, serialname String, salary Int)
           STORED AS carbondata
        """)
    } catch {
      case ex: Throwable => LOGGER.error(ex.getMessage + "\r\n" + ex.getStackTraceString)
    }
  }

  def loadTable() = {
    try {
      CarbonProperties.getInstance()
        .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, "yyyy/MM/dd")
      sql(
        s"""
           LOAD DATA LOCAL INPATH '$filePath' into table exclude_include_t3
           """)
      sql(
        s"""
           LOAD DATA LOCAL INPATH '$resourcesPath/emptyDimensionDataHive.csv' into table exclude_include_hive_t3
           """)
    } catch {
      case ex: Throwable => LOGGER.error(ex.getMessage + "\r\n" + ex.getStackTraceString)
    }
  }

  override def beforeAll {
    dropTable
    buildTestData
    buildTable
    loadTable
  }

  test("test load data with dictionary exclude & include and with empty dimension") {
    checkAnswer(
      sql("select ID from exclude_include_t3"), sql("select ID from exclude_include_hive_t3")
    )
  }

  override def afterAll {
    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
        CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT)
    dropTable
  }
} 
Example 70
Source File: TestCreateTableIfNotExists.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.testsuite.createTable

import java.util.concurrent.{Callable, Executors, ExecutorService, Future, TimeUnit}

import org.apache.spark.sql.test.util.QueryTest
import org.apache.spark.sql.AnalysisException
import org.scalatest.BeforeAndAfterAll

class TestCreateTableIfNotExists extends QueryTest with BeforeAndAfterAll {

  override def beforeAll {
    sql("use default")
    sql("drop table if exists test")
    sql("drop table if exists sourceTable")
    sql("drop table if exists targetTable")
  }

  test("test create table if not exists") {
    sql("create table test(a int, b string) STORED AS carbondata")
    try {
      // table creation should be successful
      sql("create table if not exists test(a int, b string) STORED AS carbondata")
      assert(true)
    } catch {
      case ex: Exception =>
        assert(false)
    }
  }

  test("test create table if not exist concurrently") {

    val executorService: ExecutorService = Executors.newFixedThreadPool(10)
    var futures: List[Future[_]] = List()
    for (i <- 0 until (3)) {
      futures = futures :+ runAsync()
    }

    executorService.shutdown()
    executorService.awaitTermination(30L, TimeUnit.SECONDS)

    futures.foreach { future =>
      assertResult("PASS")(future.get.toString)
    }

    def runAsync(): Future[String] = {
      executorService.submit(new Callable[String] {
        override def call() = {
          // Create table
          var result = "PASS"
          try {
            sql("create table IF NOT EXISTS TestIfExists(name string) STORED AS carbondata")
          } catch {
            case exception: Exception =>
              result = exception.getMessage
              exception.printStackTrace()
          }
          result
        }
      })
    }
  }

  test("test create table without column specified") {
    val exception = intercept[AnalysisException] {
      sql("create table TableWithoutColumn STORED AS carbondata tblproperties('sort_columns'='')")
    }
    assert(exception.getMessage.contains("Unable to infer the schema"))
  }

  override def afterAll {
    sql("use default")
    sql("drop table if exists test")
    sql("drop table if exists sourceTable")
    sql("drop table if exists targetTable")
    sql("drop table if exists TestIfExists")
  }

} 
Example 71
Source File: TestCarbonCli.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.spark.testsuite

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll

class TestCarbonCli extends QueryTest with BeforeAndAfterAll{

  override protected def beforeAll(): Unit = {
    sql("drop table if exists OneRowTable")
    sql("create table OneRowTable(col1 string, col2 string, col3 int, col4 double) STORED AS carbondata")
    sql("insert into OneRowTable select '0.1', 'a.b', 1, 1.2")
  }

  test("CarbonCli table summary") {
    checkExistence(
      sql("carboncli for table OneRowTable options('-cmd summary -a')"),
      true, "## Summary")

    checkExistence(
      sql("carboncli for table OneRowTable options('-cmd summary -v')"),
      true, "## version Details")

    checkExistence(
      sql("carboncli for table OneRowTable options('-cmd summary -s')"),
      true, "## Schema")

    checkExistence(
      sql("carboncli for table OneRowTable options('-cmd summary -t')"),
      true, "## Table Properties")

    checkExistence(
      sql("carboncli for table OneRowTable options('-cmd summary -m')"),
      true, "## Segment")
  }

  test("CarbonCli column details") {
    checkExistence(
      sql("carboncli for table OneRowTable options('-cmd summary -c col1')"),
      true, "## Column Statistics for 'col1'")
  }

  test("CarbonCli benchmark") {
    checkExistence(
      sql("carboncli for table OneRowTable options('-cmd benchmark -c col1')"),
      true, "## Benchmark")
  }

  test("CarbonCli invalid cmd"){

    assert(intercept[AnalysisException] {
      sql("carboncli for table OneRowTable").show()
    }.getMessage().contains("mismatched input 'carboncli'"))

    assert(intercept[Exception] {
      sql("carboncli for table OneRowTable options('')")
    }.getMessage().contains("Missing required option: cmd"))

    checkExistence(sql("carboncli for table OneRowTable options('-cmd test')"),
      true, "command test is not supported")
  }

  override protected def afterAll(): Unit = {
    sql("drop table if exists OneRowTable")
  }
} 
Example 72
Source File: HiveDeltaDDLSuite.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta

import org.apache.spark.sql.delta.test.DeltaHiveTest

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.test.TestHiveSingleton

abstract class HiveDeltaDDLSuiteBase
  extends DeltaDDLTestBase {

  override protected def verifyDescribeTable(tblName: String): Unit = {
    val res = sql(s"DESCRIBE TABLE $tblName").collect()
    assert(res.takeRight(2).map(_.getString(1)) === Seq("name", "dept"))
  }

  override protected def verifyNullabilityFailure(exception: AnalysisException): Unit = {
    exception.getMessage.contains("not supported for changing column")
  }
}

class HiveDeltaDDLSuite extends HiveDeltaDDLSuiteBase with DeltaHiveTest 
Example 73
Source File: PlanUtil.scala    From spark-druid-olap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.util

import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.hive.sparklinedata.SPLSessionState
import org.apache.spark.sql.{AnalysisException, DataFrame, Dataset, SQLContext}
import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.sparklinedata.druid.metadata.DruidRelationInfo
import org.sparklinedata.druid.{DruidQuery, DruidRelation, QuerySpec, Utils}


object PlanUtil {

  import Utils._

  def druidRelationInfo(tableName: String)(implicit sqlContext: SQLContext):
  Option[DruidRelationInfo] = {
    sqlContext.table(tableName).logicalPlan.collectFirst {
      case LogicalRelation(DruidRelation(drInfo, _), _, _) => drInfo
    }
  }

  def dataFrame(drInfo: DruidRelationInfo, dq: DruidQuery)(
    implicit sqlContext: SQLContext): DataFrame = {
    val dR = DruidRelation(drInfo, Some(dq))(sqlContext)
    val lP = LogicalRelation(dR, None)
    Dataset.ofRows(sqlContext.sparkSession, lP)
  }

  @throws(classOf[AnalysisException])
  def logicalPlan(dsName: String, dqStr: String, usingHist: Boolean)(
    implicit sqlContext: SQLContext): LogicalPlan = {
    val drInfo = druidRelationInfo(dsName)
    if (!drInfo.isDefined) {
      throw new AnalysisException(s"Cannot execute a DruidQuery on $dsName")
    }
    val dq = new DruidQuery(parse(dqStr).extract[QuerySpec],
      drInfo.get.options.useSmile(sqlContext),
      usingHist,
      drInfo.get.options.numSegmentsPerHistoricalQuery(sqlContext))
    val dR = DruidRelation(drInfo.get, Some(dq))(sqlContext)
    LogicalRelation(dR, None)
  }

  
  def maxCardinalityIsOne(lp: LogicalPlan): Boolean = {
    var isone = false

    val aggs = lp.collect {case ag: Aggregate if ag.groupingExpressions.isEmpty => ag}
    if (aggs.nonEmpty) {
      isone = !isCardinalityAugmented(lp, aggs.asInstanceOf[Seq[LogicalPlan]])
    }
    isone
  }
} 
Example 74
Source File: CreateHiveTableAsSelectCommand.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import scala.util.control.NonFatal

import org.apache.spark.sql.{AnalysisException, Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan, OverwriteOptions}
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.hive.MetastoreRelation



case class CreateHiveTableAsSelectCommand(
    tableDesc: CatalogTable,
    query: LogicalPlan,
    ignoreIfExists: Boolean)
  extends RunnableCommand {

  private val tableIdentifier = tableDesc.identifier

  override def innerChildren: Seq[LogicalPlan] = Seq(query)

  override def run(sparkSession: SparkSession): Seq[Row] = {
    lazy val metastoreRelation: MetastoreRelation = {
      import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
      import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
      import org.apache.hadoop.io.Text
      import org.apache.hadoop.mapred.TextInputFormat

      val withFormat =
        tableDesc.withNewStorage(
          inputFormat =
            tableDesc.storage.inputFormat.orElse(Some(classOf[TextInputFormat].getName)),
          outputFormat =
            tableDesc.storage.outputFormat
              .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)),
          serde = tableDesc.storage.serde.orElse(Some(classOf[LazySimpleSerDe].getName)),
          compressed = tableDesc.storage.compressed)

      val withSchema = if (withFormat.schema.isEmpty) {
        // Hive doesn't support specifying the column list for target table in CTAS
        // However we don't think SparkSQL should follow that.
        tableDesc.copy(schema = query.output.toStructType)
      } else {
        withFormat
      }

      sparkSession.sessionState.catalog.createTable(withSchema, ignoreIfExists = false)

      // Get the Metastore Relation
      sparkSession.sessionState.catalog.lookupRelation(tableIdentifier) match {
        case r: MetastoreRelation => r
      }
    }
    // TODO ideally, we should get the output data ready first and then
    // add the relation into catalog, just in case of failure occurs while data
    // processing.
    if (sparkSession.sessionState.catalog.tableExists(tableIdentifier)) {
      if (ignoreIfExists) {
        // table already exists, will do nothing, to keep consistent with Hive
      } else {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
    } else {
      try {
        sparkSession.sessionState.executePlan(InsertIntoTable(
          metastoreRelation, Map(), query, overwrite = OverwriteOptions(true),
          ifNotExists = false)).toRdd
      } catch {
        case NonFatal(e) =>
          // drop the created table.
          sparkSession.sessionState.catalog.dropTable(tableIdentifier, ignoreIfNotExists = true,
            purge = false)
          throw e
      }
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:${tableDesc.database}}, " +
    s"TableName: ${tableDesc.identifier.table}, " +
    s"InsertIntoHiveTable]"
  }
} 
Example 75
Source File: SparkSQLDriver.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.util.{Arrays, ArrayList => JArrayList, List => JList}
import scala.collection.JavaConverters._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, SQLContext, SparkSession}
import org.apache.spark.sql.execution.QueryExecution


private[hive] class SparkSQLDriver(val sparkSession: SparkSession = SparkSQLEnv.sparkSession)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.isEmpty) {
      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, attr.dataType.catalogString, "")
      }

      new Schema(fieldSchemas.asJava, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      sparkSession.sparkContext.setJobDescription(command)
      val execution = sparkSession.sessionState.executePlan(sparkSession.sql(command).logicalPlan)
      hiveResponse = execution.hiveResultString()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
} 
Example 76
Source File: AnalysisTest.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.SimpleCatalystConf
import org.apache.spark.sql.catalyst.catalog.{InMemoryCatalog, SessionCatalog}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._

trait AnalysisTest extends PlanTest {

  protected val caseSensitiveAnalyzer = makeAnalyzer(caseSensitive = true)
  protected val caseInsensitiveAnalyzer = makeAnalyzer(caseSensitive = false)

  private def makeAnalyzer(caseSensitive: Boolean): Analyzer = {
    val conf = new SimpleCatalystConf(caseSensitive)
    val catalog = new SessionCatalog(new InMemoryCatalog, EmptyFunctionRegistry, conf)
    catalog.createTempView("TaBlE", TestRelations.testRelation, overrideIfExists = true)
    new Analyzer(catalog, conf) {
      override val extendedResolutionRules = EliminateSubqueryAliases :: Nil
    }
  }

  protected def getAnalyzer(caseSensitive: Boolean) = {
    if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer
  }

  protected def checkAnalysis(
      inputPlan: LogicalPlan,
      expectedPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val actualPlan = analyzer.execute(inputPlan)
    analyzer.checkAnalysis(actualPlan)
    comparePlans(actualPlan, expectedPlan)
  }

  protected def assertAnalysisSuccess(
      inputPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val analysisAttempt = analyzer.execute(inputPlan)
    try analyzer.checkAnalysis(analysisAttempt) catch {
      case a: AnalysisException =>
        fail(
          s"""
            |Failed to Analyze Plan
            |$inputPlan
            |
            |Partial Analysis
            |$analysisAttempt
          """.stripMargin, a)
    }
  }

  protected def assertAnalysisError(
      inputPlan: LogicalPlan,
      expectedErrors: Seq[String],
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val e = intercept[AnalysisException] {
      analyzer.checkAnalysis(analyzer.execute(inputPlan))
    }

    if (!expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains)) {
      fail(
        s"""Exception message should contain the following substrings:
           |
           |  ${expectedErrors.mkString("\n  ")}
           |
           |Actual exception message:
           |
           |  ${e.getMessage}
         """.stripMargin)
    }
  }
} 
Example 77
Source File: ResolveInlineTablesSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Literal, Rand}
import org.apache.spark.sql.catalyst.expressions.aggregate.Count
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.types.{LongType, NullType}


class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter {

  private def lit(v: Any): Literal = Literal(v)

  test("validate inputs are foldable") {
    ResolveInlineTables.validateInputEvaluable(
      UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)))))

    // nondeterministic (rand) should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1)))))
    }

    // aggregate should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1))))))
    }

    // unresolved attribute should not work
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A")))))
    }
  }

  test("validate input dimensions") {
    ResolveInlineTables.validateInputDimension(
      UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2)))))

    // num alias != data dimension
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2)))))
    }

    // num alias == data dimension, but data themselves are inconsistent
    intercept[AnalysisException] {
      ResolveInlineTables.validateInputDimension(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22)))))
    }
  }

  test("do not fire the rule if not all expressions are resolved") {
    val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A"))))
    assert(ResolveInlineTables(table) == table)
  }

  test("convert") {
    val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted = ResolveInlineTables.convert(table)

    assert(converted.output.map(_.dataType) == Seq(LongType))
    assert(converted.data.size == 2)
    assert(converted.data(0).getLong(0) == 1L)
    assert(converted.data(1).getLong(0) == 2L)
  }

  test("nullability inference in convert") {
    val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted1 = ResolveInlineTables.convert(table1)
    assert(!converted1.schema.fields(0).nullable)

    val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType))))
    val converted2 = ResolveInlineTables.convert(table2)
    assert(converted2.schema.fields(0).nullable)
  }
} 
Example 78
Source File: ResolveSubquerySuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference}
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project}


class ResolveSubquerySuite extends AnalysisTest {

  val a = 'a.int
  val b = 'b.int
  val t1 = LocalRelation(a)
  val t2 = LocalRelation(b)

  test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") {
    val expr = Filter(In(a, Seq(ListQuery(Project(Seq(OuterReference(a)), t2)))), t1)
    val m = intercept[AnalysisException] {
      SimpleAnalyzer.ResolveSubquery(expr)
    }.getMessage
    assert(m.contains(
      "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses"))
  }
} 
Example 79
Source File: JdbcRelationProvider.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn == null) {
      null
    } else {
      JDBCPartitioningInfo(
        partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val url = jdbcOptions.url
    val table = jdbcOptions.table
    val createTableOptions = jdbcOptions.createTableOptions
    val isTruncate = jdbcOptions.isTruncate

    val conn = JdbcUtils.createConnectionFactory(jdbcOptions)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, url, table)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (isTruncate && isCascadingTruncateTable(url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, table)
              saveTable(df, url, table, jdbcOptions)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, table)
              createTable(df.schema, url, table, createTableOptions, conn)
              saveTable(df, url, table, jdbcOptions)
            }

          case SaveMode.Append =>
            saveTable(df, url, table, jdbcOptions)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '$table' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(df.schema, url, table, createTableOptions, conn)
        saveTable(df, url, table, jdbcOptions)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
} 
Example 80
Source File: QueryExecutionSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation}
import org.apache.spark.sql.test.SharedSQLContext

class QueryExecutionSuite extends SharedSQLContext {
  test("toString() exception/error handling") {
    val badRule = new SparkStrategy {
      var mode: String = ""
      override def apply(plan: LogicalPlan): Seq[SparkPlan] = mode.toLowerCase match {
        case "exception" => throw new AnalysisException(mode)
        case "error" => throw new Error(mode)
        case _ => Nil
      }
    }
    spark.experimental.extraStrategies = badRule :: Nil

    def qe: QueryExecution = new QueryExecution(spark, OneRowRelation)

    // Nothing!
    badRule.mode = ""
    assert(qe.toString.contains("OneRowRelation"))

    // Throw an AnalysisException - this should be captured.
    badRule.mode = "exception"
    assert(qe.toString.contains("org.apache.spark.sql.AnalysisException"))

    // Throw an Error - this should not be captured.
    badRule.mode = "error"
    val error = intercept[Error](qe.toString)
    assert(error.getMessage.contains("error"))
  }
} 
Example 81
Source File: VariableSubstitutionSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.internal

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException

class VariableSubstitutionSuite extends SparkFunSuite {

  private lazy val conf = new SQLConf
  private lazy val sub = new VariableSubstitution(conf)

  test("system property") {
    System.setProperty("varSubSuite.var", "abcd")
    assert(sub.substitute("${system:varSubSuite.var}") == "abcd")
  }

  test("environmental variables") {
    assert(sub.substitute("${env:SPARK_TESTING}") == "1")
  }

  test("Spark configuration variable") {
    conf.setConfString("some-random-string-abcd", "1234abcd")
    assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${some-random-string-abcd}") == "1234abcd")
  }

  test("multiple substitutes") {
    val q = "select ${bar} ${foo} ${doo} this is great"
    conf.setConfString("bar", "1")
    conf.setConfString("foo", "2")
    conf.setConfString("doo", "3")
    assert(sub.substitute(q) == "select 1 2 3 this is great")
  }

  test("test nested substitutes") {
    val q = "select ${bar} ${foo} this is great"
    conf.setConfString("bar", "1")
    conf.setConfString("foo", "${bar}")
    assert(sub.substitute(q) == "select 1 1 this is great")
  }

} 
Example 82
Source File: ResolvedDataSourceSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.execution.datasources.DataSource

class ResolvedDataSourceSuite extends SparkFunSuite {
  private def getProvidingClass(name: String): Class[_] =
    DataSource(sparkSession = null, className = name).providingClass

  test("jdbc") {
    assert(
      getProvidingClass("jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.jdbc") ===
        classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
  }

  test("json") {
    assert(
      getProvidingClass("json") ===
      classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
  }

  test("parquet") {
    assert(
      getProvidingClass("parquet") ===
      classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
  }

  test("csv") {
    assert(
      getProvidingClass("csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
    assert(
      getProvidingClass("com.databricks.spark.csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
  }

  test("error message for unknown data sources") {
    val error1 = intercept[AnalysisException] {
      getProvidingClass("avro")
    }
    assert(error1.getMessage.contains("Failed to find data source: avro."))

    val error2 = intercept[AnalysisException] {
      getProvidingClass("com.databricks.spark.avro")
    }
    assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro."))

    val error3 = intercept[ClassNotFoundException] {
      getProvidingClass("asfdwefasdfasdf")
    }
    assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf."))
  }
} 
Example 83
Source File: DDLSourceLoadSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {
    intercept[RuntimeException] {
      spark.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {
    spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {
    spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without Hive Support") {
    val e = intercept[AnalysisException] {
      spark.read.format("orc").load()
    }
    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
} 
Example 84
Source File: OrcFileOperator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.Logging
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.HiveMetastoreTypes
import org.apache.spark.sql.types.StructType

private[orc] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      path -> OrcFile.createReader(fs, path)
    }.collectFirst {
      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(path: String, conf: Option[Configuration]): StructType = {
    val reader = getFileReader(path, conf).getOrElse {
      throw new AnalysisException(
        s"Failed to discover schema from ORC files stored in $path. " +
          "Probably there are either no ORC files or only empty ORC files.")
    }
    val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
    val schema = readerInspector.getTypeName
    logDebug(s"Reading schema from file $path, got Hive schema string: $schema")
    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType]
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDir)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))

    if (paths == null || paths.isEmpty) {
      throw new IllegalArgumentException(
        s"orcFileOperator: path $path does not have valid orc files matching the pattern")
    }

    paths
  }
} 
Example 85
Source File: CreateTableAsSelect.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import org.apache.spark.annotation.Experimental
import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.catalyst.expressions.Row
import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.hive.client.{HiveTable, HiveColumn}
import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation, HiveMetastoreTypes}


private[hive]
case class CreateTableAsSelect(
    tableDesc: HiveTable,
    query: LogicalPlan,
    allowExisting: Boolean)
  extends RunnableCommand {

  def database: String = tableDesc.database
  def tableName: String = tableDesc.name

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val hiveContext = sqlContext.asInstanceOf[HiveContext]
    lazy val metastoreRelation: MetastoreRelation = {
      import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
      import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
      import org.apache.hadoop.io.Text
      import org.apache.hadoop.mapred.TextInputFormat

      val withSchema =
        tableDesc.copy(
          schema =
            query.output.map(c =>
              HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)),
          inputFormat =
            tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)),
          outputFormat =
            tableDesc.outputFormat
              .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)),
          serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName())))
      hiveContext.catalog.client.createTable(withSchema)

      // Get the Metastore Relation
      hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match {
        case r: MetastoreRelation => r
      }
    }
    // TODO ideally, we should get the output data ready first and then
    // add the relation into catalog, just in case of failure occurs while data
    // processing.
    if (hiveContext.catalog.tableExists(Seq(database, tableName))) {
      if (allowExisting) {
        // table already exists, will do nothing, to keep consistent with Hive
      } else {
        throw new AnalysisException(s"$database.$tableName already exists.")
      }
    } else {
      hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]\n" + query.toString
  }
} 
Example 86
Source File: AbstractSparkSQLDriver.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import scala.collection.JavaConversions._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.Logging
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}

private[hive] abstract class AbstractSparkSQLDriver(
    val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: context.QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.size == 0) {
      new Schema(new FieldSchema("Response code", "string", "") :: Nil, null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
      }

      new Schema(fieldSchemas, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.stringResult()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    }
  }

  def runWrapper(command: String): CommandProcessorResponseWrapper = try {
    val result = run(command)
    new CommandProcessorResponseWrapper(result, null)
  } catch {
    case ae: AnalysisException =>
      logDebug(s"Failed in [$command]", ae)
      new CommandProcessorResponseWrapper(new CommandProcessorResponse(1,
        ExceptionUtils.getStackTrace(ae), null), ae)
    case cause: Throwable =>
      logError(s"Failed in [$command]", cause)
      new CommandProcessorResponseWrapper(new CommandProcessorResponse(1,
        ExceptionUtils.getStackTrace(cause), null), cause)
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
}

private[hive] case class CommandProcessorResponseWrapper(
    rc : CommandProcessorResponse,
    cause : Throwable) 
Example 87
Source File: SqlSchemaInferrer.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.inference

import ai.deepsense.commons.spark.sql.UserDefinedFunctions
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{AnalysisException, Row}

import ai.deepsense.sparkutils.SQL

case class SqlInferenceWarning(sqlExpression: String, warningText: String) extends
  InferenceWarning(s"Schema for SQL formula '$sqlExpression' cannot be inferred ($warningText).")

class SqlSchemaInferrer {
  def inferSchema(sqlExpression: String, inputSchemas: (String, StructType)*)
  : (StructType, InferenceWarnings) = {
    try {
    val localSpark = SQL.createEmptySparkSQLSession()
      inputSchemas.foreach { case (dataFrameId, schema) =>
        val emptyData = localSpark.sparkContext.parallelize(Seq(Row.empty))
        val emptyDf = localSpark.createDataFrame(emptyData, schema)
        SQL.registerTempTable(emptyDf, dataFrameId)
      }
      val resultSchema = localSpark.sql(sqlExpression).schema
      val warnings = if (!namesUnique(inputSchemas)) {
        InferenceWarnings(SqlInferenceWarning(sqlExpression, "DataFrame ids must be unique."))
      } else if (resultSchema.isEmpty) {
        InferenceWarnings(SqlInferenceWarning(sqlExpression, "Expression must be non-empty."))
      } else {
        InferenceWarnings.empty
      }
      (resultSchema, warnings)
    } catch {
      case e @ (_: AnalysisException | _: IllegalArgumentException) =>
        (StructType(Seq.empty),
          InferenceWarnings(SqlInferenceWarning(sqlExpression, s"Invalid Spark SQL expression: ${e.getMessage}")))
    }
  }

  private def namesUnique(inputSchemas: Seq[(String, StructType)]): Boolean = {
    val names = inputSchemas.map { case (name, _) => name}
    names.size == names.toSet.size
  }
} 
Example 88
Source File: CSVInOutTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.utils.io.csv

import com.salesforce.op.test.TestSparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{AnalysisException, DataFrame}
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class CSVInOutTest extends FlatSpec with TestSparkContext {
  private val csvReader = new CSVInOut(CSVOptions(header = true))
  private val csvFile = s"$testDataDir/PassengerDataAllWithHeader.csv"

  Spec[CSVInOut] should "throw error for bad file paths with DataFrame" in {
    val error = intercept[AnalysisException](csvReader.readDataFrame("/bad/file/path/read/dataframe"))
    error.getMessage should endWith ("Path does not exist: file:/bad/file/path/read/dataframe;")
  }

  it should "throw error for bad file paths with RDD" in {
    val error = intercept[AnalysisException](csvReader.readRDD("/bad/file/path/read/rdd"))
    error.getMessage should endWith ("Path does not exist: file:/bad/file/path/read/rdd;")
  }

  it should "read a CSV file to DataFrame" in {
    val res = csvReader.readDataFrame(csvFile)
    res shouldBe a[DataFrame]
    res.count shouldBe 891
  }

  it should "read a CSV file to RDD" in {
    val res = csvReader.readRDD(csvFile)
    res shouldBe a[RDD[_]]
    res.count shouldBe 891
  }
} 
Example 89
Source File: SparkSQLDriver.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.util.{ArrayList => JArrayList, List => JList}

import scala.collection.JavaConversions._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.Logging
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}

private[hive] class SparkSQLDriver(
    val context: HiveContext = SparkSQLEnv.hiveContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: context.QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.size == 0) {
      new Schema(new FieldSchema("Response code", "string", "") :: Nil, null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
      }

      new Schema(fieldSchemas, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.stringResult()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
} 
Example 90
Source File: randomExpressions.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.types.{DataType, DoubleType}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom


case class Randn(seed: Long) extends RDG {
  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()

  def this() = this(Utils.random.nextLong())

  def this(seed: Expression) = this(seed match {
    case IntegerLiteral(s) => s
    case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
  })

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val rngTerm = ctx.freshName("rng")
    val className = classOf[XORShiftRandom].getName
    ctx.addMutableState(className, rngTerm,
      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
    ev.isNull = "false"
    s"""
      final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextGaussian();
    """
  }
} 
Example 91
Source File: KafkaWriter.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.kafka010

import java.{util => ju}

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, SparkSession}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}
import org.apache.spark.sql.types.{BinaryType, StringType}
import org.apache.spark.util.Utils


private[kafka010] object KafkaWriter extends Logging {
  val TOPIC_ATTRIBUTE_NAME: String = "topic"
  val KEY_ATTRIBUTE_NAME: String = "key"
  val VALUE_ATTRIBUTE_NAME: String = "value"

  override def toString: String = "KafkaWriter"

  def validateQuery(
      schema: Seq[Attribute],
      kafkaParameters: ju.Map[String, Object],
      topic: Option[String] = None): Unit = {
    schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse(
      if (topic.isEmpty) {
        throw new AnalysisException(s"topic option required when no " +
          s"'$TOPIC_ATTRIBUTE_NAME' attribute is present. Use the " +
          s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a topic.")
      } else {
        Literal(topic.get, StringType)
      }
    ).dataType match {
      case StringType => // good
      case _ =>
        throw new AnalysisException(s"Topic type must be a String")
    }
    schema.find(_.name == KEY_ATTRIBUTE_NAME).getOrElse(
      Literal(null, StringType)
    ).dataType match {
      case StringType | BinaryType => // good
      case _ =>
        throw new AnalysisException(s"$KEY_ATTRIBUTE_NAME attribute type " +
          s"must be a String or BinaryType")
    }
    schema.find(_.name == VALUE_ATTRIBUTE_NAME).getOrElse(
      throw new AnalysisException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found")
    ).dataType match {
      case StringType | BinaryType => // good
      case _ =>
        throw new AnalysisException(s"$VALUE_ATTRIBUTE_NAME attribute type " +
          s"must be a String or BinaryType")
    }
  }

  def write(
      sparkSession: SparkSession,
      queryExecution: QueryExecution,
      kafkaParameters: ju.Map[String, Object],
      topic: Option[String] = None): Unit = {
    val schema = queryExecution.analyzed.output
    validateQuery(schema, kafkaParameters, topic)
    queryExecution.toRdd.foreachPartition { iter =>
      val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic)
      Utils.tryWithSafeFinally(block = writeTask.execute(iter))(
        finallyBlock = writeTask.close())
    }
  }
} 
Example 92
Source File: CreateHiveTableAsSelectCommand.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import scala.util.control.NonFatal

import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession}
import org.apache.spark.sql.catalyst.catalog.CatalogTable
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.command.DataWritingCommand



case class CreateHiveTableAsSelectCommand(
    tableDesc: CatalogTable,
    query: LogicalPlan,
    outputColumns: Seq[Attribute],
    mode: SaveMode)
  extends DataWritingCommand {

  private val tableIdentifier = tableDesc.identifier

  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
    val catalog = sparkSession.sessionState.catalog
    if (catalog.tableExists(tableIdentifier)) {
      assert(mode != SaveMode.Overwrite,
        s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite")

      if (mode == SaveMode.ErrorIfExists) {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
      if (mode == SaveMode.Ignore) {
        // Since the table already exists and the save mode is Ignore, we will just return.
        return Seq.empty
      }

      InsertIntoHiveTable(
        tableDesc,
        Map.empty,
        query,
        overwrite = false,
        ifPartitionNotExists = false,
        outputColumns = outputColumns).run(sparkSession, child)
    } else {
      // TODO ideally, we should get the output data ready first and then
      // add the relation into catalog, just in case of failure occurs while data
      // processing.
      assert(tableDesc.schema.isEmpty)
      catalog.createTable(tableDesc.copy(schema = query.schema), ignoreIfExists = false)

      try {
        // Read back the metadata of the table which was created just now.
        val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier)
        // For CTAS, there is no static partition values to insert.
        val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap
        InsertIntoHiveTable(
          createdTableMeta,
          partition,
          query,
          overwrite = true,
          ifPartitionNotExists = false,
          outputColumns = outputColumns).run(sparkSession, child)
      } catch {
        case NonFatal(e) =>
          // drop the created table.
          catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false)
          throw e
      }
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:${tableDesc.database}}, " +
    s"TableName: ${tableDesc.identifier.table}, " +
    s"InsertIntoHiveTable]"
  }
} 
Example 93
Source File: TestHiveSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.test.{TestHiveSingleton, TestHiveSparkSession}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SQLTestUtils


class TestHiveSuite extends TestHiveSingleton with SQLTestUtils {
  test("load test table based on case sensitivity") {
    val testHiveSparkSession = spark.asInstanceOf[TestHiveSparkSession]

    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "false") {
      sql("SELECT * FROM SRC").queryExecution.analyzed
      assert(testHiveSparkSession.getLoadedTables.contains("src"))
      assert(testHiveSparkSession.getLoadedTables.size == 1)
    }
    testHiveSparkSession.reset()

    withSQLConf(SQLConf.CASE_SENSITIVE.key -> "true") {
      val err = intercept[AnalysisException] {
        sql("SELECT * FROM SRC").queryExecution.analyzed
      }
      assert(err.message.contains("Table or view not found"))
    }
    testHiveSparkSession.reset()
  }

  test("SPARK-15887: hive-site.xml should be loaded") {
    assert(hiveClient.getConf("hive.in.test", "") == "true")
  }
} 
Example 94
Source File: SparkSQLDriver.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.util.{ArrayList => JArrayList, Arrays, List => JList}

import scala.collection.JavaConverters._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}


private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.isEmpty) {
      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, attr.dataType.catalogString, "")
      }

      new Schema(fieldSchemas.asJava, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.sessionState.executePlan(context.sql(command).logicalPlan)
      hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) {
        execution.hiveResultString()
      }
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
} 
Example 95
Source File: HiveMetastoreLazyInitializationSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.{AnalysisException, SparkSession}
import org.apache.spark.util.Utils

class HiveMetastoreLazyInitializationSuite extends SparkFunSuite {

  test("lazily initialize Hive client") {
    val spark = SparkSession.builder()
      .appName("HiveMetastoreLazyInitializationSuite")
      .master("local[2]")
      .enableHiveSupport()
      .config("spark.hadoop.hive.metastore.uris", "thrift://127.0.0.1:11111")
      .getOrCreate()
    val originalLevel = org.apache.log4j.Logger.getRootLogger().getLevel
    try {
      // Avoid outputting a lot of expected warning logs
      spark.sparkContext.setLogLevel("error")

      // We should be able to run Spark jobs without Hive client.
      assert(spark.sparkContext.range(0, 1).count() === 1)

      // Make sure that we are not using the local derby metastore.
      val exceptionString = Utils.exceptionString(intercept[AnalysisException] {
        spark.sql("show tables")
      })
      for (msg <- Seq(
        "show tables",
        "Could not connect to meta store",
        "org.apache.thrift.transport.TTransportException",
        "Connection refused")) {
        exceptionString.contains(msg)
      }
    } finally {
      spark.sparkContext.setLogLevel(originalLevel.toString)
      spark.stop()
    }
  }
} 
Example 96
Source File: SchemaUtils.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.util

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.types.StructType



  def checkColumnNameDuplication(
      columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = {
    val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
    if (names.distinct.length != names.length) {
      val duplicateColumns = names.groupBy(identity).collect {
        case (x, ys) if ys.length > 1 => s"`$x`"
      }
      throw new AnalysisException(
        s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
    }
  }
} 
Example 97
Source File: StringUtils.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.util

import java.util.regex.{Pattern, PatternSyntaxException}

import org.apache.spark.sql.AnalysisException
import org.apache.spark.unsafe.types.UTF8String

object StringUtils {

  
  def filterPattern(names: Seq[String], pattern: String): Seq[String] = {
    val funcNames = scala.collection.mutable.SortedSet.empty[String]
    pattern.trim().split("\\|").foreach { subPattern =>
      try {
        val regex = ("(?i)" + subPattern.replaceAll("\\*", ".*")).r
        funcNames ++= names.filter{ name => regex.pattern.matcher(name).matches() }
      } catch {
        case _: PatternSyntaxException =>
      }
    }
    funcNames.toSeq
  }
} 
Example 98
Source File: SchemaUtilsSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.util

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.analysis._
import org.apache.spark.sql.types.StructType

class SchemaUtilsSuite extends SparkFunSuite {

  private def resolver(caseSensitiveAnalysis: Boolean): Resolver = {
    if (caseSensitiveAnalysis) {
      caseSensitiveResolution
    } else {
      caseInsensitiveResolution
    }
  }

  Seq((true, ("a", "a"), ("b", "b")), (false, ("a", "A"), ("b", "B"))).foreach {
      case (caseSensitive, (a0, a1), (b0, b1)) =>

    val testType = if (caseSensitive) "case-sensitive" else "case-insensitive"
    test(s"Check column name duplication in $testType cases") {
      def checkExceptionCases(schemaStr: String, duplicatedColumns: Seq[String]): Unit = {
        val expectedErrorMsg = "Found duplicate column(s) in SchemaUtilsSuite: " +
          duplicatedColumns.map(c => s"`${c.toLowerCase}`").mkString(", ")
        val schema = StructType.fromDDL(schemaStr)
        var msg = intercept[AnalysisException] {
          SchemaUtils.checkSchemaColumnNameDuplication(
            schema, "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
        }.getMessage
        assert(msg.contains(expectedErrorMsg))
        msg = intercept[AnalysisException] {
          SchemaUtils.checkColumnNameDuplication(
            schema.map(_.name), "in SchemaUtilsSuite", resolver(caseSensitive))
        }.getMessage
        assert(msg.contains(expectedErrorMsg))
        msg = intercept[AnalysisException] {
          SchemaUtils.checkColumnNameDuplication(
            schema.map(_.name), "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
        }.getMessage
        assert(msg.contains(expectedErrorMsg))
      }

      checkExceptionCases(s"$a0 INT, b INT, $a1 INT", a0 :: Nil)
      checkExceptionCases(s"$a0 INT, b INT, $a1 INT, $a0 INT", a0 :: Nil)
      checkExceptionCases(s"$a0 INT, $b0 INT, $a1 INT, $a0 INT, $b1 INT", b0 :: a0 :: Nil)
    }
  }

  test("Check no exception thrown for valid schemas") {
    def checkNoExceptionCases(schemaStr: String, caseSensitive: Boolean): Unit = {
      val schema = StructType.fromDDL(schemaStr)
      SchemaUtils.checkSchemaColumnNameDuplication(
        schema, "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
      SchemaUtils.checkColumnNameDuplication(
        schema.map(_.name), "in SchemaUtilsSuite", resolver(caseSensitive))
      SchemaUtils.checkColumnNameDuplication(
        schema.map(_.name), "in SchemaUtilsSuite", caseSensitiveAnalysis = caseSensitive)
    }

    checkNoExceptionCases("a INT, b INT, c INT", caseSensitive = true)
    checkNoExceptionCases("Aa INT, b INT, aA INT", caseSensitive = true)

    checkNoExceptionCases("a INT, b INT, c INT", caseSensitive = false)
  }
} 
Example 99
Source File: AnalysisTest.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import java.net.URI
import java.util.Locale

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.catalog.{CatalogDatabase, InMemoryCatalog, SessionCatalog}
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.internal.SQLConf

trait AnalysisTest extends PlanTest {

  protected val caseSensitiveAnalyzer = makeAnalyzer(caseSensitive = true)
  protected val caseInsensitiveAnalyzer = makeAnalyzer(caseSensitive = false)

  private def makeAnalyzer(caseSensitive: Boolean): Analyzer = {
    val conf = new SQLConf().copy(SQLConf.CASE_SENSITIVE -> caseSensitive)
    val catalog = new SessionCatalog(new InMemoryCatalog, FunctionRegistry.builtin, conf)
    catalog.createDatabase(
      CatalogDatabase("default", "", new URI("loc"), Map.empty),
      ignoreIfExists = false)
    catalog.createTempView("TaBlE", TestRelations.testRelation, overrideIfExists = true)
    catalog.createTempView("TaBlE2", TestRelations.testRelation2, overrideIfExists = true)
    catalog.createTempView("TaBlE3", TestRelations.testRelation3, overrideIfExists = true)
    new Analyzer(catalog, conf) {
      override val extendedResolutionRules = EliminateSubqueryAliases :: Nil
    }
  }

  protected def getAnalyzer(caseSensitive: Boolean) = {
    if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer
  }

  protected def checkAnalysis(
      inputPlan: LogicalPlan,
      expectedPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val actualPlan = analyzer.executeAndCheck(inputPlan)
    comparePlans(actualPlan, expectedPlan)
  }

  protected override def comparePlans(
      plan1: LogicalPlan,
      plan2: LogicalPlan,
      checkAnalysis: Boolean = false): Unit = {
    // Analysis tests may have not been fully resolved, so skip checkAnalysis.
    super.comparePlans(plan1, plan2, checkAnalysis)
  }

  protected def assertAnalysisSuccess(
      inputPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val analysisAttempt = analyzer.execute(inputPlan)
    try analyzer.checkAnalysis(analysisAttempt) catch {
      case a: AnalysisException =>
        fail(
          s"""
            |Failed to Analyze Plan
            |$inputPlan
            |
            |Partial Analysis
            |$analysisAttempt
          """.stripMargin, a)
    }
  }

  protected def assertAnalysisError(
      inputPlan: LogicalPlan,
      expectedErrors: Seq[String],
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val e = intercept[AnalysisException] {
      analyzer.checkAnalysis(analyzer.execute(inputPlan))
    }

    if (!expectedErrors.map(_.toLowerCase(Locale.ROOT)).forall(
        e.getMessage.toLowerCase(Locale.ROOT).contains)) {
      fail(
        s"""Exception message should contain the following substrings:
           |
           |  ${expectedErrors.mkString("\n  ")}
           |
           |Actual exception message:
           |
           |  ${e.getMessage}
         """.stripMargin)
    }
  }
} 
Example 100
Source File: ResolveInlineTablesSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.scalatest.BeforeAndAfter

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.expressions.{Cast, Literal, Rand}
import org.apache.spark.sql.catalyst.expressions.aggregate.Count
import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
import org.apache.spark.sql.types.{LongType, NullType, TimestampType}


class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter {

  private def lit(v: Any): Literal = Literal(v)

  test("validate inputs are foldable") {
    ResolveInlineTables(conf).validateInputEvaluable(
      UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)))))

    // nondeterministic (rand) should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1)))))
    }

    // aggregate should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1))))))
    }

    // unresolved attribute should not work
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputEvaluable(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A")))))
    }
  }

  test("validate input dimensions") {
    ResolveInlineTables(conf).validateInputDimension(
      UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2)))))

    // num alias != data dimension
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputDimension(
        UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2)))))
    }

    // num alias == data dimension, but data themselves are inconsistent
    intercept[AnalysisException] {
      ResolveInlineTables(conf).validateInputDimension(
        UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22)))))
    }
  }

  test("do not fire the rule if not all expressions are resolved") {
    val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A"))))
    assert(ResolveInlineTables(conf)(table) == table)
  }

  test("convert") {
    val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted = ResolveInlineTables(conf).convert(table)

    assert(converted.output.map(_.dataType) == Seq(LongType))
    assert(converted.data.size == 2)
    assert(converted.data(0).getLong(0) == 1L)
    assert(converted.data(1).getLong(0) == 2L)
  }

  test("convert TimeZoneAwareExpression") {
    val table = UnresolvedInlineTable(Seq("c1"),
      Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType))))
    val withTimeZone = ResolveTimeZone(conf).apply(table)
    val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone)
    val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType)
      .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long]
    assert(output.map(_.dataType) == Seq(TimestampType))
    assert(data.size == 1)
    assert(data.head.getLong(0) == correct)
  }

  test("nullability inference in convert") {
    val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L))))
    val converted1 = ResolveInlineTables(conf).convert(table1)
    assert(!converted1.schema.fields(0).nullable)

    val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType))))
    val converted2 = ResolveInlineTables(conf).convert(table2)
    assert(converted2.schema.fields(0).nullable)
  }
} 
Example 101
Source File: ResolveSubquerySuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.expressions.{In, ListQuery, OuterReference}
import org.apache.spark.sql.catalyst.plans.logical.{Filter, LocalRelation, Project}


class ResolveSubquerySuite extends AnalysisTest {

  val a = 'a.int
  val b = 'b.int
  val t1 = LocalRelation(a)
  val t2 = LocalRelation(b)

  test("SPARK-17251 Improve `OuterReference` to be `NamedExpression`") {
    val expr = Filter(In(a, Seq(ListQuery(Project(Seq(UnresolvedAttribute("a")), t2)))), t1)
    val m = intercept[AnalysisException] {
      SimpleAnalyzer.checkAnalysis(SimpleAnalyzer.ResolveSubquery(expr))
    }.getMessage
    assert(m.contains(
      "Expressions referencing the outer query are not supported outside of WHERE/HAVING clauses"))
  }
} 
Example 102
Source File: CheckCartesianProductsSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.optimizer

import org.scalatest.Matchers._

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans._
import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.internal.SQLConf.CROSS_JOINS_ENABLED

class CheckCartesianProductsSuite extends PlanTest {

  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = Batch("Check Cartesian Products", Once, CheckCartesianProducts) :: Nil
  }

  val testRelation1 = LocalRelation('a.int, 'b.int)
  val testRelation2 = LocalRelation('c.int, 'd.int)

  val joinTypesWithRequiredCondition = Seq(Inner, LeftOuter, RightOuter, FullOuter)
  val joinTypesWithoutRequiredCondition = Seq(LeftSemi, LeftAnti, ExistenceJoin('exists))

  test("CheckCartesianProducts doesn't throw an exception if cross joins are enabled)") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "true") {
      noException should be thrownBy {
        for (joinType <- joinTypesWithRequiredCondition ++ joinTypesWithoutRequiredCondition) {
          performCartesianProductCheck(joinType)
        }
      }
    }
  }

  test("CheckCartesianProducts throws an exception for join types that require a join condition") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithRequiredCondition) {
        val thrownException = the [AnalysisException] thrownBy {
          performCartesianProductCheck(joinType)
        }
        assert(thrownException.message.contains("Detected implicit cartesian product"))
      }
    }
  }

  test("CheckCartesianProducts doesn't throw an exception if a join condition is present") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithRequiredCondition) {
        noException should be thrownBy {
          performCartesianProductCheck(joinType, Some('a === 'd))
        }
      }
    }
  }

  test("CheckCartesianProducts doesn't throw an exception if join types don't require conditions") {
    withSQLConf(CROSS_JOINS_ENABLED.key -> "false") {
      for (joinType <- joinTypesWithoutRequiredCondition) {
        noException should be thrownBy {
          performCartesianProductCheck(joinType)
        }
      }
    }
  }

  private def performCartesianProductCheck(
      joinType: JoinType,
      condition: Option[Expression] = None): Unit = {
    val analyzedPlan = testRelation1.join(testRelation2, joinType, condition).analyze
    val optimizedPlan = Optimize.execute(analyzedPlan)
    comparePlans(analyzedPlan, optimizedPlan)
  }
} 
Example 103
Source File: JdbcRelationProvider.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    import JDBCOptions._

    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn.isEmpty) {
      assert(lowerBound.isEmpty && upperBound.isEmpty, "When 'partitionColumn' is not specified, " +
        s"'$JDBC_LOWER_BOUND' and '$JDBC_UPPER_BOUND' are expected to be empty")
      null
    } else {
      assert(lowerBound.nonEmpty && upperBound.nonEmpty && numPartitions.nonEmpty,
        s"When 'partitionColumn' is specified, '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND', and " +
          s"'$JDBC_NUM_PARTITIONS' are also required")
      JDBCPartitioningInfo(
        partitionColumn.get, lowerBound.get, upperBound.get, numPartitions.get)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val options = new JDBCOptions(parameters)
    val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis

    val conn = JdbcUtils.createConnectionFactory(options)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, options)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, options)
              val tableSchema = JdbcUtils.getSchemaOption(conn, options)
              saveTable(df, tableSchema, isCaseSensitive, options)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, options.table)
              createTable(conn, df, options)
              saveTable(df, Some(df.schema), isCaseSensitive, options)
            }

          case SaveMode.Append =>
            val tableSchema = JdbcUtils.getSchemaOption(conn, options)
            saveTable(df, tableSchema, isCaseSensitive, options)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '${options.table}' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(conn, df, options)
        saveTable(df, Some(df.schema), isCaseSensitive, options)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
} 
Example 104
Source File: AnalyzeTableCommand.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.command

import org.apache.spark.sql.{AnalysisException, Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.CatalogTableType



case class AnalyzeTableCommand(
    tableIdent: TableIdentifier,
    noscan: Boolean = true) extends RunnableCommand {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val sessionState = sparkSession.sessionState
    val db = tableIdent.database.getOrElse(sessionState.catalog.getCurrentDatabase)
    val tableIdentWithDB = TableIdentifier(tableIdent.table, Some(db))
    val tableMeta = sessionState.catalog.getTableMetadata(tableIdentWithDB)
    if (tableMeta.tableType == CatalogTableType.VIEW) {
      throw new AnalysisException("ANALYZE TABLE is not supported on views.")
    }

    // Compute stats for the whole table
    val newTotalSize = CommandUtils.calculateTotalSize(sessionState, tableMeta)
    val newRowCount =
      if (noscan) None else Some(BigInt(sparkSession.table(tableIdentWithDB).count()))

    // Update the metastore if the above statistics of the table are different from those
    // recorded in the metastore.
    val newStats = CommandUtils.compareAndGetNewStats(tableMeta.stats, newTotalSize, newRowCount)
    if (newStats.isDefined) {
      sessionState.catalog.alterTableStats(tableIdentWithDB, newStats)
    }

    Seq.empty[Row]
  }
} 
Example 105
Source File: JdbcUtilsSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.parser.ParseException
import org.apache.spark.sql.types._

class JdbcUtilsSuite extends SparkFunSuite {

  val tableSchema = StructType(Seq(
    StructField("C1", StringType, false), StructField("C2", IntegerType, false)))
  val caseSensitive = org.apache.spark.sql.catalyst.analysis.caseSensitiveResolution
  val caseInsensitive = org.apache.spark.sql.catalyst.analysis.caseInsensitiveResolution

  test("Parse user specified column types") {
    assert(JdbcUtils.getCustomSchema(tableSchema, null, caseInsensitive) === tableSchema)
    assert(JdbcUtils.getCustomSchema(tableSchema, "", caseInsensitive) === tableSchema)

    assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE", caseInsensitive) ===
      StructType(Seq(StructField("C1", DateType, false), StructField("C2", IntegerType, false))))
    assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE", caseSensitive) ===
      StructType(Seq(StructField("C1", StringType, false), StructField("C2", IntegerType, false))))

    assert(
      JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseInsensitive) ===
      StructType(Seq(StructField("C1", DateType, false), StructField("C2", StringType, false))))
    assert(JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, C2 STRING", caseSensitive) ===
      StructType(Seq(StructField("C1", StringType, false), StructField("C2", StringType, false))))

    // Throw AnalysisException
    val duplicate = intercept[AnalysisException]{
      JdbcUtils.getCustomSchema(tableSchema, "c1 DATE, c1 STRING", caseInsensitive) ===
        StructType(Seq(StructField("c1", DateType, false), StructField("c1", StringType, false)))
    }
    assert(duplicate.getMessage.contains(
      "Found duplicate column(s) in the customSchema option value"))

    // Throw ParseException
    val dataTypeNotSupported = intercept[ParseException]{
      JdbcUtils.getCustomSchema(tableSchema, "c3 DATEE, C2 STRING", caseInsensitive) ===
        StructType(Seq(StructField("c3", DateType, false), StructField("C2", StringType, false)))
    }
    assert(dataTypeNotSupported.getMessage.contains("DataType datee is not supported"))

    val mismatchedInput = intercept[ParseException]{
      JdbcUtils.getCustomSchema(tableSchema, "c3 DATE. C2 STRING", caseInsensitive) ===
        StructType(Seq(StructField("c3", DateType, false), StructField("C2", StringType, false)))
    }
    assert(mismatchedInput.getMessage.contains("mismatched input '.' expecting"))
  }
} 
Example 106
Source File: QueryExecutionSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation}
import org.apache.spark.sql.test.SharedSQLContext

class QueryExecutionSuite extends SharedSQLContext {
  test("toString() exception/error handling") {
    spark.experimental.extraStrategies = Seq(
        new SparkStrategy {
          override def apply(plan: LogicalPlan): Seq[SparkPlan] = Nil
        })

    def qe: QueryExecution = new QueryExecution(spark, OneRowRelation())

    // Nothing!
    assert(qe.toString.contains("OneRowRelation"))

    // Throw an AnalysisException - this should be captured.
    spark.experimental.extraStrategies = Seq(
      new SparkStrategy {
        override def apply(plan: LogicalPlan): Seq[SparkPlan] =
          throw new AnalysisException("exception")
      })
    assert(qe.toString.contains("org.apache.spark.sql.AnalysisException"))

    // Throw an Error - this should not be captured.
    spark.experimental.extraStrategies = Seq(
      new SparkStrategy {
        override def apply(plan: LogicalPlan): Seq[SparkPlan] =
          throw new Error("error")
      })
    val error = intercept[Error](qe.toString)
    assert(error.getMessage.contains("error"))
  }
} 
Example 107
Source File: VariableSubstitutionSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.internal

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException

class VariableSubstitutionSuite extends SparkFunSuite {

  private lazy val conf = new SQLConf
  private lazy val sub = new VariableSubstitution(conf)

  test("system property") {
    System.setProperty("varSubSuite.var", "abcd")
    assert(sub.substitute("${system:varSubSuite.var}") == "abcd")
  }

  test("environmental variables") {
    assert(sub.substitute("${env:SPARK_TESTING}") == "1")
  }

  test("Spark configuration variable") {
    conf.setConfString("some-random-string-abcd", "1234abcd")
    assert(sub.substitute("${hiveconf:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${sparkconf:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${spark:some-random-string-abcd}") == "1234abcd")
    assert(sub.substitute("${some-random-string-abcd}") == "1234abcd")
  }

  test("multiple substitutes") {
    val q = "select ${bar} ${foo} ${doo} this is great"
    conf.setConfString("bar", "1")
    conf.setConfString("foo", "2")
    conf.setConfString("doo", "3")
    assert(sub.substitute(q) == "select 1 2 3 this is great")
  }

  test("test nested substitutes") {
    val q = "select ${bar} ${foo} this is great"
    conf.setConfString("bar", "1")
    conf.setConfString("foo", "${bar}")
    assert(sub.substitute(q) == "select 1 1 this is great")
  }

} 
Example 108
Source File: TextSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.text

import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{AnalysisException, DataFrame, QueryTest, Row}
import org.apache.spark.util.Utils


class TextSuite extends QueryTest with SharedSQLContext {

  test("reading text file") {
    verifyFrame(sqlContext.read.format("text").load(testFile))
  }

  test("SQLContext.read.text() API") {
    verifyFrame(sqlContext.read.text(testFile))
  }

  test("SPARK-12562 verify write.text() can handle column name beyond `value`") {
    val df = sqlContext.read.text(testFile).withColumnRenamed("value", "adwrasdf")

    val tempFile = Utils.createTempDir()
    tempFile.delete()
    df.write.text(tempFile.getCanonicalPath)
    verifyFrame(sqlContext.read.text(tempFile.getCanonicalPath))

    Utils.deleteRecursively(tempFile)
  }

  test("error handling for invalid schema") {
    val tempFile = Utils.createTempDir()
    tempFile.delete()

    val df = sqlContext.range(2)
    intercept[AnalysisException] {
      df.write.text(tempFile.getCanonicalPath)
    }

    intercept[AnalysisException] {
      sqlContext.range(2).select(df("id"), df("id") + 1).write.text(tempFile.getCanonicalPath)
    }
  }

  private def testFile: String = {
    Thread.currentThread().getContextClassLoader.getResource("text-suite.txt").toString
  }

  
  private def verifyFrame(df: DataFrame): Unit = {
    // schema
    assert(df.schema == new StructType().add("value", StringType))

    // verify content
    val data = df.collect()
    assert(data(0) == Row("This is a test file for the text data source"))
    assert(data(1) == Row("1+1"))
    // non ascii characters are not allowed in the code, so we disable the scalastyle here.
    // scalastyle:off
    assert(data(2) == Row("数据砖头"))
    // scalastyle:on
    assert(data(3) == Row("\"doh\""))
    assert(data.length == 4)
  }
} 
Example 109
Source File: SqlSchemaInferrer.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.inference

import io.deepsense.commons.spark.sql.UserDefinedFunctions
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{AnalysisException, Row}

import io.deepsense.sparkutils.SQL

case class SqlInferenceWarning(sqlExpression: String, warningText: String) extends
  InferenceWarning(s"Schema for SQL formula '$sqlExpression' cannot be inferred ($warningText).")

class SqlSchemaInferrer {
  def inferSchema(sqlExpression: String, inputSchemas: (String, StructType)*)
  : (StructType, InferenceWarnings) = {
    try {
    val localSpark = SQL.createEmptySparkSQLSession()
      UserDefinedFunctions.registerFunctions(localSpark.udfRegistration)
      inputSchemas.foreach { case (dataFrameId, schema) =>
        val emptyData = localSpark.sparkContext.parallelize(Seq(Row.empty))
        val emptyDf = localSpark.createDataFrame(emptyData, schema)
        SQL.registerTempTable(emptyDf, dataFrameId)
      }
      val resultSchema = localSpark.sql(sqlExpression).schema
      val warnings = if (!namesUnique(inputSchemas)) {
        InferenceWarnings(SqlInferenceWarning(sqlExpression, "DataFrame ids must be unique."))
      } else if (resultSchema.isEmpty) {
        InferenceWarnings(SqlInferenceWarning(sqlExpression, "Expression must be non-empty."))
      } else {
        InferenceWarnings.empty
      }
      (resultSchema, warnings)
    } catch {
      case e @ (_: AnalysisException | _: IllegalArgumentException) =>
        (StructType(Seq.empty),
          InferenceWarnings(SqlInferenceWarning(sqlExpression, s"Invalid Spark SQL expression: ${e.getMessage}")))
    }
  }

  private def namesUnique(inputSchemas: Seq[(String, StructType)]): Boolean = {
    val names = inputSchemas.map { case (name, _) => name}
    names.size == names.toSet.size
  }
} 
Example 110
Source File: ExpectsGenotypeFieldsSuite.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.sql.util

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.functions._

import io.projectglow.Glow
import io.projectglow.functions._
import io.projectglow.sql.GlowBaseTest

class ExpectsGenotypeFieldsSuite extends GlowBaseTest {
  lazy val gatkTestVcf =
    s"$testDataHome/variantsplitternormalizer-test/test_left_align_hg38_altered.vcf"
  lazy val sess = spark

  // This is how we originally detected an issue where ExpectsGenotypeFields succeeds during
  // resolution but fails during physical planning.
  // PR: https://github.com/projectglow/glow/pull/224
  test("use genotype_states after splitting multiallelics") {
    val df = spark.read.format("vcf").load(gatkTestVcf)
    val split = Glow.transform("split_multiallelics", df)
    split.select(genotype_states(col("genotypes"))).collect()
  }

  test("use genotype_states after array_zip") {
    import sess.implicits._
    val df = spark
      .createDataFrame(Seq((Seq("a"), Seq(Seq(1, 1)))))
      .withColumnRenamed("_1", "sampleId")
      .withColumnRenamed("_2", "calls")
    val zipped = df.select(arrays_zip(col("sampleId"), col("calls")).as("genotypes"))
    val states = zipped.select(genotype_states(col("genotypes")))
    assert(states.as[Seq[Int]].head == Seq(2))
  }

  test("type check") {
    val df = spark.createDataFrame(Seq(Tuple1("a"))).withColumnRenamed("_1", "sampleId")
    val withGenotypes = df.select(array(struct("sampleId")).as("genotypes"))
    val ex = intercept[AnalysisException](withGenotypes.select(genotype_states(col("genotypes"))))
    assert(ex.message.contains("Genotype struct was missing required fields: (name: calls"))
  }
} 
Example 111
Source File: SqlExtensionProviderSuite.scala    From glow   with Apache License 2.0 5 votes vote down vote up
package io.projectglow.sql

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback
import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, Literal, UnaryExpression}
import org.apache.spark.sql.types.{DataType, IntegerType}

import io.projectglow.GlowSuite

class SqlExtensionProviderSuite extends GlowSuite {
  override def beforeAll(): Unit = {
    super.beforeAll()
    SqlExtensionProvider.registerFunctions(
      spark.sessionState.conf,
      spark.sessionState.functionRegistry,
      "test-functions.yml")
  }

  private lazy val sess = spark
  test("one arg function") {
    import sess.implicits._
    assert(spark.range(1).selectExpr("one_arg_test(id)").as[Int].head() == 1)

    intercept[AnalysisException] {
      spark.range(1).selectExpr("one_arg_test()").collect()
    }

    intercept[AnalysisException] {
      spark.range(1).selectExpr("one_arg_test(id, id)").collect()
    }
  }

  test("two arg function") {
    import sess.implicits._
    assert(spark.range(1).selectExpr("two_arg_test(id, id)").as[Int].head() == 1)

    intercept[AnalysisException] {
      spark.range(1).selectExpr("two_arg_test(id)").collect()
    }

    intercept[AnalysisException] {
      spark.range(1).selectExpr("two_arg_test(id, id, id)").collect()
    }
  }

  test("var args function") {
    import sess.implicits._
    assert(spark.range(1).selectExpr("var_args_test(id, id)").as[Int].head() == 1)
    assert(spark.range(1).selectExpr("var_args_test(id, id, id, id)").as[Int].head() == 1)
    assert(spark.range(1).selectExpr("var_args_test(id)").as[Int].head() == 1)

    intercept[AnalysisException] {
      spark.range(1).selectExpr("var_args_test()").collect()
    }
  }

  test("can call optional arg function") {
    import sess.implicits._
    assert(spark.range(1).selectExpr("optional_arg_test(id)").as[Int].head() == 1)
    assert(spark.range(1).selectExpr("optional_arg_test(id, id)").as[Int].head() == 1)

    intercept[AnalysisException] {
      spark.range(1).selectExpr("optional_arg_test()").collect()
    }

    intercept[AnalysisException] {
      spark.range(1).selectExpr("optional_arg_test(id, id, id)").collect()
    }
  }
}

trait TestExpr extends Expression with CodegenFallback {
  override def dataType: DataType = IntegerType

  override def nullable: Boolean = true

  override def eval(input: InternalRow): Any = 1
}

case class OneArgExpr(child: Expression) extends UnaryExpression with TestExpr
case class TwoArgExpr(left: Expression, right: Expression) extends BinaryExpression with TestExpr
case class VarArgsExpr(arg: Expression, varArgs: Seq[Expression]) extends TestExpr {
  override def children: Seq[Expression] = arg +: varArgs
}
case class OptionalArgExpr(required: Expression, optional: Expression) extends TestExpr {
  def this(required: Expression) = this(required, Literal(1))
  override def children: Seq[Expression] = Seq(required, optional)
} 
Example 112
Source File: StringColumnConstraintTest.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import de.frosner.ddq.core.Check
import de.frosner.ddq.testutils.{SparkContexts, TestData}
import org.apache.spark.sql.AnalysisException
import org.scalatest.{FlatSpec, Matchers}

class StringColumnConstraintTest extends FlatSpec with Matchers with SparkContexts {

  "A StringColumnConstraint" should "succeed if all rows satisfy the given condition" in {
    val constraintString = "column > 0"
    val check = Check(TestData.makeIntegerDf(spark, List(1, 2, 3))).satisfies(constraintString)
    val constraint = check.constraints.head
    val result = StringColumnConstraintResult(
      constraint = StringColumnConstraint(constraintString),
      data = Some(StringColumnConstraintResultData(failedRows = 0L)),
      status = ConstraintSuccess
    )
    check.run().constraintResults shouldBe Map(constraint -> result)
  }

  it should "fail if there are rows that do not satisfy the given condition" in {
    val constraintString = "column > 1"
    val check = Check(TestData.makeIntegerDf(spark, List(1, 2, 3))).satisfies(constraintString)
    val constraint = check.constraints.head
    val result = StringColumnConstraintResult(
      constraint = StringColumnConstraint(constraintString),
      data = Some(StringColumnConstraintResultData(failedRows = 1L)),
      status = ConstraintFailure
    )
    check.run().constraintResults shouldBe Map(constraint -> result)
  }

  it should "error if the column does not exist" in {
    val constraintString = "notExisting > 0"
    val check = Check(TestData.makeIntegerDf(spark, List(1, 2, 3))).satisfies(constraintString)
    val constraint = check.constraints.head
    val result = check.run().constraintResults(constraint)
    result match {
      case StringColumnConstraintResult(
      StringColumnConstraint("notExisting > 0"),
      None,
      constraintError: ConstraintError
      ) => {
        val analysisException = constraintError.throwable.asInstanceOf[AnalysisException]
        analysisException.message shouldBe "cannot resolve '`notExisting`' given input columns: [column]"
      }
    }
  }

  "A StringColumnConstraintResult" should "have the correct success message" in {
    val constraint = StringColumnConstraint("column > 0")
    val result = StringColumnConstraintResult(
      constraint = constraint,
      data = Some(StringColumnConstraintResultData(failedRows = 0L)),
      status = ConstraintSuccess
    )
    result.message shouldBe "Constraint column > 0 is satisfied."
  }

  it should "have the correct failure message (one row)" in {
    val constraint = StringColumnConstraint("column > 0")
    val result = StringColumnConstraintResult(
      constraint = constraint,
      data = Some(StringColumnConstraintResultData(failedRows = 1L)),
      status = ConstraintFailure
    )
    result.message shouldBe "1 row did not satisfy constraint column > 0."
  }

  it should "have the correct failure message (multiple rows)" in {
    val constraint = StringColumnConstraint("column > 0")
    val result = StringColumnConstraintResult(
      constraint = constraint,
      data = Some(StringColumnConstraintResultData(failedRows = 2L)),
      status = ConstraintFailure
    )
    result.message shouldBe "2 rows did not satisfy constraint column > 0."
  }

  it should "have the correct error message" in {
    val constraint = StringColumnConstraint("column > 0")
    val result = StringColumnConstraintResult(
      constraint = constraint,
      data = None,
      status = ConstraintError(new IllegalArgumentException("error"))
    )
    result.message shouldBe "Checking constraint column > 0 failed: java.lang.IllegalArgumentException: error"
  }

  it should "throw an exception if it is created with an illegal combination of fields" in {
    intercept[IllegalConstraintResultException] {
      StringColumnConstraintResult(
        constraint = StringColumnConstraint("column > 0"),
        status = ConstraintFailure,
        data = None
      )
    }
  }

} 
Example 113
Source File: CustomConstraintTest.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import de.frosner.ddq.core.Check
import de.frosner.ddq.testutils.{SparkContexts, TestData}
import org.apache.spark.sql.AnalysisException
import org.scalatest.{FlatSpec, Matchers}

class CustomConstraintTest extends FlatSpec with Matchers with SparkContexts {

  "A CustomConstraint" should "succeed if the function returns a success message" in {
    val constraintName = "name"
    val successMsg = "success"
    val check = Check(TestData.makeNullableStringDf(spark, List("a"))).custom(constraintName, {
      df => Right(successMsg)
    })
    val constraint = check.constraints.head
    val result = CustomConstraintResult(
      constraint = constraint.asInstanceOf[CustomConstraint],
      message = s"Custom constraint '$constraintName' succeeded: $successMsg",
      status = ConstraintSuccess
    )
    check.run().constraintResults shouldBe Map(constraint -> result)
  }

  it should "fail if the function returns a failure message" in {
    val constraintName = "name"
    val failureMsg = "failure"
    val check = Check(TestData.makeNullableStringDf(spark, List("a"))).custom(constraintName, {
      df => Left(failureMsg)
    })
    val constraint = check.constraints.head
    val result = CustomConstraintResult(
      constraint = constraint.asInstanceOf[CustomConstraint],
      message = s"Custom constraint '$constraintName' failed: $failureMsg",
      status = ConstraintFailure
    )
    check.run().constraintResults shouldBe Map(constraint -> result)
  }

  it should "error if the function throws an exception" in {
    val constraintName = "name"
    val exception = new Exception()
    val check = Check(TestData.makeNullableStringDf(spark, List("a"))).custom(constraintName, {
      df => throw exception
    })
    val constraint = check.constraints.head
    val result = check.run().constraintResults(constraint)
    result match {
      case CustomConstraintResult(
      customConstraint: CustomConstraint,
      "Custom constraint 'name' errored: java.lang.Exception",
      constraintError: ConstraintError
      ) => {
        constraintError.throwable shouldBe exception
      }
    }
  }

} 
Example 114
Source File: AlwaysNullConstraintTest.scala    From drunken-data-quality   with Apache License 2.0 5 votes vote down vote up
package de.frosner.ddq.constraints

import de.frosner.ddq.core.Check
import de.frosner.ddq.testutils.{SparkContexts, TestData}
import org.apache.spark.sql.AnalysisException
import org.scalatest.{FlatSpec, Matchers}

class AlwaysNullConstraintTest extends FlatSpec with Matchers with SparkContexts {

  "An AlwaysNullConstraint" should "succeed if the column is always null" in {
    val column = "column"
    val check = Check(TestData.makeNullableStringDf(spark, List(null, null, null))).isAlwaysNull(column)
    val constraint = check.constraints.head
    val result = AlwaysNullConstraintResult(
      constraint = AlwaysNullConstraint(column),
      data = Some(AlwaysNullConstraintResultData(nonNullRows = 0L)),
      status = ConstraintSuccess
    )
    check.run().constraintResults shouldBe Map(constraint -> result)
  }

  it should "fail if the column is not always null" in {
    val column = "column"
    val check = Check(TestData.makeNullableStringDf(spark, List("a", null, null))).isAlwaysNull(column)
    val constraint = check.constraints.head
    val result = AlwaysNullConstraintResult(
      constraint = AlwaysNullConstraint(column),
      data = Some(AlwaysNullConstraintResultData(nonNullRows = 1L)),
      status = ConstraintFailure
    )
    check.run().constraintResults shouldBe Map(constraint -> result)
  }

  it should "error if the column is not existing" in {
    val column = "notExisting"
    val check = Check(TestData.makeNullableStringDf(spark, List("a", null, null))).isAlwaysNull(column)
    val constraint = check.constraints.head
    val result = check.run().constraintResults(constraint)
    result match {
      case AlwaysNullConstraintResult(
      AlwaysNullConstraint("notExisting"),
      constraintError: ConstraintError,
      None
      ) =>
        val analysisException = constraintError.throwable.asInstanceOf[AnalysisException]
        analysisException.message shouldBe "cannot resolve '`notExisting`' given input columns: [column]"
    }
  }

  "An AlwaysNullConstraintResult" should "have the correct success message" in {
    val constraint = AlwaysNullConstraint("c")
    val result = AlwaysNullConstraintResult(
      constraint = constraint,
      status = ConstraintSuccess,
      data = Some(AlwaysNullConstraintResultData(0L))
    )
    result.message shouldBe "Column c is always null."
  }

  it should "have the correct failure message (one row)" in {
    val constraint = AlwaysNullConstraint("c")
    val result = AlwaysNullConstraintResult(
      constraint = constraint,
      status = ConstraintFailure,
      data = Some(AlwaysNullConstraintResultData(1L))
    )
    result.message shouldBe "Column c contains 1 non-null row (should always be null)."
  }

  it should "have the correct failure message (multiple rows)" in {
    val constraint = AlwaysNullConstraint("c")
    val result = AlwaysNullConstraintResult(
      constraint = constraint,
      status = ConstraintFailure,
      data = Some(AlwaysNullConstraintResultData(2L))
    )
    result.message shouldBe "Column c contains 2 non-null rows (should always be null)."
  }

  it should "have the correct error message" in {
    val constraint = AlwaysNullConstraint("c")
    val result = AlwaysNullConstraintResult(
      constraint = constraint,
      status = ConstraintError(new IllegalArgumentException("column c not found")),
      data = None
    )
    result.message shouldBe "Checking column c for being always null failed: " +
      "java.lang.IllegalArgumentException: column c not found"
  }

  it should "throw an exception if it is created with an illegal combination of fields" in {
    intercept[IllegalConstraintResultException] {
      AlwaysNullConstraintResult(
        constraint = AlwaysNullConstraint("c"),
        status = ConstraintFailure,
        data = None
      )
    }
  }

} 
Example 115
Source File: MergeClauseSuite.scala    From spark-acid   with Apache License 2.0 5 votes vote down vote up
package com.qubole.spark.hiveacid.merge

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.{AnalysisException, functions}

class MergeClauseSuite extends SparkFunSuite {
  def insertClause(addCondition : Boolean = true): MergeWhenNotInsert = {
    if (addCondition) {
      MergeWhenNotInsert(Some(functions.expr("x > 2").expr),
        Seq(functions.col("x").expr, functions.col("y").expr))
    }
    else {
      MergeWhenNotInsert(None,
        Seq(functions.col("x").expr, functions.col("y").expr))
    }
  }

  def updateClause(addCondition : Boolean = true): MergeWhenUpdateClause = {
    if (addCondition) {
      val updateCondition = Some(functions.expr("a > 2").expr)
      MergeWhenUpdateClause(updateCondition,
        Map("b" -> functions.lit(3).expr), isStar = false)
    } else {
      MergeWhenUpdateClause(None,
        Map("b" -> functions.lit(3).expr), isStar = false)
    }
  }

  def deleteClause(addCondition : Boolean = true): MergeWhenDelete = {
    if (addCondition) {
      MergeWhenDelete(Some(functions.expr("a < 1").expr))
    } else {
      MergeWhenDelete(None)
    }
  }

  test("Validate MergeClauses") {
    val clauses = Seq(insertClause(), updateClause(), deleteClause())
    MergeWhenClause.validate(clauses)
  }

  test("Invalid MergeClause cases") {
    val invalidMerge = "MERGE Validation Error: "

    //empty clauses
    checkInvalidMergeClause(invalidMerge + MergeWhenClause.atleastOneClauseError, Seq())

    // multi update or insert clauses
    val multiUpdateClauses = Seq(updateClause(), updateClause(), insertClause())
    checkInvalidMergeClause(invalidMerge + MergeWhenClause.justOneClausePerTypeError, multiUpdateClauses)

    // multi match clauses with first clause without condition
    val invalidMultiMatch = Seq(updateClause(false), deleteClause())
    checkInvalidMergeClause(invalidMerge + MergeWhenClause.matchClauseConditionError, invalidMultiMatch)

    // invalid Update Clause
    val invalidUpdateClause = MergeWhenUpdateClause(None, Map(), isStar = false)
    val thrown = intercept[IllegalArgumentException] {
      MergeWhenClause.validate(Seq(invalidUpdateClause))
    }
    assert(thrown.getMessage === "UPDATE Clause in MERGE should have one or more SET Values")
  }

  private def checkInvalidMergeClause(invalidMessage: String, multiUpdateClauses: Seq[MergeWhenClause]) = {
    val thrown = intercept[AnalysisException] {
      MergeWhenClause.validate(multiUpdateClauses)
    }
    assert(thrown.message === invalidMessage)
  }
} 
Example 116
Source File: HiveAcidUtils.scala    From spark-acid   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive

import scala.collection.JavaConverters._
import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTablePartition, CatalogUtils}
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, BoundReference, Expression, InterpretedPredicate, PrettyAttribute}

object HiveAcidUtils {

  
  def prunePartitionsByFilter(
                               hiveAcidMetadata: HiveAcidMetadata,
                               inputPartitions: Seq[CatalogTablePartition],
                               predicates: Option[Expression],
                               defaultTimeZoneId: String): Seq[CatalogTablePartition] = {
    if (predicates.isEmpty) {
      inputPartitions
    } else {
      val partitionSchema = hiveAcidMetadata.partitionSchema
      val partitionColumnNames = hiveAcidMetadata.partitionSchema.fieldNames.toSet

      val nonPartitionPruningPredicates = predicates.filterNot {
        _.references.map(_.name).toSet.subsetOf(partitionColumnNames)
      }
      if (nonPartitionPruningPredicates.nonEmpty) {
        throw new AnalysisException("Expected only partition pruning predicates: " +
          nonPartitionPruningPredicates)
      }

      val boundPredicate =
        InterpretedPredicate.create(predicates.get.transform {
          case att: Attribute =>
            val index = partitionSchema.indexWhere(_.name == att.name)
            BoundReference(index, partitionSchema(index).dataType, nullable = true)
        })

      inputPartitions.filter { p =>
        boundPredicate.eval(p.toRow(partitionSchema, defaultTimeZoneId))
      }
    }
  }

  def convertToCatalogTablePartition(hp: com.qubole.shaded.hadoop.hive.ql.metadata.Partition): CatalogTablePartition = {
    val apiPartition = hp.getTPartition
    val properties: Map[String, String] = if (hp.getParameters != null) {
      hp.getParameters.asScala.toMap
    } else {
      Map.empty
    }
    CatalogTablePartition(
      spec = Option(hp.getSpec).map(_.asScala.toMap).getOrElse(Map.empty),
      storage = CatalogStorageFormat(
        locationUri = Option(CatalogUtils.stringToURI(apiPartition.getSd.getLocation)),
        inputFormat = Option(apiPartition.getSd.getInputFormat),
        outputFormat = Option(apiPartition.getSd.getOutputFormat),
        serde = Option(apiPartition.getSd.getSerdeInfo.getSerializationLib),
        compressed = apiPartition.getSd.isCompressed,
        properties = Option(apiPartition.getSd.getSerdeInfo.getParameters)
          .map(_.asScala.toMap).orNull),
      createTime = apiPartition.getCreateTime.toLong * 1000,
      lastAccessTime = apiPartition.getLastAccessTime.toLong * 1000,
      parameters = properties,
      stats = None) // TODO: need to implement readHiveStats
  }
} 
Example 117
Source File: ResolvedDataSourceSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.test.SharedSQLContext

class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext {
  private def getProvidingClass(name: String): Class[_] =
    DataSource(
      sparkSession = spark,
      className = name,
      options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID)
    ).providingClass

  test("jdbc") {
    assert(
      getProvidingClass("jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") ===
      classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
    assert(
      getProvidingClass("org.apache.spark.sql.jdbc") ===
        classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider])
  }

  test("json") {
    assert(
      getProvidingClass("json") ===
      classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.json") ===
        classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat])
  }

  test("parquet") {
    assert(
      getProvidingClass("parquet") ===
      classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
    assert(
      getProvidingClass("org.apache.spark.sql.parquet") ===
        classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat])
  }

  test("csv") {
    assert(
      getProvidingClass("csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
    assert(
      getProvidingClass("com.databricks.spark.csv") ===
        classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat])
  }

  test("error message for unknown data sources") {
    val error1 = intercept[AnalysisException] {
      getProvidingClass("avro")
    }
    assert(error1.getMessage.contains("Failed to find data source: avro."))

    val error2 = intercept[AnalysisException] {
      getProvidingClass("com.databricks.spark.avro")
    }
    assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro."))

    val error3 = intercept[ClassNotFoundException] {
      getProvidingClass("asfdwefasdfasdf")
    }
    assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf."))
  }
} 
Example 118
Source File: AnalysisTest.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.{TableIdentifier, SimpleCatalystConf}

trait AnalysisTest extends PlanTest {

  val (caseSensitiveAnalyzer, caseInsensitiveAnalyzer) = {
    val caseSensitiveConf = new SimpleCatalystConf(true)
    val caseInsensitiveConf = new SimpleCatalystConf(false)

    val caseSensitiveCatalog = new SimpleCatalog(caseSensitiveConf)
    val caseInsensitiveCatalog = new SimpleCatalog(caseInsensitiveConf)

    caseSensitiveCatalog.registerTable(TableIdentifier("TaBlE"), TestRelations.testRelation)
    caseInsensitiveCatalog.registerTable(TableIdentifier("TaBlE"), TestRelations.testRelation)

    new Analyzer(caseSensitiveCatalog, EmptyFunctionRegistry, caseSensitiveConf) {
      override val extendedResolutionRules = EliminateSubQueries :: Nil
    } ->
    new Analyzer(caseInsensitiveCatalog, EmptyFunctionRegistry, caseInsensitiveConf) {
      override val extendedResolutionRules = EliminateSubQueries :: Nil
    }
  }

  protected def getAnalyzer(caseSensitive: Boolean) = {
    if (caseSensitive) caseSensitiveAnalyzer else caseInsensitiveAnalyzer
  }

  protected def checkAnalysis(
      inputPlan: LogicalPlan,
      expectedPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val actualPlan = analyzer.execute(inputPlan)
    analyzer.checkAnalysis(actualPlan)
    comparePlans(actualPlan, expectedPlan)
  }

  protected def assertAnalysisSuccess(
      inputPlan: LogicalPlan,
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    analyzer.checkAnalysis(analyzer.execute(inputPlan))
  }

  protected def assertAnalysisError(
      inputPlan: LogicalPlan,
      expectedErrors: Seq[String],
      caseSensitive: Boolean = true): Unit = {
    val analyzer = getAnalyzer(caseSensitive)
    val e = intercept[AnalysisException] {
      analyzer.checkAnalysis(analyzer.execute(inputPlan))
    }
    assert(expectedErrors.map(_.toLowerCase).forall(e.getMessage.toLowerCase.contains),
      s"Expected to throw Exception contains: ${expectedErrors.mkString(", ")}, " +
        s"actually we get ${e.getMessage}")
  }
} 
Example 119
Source File: randomExpressions.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.types.{DataType, DoubleType}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom


case class Randn(seed: Long) extends RDG {
  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()

  def this() = this(Utils.random.nextLong())

  def this(seed: Expression) = this(seed match {
    case IntegerLiteral(s) => s
    case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
  })

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val rngTerm = ctx.freshName("rng")
    val className = classOf[XORShiftRandom].getName
    ctx.addMutableState(className, rngTerm,
      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
    ev.isNull = "false"
    s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();
    """
  }
} 
Example 120
Source File: SparkSQLDriver.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.thriftserver

import java.util.{Arrays, ArrayList => JArrayList, List => JList}
import org.apache.log4j.LogManager
import org.apache.spark.sql.AnalysisException

import scala.collection.JavaConverters._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.Logging
import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}

private[hive] class SparkSQLDriver(
    val context: HiveContext = SparkSQLEnv.hiveContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: context.QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.isEmpty) {
      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
      }

      new Schema(fieldSchemas.asJava, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.stringResult()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
} 
Example 121
Source File: CreateTableAsSelect.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable}
import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes, MetastoreRelation}
import org.apache.spark.sql.{AnalysisException, Row, SQLContext}


private[hive]
case class CreateTableAsSelect(
    tableDesc: HiveTable,
    query: LogicalPlan,
    allowExisting: Boolean)
  extends RunnableCommand {

  val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database))

  override def children: Seq[LogicalPlan] = Seq(query)

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val hiveContext = sqlContext.asInstanceOf[HiveContext]
    lazy val metastoreRelation: MetastoreRelation = {
      import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
      import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
      import org.apache.hadoop.io.Text
      import org.apache.hadoop.mapred.TextInputFormat

      val withFormat =
        tableDesc.copy(
          inputFormat =
            tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)),
          outputFormat =
            tableDesc.outputFormat
              .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)),
          serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName())))

      val withSchema = if (withFormat.schema.isEmpty) {
        // Hive doesn't support specifying the column list for target table in CTAS
        // However we don't think SparkSQL should follow that.
        tableDesc.copy(schema =
        query.output.map(c =>
          HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)))
      } else {
        withFormat
      }

      hiveContext.catalog.client.createTable(withSchema)

      // Get the Metastore Relation
      hiveContext.catalog.lookupRelation(tableIdentifier, None) match {
        case r: MetastoreRelation => r
      }
    }
    // TODO ideally, we should get the output data ready first and then
    // add the relation into catalog, just in case of failure occurs while data
    // processing.
    if (hiveContext.catalog.tableExists(tableIdentifier)) {
      if (allowExisting) {
        // table already exists, will do nothing, to keep consistent with Hive
      } else {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
    } else {
      hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:${tableDesc.database}}, TableName: ${tableDesc.name}, InsertIntoHiveTable]"
  }
} 
Example 122
Source File: CreateViewAsSelect.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.hive.{HiveMetastoreTypes, HiveContext}
import org.apache.spark.sql.{AnalysisException, Row, SQLContext}
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable}


// TODO: Note that this class can NOT canonicalize the view SQL string entirely, which is different
// from Hive and may not work for some cases like create view on self join.
private[hive] case class CreateViewAsSelect(
    tableDesc: HiveTable,
    childSchema: Seq[Attribute],
    allowExisting: Boolean,
    orReplace: Boolean) extends RunnableCommand {

  assert(tableDesc.schema == Nil || tableDesc.schema.length == childSchema.length)
  assert(tableDesc.viewText.isDefined)

  val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database))

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val hiveContext = sqlContext.asInstanceOf[HiveContext]

    if (hiveContext.catalog.tableExists(tableIdentifier)) {
      if (allowExisting) {
        // view already exists, will do nothing, to keep consistent with Hive
      } else if (orReplace) {
        hiveContext.catalog.client.alertView(prepareTable())
      } else {
        throw new AnalysisException(s"View $tableIdentifier already exists. " +
          "If you want to update the view definition, please use ALTER VIEW AS or " +
          "CREATE OR REPLACE VIEW AS")
      }
    } else {
      hiveContext.catalog.client.createView(prepareTable())
    }

    Seq.empty[Row]
  }

  private def prepareTable(): HiveTable = {
    // setup column types according to the schema of child.
    val schema = if (tableDesc.schema == Nil) {
      childSchema.map { attr =>
        HiveColumn(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), null)
      }
    } else {
      childSchema.zip(tableDesc.schema).map { case (attr, col) =>
        HiveColumn(col.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), col.comment)
      }
    }

    val columnNames = childSchema.map(f => verbose(f.name))

    // When user specified column names for view, we should create a project to do the renaming.
    // When no column name specified, we still need to create a project to declare the columns
    // we need, to make us more robust to top level `*`s.
    val projectList = if (tableDesc.schema == Nil) {
      columnNames.mkString(", ")
    } else {
      columnNames.zip(tableDesc.schema.map(f => verbose(f.name))).map {
        case (name, alias) => s"$name AS $alias"
      }.mkString(", ")
    }

    val viewName = verbose(tableDesc.name)

    val expandedText = s"SELECT $projectList FROM (${tableDesc.viewText.get}) $viewName"

    tableDesc.copy(schema = schema, viewText = Some(expandedText))
  }

  // escape backtick with double-backtick in column name and wrap it with backtick.
  private def verbose(name: String) = s"`${name.replaceAll("`", "``")}`"
} 
Example 123
Source File: OrcFileOperator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.Logging
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.HiveMetastoreTypes
import org.apache.spark.sql.types.StructType

private[orc] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      path -> OrcFile.createReader(fs, path)
    }.collectFirst {
      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(path: String, conf: Option[Configuration]): StructType = {
    val reader = getFileReader(path, conf).getOrElse {
      throw new AnalysisException(
        s"Failed to discover schema from ORC files stored in $path. " +
          "Probably there are either no ORC files or only empty ORC files.")
    }
    val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
    val schema = readerInspector.getTypeName
    logDebug(s"Reading schema from file $path, got Hive schema string: $schema")
    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType]
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDir)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))

    if (paths == null || paths.isEmpty) {
      throw new IllegalArgumentException(
        s"orcFileOperator: path $path does not have valid orc files matching the pattern")
    }

    paths
  }
} 
Example 124
Source File: DDLSourceLoadSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types._


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name - internal data sources") {
    val e = intercept[AnalysisException] {
      spark.read.format("Fluet da Bomb").load()
    }
    assert(e.getMessage.contains("Multiple sources found for Fluet da Bomb"))
  }

  test("data sources with the same name - internal data source/external data source") {
    assert(spark.read.format("datasource").load().schema ==
      StructType(Seq(StructField("longType", LongType, nullable = false))))
  }

  test("data sources with the same name - external data sources") {
    val e = intercept[AnalysisException] {
      spark.read.format("Fake external source").load()
    }
    assert(e.getMessage.contains("Multiple sources found for Fake external source"))
  }

  test("load data source from format alias") {
    assert(spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false))))
  }

  test("specify full classname with duplicate formats") {
    assert(spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("integerType", IntegerType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceFour extends RelationProvider with DataSourceRegister {

  def shortName(): String = "datasource"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("longType", LongType, nullable = false)))
    }
} 
Example 125
Source File: TestShowPartitions.scala    From carbondata   with Apache License 2.0 4 votes vote down vote up
package org.apache.carbondata.spark.testsuite.partition

import org.apache.spark.sql.{AnalysisException, DataFrame, Row}
import org.apache.spark.sql.catalyst.analysis.NoSuchDatabaseException
import org.scalatest.BeforeAndAfterAll

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.spark.sql.test.util.QueryTest

import org.apache.carbondata.spark.exception.ProcessMetaDataException

class TestShowPartition  extends QueryTest with BeforeAndAfterAll {
  override def beforeAll = {

    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, "dd-MM-yyyy")

    sql("drop table if exists notPartitionTable")
    sql("""
                | CREATE TABLE notPartitionTable
                | (
                | vin String,
                | logdate Timestamp,
                | phonenumber Int,
                | country String,
                | area String
                | )
                | STORED AS carbondata
              """.stripMargin)

    sql(s"CREATE DATABASE if not exists partitionDB")

    sql("DROP TABLE IF EXISTS hiveTable")
    sql("""
       | create table hiveTable(id int, name string) partitioned by (city string)
       | row format delimited fields terminated by ','
       """.stripMargin)
    sql("alter table hiveTable add partition (city = 'Hangzhou')")

    sql(s"CREATE DATABASE if not exists hiveDB")
    sql("DROP TABLE IF EXISTS hiveDB.hiveTable")
    sql("""
       | create table hiveDB.hiveTable(id int, name string) partitioned by (city string)
       | row format delimited fields terminated by ','
       """.stripMargin)
    sql("alter table hiveDB.hiveTable add partition (city = 'Shanghai')")
  }

  test("show partition table: exception when show not partition table") {
    val errorMessage = intercept[AnalysisException] {
      sql("show partitions notPartitionTable").show()
    }
    assert(errorMessage.getMessage.contains(
      "SHOW PARTITIONS is not allowed on a table that is not partitioned"))
  }

  test("show partition table: hive partition table") {
    // EqualTo
    checkAnswer(sql("show partitions hiveTable"), Seq(Row("city=Hangzhou")))
    sql("use hiveDB").show()
    checkAnswer(sql("show partitions hiveTable"), Seq(Row("city=Shanghai")))
    sql("use default").show()
  }

  override def afterAll = {
    sql("use default")
    sql("drop table if exists notPartitionTable")
    sql("drop table if exists  hiveTable")
    try {
      sql("drop table if exists  hiveDB.hiveTable")
    } catch {
      case ex: NoSuchDatabaseException => print(ex.getMessage())
    }
    sql("DROP DATABASE if exists partitionDB")
    sql("DROP DATABASE if exists hiveDB")
    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
        CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT)
  }
}