org.apache.spark.sql.execution.datasources.DataSource Scala Examples
The following examples show how to use org.apache.spark.sql.execution.datasources.DataSource.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamingRelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LeafNode import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes) } }
Example 2
Source File: ResolvedDataSourceSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.DataSource class ResolvedDataSourceSuite extends SparkFunSuite { private def getProvidingClass(name: String): Class[_] = DataSource(sparkSession = null, className = name).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("error message for unknown data sources") { val error1 = intercept[AnalysisException] { getProvidingClass("avro") } assert(error1.getMessage.contains("Failed to find data source: avro.")) val error2 = intercept[AnalysisException] { getProvidingClass("com.databricks.spark.avro") } assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro.")) val error3 = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 3
Source File: StreamingIncrementCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import java.util.Locale import org.apache.spark.SparkException import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.streaming.StreamingRelationV2 import org.apache.spark.sql.sources.v2.StreamWriteSupport import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.xsql.DataSourceManager._ import org.apache.spark.sql.xsql.StreamingSinkType case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand { private var outputMode: OutputMode = OutputMode.Append // dummy override def output: Seq[AttributeReference] = Seq.empty // dummy override def producedAttributes: AttributeSet = plan.producedAttributes override def run(sparkSession: SparkSession): Seq[Row] = { import StreamingSinkType._ val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan)) val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema)) plan.collectLeaves.head match { case StreamingRelationV2(_, _, extraOptions, _, _) => val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK) val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv => val key = kv._1.substring(STREAMING_SINK_PREFIX.length) (key, kv._2) } StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match { case CONSOLE => case TEXT | PARQUET | ORC | JSON | CSV => if (sinkOptions.get(STREAMING_SINK_PATH) == None) { throw new SparkException("Sink type is file, must config path") } case KAFKA => if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) { throw new SparkException("Sink type is kafka, must config bootstrap servers") } if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) { throw new SparkException("Sink type is kafka, must config kafka topic") } case _ => throw new SparkException( "Sink type is invalid, " + s"select from ${StreamingSinkType.values}") } val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf) val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",") val sink = ds.newInstance() match { case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) => w case _ => val ds = DataSource( sparkSession, className = source, options = sinkOptions.toMap, partitionColumns = Nil) ds.createSink(InternalOutputModes.Append) } val outputMode = InternalOutputModes( extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE)) val duration = extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION) val trigger = extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match { case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration) case STREAMING_ONCE_TRIGGER => Trigger.Once() case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration) } val query = sparkSession.sessionState.streamingQueryManager.startQuery( extraOptions.get("queryName"), extraOptions.get(STREAMING_CHECKPOINT_LOCATION), df, sinkOptions.toMap, sink, outputMode, useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK, recoverFromCheckpointLocation = true, trigger = trigger) query.awaitTermination() } // dummy Seq.empty } } case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output }
Example 4
Source File: StreamingRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2} object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source, session: SparkSession): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes)(session) } }
Example 5
Source File: ResolvedDataSourceSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.test.SharedSQLContext class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext { private def getProvidingClass(name: String): Class[_] = DataSource( sparkSession = spark, className = name, options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID) ).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("avro: show deploy guide for loading the external avro module") { Seq("avro", "org.apache.spark.sql.avro").foreach { provider => val message = intercept[AnalysisException] { getProvidingClass(provider) }.getMessage assert(message.contains(s"Failed to find data source: $provider")) assert(message.contains("Please deploy the application as per the deployment section of")) } } test("kafka: show deploy guide for loading the external kafka module") { val message = intercept[AnalysisException] { getProvidingClass("kafka") }.getMessage assert(message.contains("Failed to find data source: kafka")) assert(message.contains("Please deploy the application as per the deployment section of")) } test("error message for unknown data sources") { val error = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 6
Source File: CreateDataSourceTableAsSelectHarvesterSuite.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.sql import com.hortonworks.spark.atlas.types.metadata import scala.util.Random import com.hortonworks.spark.atlas.{SACAtlasEntityWithDependencies, WithHiveSupport} import com.hortonworks.spark.atlas.utils.SparkUtils import org.apache.atlas.model.instance.AtlasEntity import org.apache.spark.sql.SaveMode import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.types.StructType import org.scalatest.{FunSuite, Matchers} // This is not leveraging BaseHarvesterSuite, as it doesn't need to be tested with // both non-remote HMS and remote HMS cases. class CreateDataSourceTableAsSelectHarvesterSuite extends FunSuite with Matchers with WithHiveSupport { private val sourceTblName = "source_" + Random.nextInt(100000) override protected def beforeAll(): Unit = { super.beforeAll() sparkSession.sql(s"CREATE TABLE $sourceTblName (name string, age int)") } test("saveAsTable should have output entity having table details - parquet") { testWithProvider("parquet") } test("saveAsTable should have output entity having table details - hive") { val entity = testWithProvider("hive") assert(entity.getAttribute("partitionProvider") == "Catalog") } def testWithProvider(provider: String): AtlasEntity = { val destTblName = "dest1_" + Random.nextInt(100000) val df = sparkSession.sql(s"SELECT * FROM $sourceTblName") // The codes below look after DataFrameWriter.saveAsTable codes as of Spark 2.4. // It uses internal APIs for this test. If the compatibility is broken, we should better // just remove this test. val tableIdent = df.sparkSession.sessionState.sqlParser.parseTableIdentifier(destTblName) val storage = DataSource.buildStorageFormatFromOptions(Map("path" -> "/tmp/foo")) val tableDesc = CatalogTable( identifier = tableIdent, tableType = CatalogTableType.EXTERNAL, storage = storage, schema = new StructType, provider = Some(provider), partitionColumnNames = Nil, bucketSpec = None) val cmd = CreateDataSourceTableAsSelectCommand( tableDesc, SaveMode.ErrorIfExists, df.queryExecution.logical, Seq("name", "age")) val newTable = tableDesc.copy( storage = tableDesc.storage.copy(), schema = df.schema) sparkSession.sessionState.catalog.createTable( newTable, ignoreIfExists = false, validateLocation = false) val qd = QueryDetail(df.queryExecution, 0L) val entities = CommandsHarvester.CreateDataSourceTableAsSelectHarvester.harvest(cmd, qd) val processDeps = entities.head.asInstanceOf[SACAtlasEntityWithDependencies].dependencies val maybeEntity = processDeps.find(_.typeName == metadata.TABLE_TYPE_STRING) .map(_.asInstanceOf[SACAtlasEntityWithDependencies].entity) assert(maybeEntity.isDefined, s"Output entity for table [$destTblName] was not found.") assert(maybeEntity.get.getAttribute("name") == destTblName) assert(maybeEntity.get.getAttribute("owner") == SparkUtils.currUser()) assert(maybeEntity.get.getAttribute("schemaDesc") == "struct<name:string,age:int>") assert(maybeEntity.get.getAttribute("provider") == provider) maybeEntity.get } }
Example 7
Source File: StreamingRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LeafNode import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes) } }
Example 8
Source File: ResolvedDataSourceSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.DataSource class ResolvedDataSourceSuite extends SparkFunSuite { private def getProvidingClass(name: String): Class[_] = DataSource(sparkSession = null, className = name).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("error message for unknown data sources") { val error1 = intercept[AnalysisException] { getProvidingClass("avro") } assert(error1.getMessage.contains("Failed to find data source: avro.")) val error2 = intercept[AnalysisException] { getProvidingClass("com.databricks.spark.avro") } assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro.")) val error3 = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 9
Source File: SqsSource.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.sqs import java.net.URI import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.spark.sql.execution.datasources.{DataSource, LogicalRelation} import org.apache.spark.sql.execution.streaming._ import org.apache.spark.sql.execution.streaming.FileStreamSource._ import org.apache.spark.sql.types.StructType class SqsSource(sparkSession: SparkSession, metadataPath: String, options: Map[String, String], override val schema: StructType) extends Source with Logging { private val sourceOptions = new SqsSourceOptions(options) private val hadoopConf = sparkSession.sessionState.newHadoopConf() private val metadataLog = new FileStreamSourceLog(FileStreamSourceLog.VERSION, sparkSession, metadataPath) private var metadataLogCurrentOffset = metadataLog.getLatest().map(_._1).getOrElse(-1L) private val maxFilesPerTrigger = sourceOptions.maxFilesPerTrigger private val maxFileAgeMs: Long = sourceOptions.maxFileAgeMs private val fileFormatClassName = sourceOptions.fileFormatClassName private val shouldSortFiles = sourceOptions.shouldSortFiles private val sqsClient = new SqsClient(sourceOptions, hadoopConf) metadataLog.allFiles().foreach { entry => sqsClient.sqsFileCache.add(entry.path, MessageDescription(entry.timestamp, true, "")) } sqsClient.sqsFileCache.purge() logInfo(s"maxFilesPerBatch = $maxFilesPerTrigger, maxFileAgeMs = $maxFileAgeMs") val batchFiles = sqsClient.sqsFileCache.getUncommittedFiles(maxFilesPerTrigger, shouldSortFiles) if (batchFiles.nonEmpty) { metadataLogCurrentOffset += 1 metadataLog.add(metadataLogCurrentOffset, batchFiles.map { case (path, timestamp, receiptHandle) => FileEntry(path = path, timestamp = timestamp, batchId = metadataLogCurrentOffset) }.toArray) logInfo(s"Log offset set to $metadataLogCurrentOffset with ${batchFiles.size} new files") val messageReceiptHandles = batchFiles.map { case (path, timestamp, receiptHandle) => sqsClient.sqsFileCache.markCommitted(path) logDebug(s"New file: $path") receiptHandle }.toList sqsClient.addToDeleteMessageQueue(messageReceiptHandles) } val numPurged = sqsClient.sqsFileCache.purge() if (!sqsClient.deleteMessageQueue.isEmpty) { sqsClient.deleteMessagesFromQueue() } logTrace( s""" |Number of files selected for batch = ${batchFiles.size} |Number of files purged from tracking map = $numPurged """.stripMargin) FileStreamSourceOffset(metadataLogCurrentOffset) } override def getOffset: Option[Offset] = Some(fetchMaxOffset()).filterNot(_.logOffset == -1) override def commit(end: Offset): Unit = { // No-op for now; SqsSource currently garbage-collects files based on timestamp // and the value of the maxFileAge parameter. } override def stop(): Unit = { if (!sqsClient.sqsScheduler.isTerminated) { sqsClient.sqsScheduler.shutdownNow() } } override def toString: String = s"SqsSource[${sqsClient.sqsUrl}]" }
Example 10
Source File: StreamingRelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LeafNode import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec( sourceName: String, output: Seq[Attribute], override val user: String) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes) } }
Example 11
Source File: ResolvedDataSourceSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.execution.datasources.DataSource class ResolvedDataSourceSuite extends SparkFunSuite { private def getProvidingClass(name: String): Class[_] = DataSource(sparkSession = null, className = name).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("error message for unknown data sources") { val error1 = intercept[AnalysisException] { getProvidingClass("avro") } assert(error1.getMessage.contains("Failed to find data source: avro.")) val error2 = intercept[AnalysisException] { getProvidingClass("com.databricks.spark.avro") } assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro.")) val error3 = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 12
Source File: ResolvedDataSourceSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.test.SharedSQLContext class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext { private def getProvidingClass(name: String): Class[_] = DataSource( sparkSession = spark, className = name, options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID) ).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("error message for unknown data sources") { val error1 = intercept[AnalysisException] { getProvidingClass("avro") } assert(error1.getMessage.contains("Failed to find data source: avro.")) val error2 = intercept[AnalysisException] { getProvidingClass("com.databricks.spark.avro") } assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro.")) val error3 = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }