org.apache.spark.sql.sources.RelationProvider Scala Examples
Example 1
Source File: JdbcRelationProvider.scala From drizzle-spark with Apache License 2.0 | 7 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 2
Source File: DefaultSource.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.github.traviscrawford.spark.dynamodb import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.sources.RelationProvider import org.apache.spark.sql.sources.SchemaRelationProvider import org.apache.spark.sql.types.StructType private[dynamodb] class DefaultSource extends RelationProvider with SchemaRelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]) : BaseRelation = getDynamoDBRelation(sqlContext, parameters) override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) : BaseRelation = getDynamoDBRelation(sqlContext, parameters, Some(schema)) private def getDynamoDBRelation( sqlContext: SQLContext, parameters: Map[String, String], maybeSchema: Option[StructType] = None) : DynamoDBRelation = { val tableName = parameters.getOrElse("table", throw new IllegalArgumentException("Required parameter 'table' was unspecified.") ) DynamoDBRelation( tableName = tableName, maybeFilterExpression = parameters.get("filter_expression"), maybePageSize = parameters.get("page_size"), maybeRegion = parameters.get("region"), maybeSegments = parameters.get("segments"), maybeRateLimit = parameters.get("rate_limit_per_segment").map(Integer.parseInt), maybeSchema = maybeSchema, maybeCredentials = parameters.get("aws_credentials_provider"), maybeEndpoint = parameters.get("endpoint"))(sqlContext) } }
Example 3
Source File: DefaultSource.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.sql import org.apache.spark.sql.{ DataFrame, SQLContext, SaveMode } import org.apache.spark.sql.sources.{ BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider, SchemaRelationProvider } import org.apache.spark.sql.types.StructType import com.actian.spark_vector.util.Logging import com.actian.spark_vector.vector.VectorJDBC class DefaultSource extends DataSourceRegister with RelationProvider with SchemaRelationProvider with CreatableRelationProvider with Logging { override def shortName(): String = "vector" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = VectorRelation(TableRef(parameters), sqlContext, parameters) override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = VectorRelation(TableRef(parameters), Some(schema), sqlContext, parameters) override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val tableRef = TableRef(parameters) val table = VectorRelation(tableRef, sqlContext, parameters) mode match { case SaveMode.Overwrite => table.insert(data, true) case SaveMode.ErrorIfExists => val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) } if (isEmpty) { table.insert(data, false) } else { throw new UnsupportedOperationException("Writing to a non-empty Vector table is not allowed with mode ErrorIfExists.") } case SaveMode.Append => table.insert(data, false) case SaveMode.Ignore => val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) } if (isEmpty) { table.insert(data, false) } } table } }
Example 4
Source File: HelloWorldDataSource.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.datasources.helloworld import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{ BaseRelation, DataSourceRegister, RelationProvider, TableScan } import org.apache.spark.sql.types.{ StringType, StructField, StructType } import org.apache.spark.sql.{ Row, SQLContext } class HelloWorldDataSource extends RelationProvider with DataSourceRegister with Serializable { override def shortName(): String = "helloworld" override def hashCode(): Int = getClass.hashCode() override def equals(other: scala.Any): Boolean = other.isInstanceOf[HelloWorldDataSource] override def toString: String = "HelloWorldDataSource" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val path = parameters.get("path") path match { case Some(p) => new HelloWorldRelationProvider(sqlContext, p, parameters) case _ => throw new IllegalArgumentException("Path is required for Tickets datasets") } } } class HelloWorldRelationProvider(val sqlContext: SQLContext, path: String, parameters: Map[String, String]) extends BaseRelation with TableScan { import sqlContext.implicits._ override def schema: StructType = StructType(Array( StructField("key", StringType, nullable = false), StructField("value", StringType, nullable = true) )) override def buildScan(): RDD[Row] = Seq( "path" -> path, "message" -> parameters.getOrElse("message", ""), "name" -> s"Hello ${parameters.getOrElse("name", "")}", "hello_world" -> "Hello World!" ).toDF.rdd }
Example 5
Source File: DefaultSource.scala From spark-google-spreadsheets with Apache License 2.0 | 5 votes |
package import import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = { createRelation(sqlContext, parameters, null) } private[spreadsheets] def pathToSheetNames(parameters: Map[String, String]): (String, String) = { val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets.")) val elems = path.split('/') if (elems.length < 2) throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'") (elems(0), elems(1)) } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) val context = createSpreadsheetContext(parameters) createRelation(sqlContext, context, spreadsheetName, worksheetName, schema) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) implicit val context = createSpreadsheetContext(parameters) val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName) if(!spreadsheet.isDefined) throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName") spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData) createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema) } private[spreadsheets] def createSpreadsheetContext(parameters: Map[String, String]) = { val serviceAccountIdOption = parameters.get("serviceAccountId") val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH) SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath)) } private[spreadsheets] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: StructType): SpreadsheetRelation = if (schema == null) { createRelation(sqlContext, context, spreadsheetName, worksheetName, None) } else { createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema)) } private[spreadsheets] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: Option[StructType]): SpreadsheetRelation = SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext) }
Example 6
Source File: DefaultSource.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType ): BaseRelation = { val path = parameters.getOrElse("path", sys.error("Parameter 'path' must be defined.")) val name = parameters.getOrElse("name", sys.error("Parameter 'name' must be defined.")) val numPartitions = parameters.getOrElse("numPartitions", "8").toInt GDBRelation(path, name, numPartitions)(sqlContext) } }
Example 7
Source File: DefaultSource.scala From Spark-MongoDB with Apache License 2.0 | 5 votes |
package com.stratio.datasource.mongodb import com.stratio.datasource.mongodb.config.MongodbConfigBuilder import com.stratio.datasource.mongodb.config.MongodbConfig._ import org.apache.spark.sql.SaveMode._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider{ override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { new MongodbRelation(MongodbConfigBuilder(parseParameters(parameters)).build())(sqlContext) } override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { new MongodbRelation(MongodbConfigBuilder(parseParameters(parameters)).build(), Some(schema))(sqlContext) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val mongodbRelation = new MongodbRelation( MongodbConfigBuilder(parseParameters(parameters)).build(), Some(data.schema))(sqlContext) mode match{ case Append => mongodbRelation.insert(data, overwrite = false) case Overwrite => mongodbRelation.insert(data, overwrite = true) case ErrorIfExists => if(mongodbRelation.isEmptyCollection) mongodbRelation.insert(data, overwrite = false) else throw new UnsupportedOperationException("Writing in a non-empty collection.") case Ignore => if(mongodbRelation.isEmptyCollection) mongodbRelation.insert(data, overwrite = false) } mongodbRelation } }
Example 8
Source File: DefaultSource.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.util.Properties import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, DataSourceRegister} class DefaultSource extends RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val url = parameters.getOrElse("url", sys.error("Option 'url' not specified")) val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified")) val partitionColumn = parameters.getOrElse("partitionColumn", null) val lowerBound = parameters.getOrElse("lowerBound", null) val upperBound = parameters.getOrElse("upperBound", null) val numPartitions = parameters.getOrElse("numPartitions", null) if (partitionColumn != null && (lowerBound == null || upperBound == null || numPartitions == null)) { sys.error("Partitioning incompletely specified") } val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) val properties = new Properties() // Additional properties that we will pass to getConnection parameters.foreach(kv => properties.setProperty(kv._1, kv._2)) JDBCRelation(url, table, parts, properties)(sqlContext) } }
Example 9
Source File: DefaultSource.scala From mimir with Apache License 2.0 | 5 votes |
package import import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = { createRelation(sqlContext, parameters, null) } private[spreadsheet] def pathToSheetNames(parameters: Map[String, String]): (String, String) = { val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets.")) val elems = path.split('/') if (elems.length < 2) throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'") (elems(0), elems(1)) } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) val context = createSpreadsheetContext(parameters) createRelation(sqlContext, context, spreadsheetName, worksheetName, schema) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) implicit val context = createSpreadsheetContext(parameters) val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName) if(!spreadsheet.isDefined) throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName") spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData) createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema) } private[spreadsheet] def createSpreadsheetContext(parameters: Map[String, String]) = { val serviceAccountIdOption = parameters.get("serviceAccountId") val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH) SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath)) } private[spreadsheet] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: StructType): SpreadsheetRelation = if (schema == null) { createRelation(sqlContext, context, spreadsheetName, worksheetName, None) } else { createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema)) } private[spreadsheet] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: Option[StructType]): SpreadsheetRelation = SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext) }
Example 10
Source File: JdbcRelationProvider.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { import JDBCOptions._ val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn.isEmpty) { assert(lowerBound.isEmpty && upperBound.isEmpty, "When 'partitionColumn' is not specified, " + s"'$JDBC_LOWER_BOUND' and '$JDBC_UPPER_BOUND' are expected to be empty") null } else { assert(lowerBound.nonEmpty && upperBound.nonEmpty && numPartitions.nonEmpty, s"When 'partitionColumn' is specified, '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND', and " + s"'$JDBC_NUM_PARTITIONS' are also required") JDBCPartitioningInfo( partitionColumn.get, lowerBound.get, upperBound.get, numPartitions.get) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val options = new JDBCOptions(parameters) val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis val conn = JdbcUtils.createConnectionFactory(options)() try { val tableExists = JdbcUtils.tableExists(conn, options) if (tableExists) { mode match { case SaveMode.Overwrite => if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, options) val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, options.table) createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } case SaveMode.Append => val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '${options.table}' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 11
Source File: DefaultSource.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.util.Properties import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.datasources.jdbc.{JDBCRelation, JDBCPartitioningInfo, DriverRegistry} import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, RelationProvider} class DefaultSource extends RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val url = parameters.getOrElse("url", sys.error("Option 'url' not specified")) val driver = parameters.getOrElse("driver", null) val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified")) val partitionColumn = parameters.getOrElse("partitionColumn", null) val lowerBound = parameters.getOrElse("lowerBound", null) val upperBound = parameters.getOrElse("upperBound", null) val numPartitions = parameters.getOrElse("numPartitions", null) if (driver != null) DriverRegistry.register(driver) if (partitionColumn != null && (lowerBound == null || upperBound == null || numPartitions == null)) { sys.error("Partitioning incompletely specified") } val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) //我们将传递给getConnection的其他属性 val properties = new Properties() // Additional properties that we will pass to getConnection parameters.foreach(kv => properties.setProperty(kv._1, kv._2)) JDBCRelation(url, table, parts, properties)(sqlContext) } }
Example 12
Source File: DefaultSource.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.util.Properties import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, DataSourceRegister} class DefaultSource extends RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val url = parameters.getOrElse("url", sys.error("Option 'url' not specified")) val driver = parameters.getOrElse("driver", null) val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified")) val partitionColumn = parameters.getOrElse("partitionColumn", null) val lowerBound = parameters.getOrElse("lowerBound", null) val upperBound = parameters.getOrElse("upperBound", null) val numPartitions = parameters.getOrElse("numPartitions", null) if (driver != null) DriverRegistry.register(driver) if (partitionColumn != null && (lowerBound == null || upperBound == null || numPartitions == null)) { sys.error("Partitioning incompletely specified") } val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) //我们将传递给getConnection的其他属性 val properties = new Properties() // Additional properties that we will pass to getConnection parameters.foreach(kv => properties.setProperty(kv._1, kv._2)) JDBCRelation(url, table, parts, properties)(sqlContext) } }
Example 13
Source File: JdbcRelationProvider.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 14
Source File: DefaultSource.scala From spark-netezza with Apache License 2.0 | 5 votes |
package import java.util.Properties import org.apache.spark.sql.{SQLContext} import org.apache.spark.sql.sources.{DataSourceRegister, BaseRelation, RelationProvider} override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val url = parameters.getOrElse("url", sys.error("Option 'Netezza database url' not specified")) val (table, isQuery) = parameters.get("dbtable").map(table => (table, false)).orElse { parameters.get("query") .map(q => (s"($q) as src", true)) .orElse(sys.error("Option 'dbtable/query' should be specified.")) }.get // TODO: Have to set it to the system default. // For query default is 1, when fetching from a table defauilt is 4. Data slice ca // can be used for partitioning when table is specified. val numPartitions = parameters.getOrElse("numPartitions", if (isQuery) "1" else "4").toInt val partitionCol = parameters.get("partitioncol") val lowerBound = parameters.get("lowerbound") val upperBound = parameters.get("upperbound") val properties = new Properties() // Additional properties that we will pass to getConnection parameters.foreach { case (k, v) => properties.setProperty(k, v) } val conn = NetezzaJdbcUtils.getConnector(url, properties)() val parts = try { if (partitionCol.isDefined || isQuery) { if (isQuery && numPartitions > 1 && !partitionCol.isDefined) { throw new IllegalArgumentException("Partition column should be specified or" + " number of partitions should be set to 1 with the query option.") } val partnInfo = PartitioningInfo(partitionCol, lowerBound, upperBound, numPartitions) NetezzaInputFormat.getColumnPartitions(conn, table, partnInfo) } else { // Partitions based on the data slices. NetezzaInputFormat.getDataSlicePartition(conn, numPartitions) } } finally { conn.close() } NetezzaRelation(url, table, parts, properties, numPartitions)(sqlContext) } }
Example 15
Source File: DefaultSource.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery import import com.samelamin.spark.bigquery.converters.SchemaConverters import com.samelamin.spark.bigquery.streaming.{BigQuerySink, BigQuerySource} import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.streaming.{Sink, Source} import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.spark.sql.sources.RelationProvider class DefaultSource extends StreamSinkProvider with StreamSourceProvider with RelationProvider{ override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { val path = parameters.get("transaction_log").getOrElse("transaction_log") new BigQuerySink(sqlContext.sparkSession, path, parameters) } def getConvertedSchema(sqlContext: SQLContext,options: Map[String, String]): StructType = { val bigqueryClient = BigQueryClient.getInstance(sqlContext) val tableReference = BigQueryStrings.parseTableReference(options.get("tableReferenceSource").get) SchemaConverters.BQToSQLSchema(bigqueryClient.getTableSchema(tableReference)) } override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType], providerName: String, options: Map[String, String]): (String, StructType) = { val convertedSchema = getConvertedSchema(sqlContext,options) ("bigquery", schema.getOrElse(convertedSchema)) } override def createSource(sqlContext: SQLContext, metadataPath: String, schema: Option[StructType], providerName: String, parameters: Map[String, String]): Source = { new BigQuerySource(sqlContext, schema, parameters) } override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BigQueryRelation = { val tableName = parameters.get("tableReferenceSource").get new BigQueryRelation(tableName)(sqlContext) } }
Example 16
Source File: JdbcRelationProvider.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 17
Source File: DefaultSource.scala From magellan with Apache License 2.0 | 5 votes |
package magellan import org.apache.spark.sql.types.StructType import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, SchemaRelationProvider, RelationProvider} class DefaultSource extends RelationProvider with SchemaRelationProvider { override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = createRelation(sqlContext, parameters, null) override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { val path = parameters.getOrElse("path", sys.error("'path' must be specified for Shapefiles.")) val t = parameters.getOrElse("type", "shapefile") t match { case "shapefile" => new ShapeFileRelation(path, parameters)(sqlContext) case "geojson" => new GeoJSONRelation(path, parameters)(sqlContext) case "osm" => new OsmFileRelation(path, parameters)(sqlContext) case _ => ??? } } }
Example 18
Source File: JdbcRelationProvider.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val resolver = sqlContext.conf.resolver val timeZoneId = sqlContext.conf.sessionLocalTimeZone val schema = JDBCRelation.getSchema(resolver, jdbcOptions) val parts = JDBCRelation.columnPartition(schema, resolver, timeZoneId, jdbcOptions) JDBCRelation(schema, parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val options = new JdbcOptionsInWrite(parameters) val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis val conn = JdbcUtils.createConnectionFactory(options)() try { val tableExists = JdbcUtils.tableExists(conn, options) if (tableExists) { mode match { case SaveMode.Overwrite => if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, options) val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, options.table, options) createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } case SaveMode.Append => val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '${options.table}' already exists. " + s"SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 19
Source File: DefaultSource.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis import org.apache.spark.sql.SaveMode.{Append, ErrorIfExists, Ignore, Overwrite} import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = None) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val relation = new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = None) mode match { case Append => relation.insert(data, overwrite = false) case Overwrite => relation.insert(data, overwrite = true) case ErrorIfExists => if (relation.nonEmpty) { throw new IllegalStateException("SaveMode is set to ErrorIfExists and dataframe " + "already exists in Redis and contains data.") } relation.insert(data, overwrite = false) case Ignore => if (relation.isEmpty) { relation.insert(data, overwrite = false) } } relation } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = Some(schema)) }
Example 20
Source File: SelectParquetSource.scala From spark-select with Apache License 2.0 | 5 votes |
package // Java standard libraries import // Spark internal libraries import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.sources.DataSourceRegister class SelectParquetSource extends SchemaRelationProvider with DataSourceRegister { private def checkPath(parameters: Map[String, String]): String = { parameters.getOrElse("path", sys.error("'path' must be specified for Parquet data.")) } override def shortName(): String = "minioSelectParquet" override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectParquetRelation = { val path = checkPath(params) SelectParquetRelation(Some(path), params, schema)(sqlContext) } }
Example 21
Source File: SelectCSVSource.scala From spark-select with Apache License 2.0 | 5 votes |
package // Java standard libraries import // Spark internal libraries import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.sources.DataSourceRegister class SelectCSVSource extends SchemaRelationProvider with DataSourceRegister { private def checkPath(parameters: Map[String, String]): String = { parameters.getOrElse("path", sys.error("'path' must be specified for CSV data.")) } override def shortName(): String = "minioSelectCSV" override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectCSVRelation = { val path = checkPath(params) SelectCSVRelation(Some(path), params, schema)(sqlContext) } }
Example 22
Source File: SelectJSONSource.scala From spark-select with Apache License 2.0 | 5 votes |
package // Java standard libraries import // Spark internal libraries import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.sources.DataSourceRegister class SelectJSONSource extends SchemaRelationProvider with DataSourceRegister { private def checkPath(parameters: Map[String, String]): String = { parameters.getOrElse("path", sys.error("'path' must be specified for JSON data.")) } override def shortName(): String = "minioSelectJSON" override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectJSONRelation = { val path = checkPath(params) SelectJSONRelation(Some(path), params, schema)(sqlContext) } }