org.apache.spark.sql.sources.BaseRelation Scala Examples
The following examples show how to use org.apache.spark.sql.sources.BaseRelation.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: JdbcRelationProvider.scala From drizzle-spark with Apache License 2.0 | 7 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 2
Source File: console.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.sql._ import org.apache.spark.sql.execution.streaming.sources.ConsoleWriter import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType case class ConsoleRelation(override val sqlContext: SQLContext, data: DataFrame) extends BaseRelation { override def schema: StructType = data.schema } class ConsoleSinkProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister with CreatableRelationProvider { override def createStreamWriter( queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { new ConsoleWriter(schema, options) } def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { // Number of rows to display, by default 20 rows val numRowsToShow = parameters.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true val isTruncated = parameters.get("truncate").map(_.toBoolean).getOrElse(true) data.show(numRowsToShow, isTruncated) ConsoleRelation(sqlContext, data) } def shortName(): String = "console" }
Example 3
Source File: DefaultSource.scala From spark-netezza with Apache License 2.0 | 5 votes |
package com.ibm.spark.netezza import java.util.Properties import org.apache.spark.sql.{SQLContext} import org.apache.spark.sql.sources.{DataSourceRegister, BaseRelation, RelationProvider} override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val url = parameters.getOrElse("url", sys.error("Option 'Netezza database url' not specified")) val (table, isQuery) = parameters.get("dbtable").map(table => (table, false)).orElse { parameters.get("query") .map(q => (s"($q) as src", true)) .orElse(sys.error("Option 'dbtable/query' should be specified.")) }.get // TODO: Have to set it to the system default. // For query default is 1, when fetching from a table defauilt is 4. Data slice ca // can be used for partitioning when table is specified. val numPartitions = parameters.getOrElse("numPartitions", if (isQuery) "1" else "4").toInt val partitionCol = parameters.get("partitioncol") val lowerBound = parameters.get("lowerbound") val upperBound = parameters.get("upperbound") val properties = new Properties() // Additional properties that we will pass to getConnection parameters.foreach { case (k, v) => properties.setProperty(k, v) } val conn = NetezzaJdbcUtils.getConnector(url, properties)() val parts = try { if (partitionCol.isDefined || isQuery) { if (isQuery && numPartitions > 1 && !partitionCol.isDefined) { throw new IllegalArgumentException("Partition column should be specified or" + " number of partitions should be set to 1 with the query option.") } val partnInfo = PartitioningInfo(partitionCol, lowerBound, upperBound, numPartitions) NetezzaInputFormat.getColumnPartitions(conn, table, partnInfo) } else { // Partitions based on the data slices. NetezzaInputFormat.getDataSlicePartition(conn, numPartitions) } } finally { conn.close() } NetezzaRelation(url, table, parts, properties, numPartitions)(sqlContext) } }
Example 4
Source File: HadoopFsRelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.util.Locale import scala.collection.mutable import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.{StructField, StructType} case class HadoopFsRelation( location: FileIndex, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext private def getColName(f: StructField): String = { if (sparkSession.sessionState.conf.caseSensitiveAnalysis) { f.name } else { f.name.toLowerCase(Locale.ROOT) } } val overlappedPartCols = mutable.Map.empty[String, StructField] partitionSchema.foreach { partitionField => if (dataSchema.exists(getColName(_) == getColName(partitionField))) { overlappedPartCols += getColName(partitionField) -> partitionField } } // When data and partition schemas have overlapping columns, the output // schema respects the order of the data schema for the overlapping columns, and it // respects the data types of the partition schema. val schema: StructType = { StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++ partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f)))) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = { val compressionFactor = sqlContext.conf.fileCompressionFactor (location.sizeInBytes * compressionFactor).toLong } override def inputFiles: Array[String] = location.inputFiles }
Example 5
Source File: JdbcRelationProvider.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { import JDBCOptions._ val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn.isEmpty) { assert(lowerBound.isEmpty && upperBound.isEmpty, "When 'partitionColumn' is not specified, " + s"'$JDBC_LOWER_BOUND' and '$JDBC_UPPER_BOUND' are expected to be empty") null } else { assert(lowerBound.nonEmpty && upperBound.nonEmpty && numPartitions.nonEmpty, s"When 'partitionColumn' is specified, '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND', and " + s"'$JDBC_NUM_PARTITIONS' are also required") JDBCPartitioningInfo( partitionColumn.get, lowerBound.get, upperBound.get, numPartitions.get) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val options = new JDBCOptions(parameters) val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis val conn = JdbcUtils.createConnectionFactory(options)() try { val tableExists = JdbcUtils.tableExists(conn, options) if (tableExists) { mode match { case SaveMode.Overwrite => if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, options) val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, options.table) createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } case SaveMode.Append => val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '${options.table}' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 6
Source File: LogicalRelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): LogicalRelation = { this.copy(output = output.map(_.newInstance())) } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" } object LogicalRelation { def apply(relation: BaseRelation, isStreaming: Boolean = false): LogicalRelation = LogicalRelation(relation, relation.schema.toAttributes, None, isStreaming) def apply(relation: BaseRelation, table: CatalogTable): LogicalRelation = LogicalRelation(relation, relation.schema.toAttributes, Some(table), false) }
Example 7
Source File: DatasetRelation.scala From spark-sftp with Apache License 2.0 | 5 votes |
package com.springml.spark.sftp import com.databricks.spark.avro._ import org.apache.log4j.Logger import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SQLContext} import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.StructType case class DatasetRelation( fileLocation: String, fileType: String, inferSchema: String, header: String, delimiter: String, quote: String, escape: String, multiLine: String, rowTag: String, customSchema: StructType, sqlContext: SQLContext) extends BaseRelation with TableScan { private val logger = Logger.getLogger(classOf[DatasetRelation]) val df = read() private def read(): DataFrame = { var dataframeReader = sqlContext.read if (customSchema != null) { dataframeReader = dataframeReader.schema(customSchema) } var df: DataFrame = null df = fileType match { case "avro" => dataframeReader.avro(fileLocation) case "txt" => dataframeReader.format("text").load(fileLocation) case "xml" => dataframeReader.format(constants.xmlClass) .option(constants.xmlRowTag, rowTag) .load(fileLocation) case "csv" => dataframeReader. option("header", header). option("delimiter", delimiter). option("quote", quote). option("escape", escape). option("multiLine", multiLine). option("inferSchema", inferSchema). csv(fileLocation) case _ => dataframeReader.format(fileType).load(fileLocation) } df } override def schema: StructType = { df.schema } override def buildScan(): RDD[Row] = { df.rdd } }
Example 8
Source File: DefaultSource.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.util.Properties import org.apache.spark.sql.SQLContext import org.apache.spark.sql.execution.datasources.jdbc.{JDBCRelation, JDBCPartitioningInfo, DriverRegistry} import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, RelationProvider} class DefaultSource extends RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val url = parameters.getOrElse("url", sys.error("Option 'url' not specified")) val driver = parameters.getOrElse("driver", null) val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified")) val partitionColumn = parameters.getOrElse("partitionColumn", null) val lowerBound = parameters.getOrElse("lowerBound", null) val upperBound = parameters.getOrElse("upperBound", null) val numPartitions = parameters.getOrElse("numPartitions", null) if (driver != null) DriverRegistry.register(driver) if (partitionColumn != null && (lowerBound == null || upperBound == null || numPartitions == null)) { sys.error("Partitioning incompletely specified") } val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) //我们将传递给getConnection的其他属性 val properties = new Properties() // Additional properties that we will pass to getConnection parameters.foreach(kv => properties.setProperty(kv._1, kv._2)) JDBCRelation(url, table, parts, properties)(sqlContext) } }
Example 9
Source File: DefaultSource.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.util.Properties import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, DataSourceRegister} class DefaultSource extends RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val url = parameters.getOrElse("url", sys.error("Option 'url' not specified")) val driver = parameters.getOrElse("driver", null) val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified")) val partitionColumn = parameters.getOrElse("partitionColumn", null) val lowerBound = parameters.getOrElse("lowerBound", null) val upperBound = parameters.getOrElse("upperBound", null) val numPartitions = parameters.getOrElse("numPartitions", null) if (driver != null) DriverRegistry.register(driver) if (partitionColumn != null && (lowerBound == null || upperBound == null || numPartitions == null)) { sys.error("Partitioning incompletely specified") } val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) //我们将传递给getConnection的其他属性 val properties = new Properties() // Additional properties that we will pass to getConnection parameters.foreach(kv => properties.setProperty(kv._1, kv._2)) JDBCRelation(url, table, parts, properties)(sqlContext) } }
Example 10
Source File: AddSourceToAttributes.scala From jgit-spark-connector with Apache License 2.0 | 5 votes |
package tech.sourced.engine.rule import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types.MetadataBuilder import tech.sourced.engine.{GitRelation, MetadataRelation, Sources} import tech.sourced.engine.compat def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { case compat.LogicalRelation(rel @ GitRelation(_, _, _, schemaSource), out, catalogTable) => withMetadata(rel, schemaSource, out, catalogTable) case compat.LogicalRelation( rel @ MetadataRelation(_, _, _, _, schemaSource), out, catalogTable) => withMetadata(rel, schemaSource, out, catalogTable) } private def withMetadata(relation: BaseRelation, schemaSource: Option[String], out: Seq[AttributeReference], catalogTable: Option[CatalogTable]): LogicalRelation = { val processedOut = schemaSource match { case Some(table) => out.map( _.withMetadata(new MetadataBuilder().putString(SOURCE, table).build() ).asInstanceOf[AttributeReference] ) case None => out } compat.LogicalRelation(relation, processedOut, catalogTable) } }
Example 11
Source File: compat.scala From jgit-spark-connector with Apache License 2.0 | 5 votes |
package tech.sourced.engine.compat import org.apache.spark.SPARK_VERSION import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.execution.datasources.{ LogicalRelation => SparkLogicalRelation } import org.apache.spark.sql.sources.BaseRelation import scala.reflect.runtime.{universe => ru} private[compat] object Compat { def apply[T](s22: T, s23: T): T = SPARK_VERSION match { case s if s.startsWith("2.2.") => s22 case s if s.startsWith("2.3.") => s23 case _ => throw new RuntimeException(s"Unsupported SPARK_VERSION: $SPARK_VERSION") } lazy val ClassMirror = ru.runtimeMirror(Compat.getClass.getClassLoader) } private[engine] object LogicalRelation { def apply(rel: BaseRelation, out: Seq[AttributeReference], catalog: Option[CatalogTable]): SparkLogicalRelation = applyImpl(rel, out, catalog) private lazy val applyImpl = Compat(applySpark22(_, _, _), applySpark23(_, _, _)) private lazy val typ = ru.typeOf[SparkLogicalRelation] private lazy val classSymbol = Compat.ClassMirror.reflectClass(typ.typeSymbol.asClass) private lazy val ctor = classSymbol.reflectConstructor(typ.decl(ru.termNames.CONSTRUCTOR).asMethod) def applySpark22(rel: BaseRelation, out: Seq[AttributeReference], catalog: Option[CatalogTable]): SparkLogicalRelation = ctor(rel, out, catalog).asInstanceOf[SparkLogicalRelation] def applySpark23(rel: BaseRelation, out: Seq[AttributeReference], catalog: Option[CatalogTable]): SparkLogicalRelation = ctor(rel, out, catalog, false).asInstanceOf[SparkLogicalRelation] def unapply(arg: SparkLogicalRelation) : Option[(BaseRelation, Seq[AttributeReference], Option[CatalogTable])] = unapplyImpl(arg) private lazy val unapplyImpl = Compat(unapplySpark22(_), unapplySpark23(_)) def unapplySpark22(arg: SparkLogicalRelation) : Option[(BaseRelation, Seq[AttributeReference], Option[CatalogTable])] = Some((arg.relation, arg.output, arg.catalogTable)) def unapplySpark23(arg: SparkLogicalRelation) : Option[(BaseRelation, Seq[AttributeReference], Option[CatalogTable])] = { val isStreaming = Compat.ClassMirror .reflect(arg) .reflectField(typ.decl(ru.TermName("isStreaming")).asTerm) .get .asInstanceOf[Boolean] if (isStreaming) { None } else { Some((arg.relation, arg.output, arg.catalogTable)) } } }
Example 12
Source File: DefaultSource.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.github.traviscrawford.spark.dynamodb import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.sources.RelationProvider import org.apache.spark.sql.sources.SchemaRelationProvider import org.apache.spark.sql.types.StructType private[dynamodb] class DefaultSource extends RelationProvider with SchemaRelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]) : BaseRelation = getDynamoDBRelation(sqlContext, parameters) override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) : BaseRelation = getDynamoDBRelation(sqlContext, parameters, Some(schema)) private def getDynamoDBRelation( sqlContext: SQLContext, parameters: Map[String, String], maybeSchema: Option[StructType] = None) : DynamoDBRelation = { val tableName = parameters.getOrElse("table", throw new IllegalArgumentException("Required parameter 'table' was unspecified.") ) DynamoDBRelation( tableName = tableName, maybeFilterExpression = parameters.get("filter_expression"), maybePageSize = parameters.get("page_size"), maybeRegion = parameters.get("region"), maybeSegments = parameters.get("segments"), maybeRateLimit = parameters.get("rate_limit_per_segment").map(Integer.parseInt), maybeSchema = maybeSchema, maybeCredentials = parameters.get("aws_credentials_provider"), maybeEndpoint = parameters.get("endpoint"))(sqlContext) } }
Example 13
Source File: HadoopFsRelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.StructType case class HadoopFsRelation( location: FileIndex, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext val schema: StructType = { val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet StructType(dataSchema ++ partitionSchema.filterNot { column => dataSchemaColumnNames.contains(column.name.toLowerCase) }) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = location.sizeInBytes override def inputFiles: Array[String] = location.inputFiles }
Example 14
Source File: JdbcRelationProvider.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 15
Source File: LogicalRelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): this.type = { LogicalRelation( relation, expectedOutputAttributes.map(_.map(_.newInstance())), catalogTable).asInstanceOf[this.type] } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" }
Example 16
Source File: DruidRelation.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.sparklinedata.druid import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId} import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.sql.{Row, SQLContext} import org.joda.time.Interval import org.sparklinedata.druid.metadata.DruidRelationInfo case class DruidOperatorAttribute(exprId : ExprId, name : String, dataType : DataType, tf: String = null) override val needConversion: Boolean = false override def schema: StructType = dQuery.map(_.schema(info)).getOrElse(info.sourceDF(sqlContext).schema) def buildInternalScan : RDD[InternalRow] = dQuery.map(new DruidRDD(sqlContext, info, _)).getOrElse( info.sourceDF(sqlContext).queryExecution.toRdd ) override def buildScan(): RDD[Row] = buildInternalScan.asInstanceOf[RDD[Row]] override def toString : String = { if (dQuery.isDefined) { s"DruidQuery(${System.identityHashCode(dQuery)}): ${Utils.queryToString(dQuery.get)}" } else { info.toString } } }
Example 17
Source File: DefaultSource.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.sql import org.apache.spark.sql.{ DataFrame, SQLContext, SaveMode } import org.apache.spark.sql.sources.{ BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider, SchemaRelationProvider } import org.apache.spark.sql.types.StructType import com.actian.spark_vector.util.Logging import com.actian.spark_vector.vector.VectorJDBC class DefaultSource extends DataSourceRegister with RelationProvider with SchemaRelationProvider with CreatableRelationProvider with Logging { override def shortName(): String = "vector" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = VectorRelation(TableRef(parameters), sqlContext, parameters) override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = VectorRelation(TableRef(parameters), Some(schema), sqlContext, parameters) override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val tableRef = TableRef(parameters) val table = VectorRelation(tableRef, sqlContext, parameters) mode match { case SaveMode.Overwrite => table.insert(data, true) case SaveMode.ErrorIfExists => val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) } if (isEmpty) { table.insert(data, false) } else { throw new UnsupportedOperationException("Writing to a non-empty Vector table is not allowed with mode ErrorIfExists.") } case SaveMode.Append => table.insert(data, false) case SaveMode.Ignore => val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) } if (isEmpty) { table.insert(data, false) } } table } }
Example 18
Source File: SpreadsheetRelation.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.datasource.google.spreadsheet import mimir.exec.spark.datasource.google.spreadsheet.SparkSpreadsheetService.SparkSpreadsheetContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation, TableScan} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SQLContext} case class SpreadsheetRelation protected[spark] ( context:SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, userSchema: Option[StructType] = None)(@transient val sqlContext: SQLContext) extends BaseRelation with TableScan with InsertableRelation { import mimir.exec.spark.datasource.google.spreadsheet.SparkSpreadsheetService._ private val fieldMap = scala.collection.mutable.Map[String, String]() override def schema: StructType = userSchema.getOrElse(inferSchema()) private lazy val aWorksheet: SparkWorksheet = findWorksheet(spreadsheetName, worksheetName)(context) match { case Right(aWorksheet) => aWorksheet case Left(e) => throw e } private lazy val rows: Seq[Map[String, String]] = aWorksheet.rows private[spreadsheet] def findWorksheet(spreadsheetName: String, worksheetName: String)(implicit ctx: SparkSpreadsheetContext): Either[Throwable, SparkWorksheet] = for { sheet <- findSpreadsheet(spreadsheetName).toRight(new RuntimeException(s"no such spreadsheet: $spreadsheetName")).right worksheet <- sheet.findWorksheet(worksheetName).toRight(new RuntimeException(s"no such worksheet: $worksheetName")).right } yield worksheet override def buildScan(): RDD[Row] = { val aSchema = schema val schemaMap = fieldMap.toMap sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter => iter.map { m => var index = 0 val rowArray = new Array[Any](aSchema.fields.length) while(index < aSchema.fields.length) { val field = aSchema.fields(index) rowArray(index) = if (m.contains(field.name)) { TypeCast.castTo(m(field.name), field.dataType, field.nullable) } else if (schemaMap.contains(field.name) && m.contains(schemaMap(field.name))) { TypeCast.castTo(m(schemaMap(field.name)), field.dataType, field.nullable) } else { null } index += 1 } Row.fromSeq(rowArray) } } } override def insert(data: DataFrame, overwrite: Boolean): Unit = { if(!overwrite) { sys.error("Spreadsheet tables only support INSERT OVERWRITE for now.") } findWorksheet(spreadsheetName, worksheetName)(context) match { case Right(w) => w.updateCells(data.schema, data.collect().toList, Util.toRowData) case Left(e) => throw e } } def sanitizeColumnName(name: String): String = { name .replaceAll("[^a-zA-Z0-9]+", "_") // Replace sequences of non-alphanumeric characters with underscores .replaceAll("_+$", "") // Strip trailing underscores .replaceAll("^[0-9_]+", "") // Strip leading underscores and digits } private def inferSchema(): StructType = StructType(aWorksheet.headers.toList.map { fieldName => { val sanitizedName = sanitizeColumnName(fieldName) fieldMap.put(sanitizedName, fieldName) StructField(sanitizedName, StringType, true) }}) }
Example 19
Source File: DefaultSource.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.datasource.google.spreadsheet import java.io.File import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = { createRelation(sqlContext, parameters, null) } private[spreadsheet] def pathToSheetNames(parameters: Map[String, String]): (String, String) = { val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets.")) val elems = path.split('/') if (elems.length < 2) throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'") (elems(0), elems(1)) } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) val context = createSpreadsheetContext(parameters) createRelation(sqlContext, context, spreadsheetName, worksheetName, schema) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) implicit val context = createSpreadsheetContext(parameters) val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName) if(!spreadsheet.isDefined) throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName") spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData) createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema) } private[spreadsheet] def createSpreadsheetContext(parameters: Map[String, String]) = { val serviceAccountIdOption = parameters.get("serviceAccountId") val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH) SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath)) } private[spreadsheet] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: StructType): SpreadsheetRelation = if (schema == null) { createRelation(sqlContext, context, spreadsheetName, worksheetName, None) } else { createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema)) } private[spreadsheet] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: Option[StructType]): SpreadsheetRelation = SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext) }
Example 20
Source File: DefaultSource.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.util.Properties import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, DataSourceRegister} class DefaultSource extends RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val url = parameters.getOrElse("url", sys.error("Option 'url' not specified")) val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified")) val partitionColumn = parameters.getOrElse("partitionColumn", null) val lowerBound = parameters.getOrElse("lowerBound", null) val upperBound = parameters.getOrElse("upperBound", null) val numPartitions = parameters.getOrElse("numPartitions", null) if (partitionColumn != null && (lowerBound == null || upperBound == null || numPartitions == null)) { sys.error("Partitioning incompletely specified") } val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) val properties = new Properties() // Additional properties that we will pass to getConnection parameters.foreach(kv => properties.setProperty(kv._1, kv._2)) JDBCRelation(url, table, parts, properties)(sqlContext) } }
Example 21
Source File: ExistingRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.sources.{HadoopFsRelation, BaseRelation} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericMutableRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } //private[sql] case class PhysicalRDD( output: Seq[Attribute], rdd: RDD[InternalRow], override val nodeName: String, override val metadata: Map[String, String] = Map.empty, override val outputsUnsafeRows: Boolean = false) extends LeafNode { protected override def doExecute(): RDD[InternalRow] = rdd override def simpleString: String = { val metadataEntries = for ((key, value) <- metadata.toSeq.sorted) yield s"$key: $value" s"Scan $nodeName${output.mkString("[", ",", "]")}${metadataEntries.mkString(" ", ", ", "")}" } } private[sql] object PhysicalRDD { // Metadata keys val INPUT_PATHS = "InputPaths" val PUSHED_FILTERS = "PushedFilters" def createFromDataSource( output: Seq[Attribute], rdd: RDD[InternalRow], relation: BaseRelation, metadata: Map[String, String] = Map.empty): PhysicalRDD = { // All HadoopFsRelations output UnsafeRows val outputUnsafeRows = relation.isInstanceOf[HadoopFsRelation] PhysicalRDD(output, rdd, relation.toString, metadata, outputUnsafeRows) } }
Example 22
Source File: DefaultSource.scala From Spark-MongoDB with Apache License 2.0 | 5 votes |
package com.stratio.datasource.mongodb import com.stratio.datasource.mongodb.config.MongodbConfigBuilder import com.stratio.datasource.mongodb.config.MongodbConfig._ import org.apache.spark.sql.SaveMode._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider{ override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { new MongodbRelation(MongodbConfigBuilder(parseParameters(parameters)).build())(sqlContext) } override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { new MongodbRelation(MongodbConfigBuilder(parseParameters(parameters)).build(), Some(schema))(sqlContext) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val mongodbRelation = new MongodbRelation( MongodbConfigBuilder(parseParameters(parameters)).build(), Some(data.schema))(sqlContext) mode match{ case Append => mongodbRelation.insert(data, overwrite = false) case Overwrite => mongodbRelation.insert(data, overwrite = true) case ErrorIfExists => if(mongodbRelation.isEmptyCollection) mongodbRelation.insert(data, overwrite = false) else throw new UnsupportedOperationException("Writing in a non-empty collection.") case Ignore => if(mongodbRelation.isEmptyCollection) mongodbRelation.insert(data, overwrite = false) } mongodbRelation } }
Example 23
Source File: BEDRelation.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.datasources.BED import org.apache.log4j.Logger import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoders, Row, SQLContext, SparkSession} import org.apache.spark.sql.sources.{BaseRelation, Filter, PrunedFilteredScan} import org.biodatageeks.sequila.utils.{Columns, DataQualityFuncs} class BEDRelation(path: String)(@transient val sqlContext: SQLContext) extends BaseRelation with PrunedFilteredScan with Serializable { @transient val logger = Logger.getLogger(this.getClass.getCanonicalName) override def schema: org.apache.spark.sql.types.StructType = Encoders.product[org.biodatageeks.formats.BrowserExtensibleData].schema private def getValueFromColumn(colName:String, r:Array[String]): Any = { colName match { case Columns.CONTIG => DataQualityFuncs.cleanContig(r(0) ) case Columns.START => r(1).toInt + 1 //Convert interval to 1-based case Columns.END => r(2).toInt case Columns.NAME => if (r.length > 3) Some (r(3)) else None case Columns.SCORE => if (r.length > 4) Some (r(4).toInt) else None case Columns.STRAND => if (r.length > 5) Some (r(5)) else None case Columns.THICK_START => if (r.length > 6) Some (r(6).toInt) else None case Columns.THICK_END => if (r.length > 7) Some (r(7).toInt) else None case Columns.ITEM_RGB => if (r.length > 8) Some (r(8).split(",").map(_.toInt)) else None case Columns.BLOCK_COUNT => if (r.length > 9) Some (r(9).toInt) else None case Columns.BLOCK_SIZES => if (r.length > 10) Some (r(10).split(",").map(_.toInt)) else None case Columns.BLOCK_STARTS => if (r.length > 11) Some (r(11).split(",").map(_.toInt)) else None case _ => throw new Exception(s"Unknown column found: ${colName}") } } override def buildScan(requiredColumns:Array[String], filters:Array[Filter]): RDD[Row] = { sqlContext .sparkContext .textFile(path) .filter(!_.toLowerCase.startsWith("track")) .filter(!_.toLowerCase.startsWith("browser")) .map(_.split("\t")) .map(r=> { val record = new Array[Any](requiredColumns.length) //requiredColumns. for (i <- 0 to requiredColumns.length - 1) { record(i) = getValueFromColumn(requiredColumns(i), r) } Row.fromSeq(record) } ) } }
Example 24
Source File: PulsarRelation.scala From pulsar-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.pulsar import java.{util => ju} import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.catalyst.json.JSONOptionsInRead import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.StructType private[pulsar] class PulsarRelation( override val sqlContext: SQLContext, override val schema: StructType, schemaInfo: SchemaInfoSerializable, adminUrl: String, clientConf: ju.Map[String, Object], readerConf: ju.Map[String, Object], startingOffset: SpecificPulsarOffset, endingOffset: SpecificPulsarOffset, pollTimeoutMs: Int, failOnDataLoss: Boolean, subscriptionNamePrefix: String, jsonOptions: JSONOptionsInRead) extends BaseRelation with TableScan with Logging { import PulsarSourceUtils._ val reportDataLoss = reportDataLossFunc(failOnDataLoss) override def buildScan(): RDD[Row] = { val fromTopicOffsets = startingOffset.topicOffsets val endTopicOffsets = endingOffset.topicOffsets if (fromTopicOffsets.keySet != endTopicOffsets.keySet) { val fromTopics = fromTopicOffsets.keySet.toList.sorted.mkString(",") val endTopics = endTopicOffsets.keySet.toList.sorted.mkString(",") throw new IllegalStateException( "different topics " + s"for starting offsets topics[${fromTopics}] and " + s"ending offsets topics[${endTopics}]") } val offsetRanges = endTopicOffsets.keySet .map { tp => val fromOffset = fromTopicOffsets.getOrElse(tp, { // this shouldn't happen since we had checked it throw new IllegalStateException(s"$tp doesn't have a from offset") }) val untilOffset = endTopicOffsets(tp) PulsarOffsetRange(tp, fromOffset, untilOffset, None) } .filter { range => if (range.untilOffset.compareTo(range.fromOffset) < 0) { reportDataLoss( s"${range.topic}'s offset was changed " + s"from ${range.fromOffset} to ${range.untilOffset}, " + "some data might has been missed") false } else { true } } .toSeq val rdd = new PulsarSourceRDD4Batch( sqlContext.sparkContext, schemaInfo, adminUrl, clientConf, readerConf, offsetRanges, pollTimeoutMs, failOnDataLoss, subscriptionNamePrefix, jsonOptions ) sqlContext.internalCreateDataFrame(rdd.setName("pulsar"), schema).rdd } }
Example 25
Source File: HiveAcidRelation.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.datasource import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession} import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan} import org.apache.spark.sql.types._ import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf} import com.qubole.spark.hiveacid.hive.HiveAcidMetadata import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert} import org.apache.spark.sql.catalyst.AliasIdentifier import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import collection.JavaConversions._ case class HiveAcidRelation(sparkSession: SparkSession, fullyQualifiedTableName: String, parameters: Map[String, String]) extends BaseRelation with InsertableRelation with PrunedFilteredScan with Logging { private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession( sparkSession, fullyQualifiedTableName ) private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession, hiveAcidMetadata, parameters) private val readOptions = SparkAcidConf(sparkSession, parameters) override def sqlContext: SQLContext = sparkSession.sqlContext override val schema: StructType = if (readOptions.includeRowIds) { hiveAcidMetadata.tableSchemaWithRowId } else { hiveAcidMetadata.tableSchema } override def insert(data: DataFrame, overwrite: Boolean): Unit = { // sql insert into and overwrite if (overwrite) { hiveAcidTable.insertOverwrite(data) } else { hiveAcidTable.insertInto(data) } } def update(condition: Option[Column], newValues: Map[String, Column]): Unit = { hiveAcidTable.update(condition, newValues) } def delete(condition: Column): Unit = { hiveAcidTable.delete(condition) } override def sizeInBytes: Long = { val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong } def merge(sourceDf: DataFrame, mergeExpression: Expression, matchedClause: Seq[MergeWhenClause], notMatched: Option[MergeWhenNotInsert], sourceAlias: Option[AliasIdentifier], targetAlias: Option[AliasIdentifier]): Unit = { hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause, notMatched, sourceAlias, targetAlias) } def getHiveAcidTable(): HiveAcidTable = { hiveAcidTable } // FIXME: should it be true / false. Recommendation seems to // be to leave it as true override val needConversion: Boolean = false override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { val readOptions = SparkAcidConf(sparkSession, parameters) // sql "select *" hiveAcidTable.getRdd(requiredColumns, filters, readOptions) } }
Example 26
Source File: DefaultSource.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType ): BaseRelation = { val path = parameters.getOrElse("path", sys.error("Parameter 'path' must be defined.")) val name = parameters.getOrElse("name", sys.error("Parameter 'name' must be defined.")) val numPartitions = parameters.getOrElse("numPartitions", "8").toInt GDBRelation(path, name, numPartitions)(sqlContext) } }
Example 27
Source File: GDBRelation.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SQLContext} case class GDBRelation(gdbPath: String, gdbName: String, numPartition: Int) (@transient val sqlContext: SQLContext) extends BaseRelation with Logging with TableScan { override val schema = inferSchema() private def inferSchema() = { val sc = sqlContext.sparkContext GDBTable.findTable(gdbPath, gdbName, sc.hadoopConfiguration) match { case Some(catTab) => { val table = GDBTable(gdbPath, catTab.hexName, sc.hadoopConfiguration) try { table.schema() } finally { table.close() } } case _ => { log.error(s"Cannot find '$gdbName' in $gdbPath, creating an empty schema !") StructType(Seq.empty[StructField]) } } } override def buildScan(): RDD[Row] = { GDBRDD(sqlContext.sparkContext, gdbPath, gdbName, numPartition) } }
Example 28
Source File: RedshiftReaderM.scala From SqlShift with MIT License | 5 votes |
package com.databricks.spark.redshift import com.amazonaws.auth.AWSCredentials import com.amazonaws.services.s3.AmazonS3Client import org.apache.spark.SparkContext import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.{DataFrame, SQLContext} object RedshiftReaderM { val endpoint = "s3.ap-south-1.amazonaws.com" def getS3Client(provider: AWSCredentials): AmazonS3Client = { val client = new AmazonS3Client(provider) client.setEndpoint(endpoint) client } def getDataFrameForConfig(configs: Map[String, String], sparkContext: SparkContext, sqlContext: SQLContext): DataFrame = { val source: DefaultSource = new DefaultSource(new JDBCWrapper(), getS3Client) val br: BaseRelation = source.createRelation(sqlContext, configs) sqlContext.baseRelationToDataFrame(br) } }
Example 29
Source File: SpreadsheetRelation.scala From spark-google-spreadsheets with Apache License 2.0 | 5 votes |
package com.github.potix2.spark.google.spreadsheets import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService.SparkSpreadsheetContext import com.github.potix2.spark.google.spreadsheets.util._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation, TableScan} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Row, SQLContext} case class SpreadsheetRelation protected[spark] ( context:SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, userSchema: Option[StructType] = None)(@transient val sqlContext: SQLContext) extends BaseRelation with TableScan with InsertableRelation { import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService._ override def schema: StructType = userSchema.getOrElse(inferSchema()) private lazy val aWorksheet: SparkWorksheet = findWorksheet(spreadsheetName, worksheetName)(context) match { case Right(aWorksheet) => aWorksheet case Left(e) => throw e } private lazy val rows: Seq[Map[String, String]] = aWorksheet.rows private[spreadsheets] def findWorksheet(spreadsheetName: String, worksheetName: String)(implicit ctx: SparkSpreadsheetContext): Either[Throwable, SparkWorksheet] = for { sheet <- findSpreadsheet(spreadsheetName).toRight(new RuntimeException(s"no such spreadsheet: $spreadsheetName")).right worksheet <- sheet.findWorksheet(worksheetName).toRight(new RuntimeException(s"no such worksheet: $worksheetName")).right } yield worksheet override def buildScan(): RDD[Row] = { val aSchema = schema sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter => iter.map { m => var index = 0 val rowArray = new Array[Any](aSchema.fields.length) while(index < aSchema.fields.length) { val field = aSchema.fields(index) rowArray(index) = if (m.contains(field.name)) { TypeCast.castTo(m(field.name), field.dataType, field.nullable) } else { null } index += 1 } Row.fromSeq(rowArray) } } } override def insert(data: DataFrame, overwrite: Boolean): Unit = { if(!overwrite) { sys.error("Spreadsheet tables only support INSERT OVERWRITE for now.") } findWorksheet(spreadsheetName, worksheetName)(context) match { case Right(w) => w.updateCells(data.schema, data.collect().toList, Util.toRowData) case Left(e) => throw e } } private def inferSchema(): StructType = StructType(aWorksheet.headers.toList.map { fieldName => StructField(fieldName, StringType, nullable = true) }) }
Example 30
Source File: DefaultSource.scala From spark-google-spreadsheets with Apache License 2.0 | 5 votes |
package com.github.potix2.spark.google.spreadsheets import java.io.File import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = { createRelation(sqlContext, parameters, null) } private[spreadsheets] def pathToSheetNames(parameters: Map[String, String]): (String, String) = { val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets.")) val elems = path.split('/') if (elems.length < 2) throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'") (elems(0), elems(1)) } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) val context = createSpreadsheetContext(parameters) createRelation(sqlContext, context, spreadsheetName, worksheetName, schema) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) implicit val context = createSpreadsheetContext(parameters) val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName) if(!spreadsheet.isDefined) throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName") spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData) createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema) } private[spreadsheets] def createSpreadsheetContext(parameters: Map[String, String]) = { val serviceAccountIdOption = parameters.get("serviceAccountId") val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH) SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath)) } private[spreadsheets] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: StructType): SpreadsheetRelation = if (schema == null) { createRelation(sqlContext, context, spreadsheetName, worksheetName, None) } else { createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema)) } private[spreadsheets] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: Option[StructType]): SpreadsheetRelation = SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext) }
Example 31
Source File: HelloWorldDataSource.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.datasources.helloworld import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{ BaseRelation, DataSourceRegister, RelationProvider, TableScan } import org.apache.spark.sql.types.{ StringType, StructField, StructType } import org.apache.spark.sql.{ Row, SQLContext } class HelloWorldDataSource extends RelationProvider with DataSourceRegister with Serializable { override def shortName(): String = "helloworld" override def hashCode(): Int = getClass.hashCode() override def equals(other: scala.Any): Boolean = other.isInstanceOf[HelloWorldDataSource] override def toString: String = "HelloWorldDataSource" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val path = parameters.get("path") path match { case Some(p) => new HelloWorldRelationProvider(sqlContext, p, parameters) case _ => throw new IllegalArgumentException("Path is required for Tickets datasets") } } } class HelloWorldRelationProvider(val sqlContext: SQLContext, path: String, parameters: Map[String, String]) extends BaseRelation with TableScan { import sqlContext.implicits._ override def schema: StructType = StructType(Array( StructField("key", StringType, nullable = false), StructField("value", StringType, nullable = true) )) override def buildScan(): RDD[Row] = Seq( "path" -> path, "message" -> parameters.getOrElse("message", ""), "name" -> s"Hello ${parameters.getOrElse("name", "")}", "hello_world" -> "Hello World!" ).toDF.rdd }
Example 32
Source File: TensorflowRelation.scala From ecosystem with Apache License 2.0 | 5 votes |
package org.tensorflow.spark.datasources.tfrecords import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, SQLContext, SparkSession} import org.tensorflow.example.{SequenceExample, Example} import org.tensorflow.hadoop.io.TFRecordFileInputFormat import org.tensorflow.spark.datasources.tfrecords.serde.DefaultTfRecordRowDecoder case class TensorflowRelation(options: Map[String, String], customSchema: Option[StructType]=None) (@transient val session: SparkSession) extends BaseRelation with TableScan { //Import TFRecords as DataFrame happens here lazy val (tfRdd, tfSchema) = { val rdd = session.sparkContext.newAPIHadoopFile(options("path"), classOf[TFRecordFileInputFormat], classOf[BytesWritable], classOf[NullWritable]) val recordType = options.getOrElse("recordType", "Example") recordType match { case "Example" => val exampleRdd = rdd.map{case (bytesWritable, nullWritable) => Example.parseFrom(bytesWritable.getBytes) } val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(exampleRdd)) val rowRdd = exampleRdd.map(example => DefaultTfRecordRowDecoder.decodeExample(example, finalSchema)) (rowRdd, finalSchema) case "SequenceExample" => val sequenceExampleRdd = rdd.map{case (bytesWritable, nullWritable) => SequenceExample.parseFrom(bytesWritable.getBytes) } val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(sequenceExampleRdd)) val rowRdd = sequenceExampleRdd.map(example => DefaultTfRecordRowDecoder.decodeSequenceExample(example, finalSchema)) (rowRdd, finalSchema) case _ => throw new IllegalArgumentException(s"Unsupported recordType ${recordType}: recordType can be Example or SequenceExample") } } override def sqlContext: SQLContext = session.sqlContext override def schema: StructType = tfSchema override def buildScan(): RDD[Row] = tfRdd }
Example 33
Source File: LogicalRelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): this.type = { LogicalRelation( relation, expectedOutputAttributes.map(_.map(_.newInstance())), catalogTable).asInstanceOf[this.type] } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" }
Example 34
Source File: RemoveNestedAliasesSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import com.sap.spark.PlanTest import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.Alias import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ import org.scalatest.FunSuite import org.scalatest.mock.MockitoSugar class RemoveNestedAliasesSuite extends FunSuite with MockitoSugar with PlanTest { val br1 = new BaseRelation { override def sqlContext: SQLContext = mock[SQLContext] override def schema: StructType = StructType(Seq( StructField("name", StringType), StructField("age", IntegerType) )) } val lr1 = LogicalRelation(br1) val nameAtt = lr1.output.find(_.name == "name").get val ageAtt = lr1.output.find(_.name == "age").get test("Replace alias into aliases") { val avgExpr = avg(ageAtt) val avgAlias = avgExpr as 'avgAlias val aliasAlias = avgAlias as 'aliasAlias val aliasAliasAlias = aliasAlias as 'aliasAliasAlias val copiedAlias = Alias(avgExpr, aliasAlias.name)( exprId = aliasAlias.exprId ) val copiedAlias2 = Alias(avgExpr, aliasAliasAlias.name)( exprId = aliasAliasAlias.exprId ) assertResult( lr1.groupBy(avgAlias.toAttribute)(avgAlias) )(RemoveNestedAliases(lr1.groupBy(avgAlias.toAttribute)(avgAlias))) assertResult( lr1.groupBy(copiedAlias.toAttribute)(copiedAlias) )(RemoveNestedAliases(lr1.groupBy(aliasAlias.toAttribute)(aliasAlias))) assertResult( lr1.groupBy(copiedAlias2.toAttribute)(copiedAlias2) )(RemoveNestedAliases(lr1.groupBy(aliasAliasAlias.toAttribute)(aliasAliasAlias))) } test("Replace alias into expressions") { val ageAlias = ageAtt as 'ageAlias val avgExpr = avg(ageAlias) as 'avgAlias val correctedAvgExpr = avg(ageAtt) as 'avgAlias comparePlans( lr1.groupBy(correctedAvgExpr.toAttribute)(correctedAvgExpr), RemoveNestedAliases(lr1.groupBy(avgExpr.toAttribute)(avgExpr)) ) } }
Example 35
Source File: HadoopFsRelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.StructType case class HadoopFsRelation( location: FileCatalog, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext val schema: StructType = { val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet StructType(dataSchema ++ partitionSchema.filterNot { column => dataSchemaColumnNames.contains(column.name.toLowerCase) }) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = location.sizeInBytes override def inputFiles: Array[String] = location.inputFiles }
Example 36
Source File: RiakRelation.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.riak import com.basho.riak.spark._ import scala.reflect._ import com.basho.riak.spark.rdd.connector.{RiakConnectorConf, RiakConnector} import com.basho.riak.spark.rdd.{ReadConf, RiakTSRDD} import com.basho.riak.spark.util.TSConversionUtil import com.basho.riak.spark.writer.WriteConf import com.basho.riak.spark.writer.mapper.SqlDataMapper import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{InsertableRelation, BaseRelation, Filter, PrunedFilteredScan} import org.apache.spark.sql.types._ import org.apache.spark.sql._ import scala.collection.convert.decorateAsScala._ import com.basho.riak.spark.query.QueryBucketDef object RiakRelation { def apply(bucket: String, sqlContext: SQLContext, schema: Option[StructType] = None, connector: Option[RiakConnector] = None, readConf: ReadConf, writeConf: WriteConf): RiakRelation = { new RiakRelation(bucket, connector.getOrElse(RiakConnector(sqlContext.sparkContext.getConf)), readConf, writeConf, sqlContext, schema) } def apply(sqlContext: SQLContext, parameters: Map[String, String], schema: Option[StructType]): RiakRelation = { val existingConf = sqlContext.sparkContext.getConf val bucketDef = BucketDef(parameters(DefaultSource.RiakBucketProperty), None) val riakConnector = new RiakConnector(RiakConnectorConf(existingConf, parameters)) val readConf = ReadConf(existingConf, parameters) val writeConf = WriteConf(existingConf, parameters) RiakRelation(bucketDef.bucket, sqlContext, schema, Some(riakConnector), readConf, writeConf) } }
Example 37
Source File: SelectJSONSource.scala From spark-select with Apache License 2.0 | 5 votes |
package io.minio.spark.select // Java standard libraries import java.io.File // Spark internal libraries import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.sources.DataSourceRegister class SelectJSONSource extends SchemaRelationProvider with DataSourceRegister { private def checkPath(parameters: Map[String, String]): String = { parameters.getOrElse("path", sys.error("'path' must be specified for JSON data.")) } override def shortName(): String = "minioSelectJSON" override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectJSONRelation = { val path = checkPath(params) SelectJSONRelation(Some(path), params, schema)(sqlContext) } }
Example 38
Source File: SelectCSVSource.scala From spark-select with Apache License 2.0 | 5 votes |
package io.minio.spark.select // Java standard libraries import java.io.File // Spark internal libraries import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.sources.DataSourceRegister class SelectCSVSource extends SchemaRelationProvider with DataSourceRegister { private def checkPath(parameters: Map[String, String]): String = { parameters.getOrElse("path", sys.error("'path' must be specified for CSV data.")) } override def shortName(): String = "minioSelectCSV" override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectCSVRelation = { val path = checkPath(params) SelectCSVRelation(Some(path), params, schema)(sqlContext) } }
Example 39
Source File: SelectParquetSource.scala From spark-select with Apache License 2.0 | 5 votes |
package io.minio.spark.select // Java standard libraries import java.io.File // Spark internal libraries import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.sources.DataSourceRegister class SelectParquetSource extends SchemaRelationProvider with DataSourceRegister { private def checkPath(parameters: Map[String, String]): String = { parameters.getOrElse("path", sys.error("'path' must be specified for Parquet data.")) } override def shortName(): String = "minioSelectParquet" override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectParquetRelation = { val path = checkPath(params) SelectParquetRelation(Some(path), params, schema)(sqlContext) } }
Example 40
Source File: PointCloudRelation.scala From geotrellis-pointcloud with Apache License 2.0 | 5 votes |
package geotrellis.pointcloud.spark.datasource import geotrellis.pointcloud.spark.store.hadoop._ import geotrellis.pointcloud.spark.store.hadoop.HadoopPointCloudRDD.{Options => HadoopOptions} import geotrellis.pointcloud.util.Filesystem import geotrellis.proj4.CRS import geotrellis.store.hadoop.util.HdfsUtils import geotrellis.vector.Extent import cats.implicits._ import io.pdal._ import io.circe.syntax._ import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SQLContext} import java.io.File import scala.collection.JavaConverters._ // This class has to be serializable since it is shipped over the network. class PointCloudRelation( val sqlContext: SQLContext, path: String, options: HadoopOptions ) extends BaseRelation with TableScan with Serializable { @transient implicit lazy val sc: SparkContext = sqlContext.sparkContext // TODO: switch between HadoopPointCloudRDD and S3PointcCloudRDD lazy val isS3: Boolean = path.startsWith("s3") override def schema: StructType = { lazy val (local, fixedPath) = if(path.startsWith("s3") || path.startsWith("hdfs")) { val tmpDir = Filesystem.createDirectory() val remotePath = new Path(path) // copy remote file into local tmp dir val localPath = new File(tmpDir, remotePath.getName) HdfsUtils.copyPath(remotePath, new Path(s"file:///${localPath.getAbsolutePath}"), sc.hadoopConfiguration) (true, localPath.toString) } else (false, path) val localPipeline = options.pipeline .hcursor .downField("pipeline").downArray .downField("filename").withFocus(_ => fixedPath.asJson) .top.fold(options.pipeline)(identity) val pl = Pipeline(localPipeline.noSpaces) if (pl.validate()) pl.execute() val pointCloud = try { pl.getPointViews().next().getPointCloud(0) } finally { pl.close() if(local) println(new File(fixedPath).delete) } val rdd = HadoopPointCloudRDD(new Path(path), options) val md: (Option[Extent], Option[CRS]) = rdd .map { case (header, _) => (header.projectedExtent3D.map(_.extent3d.toExtent), header.crs) } .reduce { case ((e1, c), (e2, _)) => ((e1, e2).mapN(_ combine _), c) } val metadata = new MetadataBuilder().putString("metadata", md.asJson.noSpaces).build pointCloud.deriveSchema(metadata) } override def buildScan(): RDD[Row] = { val rdd = HadoopPointCloudRDD(new Path(path), options) rdd.flatMap { _._2.flatMap { pc => pc.readAll.toList.map { k => Row(k: _*) } } } } }
Example 41
Source File: DefaultSource.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis import org.apache.spark.sql.SaveMode.{Append, ErrorIfExists, Ignore, Overwrite} import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = None) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val relation = new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = None) mode match { case Append => relation.insert(data, overwrite = false) case Overwrite => relation.insert(data, overwrite = true) case ErrorIfExists => if (relation.nonEmpty) { throw new IllegalStateException("SaveMode is set to ErrorIfExists and dataframe " + "already exists in Redis and contains data.") } relation.insert(data, overwrite = false) case Ignore => if (relation.isEmpty) { relation.insert(data, overwrite = false) } } relation } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = Some(schema)) }
Example 42
Source File: XmlRelation.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml import java.io.IOException import org.apache.hadoop.fs.Path import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.sources.{PrunedScan, InsertableRelation, BaseRelation, TableScan} import org.apache.spark.sql.types._ import com.databricks.spark.xml.util.{InferSchema, XmlFile} import com.databricks.spark.xml.parsers.StaxXmlParser case class XmlRelation protected[spark] ( baseRDD: () => RDD[String], location: Option[String], parameters: Map[String, String], userSchema: StructType = null)(@transient val sqlContext: SQLContext) extends BaseRelation with InsertableRelation with PrunedScan { private val options = XmlOptions(parameters) override val schema: StructType = { Option(userSchema).getOrElse { InferSchema.infer( baseRDD(), options) } } override def buildScan(requiredColumns: Array[String]): RDD[Row] = { val requiredFields = requiredColumns.map(schema(_)) val requestedSchema = StructType(requiredFields) StaxXmlParser.parse( baseRDD(), requestedSchema, options) } // The function below was borrowed from JSONRelation override def insert(data: DataFrame, overwrite: Boolean): Unit = { val filesystemPath = location match { case Some(p) => new Path(p) case None => throw new IOException(s"Cannot INSERT into table with no path defined") } val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) if (overwrite) { try { fs.delete(filesystemPath, true) } catch { case e: IOException => throw new IOException( s"Unable to clear output directory ${filesystemPath.toString} prior" + s" to INSERT OVERWRITE a XML table:\n${e.toString}") } // Write the data. We assume that schema isn't changed, and we won't update it. XmlFile.saveAsXmlFile(data, filesystemPath.toString, parameters) } else { throw new IllegalArgumentException("XML tables only support INSERT OVERWRITE for now.") } } }
Example 43
Source File: DefaultSource.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.hbase import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, SchemaRelationProvider} import org.apache.spark.sql.types.StructType class CustomedDefaultSource extends DefaultSource with DataSourceRegister with SchemaRelationProvider { override def shortName(): String = "hbase" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { new CustomedHBaseRelation(parameters, Option(schema))(sqlContext) } }
Example 44
Source File: LogicalRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): LogicalRelation = { this.copy(output = output.map(_.newInstance())) } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" } object LogicalRelation { def apply(relation: BaseRelation, isStreaming: Boolean = false): LogicalRelation = LogicalRelation(relation, relation.schema.toAttributes, None, isStreaming) def apply(relation: BaseRelation, table: CatalogTable): LogicalRelation = LogicalRelation(relation, relation.schema.toAttributes, Some(table), false) }
Example 45
Source File: JdbcRelationProvider.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val resolver = sqlContext.conf.resolver val timeZoneId = sqlContext.conf.sessionLocalTimeZone val schema = JDBCRelation.getSchema(resolver, jdbcOptions) val parts = JDBCRelation.columnPartition(schema, resolver, timeZoneId, jdbcOptions) JDBCRelation(schema, parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val options = new JdbcOptionsInWrite(parameters) val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis val conn = JdbcUtils.createConnectionFactory(options)() try { val tableExists = JdbcUtils.tableExists(conn, options) if (tableExists) { mode match { case SaveMode.Overwrite => if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, options) val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, options.table, options) createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } case SaveMode.Append => val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '${options.table}' already exists. " + s"SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 46
Source File: HadoopFsRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.util.Locale import scala.collection.mutable import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.{StructField, StructType} case class HadoopFsRelation( location: FileIndex, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext private def getColName(f: StructField): String = { if (sparkSession.sessionState.conf.caseSensitiveAnalysis) { f.name } else { f.name.toLowerCase(Locale.ROOT) } } val overlappedPartCols = mutable.Map.empty[String, StructField] partitionSchema.foreach { partitionField => if (dataSchema.exists(getColName(_) == getColName(partitionField))) { overlappedPartCols += getColName(partitionField) -> partitionField } } // When data and partition schemas have overlapping columns, the output // schema respects the order of the data schema for the overlapping columns, and it // respects the data types of the partition schema. val schema: StructType = { StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++ partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f)))) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = { val compressionFactor = sqlContext.conf.fileCompressionFactor (location.sizeInBytes * compressionFactor).toLong } override def inputFiles: Array[String] = location.inputFiles }
Example 47
Source File: console.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.sql._ import org.apache.spark.sql.execution.streaming.sources.ConsoleWriter import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType case class ConsoleRelation(override val sqlContext: SQLContext, data: DataFrame) extends BaseRelation { override def schema: StructType = data.schema } class ConsoleSinkProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister with CreatableRelationProvider { override def createStreamWriter( queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { new ConsoleWriter(schema, options) } def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { // Number of rows to display, by default 20 rows val numRowsToShow = parameters.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true val isTruncated = parameters.get("truncate").map(_.toBoolean).getOrElse(true) data.show(numRowsToShow, isTruncated) ConsoleRelation(sqlContext, data) } def shortName(): String = "console" }
Example 48
Source File: CollapseExpandSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.analysis.CollapseExpandSuite.SqlLikeCatalystSourceRelation import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.sources.sql.SqlLikeRelation import org.apache.spark.sql.sources.{BaseRelation, CatalystSource, Table} import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.sql.util.PlanComparisonUtils._ import org.apache.spark.sql.{GlobalSapSQLContext, Row} import org.mockito.Matchers._ import org.mockito.Mockito._ import org.scalatest.FunSuite import org.scalatest.mock.MockitoSugar class CollapseExpandSuite extends FunSuite with MockitoSugar with GlobalSapSQLContext { case object Leaf extends LeafNode { override def output: Seq[Attribute] = Seq.empty } test("Expansion with a single sequence of projections is correctly collapsed") { val expand = Expand( Seq(Seq('a.string, Literal(1))), Seq('a.string, 'gid.int), Leaf) val collapsed = CollapseExpand(expand) assertResult(normalizeExprIds(Project(Seq('a.string, Literal(1) as "gid"), Leaf)))( normalizeExprIds(collapsed)) } test("Expansion with multiple projections is correctly collapsed") { val expand = Expand( Seq( Seq('a.string, Literal(1)), Seq('b.string, Literal(1))), Seq('a.string, 'gid1.int, 'b.string, 'gid2.int), Leaf) val collapsed = CollapseExpand(expand) assertResult( normalizeExprIds( Project(Seq( 'a.string, Literal(1) as "gid1", 'b.string, Literal(1) as "gid2"), Leaf)))(normalizeExprIds(collapsed)) } test("Expand pushdown integration") { val relation = mock[SqlLikeCatalystSourceRelation] when(relation.supportsLogicalPlan(any[Expand])) .thenReturn(true) when(relation.isMultiplePartitionExecution(any[Seq[CatalystSource]])) .thenReturn(true) when(relation.schema) .thenReturn(StructType(StructField("foo", StringType) :: Nil)) when(relation.relationName) .thenReturn("t") when(relation.logicalPlanToRDD(any[LogicalPlan])) .thenReturn(sc.parallelize(Seq(Row("a", 1), Row("b", 1), Row("a", 1)))) sqlc.baseRelationToDataFrame(relation).registerTempTable("t") val dataFrame = sqlc.sql("SELECT COUNT(DISTINCT foo) FROM t") val Seq(Row(ct)) = dataFrame.collect().toSeq assertResult(2)(ct) } } object CollapseExpandSuite { abstract class SqlLikeCatalystSourceRelation extends BaseRelation with Table with SqlLikeRelation with CatalystSource }
Example 49
Source File: UseAliasesForAggregationsInGroupingsSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.plans.logical.Aggregate import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ import org.scalatest.FunSuite import org.scalatest.mock.MockitoSugar class UseAliasesForAggregationsInGroupingsSuite extends FunSuite with MockitoSugar { val br1 = new BaseRelation { override def sqlContext: SQLContext = mock[SQLContext] override def schema: StructType = StructType(Seq( StructField("name", StringType), StructField("age", IntegerType) )) } val lr1 = LogicalRelation(br1) val nameAtt = lr1.output.find(_.name == "name").get val ageAtt = lr1.output.find(_.name == "age").get test("replace functions in group by") { val avgExpr = avg(ageAtt) val avgAlias = avgExpr as 'avgAlias assertResult( lr1.groupBy(avgAlias.toAttribute)(avgAlias) )(UseAliasesForFunctionsInGroupings( lr1.groupBy(avgExpr)(avgAlias)) ) assertResult( lr1.select(ageAtt) )(UseAliasesForFunctionsInGroupings( lr1.select(ageAtt)) ) intercept[RuntimeException]( UseAliasesForFunctionsInGroupings(Aggregate(Seq(avgExpr), Seq(ageAtt), lr1)) ) } }
Example 50
Source File: MQTTStreamSink.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.sql.streaming.mqtt import scala.collection.JavaConverters._ import scala.collection.mutable import org.eclipse.paho.client.mqttv3.MqttException import org.apache.spark.SparkEnv import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.bahir.utils.Logging import org.apache.bahir.utils.Retry class MQTTStreamWriter (schema: StructType, parameters: DataSourceOptions) extends StreamWriter with Logging { override def createWriterFactory(): DataWriterFactory[InternalRow] = { // Skipping client identifier as single batch can be distributed to multiple // Spark worker process. MQTT server does not support two connections // declaring same client ID at given point in time. val params = parameters.asMap().asScala.filterNot( _._1.equalsIgnoreCase("clientId") ) MQTTDataWriterFactory(params) } override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} } case class MQTTDataWriterFactory(config: mutable.Map[String, String]) extends DataWriterFactory[InternalRow] { override def createDataWriter( partitionId: Int, taskId: Long, epochId: Long ): DataWriter[InternalRow] = new MQTTDataWriter(config) } case object MQTTWriterCommitMessage extends WriterCommitMessage class MQTTDataWriter(config: mutable.Map[String, String]) extends DataWriter[InternalRow] { private lazy val publishAttempts: Int = SparkEnv.get.conf.getInt("spark.mqtt.client.publish.attempts", -1) private lazy val publishBackoff: Long = SparkEnv.get.conf.getTimeAsMs("spark.mqtt.client.publish.backoff", "5s") private lazy val (_, _, topic, _, _, qos, _, _, _) = MQTTUtils.parseConfigParams(config.toMap) override def write(record: InternalRow): Unit = { val client = CachedMQTTClient.getOrCreate(config.toMap) val message = record.getBinary(0) Retry(publishAttempts, publishBackoff, classOf[MqttException]) { // In case of errors, retry sending the message. client.publish(topic, message, qos, false) } } override def commit(): WriterCommitMessage = MQTTWriterCommitMessage override def abort(): Unit = {} } case class MQTTRelation(override val sqlContext: SQLContext, data: DataFrame) extends BaseRelation { override def schema: StructType = data.schema } class MQTTStreamSinkProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister with CreatableRelationProvider { override def createStreamWriter(queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { new MQTTStreamWriter(schema, options) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { MQTTRelation(sqlContext, data) } override def shortName(): String = "mqtt" }
Example 51
Source File: ResolveHierarchySuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.{Attribute, EqualTo} import org.apache.spark.sql.catalyst.plans.logical.{AdjacencyListHierarchySpec, Hierarchy} import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ import org.scalatest.FunSuite import org.scalatest.mock.MockitoSugar class ResolveHierarchySuite extends FunSuite with MockitoSugar { val br1 = new BaseRelation { override def sqlContext: SQLContext = mock[SQLContext] override def schema: StructType = StructType(Seq( StructField("id", IntegerType), StructField("parent", IntegerType) )) } val lr1 = LogicalRelation(br1) val idAtt = lr1.output.find(_.name == "id").get val parentAtt = lr1.output.find(_.name == "parent").get test("Check parenthood expression has no conflicting expression IDs and qualifiers") { val source = SimpleAnalyzer.execute(lr1.select('id, 'parent).subquery('u)) assert(source.resolved) val hierarchy = Hierarchy( AdjacencyListHierarchySpec(source, "v", UnresolvedAttribute("u" :: "id" :: Nil) === UnresolvedAttribute("v" :: "id" :: Nil), Some('id.isNull), Nil), 'node ) val resolveHierarchy = ResolveHierarchy(SimpleAnalyzer) val resolveReferences = ResolveReferencesWithHierarchies(SimpleAnalyzer) val resolvedHierarchy = (0 to 10).foldLeft(hierarchy: Hierarchy) { (h, _) => SimpleAnalyzer.ResolveReferences( resolveReferences(resolveHierarchy(h)) ).asInstanceOf[Hierarchy] } assert(resolvedHierarchy.node.resolved) val resolvedSpec = resolvedHierarchy.spec.asInstanceOf[AdjacencyListHierarchySpec] assert(resolvedSpec.parenthoodExp.resolved) assert(resolvedSpec.startWhere.forall(_.resolved)) assert(resolvedHierarchy.childrenResolved) assert(resolvedHierarchy.resolved) val parenthoodExpression = resolvedSpec.parenthoodExp.asInstanceOf[EqualTo] assertResult("u" :: Nil)(parenthoodExpression.left.asInstanceOf[Attribute].qualifiers) assertResult("v" :: Nil)(parenthoodExpression.right.asInstanceOf[Attribute].qualifiers) assert(parenthoodExpression.right.asInstanceOf[Attribute].exprId != source.output.find(_.name == "id").get.exprId) } }
Example 52
Source File: ResolveAnnotationsSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.Project import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types._ import org.scalatest.FunSuite import org.scalatest.mock.MockitoSugar import org.apache.spark.sql.catalyst.dsl.plans._ class ResolveAnnotationsSuite extends FunSuite with MockitoSugar { // scalastyle:off magic.number val annotatedRel1 = new BaseRelation { override def sqlContext: SQLContext = mock[SQLContext] override def schema: StructType = StructType(Seq( StructField("id1.1", IntegerType, metadata = new MetadataBuilder().putLong("key1.1", 11L).build()), StructField("id1.2", IntegerType, metadata = new MetadataBuilder() .putLong("key1.2", 12L) .putLong("key1.3", 13).build())) ) } val lr1 = LogicalRelation(annotatedRel1) val id11Att = lr1.output.find(_.name == "id1.1").get val id12Att = lr1.output.find(_.name == "id1.2").get val id11AnnotatedAtt = AnnotatedAttribute(id11Att)( Map("key1.1" -> Literal.create(100L, LongType), // override the old key "newkey" -> Literal.create(200L, LongType))) // define a new key val simpleAnnotatedSelect = lr1.select(id11AnnotatedAtt) }
Example 53
Source File: ResolveCountDistinctStarSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.Aggregate import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} import org.scalatest.FunSuite import org.scalatest.Inside._ import org.scalatest.mock.MockitoSugar import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Count} import scala.collection.mutable.ArrayBuffer class ResolveCountDistinctStarSuite extends FunSuite with MockitoSugar { val persons = new LogicalRelation(new BaseRelation { override def sqlContext: SQLContext = mock[SQLContext] override def schema: StructType = StructType(Seq( StructField("age", IntegerType), StructField("name", StringType) )) }) test("Count distinct star is resolved correctly") { val projection = persons.select(UnresolvedAlias( AggregateExpression(Count(UnresolvedStar(None) :: Nil), Complete, true))) val stillNotCompletelyResolvedAggregate = SimpleAnalyzer.execute(projection) val resolvedAggregate = ResolveCountDistinctStar(SimpleAnalyzer) .apply(stillNotCompletelyResolvedAggregate) inside(resolvedAggregate) { case Aggregate(Nil, ArrayBuffer(Alias(AggregateExpression(Count(expressions), Complete, true), _)), _) => assert(expressions.collect { case a:AttributeReference => a.name }.toSet == Set("name", "age")) } assert(resolvedAggregate.resolved) } }
Example 54
Source File: testRelations.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark.dstest import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.commands.WithOrigin import org.apache.spark.sql.sources.{BaseRelation, DropRelation, Table} import org.apache.spark.sql.types._ case class DummyRelationWithTempFlag( sqlContext: SQLContext, tableName: Seq[String], schema: StructType, temporary: Boolean) extends BaseRelation with Table with DropRelation with WithOrigin { override val provider: String = "com.sap.spark.dstest" override def isTemporary: Boolean = temporary override def dropTable(): Unit = {} } case class DummyRelationWithoutTempFlag( sqlContext: SQLContext, schema: StructType) extends BaseRelation with DropRelation with Table with WithOrigin { override def isTemporary: Boolean = false override val provider: String = "com.sap.spark.dstest" override def dropTable(): Unit = {} }
Example 55
Source File: TestUtils.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark.util import java.util.Locale import scala.io.Source import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.{Row, SQLContext, SapSQLContext} import org.apache.spark.sql.hive.SapHiveContext import org.apache.spark.sql.sources.sql.SqlLikeRelation import org.apache.spark.sql.sources.{BaseRelation, CatalystSource, Table} import org.apache.spark.sql.types.StructType import org.mockito.Matchers._ import org.mockito.Mockito._ import scala.tools.nsc.io.Directory import scala.util.{Failure, Success} def parsePTestFile(fileName: String): List[(String, String, String)] = { val filePath = getFileFromClassPath(fileName) val fileContents = Source.fromFile(filePath).getLines .map(p => p.stripMargin.trim) .filter(p => !p.isEmpty && !p.startsWith("//")) // filter empty rows and comments .mkString("\n") val p = new PTestFileParser // strip semicolons from query and parsed p(fileContents) match { case Success(lines) => lines.map { case (query, parsed, expect) => (stripSemicolon(query).trim, stripSemicolon(parsed).trim, expect.trim) } case Failure(ex) => throw ex } } private def stripSemicolon(sql: String): String = if (sql.endsWith(";")) { sql.substring(0, sql.length-1) } else { sql } def withTempDirectory[A](f: Directory => A): A = { val dir = Directory.makeTemp() try { f(dir) } finally { dir.deleteIfExists() } } }
Example 56
Source File: DefaultSource.scala From spark-power-bi with Apache License 2.0 | 5 votes |
package com.granturing.spark.powerbi import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider} import scala.concurrent._ import scala.concurrent.ExecutionContext.Implicits._ import scala.concurrent.duration.Duration class DefaultSource extends CreatableRelationProvider with PowerBISink { override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val conf = ClientConf.fromSparkConf(sqlContext.sparkContext.getConf) implicit val client = new Client(conf) val dataset = parameters.getOrElse("dataset", sys.error("'dataset' must be specified")) val table = parameters.getOrElse("table", sys.error("'table' must be specified")) val batchSize = parameters.getOrElse("batchSize", conf.batchSize.toString).toInt val group = parameters.get("group") val step = for { groupId <- getGroupId(group) ds <- getOrCreateDataset(mode, groupId, dataset, table, data.schema) } yield (groupId, ds) val result = step map { case (groupId, ds) => val fields = data.schema.fieldNames.zipWithIndex val _conf = conf val _token = Some(client.currentToken) val _table = table val _batchSize = batchSize val coalesced = data.rdd.partitions.size > _conf.maxPartitions match { case true => data.coalesce(_conf.maxPartitions) case false => data } coalesced foreachPartition { p => val rows = p map { r => fields map { case(name, index) => (name -> r(index)) } toMap } toSeq val _client = new Client(_conf, _token) val submit = rows. sliding(_batchSize, _batchSize). foldLeft(future()) { (fAccum, batch) => fAccum flatMap { _ => _client.addRows(ds.id, _table, batch, groupId) } } submit.onComplete { _ => _client.shutdown() } Await.result(submit, _conf.timeout) } } result.onComplete { _ => client.shutdown() } Await.result(result, Duration.Inf) new BaseRelation { val sqlContext = data.sqlContext val schema = data.schema } } }
Example 57
Source File: ExcelRelation.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import scala.collection.JavaConversions._ import org.apache.spark.sql.sources.{ BaseRelation, TableScan } import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import org.apache.spark.sql.SQLContext import org.apache.spark.sql._ import org.apache.spark.rdd.RDD import org.apache.hadoop.conf._ import org.apache.hadoop.mapreduce._ import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log import org.zuinnote.hadoop.office.format.common.dao._ import org.zuinnote.hadoop.office.format.mapreduce._ import org.zuinnote.spark.office.excel.util.ExcelFile override def buildScan: RDD[Row] = { // read ExcelRows val excelRowsRDD = ExcelFile.load(sqlContext, location, hadoopParams) // map to schema val schemaFields = schema.fields excelRowsRDD.flatMap(excelKeyValueTuple => { // map the Excel row data structure to a Spark SQL schema val rowArray = new Array[Any](excelKeyValueTuple._2.get.length) var i = 0; for (x <- excelKeyValueTuple._2.get) { // parse through the SpreadSheetCellDAO val spreadSheetCellDAOStructArray = new Array[String](schemaFields.length) val currentSpreadSheetCellDAO: Array[SpreadSheetCellDAO] = excelKeyValueTuple._2.get.asInstanceOf[Array[SpreadSheetCellDAO]] spreadSheetCellDAOStructArray(0) = currentSpreadSheetCellDAO(i).getFormattedValue spreadSheetCellDAOStructArray(1) = currentSpreadSheetCellDAO(i).getComment spreadSheetCellDAOStructArray(2) = currentSpreadSheetCellDAO(i).getFormula spreadSheetCellDAOStructArray(3) = currentSpreadSheetCellDAO(i).getAddress spreadSheetCellDAOStructArray(4) = currentSpreadSheetCellDAO(i).getSheetName // add row representing one Excel row rowArray(i) = spreadSheetCellDAOStructArray i += 1 } Some(Row.fromSeq(rowArray)) }) } }
Example 58
Source File: EventHubsRelation.scala From azure-event-hubs-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.eventhubs import org.apache.spark.eventhubs.rdd.{ EventHubsRDD, OffsetRange } import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.{ Row, SQLContext } import org.apache.spark.sql.sources.{ BaseRelation, TableScan } import org.apache.spark.sql.types.StructType import scala.language.postfixOps private[eventhubs] class EventHubsRelation(override val sqlContext: SQLContext, parameters: Map[String, String]) extends BaseRelation with TableScan with Logging { import org.apache.spark.eventhubs._ private val ehConf = EventHubsConf.toConf(parameters) private val eventHubClient = EventHubsSourceProvider.clientFactory(parameters)(ehConf) override def schema: StructType = EventHubsSourceProvider.eventHubsSchema override def buildScan(): RDD[Row] = { val partitionCount: Int = eventHubClient.partitionCount val fromSeqNos = eventHubClient.translate(ehConf, partitionCount) val untilSeqNos = eventHubClient.translate(ehConf, partitionCount, useStart = false) require(fromSeqNos.forall(f => f._2 >= 0L), "Currently only sequence numbers can be passed in your starting positions.") require(untilSeqNos.forall(u => u._2 >= 0L), "Currently only sequence numbers can be passed in your ending positions.") val offsetRanges = untilSeqNos.keySet.map { p => val fromSeqNo = fromSeqNos .getOrElse(p, throw new IllegalStateException(s"$p doesn't have a fromSeqNo")) val untilSeqNo = untilSeqNos(p) OffsetRange(ehConf.name, p, fromSeqNo, untilSeqNo, None) }.toArray eventHubClient.close() logInfo( "GetBatch generating RDD of with offsetRanges: " + offsetRanges.sortBy(_.nameAndPartition.toString).mkString(", ")) val rdd = EventHubsSourceProvider.toInternalRow( new EventHubsRDD(sqlContext.sparkContext, ehConf.trimmed, offsetRanges)) sqlContext.internalCreateDataFrame(rdd, schema, isStreaming = false).rdd } }
Example 59
Source File: DefaultSource.scala From magellan with Apache License 2.0 | 5 votes |
package magellan import org.apache.spark.sql.types.StructType import org.apache.spark.sql.SQLContext import org.apache.spark.sql.sources.{BaseRelation, SchemaRelationProvider, RelationProvider} class DefaultSource extends RelationProvider with SchemaRelationProvider { override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = createRelation(sqlContext, parameters, null) override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { val path = parameters.getOrElse("path", sys.error("'path' must be specified for Shapefiles.")) val t = parameters.getOrElse("type", "shapefile") t match { case "shapefile" => new ShapeFileRelation(path, parameters)(sqlContext) case "geojson" => new GeoJSONRelation(path, parameters)(sqlContext) case "osm" => new OsmFileRelation(path, parameters)(sqlContext) case _ => ??? } } }
Example 60
Source File: LogicalRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): this.type = { LogicalRelation( relation, expectedOutputAttributes.map(_.map(_.newInstance())), catalogTable).asInstanceOf[this.type] } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" }
Example 61
Source File: JdbcRelationProvider.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 62
Source File: HadoopFsRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.StructType case class HadoopFsRelation( location: FileIndex, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext val schema: StructType = { val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet StructType(dataSchema ++ partitionSchema.filterNot { column => dataSchemaColumnNames.contains(column.name.toLowerCase) }) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = location.sizeInBytes override def inputFiles: Array[String] = location.inputFiles }
Example 63
Source File: CarbonFileIndexReplaceRule.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.carbondata.execution.datasources import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.{FileIndex, HadoopFsRelation, InMemoryFileIndex, InsertIntoHadoopFsRelationCommand, LogicalRelation} import org.apache.spark.sql.sources.BaseRelation import org.apache.carbondata.core.datastore.filesystem.CarbonFile import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.core.util.path.CarbonTablePath private def getDataFolders( tableFolder: CarbonFile, dataFolders: ArrayBuffer[CarbonFile]): Unit = { val files = tableFolder.listFiles() files.foreach { f => if (f.isDirectory) { val files = f.listFiles() if (files.nonEmpty && !files(0).isDirectory) { dataFolders += f } else { getDataFolders(f, dataFolders) } } } } }
Example 64
Source File: SocketTextSource.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.sources import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.{BaseRelation, SchemaRelationProvider} import org.apache.spark.sql.streaming.StreamPlan import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.dstream.DStream class SocketTextSource extends SchemaRelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { require(parameters.contains("host") && parameters.contains("port") && parameters.contains("messageToRow")) val messageToRow = { try { val clz = Class.forName(parameters("messageToRow")) clz.newInstance().asInstanceOf[MessageToRowConverter] } catch { case e: Exception => sys.error(s"Failed to load class : ${e.toString}") } } new SocketTextRelation( parameters("host"), parameters("port").toInt, messageToRow, schema, sqlContext) } } case class SocketTextRelation( host: String, port: Int, messageToRowConverter: MessageToRowConverter, val schema: StructType, @transient val sqlContext: SQLContext) extends StreamBaseRelation with StreamPlan { // Currently only support Kafka with String messages @transient private val socketStream = streamSqlContext.streamingContext.socketTextStream( host, port) @transient val stream: DStream[InternalRow] = socketStream.map(messageToRowConverter.toRow(_, schema)) }