org.apache.spark.sql.catalyst.catalog.BucketSpec Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.catalog.BucketSpec.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: InsertIntoHadoopFsRelationCommand.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.IOException import org.apache.hadoop.fs.Path import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand )) { throw new IOException(s"Unable to clear output " + s"directory $qualifiedOutputPath prior to writing to it") } true case (SaveMode.Append, _) | (SaveMode.Overwrite, _) | (SaveMode.ErrorIfExists, false) => true case (SaveMode.Ignore, exists) => !exists case (s, exists) => throw new IllegalStateException(s"unsupported save mode $s ($exists)") } // If we are appending data to an existing dir. val isAppend = pathExists && (mode == SaveMode.Append) if (doInsertion) { WriteOutput.write( sparkSession, query, fileFormat, qualifiedOutputPath, hadoopConf, partitionColumns, bucketSpec, refreshFunction, options, isAppend) } else { logInfo("Skipping insertion into a relation that already exists.") } Seq.empty[Row] } }
Example 2
Source File: HadoopFsRelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.StructType case class HadoopFsRelation( location: FileCatalog, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext val schema: StructType = { val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet StructType(dataSchema ++ partitionSchema.filterNot { column => dataSchemaColumnNames.contains(column.name.toLowerCase) }) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = location.sizeInBytes override def inputFiles: Array[String] = location.inputFiles }
Example 3
Source File: HadoopFsRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.util.Locale import scala.collection.mutable import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.{StructField, StructType} case class HadoopFsRelation( location: FileIndex, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext private def getColName(f: StructField): String = { if (sparkSession.sessionState.conf.caseSensitiveAnalysis) { f.name } else { f.name.toLowerCase(Locale.ROOT) } } val overlappedPartCols = mutable.Map.empty[String, StructField] partitionSchema.foreach { partitionField => if (dataSchema.exists(getColName(_) == getColName(partitionField))) { overlappedPartCols += getColName(partitionField) -> partitionField } } // When data and partition schemas have overlapping columns, the output // schema respects the order of the data schema for the overlapping columns, and it // respects the data types of the partition schema. val schema: StructType = { StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++ partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f)))) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = { val compressionFactor = sqlContext.conf.fileCompressionFactor (location.sizeInBytes * compressionFactor).toLong } override def inputFiles: Array[String] = location.inputFiles }
Example 4
Source File: HadoopFsRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.StructType case class HadoopFsRelation( location: FileIndex, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext val schema: StructType = { val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet StructType(dataSchema ++ partitionSchema.filterNot { column => dataSchemaColumnNames.contains(column.name.toLowerCase) }) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = location.sizeInBytes override def inputFiles: Array[String] = location.inputFiles }
Example 5
Source File: HadoopFsRelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.StructType case class HadoopFsRelation( location: FileIndex, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext val schema: StructType = { val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet StructType(dataSchema ++ partitionSchema.filterNot { column => dataSchemaColumnNames.contains(column.name.toLowerCase) }) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = location.sizeInBytes override def inputFiles: Array[String] = location.inputFiles }
Example 6
Source File: HadoopFsRelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.util.Locale import scala.collection.mutable import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.catalyst.catalog.BucketSpec import org.apache.spark.sql.execution.FileRelation import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister} import org.apache.spark.sql.types.{StructField, StructType} case class HadoopFsRelation( location: FileIndex, partitionSchema: StructType, dataSchema: StructType, bucketSpec: Option[BucketSpec], fileFormat: FileFormat, options: Map[String, String])(val sparkSession: SparkSession) extends BaseRelation with FileRelation { override def sqlContext: SQLContext = sparkSession.sqlContext private def getColName(f: StructField): String = { if (sparkSession.sessionState.conf.caseSensitiveAnalysis) { f.name } else { f.name.toLowerCase(Locale.ROOT) } } val overlappedPartCols = mutable.Map.empty[String, StructField] partitionSchema.foreach { partitionField => if (dataSchema.exists(getColName(_) == getColName(partitionField))) { overlappedPartCols += getColName(partitionField) -> partitionField } } // When data and partition schemas have overlapping columns, the output // schema respects the order of the data schema for the overlapping columns, and it // respects the data types of the partition schema. val schema: StructType = { StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++ partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f)))) } def partitionSchemaOption: Option[StructType] = if (partitionSchema.isEmpty) None else Some(partitionSchema) override def toString: String = { fileFormat match { case source: DataSourceRegister => source.shortName() case _ => "HadoopFiles" } } override def sizeInBytes: Long = { val compressionFactor = sqlContext.conf.fileCompressionFactor (location.sizeInBytes * compressionFactor).toLong } override def inputFiles: Array[String] = location.inputFiles }