org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ExistingRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} private[sql] case class LogicalLocalTable(output: Seq[Attribute], rows: Seq[Row])(sqlContext: SQLContext) extends LogicalPlan with MultiInstanceRelation { override def children: Seq[LogicalPlan] = Nil override def newInstance(): this.type = LogicalLocalTable(output.map(_.newInstance()), rows)(sqlContext).asInstanceOf[this.type] override def sameResult(plan: LogicalPlan): Boolean = plan match { case LogicalRDD(_, otherRDD) => rows == rows case _ => false } @transient override lazy val statistics: Statistics = Statistics( // TODO: Improve the statistics estimation. // This is made small enough so it can be broadcasted. sizeInBytes = sqlContext.conf.autoBroadcastJoinThreshold - 1 ) }
Example 2
Source File: RTreeIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.simba.ShapeType import org.apache.spark.sql.simba.partitioner.STRPartition import org.apache.spark.sql.simba.util.ShapeUtils import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.NumericType import org.apache.spark.storage.StorageLevel private[simba] case class RTreeIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String)(var _indexedRDD: IndexedRDD = null, var global_rtree: RTree = null) extends IndexedRelation with MultiInstanceRelation { var isPoint = false private def checkKeys: Boolean = { if (column_keys.length > 1) { for (i <- column_keys.indices) if (!column_keys(i).dataType.isInstanceOf[NumericType]) { return false } true } else { // length = 1; we do not support one dimension R-tree column_keys.head.dataType match { case t: ShapeType => isPoint = true true case _ => false } } } require(checkKeys) val dimension = ShapeUtils.getPointFromRow(child.execute().first(), column_keys, child, isPoint).coord.length if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions val maxEntriesPerNode = simbaSession.sessionState.simbaConf.maxEntriesPerNode val sampleRate = simbaSession.sessionState.simbaConf.sampleRate val transferThreshold = simbaSession.sessionState.simbaConf.transferThreshold val dataRDD = child.execute().map(row => { (ShapeUtils.getPointFromRow(row, column_keys, child, isPoint), row) }) val max_entries_per_node = maxEntriesPerNode val (partitionedRDD, mbr_bounds) = STRPartition(dataRDD, dimension, numShufflePartitions, sampleRate, transferThreshold, max_entries_per_node) val indexed = partitionedRDD.mapPartitions { iter => val data = iter.toArray var index: RTree = null if (data.length > 0) index = RTree(data.map(_._1).zipWithIndex, max_entries_per_node) Array(IPartition(data.map(_._2), index)).iterator }.persist(StorageLevel.MEMORY_AND_DISK_SER) val partitionSize = indexed.mapPartitions(iter => iter.map(_.data.length)).collect() global_rtree = RTree(mbr_bounds.zip(partitionSize) .map(x => (x._1._1, x._1._2, x._2)), max_entries_per_node) indexed.setName(table_name.map(n => s"$n $index_name").getOrElse(child.toString)) _indexedRDD = indexed } override def newInstance(): IndexedRelation = { RTreeIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD).asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { RTreeIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD, global_rtree) } }
Example 3
Source File: QuadTreeIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{DoubleType, IntegerType} import org.apache.spark.storage.StorageLevel import org.apache.spark.sql.simba.partitioner.QuadTreePartitioner import org.apache.spark.sql.simba.spatial.Point private[simba] case class QuadTreeIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String)(var _indexedRDD: IndexedRDD = null, var global_index: QuadTree = null) extends IndexedRelation with MultiInstanceRelation { private def checkKeys: Boolean = { for (i <- column_keys.indices) if (!(column_keys(i).dataType.isInstanceOf[DoubleType] || column_keys(i).dataType.isInstanceOf[IntegerType])) { return false } true } require(checkKeys) if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions val sampleRate = simbaSession.sessionState.simbaConf.sampleRate val tranferThreshold = simbaSession.sessionState.simbaConf.transferThreshold val dataRDD = child.execute().map(row => { val now = column_keys.map(x => BindReferences.bindReference(x, child.output).eval(row).asInstanceOf[Number].doubleValue() ).toArray (new Point(now), row) }) val dimension = column_keys.length val (partitionedRDD, _, global_qtree) = QuadTreePartitioner(dataRDD, dimension, numShufflePartitions, sampleRate, tranferThreshold) val indexed = partitionedRDD.mapPartitions { iter => val data = iter.toArray val index: QuadTree = if (data.length > 0) QuadTree(data.map(_._1).zipWithIndex) else null Array(IPartition(data.map(_._2), index)).iterator }.persist(StorageLevel.MEMORY_AND_DISK_SER) indexed.setName(table_name.map(name => s"$name $index_name").getOrElse(child.toString)) _indexedRDD = indexed global_index = global_qtree } override def newInstance(): IndexedRelation = { new QuadTreeIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD) .asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { new QuadTreeIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD, global_index) } }
Example 4
Source File: TreeMapIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.simba.partitioner.RangePartition import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.NumericType import org.apache.spark.storage.StorageLevel private[simba] case class TreeMapIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String) (var _indexedRDD: IndexedRDD = null, var range_bounds: Array[Double] = null) extends IndexedRelation with MultiInstanceRelation { require(column_keys.length == 1) require(column_keys.head.dataType.isInstanceOf[NumericType]) if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions val dataRDD = child.execute().map(row => { val eval_key = BindReferences.bindReference(column_keys.head, child.output).eval(row) .asInstanceOf[Double] (eval_key, row) }) val (partitionedRDD, tmp_bounds) = RangePartition.rowPartition(dataRDD, numShufflePartitions) range_bounds = tmp_bounds val indexed = partitionedRDD.mapPartitions(iter => { val data = iter.toArray val index = TreeMapIndex(data) Array(IPartition(data.map(_._2), index)).iterator }).persist(StorageLevel.MEMORY_AND_DISK_SER) indexed.setName(table_name.map(n => s"$n $index_name").getOrElse(child.toString)) _indexedRDD = indexed } override def newInstance(): IndexedRelation = { TreeMapIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD) .asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { TreeMapIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD, range_bounds) } }
Example 5
Source File: TreapIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.simba.partitioner.RangePartition import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.NumericType import org.apache.spark.storage.StorageLevel private[simba] case class TreapIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String) (var _indexedRDD: IndexedRDD = null, var range_bounds: Array[Double] = null) extends IndexedRelation with MultiInstanceRelation { require(column_keys.length == 1) require(column_keys.head.dataType.isInstanceOf[NumericType]) val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val dataRDD = child.execute().map(row => { val eval_key = BindReferences.bindReference(column_keys.head, child.output).eval(row) .asInstanceOf[Double] (eval_key, row) }) val (partitionedRDD, tmp_bounds) = RangePartition.rowPartition(dataRDD, numShufflePartitions) range_bounds = tmp_bounds val indexed = partitionedRDD.mapPartitions(iter => { val data = iter.toArray val index = Treap(data) Array(IPartition(data.map(_._2), index)).iterator }).persist(StorageLevel.MEMORY_AND_DISK_SER) indexed.setName(table_name.map(n => s"$n $index_name").getOrElse(child.toString)) _indexedRDD = indexed } override def newInstance(): IndexedRelation = { TreapIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD) .asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { TreapIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD, range_bounds) } }
Example 6
Source File: HashMapIndexedRelation.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.index import org.apache.spark.sql.simba.partitioner.HashPartition import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, BindReferences} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.NumericType import org.apache.spark.storage.StorageLevel private[simba] case class HashMapIndexedRelation(output: Seq[Attribute], child: SparkPlan, table_name: Option[String], column_keys: List[Attribute], index_name: String)(var _indexedRDD: IndexedRDD = null) extends IndexedRelation with MultiInstanceRelation { require(column_keys.length == 1) require(column_keys.head.dataType.isInstanceOf[NumericType]) if (_indexedRDD == null) { buildIndex() } private[simba] def buildIndex(): Unit = { val numShufflePartitions = simbaSession.sessionState.simbaConf.indexPartitions val dataRDD = child.execute().map(row => { val eval_key = BindReferences.bindReference(column_keys.head, child.output).eval(row) (eval_key, row) }) val partitionedRDD = HashPartition(dataRDD, numShufflePartitions) val indexed = partitionedRDD.mapPartitions(iter => { val data = iter.toArray val index = HashMapIndex(data) Array(IPartition(data.map(_._2), index)).iterator }).persist(StorageLevel.MEMORY_AND_DISK_SER) indexed.setName(table_name.map(n => s"$n $index_name").getOrElse(child.toString)) _indexedRDD = indexed } override def newInstance(): IndexedRelation = { HashMapIndexedRelation(output.map(_.newInstance()), child, table_name, column_keys, index_name)(_indexedRDD).asInstanceOf[this.type] } override def withOutput(new_output: Seq[Attribute]): IndexedRelation = { HashMapIndexedRelation(new_output, child, table_name, column_keys, index_name)(_indexedRDD) } }
Example 7
Source File: GenomicInterval.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Range, Statistics} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.biodatageeks.sequila.utils.Columns case class GenomicInterval( contig:String, start:Int, end:Int, output: Seq[Attribute] ) extends LeafNode with MultiInstanceRelation with Serializable { override def newInstance(): GenomicInterval = copy(output = output.map(_.newInstance())) def computeStats(conf: SQLConf): Statistics = { val sizeInBytes = IntegerType.defaultSize * 2 //FIXME: Add contigName size Statistics( sizeInBytes = sizeInBytes ) } override def simpleString: String = { s"GenomicInterval ($contig, $start, $end)" } } object GenomicInterval { def apply(contig:String, start: Int, end: Int): GenomicInterval = { val output = StructType(Seq( StructField(s"${Columns.CONTIG}", StringType, nullable = false), StructField(s"${Columns.START}", IntegerType, nullable = false), StructField(s"${Columns.END}", IntegerType, nullable = false)) ) .toAttributes new GenomicInterval(contig,start, end, output) } }
Example 8
Source File: ExistingRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.sources.{HadoopFsRelation, BaseRelation} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericMutableRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } //private[sql] case class PhysicalRDD( output: Seq[Attribute], rdd: RDD[InternalRow], override val nodeName: String, override val metadata: Map[String, String] = Map.empty, override val outputsUnsafeRows: Boolean = false) extends LeafNode { protected override def doExecute(): RDD[InternalRow] = rdd override def simpleString: String = { val metadataEntries = for ((key, value) <- metadata.toSeq.sorted) yield s"$key: $value" s"Scan $nodeName${output.mkString("[", ",", "]")}${metadataEntries.mkString(" ", ", ", "")}" } } private[sql] object PhysicalRDD { // Metadata keys val INPUT_PATHS = "InputPaths" val PUSHED_FILTERS = "PushedFilters" def createFromDataSource( output: Seq[Attribute], rdd: RDD[InternalRow], relation: BaseRelation, metadata: Map[String, String] = Map.empty): PhysicalRDD = { // All HadoopFsRelations output UnsafeRows val outputUnsafeRows = relation.isInstanceOf[HadoopFsRelation] PhysicalRDD(output, rdd, relation.toString, metadata, outputUnsafeRows) } }
Example 9
Source File: DataSourceV2Relation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, Statistics} import org.apache.spark.sql.sources.v2.reader._ case class DataSourceV2Relation( output: Seq[AttributeReference], reader: DataSourceReader) extends LeafNode with MultiInstanceRelation with DataSourceReaderHolder { override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2Relation] override def computeStats(): Statistics = reader match { case r: SupportsReportStatistics => Statistics(sizeInBytes = r.getStatistics.sizeInBytes().orElse(conf.defaultSizeInBytes)) case _ => Statistics(sizeInBytes = conf.defaultSizeInBytes) } override def newInstance(): DataSourceV2Relation = { copy(output = output.map(_.newInstance())) } } class StreamingDataSourceV2Relation( output: Seq[AttributeReference], reader: DataSourceReader) extends DataSourceV2Relation(output, reader) { override def isStreaming: Boolean = true } object DataSourceV2Relation { def apply(reader: DataSourceReader): DataSourceV2Relation = { new DataSourceV2Relation(reader.readSchema().toAttributes, reader) } }
Example 10
Source File: LogicalRelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): LogicalRelation = { this.copy(output = output.map(_.newInstance())) } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" } object LogicalRelation { def apply(relation: BaseRelation, isStreaming: Boolean = false): LogicalRelation = LogicalRelation(relation, relation.schema.toAttributes, None, isStreaming) def apply(relation: BaseRelation, table: CatalogTable): LogicalRelation = LogicalRelation(relation, relation.schema.toAttributes, Some(table), false) }
Example 11
Source File: LogicalRelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): this.type = { LogicalRelation( relation, expectedOutputAttributes.map(_.map(_.newInstance())), catalogTable).asInstanceOf[this.type] } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" }
Example 12
Source File: LogicalRelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): this.type = { LogicalRelation( relation, expectedOutputAttributes.map(_.map(_.newInstance())), catalogTable).asInstanceOf[this.type] } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" }
Example 13
Source File: DescribeDeltaHistoryCommand.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.commands // scalastyle:off import.ordering.noEmptyLine import org.apache.spark.sql.delta.{DeltaErrors, DeltaLog, DeltaTableIdentifier} import org.apache.spark.sql.delta.actions.CommitInfo import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.execution.command.RunnableCommand case class DescribeDeltaHistoryCommand( path: Option[String], tableIdentifier: Option[TableIdentifier], limit: Option[Int], override val output: Seq[Attribute] = ExpressionEncoder[CommitInfo]().schema.toAttributes) extends RunnableCommand with DeltaLogging { override def run(sparkSession: SparkSession): Seq[Row] = { val basePath = if (path.nonEmpty) { new Path(path.get) } else if (tableIdentifier.nonEmpty) { val sessionCatalog = sparkSession.sessionState.catalog lazy val metadata = sessionCatalog.getTableMetadata(tableIdentifier.get) DeltaTableIdentifier(sparkSession, tableIdentifier.get) match { case Some(id) if id.path.nonEmpty => new Path(id.path.get) case Some(id) if id.table.nonEmpty => new Path(metadata.location) case _ => if (metadata.tableType == CatalogTableType.VIEW) { throw DeltaErrors.describeViewHistory } throw DeltaErrors.notADeltaTableException("DESCRIBE HISTORY") } } else { throw DeltaErrors.missingTableIdentifierException("DESCRIBE HISTORY") } // Max array size if (limit.exists(_ > Int.MaxValue - 8)) { throw new IllegalArgumentException("Please use a limit less than Int.MaxValue - 8.") } val deltaLog = DeltaLog.forTable(sparkSession, basePath) recordDeltaOperation(deltaLog, "delta.ddl.describeHistory") { if (deltaLog.snapshot.version == -1) { throw DeltaErrors.notADeltaTableException("DESCRIBE HISTORY") } import sparkSession.implicits._ deltaLog.history.getHistory(limit).toDF().collect().toSeq } } }
Example 14
Source File: ExistingDStream.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.rdd.{EmptyRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream private[streaming] case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow]) extends SparkPlan with StreamPlan { def children = Nil override def doExecute() = { assert(validTime != null) Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime)) .asInstanceOf[Option[RDD[InternalRow]]] .getOrElse(new EmptyRDD[InternalRow](sparkContext)) } }
Example 15
Source File: LogicalRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): this.type = { LogicalRelation( relation, expectedOutputAttributes.map(_.map(_.newInstance())), catalogTable).asInstanceOf[this.type] } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" }
Example 16
Source File: StreamingRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.sources.v2.{ContinuousReadSupport, DataSourceV2} object StreamingRelation { def apply(dataSource: DataSource): StreamingRelation = { StreamingRelation( dataSource, dataSource.sourceInfo.name, dataSource.sourceInfo.schema.toAttributes) } } case class StreamingRelationExec(sourceName: String, output: Seq[Attribute]) extends LeafExecNode { override def toString: String = sourceName override protected def doExecute(): RDD[InternalRow] = { throw new UnsupportedOperationException("StreamingRelationExec cannot be executed") } } object StreamingExecutionRelation { def apply(source: Source, session: SparkSession): StreamingExecutionRelation = { StreamingExecutionRelation(source, source.schema.toAttributes)(session) } }
Example 17
Source File: ExistingRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoder, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericInternalRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } case class RDDScanExec( output: Seq[Attribute], rdd: RDD[InternalRow], name: String, override val outputPartitioning: Partitioning = UnknownPartitioning(0), override val outputOrdering: Seq[SortOrder] = Nil) extends LeafExecNode { private def rddName: String = Option(rdd.name).map(n => s" $n").getOrElse("") override val nodeName: String = s"Scan $name$rddName" override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.mapPartitionsWithIndexInternal { (index, iter) => val proj = UnsafeProjection.create(schema) proj.initialize(index) iter.map { r => numOutputRows += 1 proj(r) } } } override def simpleString: String = { s"$nodeName${Utils.truncatedString(output, "[", ",", "]")}" } }
Example 18
Source File: LogicalRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.{AttributeMap, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.util.Utils override def newInstance(): LogicalRelation = { this.copy(output = output.map(_.newInstance())) } override def refresh(): Unit = relation match { case fs: HadoopFsRelation => fs.location.refresh() case _ => // Do nothing. } override def simpleString: String = s"Relation[${Utils.truncatedString(output, ",")}] $relation" } object LogicalRelation { def apply(relation: BaseRelation, isStreaming: Boolean = false): LogicalRelation = LogicalRelation(relation, relation.schema.toAttributes, None, isStreaming) def apply(relation: BaseRelation, table: CatalogTable): LogicalRelation = LogicalRelation(relation, relation.schema.toAttributes, Some(table), false) }
Example 19
Source File: ExistingRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoder, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericInternalRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } case class RDDScanExec( output: Seq[Attribute], rdd: RDD[InternalRow], override val nodeName: String) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.mapPartitionsInternal { iter => val proj = UnsafeProjection.create(schema) iter.map { r => numOutputRows += 1 proj(r) } } } override def simpleString: String = { s"Scan $nodeName${Utils.truncatedString(output, "[", ",", "]")}" } }