org.apache.spark.sql.execution.metric.SQLMetric Scala Examples
The following examples show how to use org.apache.spark.sql.execution.metric.SQLMetric.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: BasicWriteStatsTracker.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.FileNotFoundException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.{SparkContext, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.util.SerializableConfiguration class BasicWriteJobStatsTracker( serializableHadoopConf: SerializableConfiguration, @transient val metrics: Map[String, SQLMetric]) extends WriteJobStatsTracker { override def newTaskInstance(): WriteTaskStatsTracker = { new BasicWriteTaskStatsTracker(serializableHadoopConf.value) } override def processStats(stats: Seq[WriteTaskStats]): Unit = { val sparkContext = SparkContext.getActive.get var numPartitions: Long = 0L var numFiles: Long = 0L var totalNumBytes: Long = 0L var totalNumOutput: Long = 0L val basicStats = stats.map(_.asInstanceOf[BasicWriteTaskStats]) basicStats.foreach { summary => numPartitions += summary.numPartitions numFiles += summary.numFiles totalNumBytes += summary.numBytes totalNumOutput += summary.numRows } metrics(BasicWriteJobStatsTracker.NUM_FILES_KEY).add(numFiles) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_BYTES_KEY).add(totalNumBytes) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_ROWS_KEY).add(totalNumOutput) metrics(BasicWriteJobStatsTracker.NUM_PARTS_KEY).add(numPartitions) val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList) } } object BasicWriteJobStatsTracker { private val NUM_FILES_KEY = "numFiles" private val NUM_OUTPUT_BYTES_KEY = "numOutputBytes" private val NUM_OUTPUT_ROWS_KEY = "numOutputRows" private val NUM_PARTS_KEY = "numParts" def metrics: Map[String, SQLMetric] = { val sparkContext = SparkContext.getActive.get Map( NUM_FILES_KEY -> SQLMetrics.createMetric(sparkContext, "number of written files"), NUM_OUTPUT_BYTES_KEY -> SQLMetrics.createMetric(sparkContext, "bytes of written output"), NUM_OUTPUT_ROWS_KEY -> SQLMetrics.createMetric(sparkContext, "number of output rows"), NUM_PARTS_KEY -> SQLMetrics.createMetric(sparkContext, "number of dynamic part") ) } }
Example 2
Source File: DataWritingCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker import org.apache.spark.sql.execution.datasources.FileFormatWriter import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.util.SerializableConfiguration def logicalPlanOutputWithNames( query: LogicalPlan, names: Seq[String]): Seq[Attribute] = { // Save the output attributes to a variable to avoid duplicated function calls. val outputAttributes = query.output assert(outputAttributes.length == names.length, "The length of provided names doesn't match the length of output attributes.") outputAttributes.zip(names).map { case (attr, outputName) => attr.withName(outputName) } } }
Example 3
Source File: ShuffledColumnarBatchRDD.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.metric.{SQLMetric, SQLShuffleReadMetricsReporter} import org.apache.spark.sql.vectorized.ColumnarBatch class ShuffledColumnarBatchRDD( var dependency: ShuffleDependency[Int, ColumnarBatch, ColumnarBatch], metrics: Map[String, SQLMetric], specifiedPartitionStartIndices: Option[Array[Int]] = None) extends RDD[ColumnarBatch](dependency.rdd.context, Nil) { private[this] val numPreShufflePartitions = dependency.partitioner.numPartitions private[this] val partitionStartIndices: Array[Int] = specifiedPartitionStartIndices match { case Some(indices) => indices case None => // When specifiedPartitionStartIndices is not defined, every post-shuffle partition // corresponds to a pre-shuffle partition. (0 until numPreShufflePartitions).toArray } private[this] val part: Partitioner = new CoalescedPartitioner(dependency.partitioner, partitionStartIndices) override def getDependencies: Seq[Dependency[_]] = List(dependency) override val partitioner: Option[Partitioner] = Some(part) override def getPartitions: Array[Partition] = { assert(partitionStartIndices.length == part.numPartitions) Array.tabulate[Partition](partitionStartIndices.length) { i => val startIndex = partitionStartIndices(i) val endIndex = if (i < partitionStartIndices.length - 1) { partitionStartIndices(i + 1) } else { numPreShufflePartitions } new ShuffledRDDPartition(i, startIndex, endIndex) } } override def getPreferredLocations(partition: Partition): Seq[String] = { val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] val dep = dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] tracker.getPreferredLocationsForShuffle(dep, partition.index) } override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = { val shuffledColumnarBatchPartition = split.asInstanceOf[ShuffledRDDPartition] val tempMetrics = context.taskMetrics().createTempShuffleReadMetrics() // `SQLShuffleReadMetricsReporter` will update its own metrics for SQL exchange operator, // as well as the `tempMetrics` for basic shuffle metrics. val sqlMetricsReporter = new SQLShuffleReadMetricsReporter(tempMetrics, metrics) // The range of pre-shuffle partitions that we are fetching at here is // [startPreShufflePartitionIndex, endPreShufflePartitionIndex - 1]. val reader = SparkEnv.get.shuffleManager.getReader( dependency.shuffleHandle, shuffledColumnarBatchPartition.startPreShufflePartitionIndex, shuffledColumnarBatchPartition.endPreShufflePartitionIndex, context, sqlMetricsReporter) reader.read().asInstanceOf[Iterator[Product2[Int, ColumnarBatch]]].map(_._2) } override def clearDependencies() { super.clearDependencies() dependency = null } }
Example 4
Source File: ColumnarDataSourceRDD.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.execution import com.intel.sparkColumnarPlugin.vectorized._ import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, PartitionReader} import org.apache.spark.sql.execution.datasources.{FilePartition, PartitionedFile} import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} import org.apache.spark.sql.execution.datasources.v2.VectorizedFilePartitionReaderHandler import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetPartitionReaderFactory class DataSourceRDDPartition(val index: Int, val inputPartition: InputPartition) extends Partition with Serializable // TODO: we should have 2 RDDs: an RDD[InternalRow] for row-based scan, an `RDD[ColumnarBatch]` for // columnar scan. class ColumnarDataSourceRDD( sc: SparkContext, @transient private val inputPartitions: Seq[InputPartition], partitionReaderFactory: PartitionReaderFactory, columnarReads: Boolean, scanTime: SQLMetric) extends RDD[ColumnarBatch](sc, Nil) { override protected def getPartitions: Array[Partition] = { inputPartitions.zipWithIndex.map { case (inputPartition, index) => new DataSourceRDDPartition(index, inputPartition) }.toArray } private def castPartition(split: Partition): DataSourceRDDPartition = split match { case p: DataSourceRDDPartition => p case _ => throw new SparkException(s"[BUG] Not a DataSourceRDDPartition: $split") } override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = { val inputPartition = castPartition(split).inputPartition val reader = if (columnarReads) { partitionReaderFactory match { case factory: ParquetPartitionReaderFactory => VectorizedFilePartitionReaderHandler.get(inputPartition, factory) case _ => partitionReaderFactory.createColumnarReader(inputPartition) } } else { partitionReaderFactory.createReader(inputPartition) } val rddId = this context.addTaskCompletionListener[Unit](_ => reader.close()) val iter = new Iterator[Any] { private[this] var valuePrepared = false override def hasNext: Boolean = { if (!valuePrepared) { try { val beforeScan = System.nanoTime() valuePrepared = reader.next() scanTime += (System.nanoTime() - beforeScan) / (1000 * 1000) } catch { case e => val errmsg = e.getStackTrace.mkString("\n") logError(s"hasNext got exception: $errmsg") valuePrepared = false } } valuePrepared } override def next(): Any = { if (!hasNext) { throw new java.util.NoSuchElementException("End of stream") } valuePrepared = false reader.get() } } val closeableColumnarBatchIterator = new CloseableColumnBatchIterator(iter.asInstanceOf[Iterator[ColumnarBatch]]) // TODO: SPARK-25083 remove the type erasure hack in data source scan new InterruptibleIterator(context, closeableColumnarBatchIterator) } override def getPreferredLocations(split: Partition): Seq[String] = { castPartition(split).inputPartition.preferredLocations() } }
Example 5
Source File: ColumnarBatchScanExec.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.execution import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.datasources.v2.BatchScanExec import org.apache.spark.sql.connector.read.{InputPartition, PartitionReaderFactory, Scan} import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} class ColumnarBatchScanExec( output: Seq[AttributeReference], @transient scan: Scan) extends BatchScanExec(output, scan) { override def supportsColumnar(): Boolean = true override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "scanTime" -> SQLMetrics.createTimingMetric(sparkContext, "BatchScan elapse time")) override def doExecuteColumnar(): RDD[ColumnarBatch] = { val numOutputRows = longMetric("numOutputRows") val scanTime = longMetric("scanTime") val inputColumnarRDD = new ColumnarDataSourceRDD(sparkContext, partitions, readerFactory, true, scanTime) inputColumnarRDD.map { r => numOutputRows += r.numRows() r } } }
Example 6
Source File: ColumnarSortExec.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.execution import com.intel.sparkColumnarPlugin.expression._ import com.intel.sparkColumnarPlugin.vectorized._ import java.util.concurrent.TimeUnit._ import org.apache.spark.{SparkEnv, TaskContext, SparkContext} import org.apache.spark.executor.TaskMetrics import org.apache.spark.sql.execution._ import org.apache.spark.sql.catalyst.expressions.SortOrder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.vectorized.{ColumnarBatch, ColumnVector} class ColumnarSortExec( sortOrder: Seq[SortOrder], global: Boolean, child: SparkPlan, testSpillFrequency: Int = 0) extends SortExec(sortOrder, global, child, testSpillFrequency) { override def supportsColumnar = true // Disable code generation override def supportCodegen: Boolean = false override lazy val metrics = Map( "totalSortTime" -> SQLMetrics .createTimingMetric(sparkContext, "time in sort + shuffle process"), "sortTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in sort process"), "shuffleTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in shuffle process"), "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows"), "numOutputBatches" -> SQLMetrics.createMetric(sparkContext, "number of output batches")) override def doExecuteColumnar(): RDD[ColumnarBatch] = { val elapse = longMetric("totalSortTime") val sortTime = longMetric("sortTime") val shuffleTime = longMetric("shuffleTime") val numOutputRows = longMetric("numOutputRows") val numOutputBatches = longMetric("numOutputBatches") child.executeColumnar().mapPartitions { iter => val hasInput = iter.hasNext val res = if (!hasInput) { Iterator.empty } else { val sorter = ColumnarSorter.create( sortOrder, true, child.output, sortTime, numOutputBatches, numOutputRows, shuffleTime, elapse) TaskContext .get() .addTaskCompletionListener[Unit](_ => { sorter.close() }) new CloseableColumnBatchIterator(sorter.createColumnarIterator(iter)) } res } } }
Example 7
Source File: DeltaSink.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.sources import org.apache.spark.sql.delta._ import org.apache.spark.sql.delta.actions.SetTransaction import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.spark.sql.delta.schema.{ImplicitMetadataOperation, SchemaUtils} import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.sql._ import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.sql.execution.metric.SQLMetrics.createMetric import org.apache.spark.sql.execution.streaming.{Sink, StreamExecution} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.NullType class DeltaSink( sqlContext: SQLContext, path: Path, partitionColumns: Seq[String], outputMode: OutputMode, options: DeltaOptions) extends Sink with ImplicitMetadataOperation with DeltaLogging { private val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path) private val sqlConf = sqlContext.sparkSession.sessionState.conf override protected val canOverwriteSchema: Boolean = outputMode == OutputMode.Complete() && options.canOverwriteSchema override protected val canMergeSchema: Boolean = options.canMergeSchema override def addBatch(batchId: Long, data: DataFrame): Unit = deltaLog.withNewTransaction { txn => val sc = data.sparkSession.sparkContext val metrics = Map[String, SQLMetric]( "numAddedFiles" -> createMetric(sc, "number of files added"), "numRemovedFiles" -> createMetric(sc, "number of files removed") ) val queryId = sqlContext.sparkContext.getLocalProperty(StreamExecution.QUERY_ID_KEY) assert(queryId != null) if (SchemaUtils.typeExistsRecursively(data.schema)(_.isInstanceOf[NullType])) { throw DeltaErrors.streamWriteNullTypeException } // If the batch reads the same Delta table as this sink is going to write to, then this // write has dependencies. Then make sure that this commit set hasDependencies to true // by injecting a read on the whole table. This needs to be done explicitly because // MicroBatchExecution has already enforced all the data skipping (by forcing the generation // of the executed plan) even before the transaction was started. val selfScan = data.queryExecution.analyzed.collectFirst { case DeltaTable(index) if index.deltaLog.isSameLogAs(txn.deltaLog) => true }.nonEmpty if (selfScan) { txn.readWholeTable() } // Streaming sinks can't blindly overwrite schema. See Schema Management design doc for details updateMetadata( txn, data, partitionColumns, configuration = Map.empty, outputMode == OutputMode.Complete()) val currentVersion = txn.txnVersion(queryId) if (currentVersion >= batchId) { logInfo(s"Skipping already complete epoch $batchId, in query $queryId") return } val deletedFiles = outputMode match { case o if o == OutputMode.Complete() => deltaLog.assertRemovable() txn.filterFiles().map(_.remove) case _ => Nil } val newFiles = txn.writeFiles(data, Some(options)) val setTxn = SetTransaction(queryId, batchId, Some(deltaLog.clock.getTimeMillis())) :: Nil val info = DeltaOperations.StreamingUpdate(outputMode, queryId, batchId, options.userMetadata) metrics("numRemovedFiles").set(deletedFiles.size) metrics("numAddedFiles").set(newFiles.size) txn.registerSQLMetrics(sqlContext.sparkSession, metrics) txn.commit(setTxn ++ newFiles ++ deletedFiles, info) // This is needed to make the SQL metrics visible in the Spark UI val executionId = sqlContext.sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates( sqlContext.sparkContext, executionId, metrics.values.toSeq) } override def toString(): String = s"DeltaSink[$path]" }