org.apache.spark.util.SerializableConfiguration Scala Examples
The following examples show how to use org.apache.spark.util.SerializableConfiguration.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TextFileFormat.scala From drizzle-spark with Apache License 2.0 | 12 votes |
package org.apache.spark.sql.execution.datasources.text import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.io.compress.GzipCodec import org.apache.hadoop.mapreduce.{Job, RecordWriter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.{FileOutputFormat, TextOutputFormat} import org.apache.hadoop.util.ReflectionUtils import org.apache.spark.TaskContext import org.apache.spark.sql.{AnalysisException, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.util.CompressionCodecs import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.util.SerializableConfiguration def getCompressionExtension(context: TaskAttemptContext): String = { // Set the compression extension, similar to code in TextOutputFormat.getDefaultWorkFile if (FileOutputFormat.getCompressOutput(context)) { val codecClass = FileOutputFormat.getOutputCompressorClass(context, classOf[GzipCodec]) ReflectionUtils.newInstance(codecClass, context.getConfiguration).getDefaultExtension } else { "" } } }
Example 2
Source File: StateStoreRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration class StateStoreRDD[T: ClassTag, U: ClassTag]( dataRDD: RDD[T], storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U], checkpointLocation: String, operatorId: Long, storeVersion: Long, keySchema: StructType, valueSchema: StructType, sessionState: SessionState, @transient private val storeCoordinator: Option[StateStoreCoordinatorRef]) extends RDD[U](dataRDD) { private val storeConf = new StateStoreConf(sessionState.conf) // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it private val confBroadcast = dataRDD.context.broadcast( new SerializableConfiguration(sessionState.newHadoopConf())) override protected def getPartitions: Array[Partition] = dataRDD.partitions override def getPreferredLocations(partition: Partition): Seq[String] = { val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) storeCoordinator.flatMap(_.getLocation(storeId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) store = StateStore.get( storeId, keySchema, valueSchema, storeVersion, storeConf, confBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 3
Source File: BasicWriteStatsTracker.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.FileNotFoundException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.{SparkContext, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.util.SerializableConfiguration class BasicWriteJobStatsTracker( serializableHadoopConf: SerializableConfiguration, @transient val metrics: Map[String, SQLMetric]) extends WriteJobStatsTracker { override def newTaskInstance(): WriteTaskStatsTracker = { new BasicWriteTaskStatsTracker(serializableHadoopConf.value) } override def processStats(stats: Seq[WriteTaskStats]): Unit = { val sparkContext = SparkContext.getActive.get var numPartitions: Long = 0L var numFiles: Long = 0L var totalNumBytes: Long = 0L var totalNumOutput: Long = 0L val basicStats = stats.map(_.asInstanceOf[BasicWriteTaskStats]) basicStats.foreach { summary => numPartitions += summary.numPartitions numFiles += summary.numFiles totalNumBytes += summary.numBytes totalNumOutput += summary.numRows } metrics(BasicWriteJobStatsTracker.NUM_FILES_KEY).add(numFiles) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_BYTES_KEY).add(totalNumBytes) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_ROWS_KEY).add(totalNumOutput) metrics(BasicWriteJobStatsTracker.NUM_PARTS_KEY).add(numPartitions) val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList) } } object BasicWriteJobStatsTracker { private val NUM_FILES_KEY = "numFiles" private val NUM_OUTPUT_BYTES_KEY = "numOutputBytes" private val NUM_OUTPUT_ROWS_KEY = "numOutputRows" private val NUM_PARTS_KEY = "numParts" def metrics: Map[String, SQLMetric] = { val sparkContext = SparkContext.getActive.get Map( NUM_FILES_KEY -> SQLMetrics.createMetric(sparkContext, "number of written files"), NUM_OUTPUT_BYTES_KEY -> SQLMetrics.createMetric(sparkContext, "bytes of written output"), NUM_OUTPUT_ROWS_KEY -> SQLMetrics.createMetric(sparkContext, "number of output rows"), NUM_PARTS_KEY -> SQLMetrics.createMetric(sparkContext, "number of dynamic part") ) } }
Example 4
Source File: DataWritingCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{Command, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker import org.apache.spark.sql.execution.datasources.FileFormatWriter import org.apache.spark.sql.execution.metric.SQLMetric import org.apache.spark.util.SerializableConfiguration def logicalPlanOutputWithNames( query: LogicalPlan, names: Seq[String]): Seq[Attribute] = { // Save the output attributes to a variable to avoid duplicated function calls. val outputAttributes = query.output assert(outputAttributes.length == names.length, "The length of provided names doesn't match the length of output attributes.") outputAttributes.zip(names).map { case (attr, outputName) => attr.withName(outputName) } } }
Example 5
Source File: FileStreamSink.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FileFormat, FileFormatWriter} import org.apache.spark.util.SerializableConfiguration object FileStreamSink extends Logging { // The name of the subdirectory that is used to store metadata about which files are valid. val metadataDir = "_spark_metadata" class FileStreamSink( sparkSession: SparkSession, path: String, fileFormat: FileFormat, partitionColumnNames: Seq[String], options: Map[String, String]) extends Sink with Logging { private val basePath = new Path(path) private val logPath = new Path(basePath, FileStreamSink.metadataDir) private val fileLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, logPath.toUri.toString) private val hadoopConf = sparkSession.sessionState.newHadoopConf() private def basicWriteJobStatsTracker: BasicWriteJobStatsTracker = { val serializableHadoopConf = new SerializableConfiguration(hadoopConf) new BasicWriteJobStatsTracker(serializableHadoopConf, BasicWriteJobStatsTracker.metrics) } override def addBatch(batchId: Long, data: DataFrame): Unit = { if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) { logInfo(s"Skipping already committed batch $batchId") } else { val committer = FileCommitProtocol.instantiate( className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass, jobId = batchId.toString, outputPath = path) committer match { case manifestCommitter: ManifestFileCommitProtocol => manifestCommitter.setupManifestOptions(fileLog, batchId) case _ => // Do nothing } // Get the actual partition columns as attributes after matching them by name with // the given columns names. val partitionColumns: Seq[Attribute] = partitionColumnNames.map { col => val nameEquality = data.sparkSession.sessionState.conf.resolver data.logicalPlan.output.find(f => nameEquality(f.name, col)).getOrElse { throw new RuntimeException(s"Partition column $col not found in schema ${data.schema}") } } val qe = data.queryExecution FileFormatWriter.write( sparkSession = sparkSession, plan = qe.executedPlan, fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec(path, Map.empty, qe.analyzed.output), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = None, statsTrackers = Seq(basicWriteJobStatsTracker), options = options) } } override def toString: String = s"FileSink[$path]" }
Example 6
Source File: StateStoreRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import java.util.UUID import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.EpochTracker import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration override def getPreferredLocations(partition: Partition): Seq[String] = { val stateStoreProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) // If we're in continuous processing mode, we should get the store version for the current // epoch rather than the one at planning time. val isContinuous = Option(ctxt.getLocalProperty(StreamExecution.IS_CONTINUOUS_PROCESSING)) .map(_.toBoolean).getOrElse(false) val currentVersion = if (isContinuous) { val epoch = EpochTracker.getCurrentEpoch assert(epoch.isDefined, "Current epoch must be defined for continuous processing streams.") epoch.get } else { storeVersion } store = StateStore.get( storeProviderId, keySchema, valueSchema, indexOrdinal, currentVersion, storeConf, hadoopConfBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 7
Source File: ArrowScan.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.oap.spark.sql.execution.datasources.v2.arrow import scala.collection.JavaConverters._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.connector.read.PartitionReaderFactory import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.v2.FileScan import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.util.SerializableConfiguration case class ArrowScan( sparkSession: SparkSession, fileIndex: PartitioningAwareFileIndex, readDataSchema: StructType, readPartitionSchema: StructType, pushedFilters: Array[Filter], options: CaseInsensitiveStringMap, partitionFilters: Seq[Expression] = Seq.empty, dataFilters: Seq[Expression] = Seq.empty) extends FileScan { override def createReaderFactory(): PartitionReaderFactory = { val caseSensitiveMap = options.asCaseSensitiveMap().asScala.toMap val hconf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) val broadcastedConf = sparkSession.sparkContext.broadcast(new SerializableConfiguration(hconf)) ArrowPartitionReaderFactory( sparkSession.sessionState.conf, broadcastedConf, readDataSchema, readPartitionSchema, pushedFilters, new ArrowOptions(options.asScala.toMap)) } override def withFilters(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) }
Example 8
Source File: StateStoreRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration class StateStoreRDD[T: ClassTag, U: ClassTag]( dataRDD: RDD[T], storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U], checkpointLocation: String, operatorId: Long, storeVersion: Long, keySchema: StructType, valueSchema: StructType, sessionState: SessionState, @transient private val storeCoordinator: Option[StateStoreCoordinatorRef]) extends RDD[U](dataRDD) { private val storeConf = new StateStoreConf(sessionState.conf) // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it private val confBroadcast = dataRDD.context.broadcast( new SerializableConfiguration(sessionState.newHadoopConf())) override protected def getPartitions: Array[Partition] = dataRDD.partitions override def getPreferredLocations(partition: Partition): Seq[String] = { val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) storeCoordinator.flatMap(_.getLocation(storeId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) store = StateStore.get( storeId, keySchema, valueSchema, storeVersion, storeConf, confBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 9
Source File: StateStoreRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration class StateStoreRDD[T: ClassTag, U: ClassTag]( dataRDD: RDD[T], storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U], checkpointLocation: String, operatorId: Long, storeVersion: Long, keySchema: StructType, valueSchema: StructType, sessionState: SessionState, @transient private val storeCoordinator: Option[StateStoreCoordinatorRef]) extends RDD[U](dataRDD) { private val storeConf = new StateStoreConf(sessionState.conf) // A Hadoop Configuration can be about 10 KB, which is pretty big, so broadcast it private val confBroadcast = dataRDD.context.broadcast( new SerializableConfiguration(sessionState.newHadoopConf())) override protected def getPartitions: Array[Partition] = dataRDD.partitions override def getPreferredLocations(partition: Partition): Seq[String] = { val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) storeCoordinator.flatMap(_.getLocation(storeId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeId = StateStoreId(checkpointLocation, operatorId, partition.index) store = StateStore.get( storeId, keySchema, valueSchema, storeVersion, storeConf, confBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 10
Source File: StateStoreRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import java.util.UUID import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.sql.internal.SessionState import org.apache.spark.sql.types.StructType import org.apache.spark.util.SerializableConfiguration override def getPreferredLocations(partition: Partition): Seq[String] = { val stateStoreProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) storeCoordinator.flatMap(_.getLocation(stateStoreProviderId)).toSeq } override def compute(partition: Partition, ctxt: TaskContext): Iterator[U] = { var store: StateStore = null val storeProviderId = StateStoreProviderId( StateStoreId(checkpointLocation, operatorId, partition.index), queryRunId) store = StateStore.get( storeProviderId, keySchema, valueSchema, indexOrdinal, storeVersion, storeConf, hadoopConfBroadcast.value.value) val inputIter = dataRDD.iterator(partition, ctxt) storeUpdateFunction(store, inputIter) } }
Example 11
Source File: HDFSMetaDataCommiterSuite.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import java.io.File import org.apache.hadoop.conf.Configuration import scala.language.implicitConversions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.util.SerializableConfiguration class HDFSMetaDataCommiterSuite extends SparkFunSuite with SharedSQLContext { val testConf: Configuration = new Configuration() val serializedConf = new SerializableConfiguration(testConf) test("Add and Get operation") { withTempDir { temp => val dir = new File(temp, "commit") val metadataCommitter = new HDFSMetadataCommitter[String](dir.getAbsolutePath, serializedConf) assert(metadataCommitter.add(0, "Shard-000001", "foo")) assert(metadataCommitter.get(0) === Seq("foo")) assert(metadataCommitter.add(1, "Shard-000001", "one")) assert(metadataCommitter.add(1, "Shard-000002", "two")) assert(metadataCommitter.get(1).toSet === Set("one", "two")) // Adding the same batch over-writes the previous entry // This is required since re-attempt of a failed task will // update in same location assert(metadataCommitter.add(1, "Shard-000001", "updated-one")) assert(metadataCommitter.get(1).toSet === Set("updated-one", "two")) } } test("Purge operation") { withTempDir { temp => val metadataCommitter = new HDFSMetadataCommitter[String]( temp.getAbsolutePath, serializedConf) assert(metadataCommitter.add(0, "Shard-000001", "one")) assert(metadataCommitter.add(1, "Shard-000001", "two")) assert(metadataCommitter.add(2, "Shard-000001", "three")) assert(metadataCommitter.get(0).nonEmpty) assert(metadataCommitter.get(1).nonEmpty) assert(metadataCommitter.get(2).nonEmpty) metadataCommitter.purge(2) assertThrows[IllegalStateException](metadataCommitter.get(0)) assertThrows[IllegalStateException](metadataCommitter.get(1)) assert(metadataCommitter.get(2).nonEmpty) // There should be exactly one file, called "2", in the metadata directory. val allFiles = new File(metadataCommitter.metadataPath.toString).listFiles().toSeq assert(allFiles.size == 1) assert(allFiles.head.getName == "2") } } }
Example 12
Source File: TextFileOverwrite.scala From spark_helper with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.rdd.{RDD, HadoopRDD} import org.apache.spark.util.SerializableConfiguration import org.apache.hadoop.mapred.{FileInputFormat, JobConf, TextInputFormat} import org.apache.hadoop.io.{LongWritable, Text} import org.apache.hadoop.fs.Path object TextFileOverwrite { def textFile( paths: Seq[String], minPartitions: Int, sc: SparkContext ): RDD[String] = { val confBroadcast = sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)) val setInputPathsFunc = (jobConf: JobConf) => FileInputFormat.setInputPaths(jobConf, paths.map(p => new Path(p)): _*) new HadoopRDD( sc, confBroadcast, Some(setInputPathsFunc), classOf[TextInputFormat], classOf[LongWritable], classOf[Text], minPartitions ).map(pair => pair._2.toString) } }
Example 13
Source File: ReliableRDDCheckpointData.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.hadoop.fs.Path import org.apache.spark._ import org.apache.spark.util.SerializableConfiguration def cleanCheckpoint(sc: SparkContext, rddId: Int): Unit = { checkpointPath(sc, rddId).foreach { path => val fs = path.getFileSystem(sc.hadoopConfiguration) if (fs.exists(path)) { if (!fs.delete(path, true)) { logWarning(s"Error deleting ${path.toString()}") } } } } }