org.apache.spark.internal.Logging Scala Examples
The following examples show how to use org.apache.spark.internal.Logging.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CommandUtils.scala From drizzle-spark with Apache License 2.0 | 7 votes |
package org.apache.spark.deploy.worker import java.io.{File, FileOutputStream, InputStream, IOException} import scala.collection.JavaConverters._ import scala.collection.Map import org.apache.spark.SecurityManager import org.apache.spark.deploy.Command import org.apache.spark.internal.Logging import org.apache.spark.launcher.WorkerCommandBuilder import org.apache.spark.util.Utils def redirectStream(in: InputStream, file: File) { val out = new FileOutputStream(file, true) // TODO: It would be nice to add a shutdown hook here that explains why the output is // terminating. Otherwise if the worker dies the executor logs will silently stop. new Thread("redirect output to " + file) { override def run() { try { Utils.copyStream(in, out, true) } catch { case e: IOException => logInfo("Redirection to " + file + " closed: " + e.getMessage) } } }.start() } }
Example 2
Source File: OrcFileOperator.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.flatMap(getFileReader(_, conf)).headOption.map { reader => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 3
Source File: HDFSCredentialProvider.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.deploy.yarn.security import java.io.{ByteArrayInputStream, DataInputStream} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier import org.apache.hadoop.mapred.Master import org.apache.hadoop.security.Credentials import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging { // Token renewal interval, this value will be set in the first call, // if None means no token renewer specified, so cannot get token renewal interval. private var tokenRenewalInterval: Option[Long] = null override val serviceName: String = "hdfs" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { // NameNode to access, used to get tokens from different FileSystems nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) logInfo("getting token for namenode: " + dst) dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds) } // Get the token renewal interval if it is not set. It will only be called once. if (tokenRenewalInterval == null) { tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf) } // Get the time of next renewal. tokenRenewalInterval.map { interval => creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .map { t => val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) identifier.getIssueDate + interval }.foldLeft(0L)(math.max) } } private def getTokenRenewalInterval( hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = { // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. sparkConf.get(PRINCIPAL).map { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } val t = creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .head val newExpiration = t.renew(hadoopConf) val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) val interval = newExpiration - identifier.getIssueDate logInfo(s"Renewal Interval is $interval") interval } } private def getTokenRenewer(conf: Configuration): String = { val delegTokenRenewer = Master.getMasterPrincipal(conf) logDebug("delegation token renewer is: " + delegTokenRenewer) if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) { val errorMessage = "Can't get Master Kerberos principal for use as renewer" logError(errorMessage) throw new SparkException(errorMessage) } delegTokenRenewer } private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = { sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet + sparkConf.get(STAGING_DIR).map(new Path(_)) .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory) } }
Example 4
Source File: CustomReceiver.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { logInfo("Connecting to " + host + ":" + port) socket = new Socket(host, port) logInfo("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() logInfo("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } } // scalastyle:on println
Example 5
Source File: StreamingExamples.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.streaming import org.apache.log4j.{Level, Logger} import org.apache.spark.internal.Logging def setStreamingLogLevels() { val log4jInitialized = Logger.getRootLogger.getAllAppenders.hasMoreElements if (!log4jInitialized) { // We first log something to initialize Spark's default logging, then we override the // logging level. logInfo("Setting log level to [WARN] for streaming example." + " To override add a custom log4j.properties to the classpath.") Logger.getRootLogger.setLevel(Level.WARN) } } }
Example 6
Source File: MesosClusterDispatcher.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.mesos import java.util.concurrent.CountDownLatch import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.deploy.mesos.ui.MesosClusterUI import org.apache.spark.deploy.rest.mesos.MesosRestServer import org.apache.spark.internal.Logging import org.apache.spark.scheduler.cluster.mesos._ import org.apache.spark.util.{ShutdownHookManager, Utils} private[mesos] class MesosClusterDispatcher( args: MesosClusterDispatcherArguments, conf: SparkConf) extends Logging { private val publicAddress = Option(conf.getenv("SPARK_PUBLIC_DNS")).getOrElse(args.host) private val recoveryMode = conf.get("spark.deploy.recoveryMode", "NONE").toUpperCase() logInfo("Recovery mode in Mesos dispatcher set to: " + recoveryMode) private val engineFactory = recoveryMode match { case "NONE" => new BlackHoleMesosClusterPersistenceEngineFactory case "ZOOKEEPER" => new ZookeeperMesosClusterPersistenceEngineFactory(conf) case _ => throw new IllegalArgumentException("Unsupported recovery mode: " + recoveryMode) } private val scheduler = new MesosClusterScheduler(engineFactory, conf) private val server = new MesosRestServer(args.host, args.port, conf, scheduler) private val webUi = new MesosClusterUI( new SecurityManager(conf), args.webUiPort, conf, publicAddress, scheduler) private val shutdownLatch = new CountDownLatch(1) def start(): Unit = { webUi.bind() scheduler.frameworkUrl = conf.get("spark.mesos.dispatcher.webui.url", webUi.activeWebUiUrl) scheduler.start() server.start() } def awaitShutdown(): Unit = { shutdownLatch.await() } def stop(): Unit = { webUi.stop() server.stop() scheduler.stop() shutdownLatch.countDown() } } private[mesos] object MesosClusterDispatcher extends Logging { def main(args: Array[String]) { Utils.initDaemon(log) val conf = new SparkConf val dispatcherArgs = new MesosClusterDispatcherArguments(args, conf) conf.setMaster(dispatcherArgs.masterUrl) conf.setAppName(dispatcherArgs.name) dispatcherArgs.zookeeperUrl.foreach { z => conf.set("spark.deploy.recoveryMode", "ZOOKEEPER") conf.set("spark.deploy.zookeeper.url", z) } val dispatcher = new MesosClusterDispatcher(dispatcherArgs, conf) dispatcher.start() logDebug("Adding shutdown hook") // force eager creation of logger ShutdownHookManager.addShutdownHook { () => logInfo("Shutdown hook is shutting down dispatcher") dispatcher.stop() dispatcher.awaitShutdown() } dispatcher.awaitShutdown() } }
Example 7
Source File: MesosClusterPersistenceEngine.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import scala.collection.JavaConverters._ import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.zookeeper.KeeperException.NoNodeException import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[spark] class ZookeeperMesosClusterPersistenceEngine( baseDir: String, zk: CuratorFramework, conf: SparkConf) extends MesosClusterPersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark_mesos_dispatcher") + "/" + baseDir SparkCuratorUtil.mkdir(zk, WORKING_DIR) def path(name: String): String = { WORKING_DIR + "/" + name } override def expunge(name: String): Unit = { zk.delete().forPath(path(name)) } override def persist(name: String, obj: Object): Unit = { val serialized = Utils.serialize(obj) val zkPath = path(name) zk.create().withMode(CreateMode.PERSISTENT).forPath(zkPath, serialized) } override def fetch[T](name: String): Option[T] = { val zkPath = path(name) try { val fileData = zk.getData().forPath(zkPath) Some(Utils.deserialize[T](fileData)) } catch { case e: NoNodeException => None case e: Exception => logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(zkPath) None } } override def fetchAll[T](): Iterable[T] = { zk.getChildren.forPath(WORKING_DIR).asScala.flatMap(fetch[T]) } }
Example 8
Source File: MesosTaskLaunchData.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import java.nio.ByteBuffer import org.apache.mesos.protobuf.ByteString import org.apache.spark.internal.Logging private[spark] case class MesosTaskLaunchData( serializedTask: ByteBuffer, attemptNumber: Int) extends Logging { def toByteString: ByteString = { val dataBuffer = ByteBuffer.allocate(4 + serializedTask.limit) dataBuffer.putInt(attemptNumber) dataBuffer.put(serializedTask) dataBuffer.rewind logDebug(s"ByteBuffer size: [${dataBuffer.remaining}]") ByteString.copyFrom(dataBuffer) } } private[spark] object MesosTaskLaunchData extends Logging { def fromByteString(byteString: ByteString): MesosTaskLaunchData = { val byteBuffer = byteString.asReadOnlyByteBuffer() logDebug(s"ByteBuffer size: [${byteBuffer.remaining}]") val attemptNumber = byteBuffer.getInt // updates the position by 4 bytes val serializedTask = byteBuffer.slice() // subsequence starting at the current position MesosTaskLaunchData(serializedTask, attemptNumber) } }
Example 9
Source File: GraphLoader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx import org.apache.spark.SparkContext import org.apache.spark.graphx.impl.{EdgePartitionBuilder, GraphImpl} import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel def edgeListFile( sc: SparkContext, path: String, canonicalOrientation: Boolean = false, numEdgePartitions: Int = -1, edgeStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY, vertexStorageLevel: StorageLevel = StorageLevel.MEMORY_ONLY) : Graph[Int, Int] = { val startTime = System.currentTimeMillis // Parse the edge data table directly into edge partitions val lines = if (numEdgePartitions > 0) { sc.textFile(path, numEdgePartitions).coalesce(numEdgePartitions) } else { sc.textFile(path) } val edges = lines.mapPartitionsWithIndex { (pid, iter) => val builder = new EdgePartitionBuilder[Int, Int] iter.foreach { line => if (!line.isEmpty && line(0) != '#') { val lineArray = line.split("\\s+") if (lineArray.length < 2) { throw new IllegalArgumentException("Invalid line: " + line) } val srcId = lineArray(0).toLong val dstId = lineArray(1).toLong if (canonicalOrientation && srcId > dstId) { builder.add(dstId, srcId, 1) } else { builder.add(srcId, dstId, 1) } } } Iterator((pid, builder.toEdgePartition)) }.persist(edgeStorageLevel).setName("GraphLoader.edgeListFile - edges (%s)".format(path)) edges.count() logInfo("It took %d ms to load the edges".format(System.currentTimeMillis - startTime)) GraphImpl.fromEdgePartitions(edges, defaultVertexAttr = 1, edgeStorageLevel = edgeStorageLevel, vertexStorageLevel = vertexStorageLevel) } // end of edgeListFile }
Example 10
Source File: RWrapperUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.RFormula import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.Dataset object RWrapperUtils extends Logging { def checkDataColumns(rFormula: RFormula, data: Dataset[_]): Unit = { if (data.schema.fieldNames.contains(rFormula.getFeaturesCol)) { val newFeaturesName = s"${Identifiable.randomUID(rFormula.getFeaturesCol)}" logWarning(s"data containing ${rFormula.getFeaturesCol} column, " + s"using new name $newFeaturesName instead") rFormula.setFeaturesCol(newFeaturesName) } } }
Example 11
Source File: Transformer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import scala.annotation.varargs import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ protected def validateInputType(inputType: DataType): Unit = {} override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), outputDataType, nullable = false) StructType(outputFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val transformUDF = udf(this.createTransformFunc, outputDataType) dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } override def copy(extra: ParamMap): T = defaultCopy(extra) }
Example 12
Source File: IterativelyReweightedLeastSquares.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg._ import org.apache.spark.rdd.RDD private[ml] class IterativelyReweightedLeastSquares( val initialModel: WeightedLeastSquaresModel, val reweightFunc: (Instance, WeightedLeastSquaresModel) => (Double, Double), val fitIntercept: Boolean, val regParam: Double, val maxIter: Int, val tol: Double) extends Logging with Serializable { def fit(instances: RDD[Instance]): IterativelyReweightedLeastSquaresModel = { var converged = false var iter = 0 var model: WeightedLeastSquaresModel = initialModel var oldModel: WeightedLeastSquaresModel = null while (iter < maxIter && !converged) { oldModel = model // Update offsets and weights using reweightFunc val newInstances = instances.map { instance => val (newOffset, newWeight) = reweightFunc(instance, oldModel) Instance(newOffset, newWeight, instance.features) } // Estimate new model model = new WeightedLeastSquares(fitIntercept, regParam, elasticNetParam = 0.0, standardizeFeatures = false, standardizeLabel = false).fit(newInstances) // Check convergence val oldCoefficients = oldModel.coefficients val coefficients = model.coefficients BLAS.axpy(-1.0, coefficients, oldCoefficients) val maxTolOfCoefficients = oldCoefficients.toArray.reduce { (x, y) => math.max(math.abs(x), math.abs(y)) } val maxTol = math.max(maxTolOfCoefficients, math.abs(oldModel.intercept - model.intercept)) if (maxTol < tol) { converged = true logInfo(s"IRLS converged in $iter iterations.") } logInfo(s"Iteration $iter : relative tolerance = $maxTol") iter = iter + 1 if (iter == maxIter) { logInfo(s"IRLS reached the max number of iterations: $maxIter.") } } new IterativelyReweightedLeastSquaresModel( model.coefficients, model.intercept, model.diagInvAtWA, iter) } }
Example 13
Source File: AssociationRules.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.fpm import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.java.JavaSparkContext.fakeClassTag import org.apache.spark.internal.Logging import org.apache.spark.mllib.fpm.AssociationRules.Rule import org.apache.spark.mllib.fpm.FPGrowth.FreqItemset import org.apache.spark.rdd.RDD @Since("1.5.0") def javaConsequent: java.util.List[Item] = { consequent.toList.asJava } override def toString: String = { s"${antecedent.mkString("{", ",", "}")} => " + s"${consequent.mkString("{", ",", "}")}: ${confidence}" } } }
Example 14
Source File: PearsonCorrelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector} import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.rdd.RDD def computeCorrelationMatrixFromCovariance(covarianceMatrix: Matrix): Matrix = { val cov = covarianceMatrix.asBreeze.asInstanceOf[BDM[Double]] val n = cov.cols // Compute the standard deviation on the diagonals first var i = 0 while (i < n) { // TODO remove once covariance numerical issue resolved. cov(i, i) = if (closeToZero(cov(i, i))) 0.0 else math.sqrt(cov(i, i)) i +=1 } // Loop through columns since cov is column major var j = 0 var sigma = 0.0 var containNaN = false while (j < n) { sigma = cov(j, j) i = 0 while (i < j) { val corr = if (sigma == 0.0 || cov(i, i) == 0.0) { containNaN = true Double.NaN } else { cov(i, j) / (sigma * cov(i, i)) } cov(i, j) = corr cov(j, i) = corr i += 1 } j += 1 } // put 1.0 on the diagonals i = 0 while (i < n) { cov(i, i) = 1.0 i +=1 } if (containNaN) { logWarning("Pearson correlation matrix contains NaN values.") } Matrices.fromBreeze(cov) } private def closeToZero(value: Double, threshold: Double = 1e-12): Boolean = { math.abs(value) <= threshold } }
Example 15
Source File: SpearmanCorrelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.correlation import scala.collection.mutable.ArrayBuffer import org.apache.spark.internal.Logging import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.rdd.RDD override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = { // ((columnIndex, value), rowUid) val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) => vec.toArray.view.zipWithIndex.map { case (v, j) => ((j, v), uid) } } // global sort by (columnIndex, value) val sorted = colBased.sortByKey() // assign global ranks (using average ranks for tied values) val globalRanks = sorted.zipWithIndex().mapPartitions { iter => var preCol = -1 var preVal = Double.NaN var startRank = -1.0 var cachedUids = ArrayBuffer.empty[Long] val flush: () => Iterable[(Long, (Int, Double))] = () => { val averageRank = startRank + (cachedUids.size - 1) / 2.0 val output = cachedUids.map { uid => (uid, (preCol, averageRank)) } cachedUids.clear() output } iter.flatMap { case (((j, v), uid), rank) => // If we see a new value or cachedUids is too big, we flush ids with their average rank. if (j != preCol || v != preVal || cachedUids.size >= 10000000) { val output = flush() preCol = j preVal = v startRank = rank cachedUids += uid output } else { cachedUids += uid Iterator.empty } } ++ flush() } // Replace values in the input matrix by their ranks compared with values in the same column. // Note that shifting all ranks in a column by a constant value doesn't affect result. val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) => // sort by column index and then convert values to a vector Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray) } PearsonCorrelation.computeCorrelationMatrix(groupedRanks) } }
Example 16
Source File: StreamingTestMethod.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.test import java.io.Serializable import scala.language.implicitConversions import scala.math.pow import com.twitter.chill.MeatLocker import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues import org.apache.commons.math3.stat.inference.TTest import org.apache.spark.internal.Logging import org.apache.spark.streaming.dstream.DStream import org.apache.spark.util.StatCounter private[stat] object StreamingTestMethod { // Note: after new `StreamingTestMethod`s are implemented, please update this map. private final val TEST_NAME_TO_OBJECT: Map[String, StreamingTestMethod] = Map( "welch" -> WelchTTest, "student" -> StudentTTest) def getTestMethodFromName(method: String): StreamingTestMethod = TEST_NAME_TO_OBJECT.get(method) match { case Some(test) => test case None => throw new IllegalArgumentException( "Unrecognized method name. Supported streaming test methods: " + TEST_NAME_TO_OBJECT.keys.mkString(", ")) } }
Example 17
Source File: DataValidators.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 18
Source File: GradientBoostedTreesSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 19
Source File: FlumeInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.io.{Externalizable, ObjectInput, ObjectOutput} import java.net.InetSocketAddress import java.nio.ByteBuffer import java.util.concurrent.Executors import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.avro.ipc.NettyServer import org.apache.avro.ipc.specific.SpecificResponder import org.apache.flume.source.avro.{AvroFlumeEvent, AvroSourceProtocol, Status} import org.jboss.netty.channel.{ChannelPipeline, ChannelPipelineFactory, Channels} import org.jboss.netty.channel.socket.nio.NioServerSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.Utils private[streaming] class FlumeInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, storageLevel: StorageLevel, enableDecompression: Boolean ) extends ReceiverInputDStream[SparkFlumeEvent](_ssc) { override def getReceiver(): Receiver[SparkFlumeEvent] = { new FlumeReceiver(host, port, storageLevel, enableDecompression) } } private[streaming] class CompressionChannelPipelineFactory extends ChannelPipelineFactory { def getPipeline(): ChannelPipeline = { val pipeline = Channels.pipeline() val encoder = new ZlibEncoder(6) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) pipeline } } }
Example 20
Source File: EventTransformer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.io.{ObjectInput, ObjectOutput} import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[streaming] object EventTransformer extends Logging { def readExternal(in: ObjectInput): (java.util.HashMap[CharSequence, CharSequence], Array[Byte]) = { val bodyLength = in.readInt() val bodyBuff = new Array[Byte](bodyLength) in.readFully(bodyBuff) val numHeaders = in.readInt() val headers = new java.util.HashMap[CharSequence, CharSequence] for (i <- 0 until numHeaders) { val keyLength = in.readInt() val keyBuff = new Array[Byte](keyLength) in.readFully(keyBuff) val key: String = Utils.deserialize(keyBuff) val valLength = in.readInt() val valBuff = new Array[Byte](valLength) in.readFully(valBuff) val value: String = Utils.deserialize(valBuff) headers.put(key, value) } (headers, bodyBuff) } def writeExternal(out: ObjectOutput, headers: java.util.Map[CharSequence, CharSequence], body: Array[Byte]) { out.writeInt(body.length) out.write(body) val numHeaders = headers.size() out.writeInt(numHeaders) for ((k, v) <- headers.asScala) { val keyBuff = Utils.serialize(k.toString) out.writeInt(keyBuff.length) out.write(keyBuff) val valBuff = Utils.serialize(v.toString) out.writeInt(valBuff.length) out.write(valBuff) } } }
Example 21
Source File: FlumeStreamSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.JavaConverters._ import scala.concurrent.duration._ import scala.language.postfixOps import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.Logging import org.apache.spark.network.util.JavaUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite") var ssc: StreamingContext = null test("flume input stream") { testFlumeStream(testCompression = false) } test("flume input compressed stream") { testFlumeStream(testCompression = true) } private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 22
Source File: CachedKafkaConsumer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.{util => ju} import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer} import org.apache.kafka.common.TopicPartition import org.apache.spark.{SparkEnv, SparkException, TaskContext} import org.apache.spark.internal.Logging def getOrCreate( topic: String, partition: Int, kafkaParams: ju.Map[String, Object]): CachedKafkaConsumer = synchronized { val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String] val topicPartition = new TopicPartition(topic, partition) val key = CacheKey(groupId, topicPartition) // If this is reattempt at running the task, then invalidate cache and start with // a new consumer if (TaskContext.get != null && TaskContext.get.attemptNumber > 1) { cache.remove(key) new CachedKafkaConsumer(topicPartition, kafkaParams) } else { if (!cache.containsKey(key)) { cache.put(key, new CachedKafkaConsumer(topicPartition, kafkaParams)) } cache.get(key) } } }
Example 23
Source File: Signaling.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.repl import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.util.SignalUtils private[repl] object Signaling extends Logging { def cancelOnInterrupt(ctx: SparkContext): Unit = SignalUtils.register("INT") { if (!ctx.statusTracker.getActiveJobIds().isEmpty) { logWarning("Cancelling all active jobs, this can take a while. " + "Press Ctrl+C again to exit now.") ctx.cancelAllJobs() true } else { false } } }
Example 24
Source File: FiltersSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.client import java.util.Collections import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.serde.serdeConstants import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ class FiltersSuite extends SparkFunSuite with Logging { private val shim = new Shim_v0_13 private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test") private val varCharCol = new FieldSchema() varCharCol.setName("varchar") varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME) testTable.setPartCols(Collections.singletonList(varCharCol)) filterTest("string filter", (a("stringcol", StringType) > Literal("test")) :: Nil, "stringcol > \"test\"") filterTest("string filter backwards", (Literal("test") > a("stringcol", StringType)) :: Nil, "\"test\" > stringcol") filterTest("int filter", (a("intcol", IntegerType) === Literal(1)) :: Nil, "intcol = 1") filterTest("int filter backwards", (Literal(1) === a("intcol", IntegerType)) :: Nil, "1 = intcol") filterTest("int and string filter", (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil, "1 = intcol and \"a\" = strcol") filterTest("skip varchar", (Literal("") === a("varchar", StringType)) :: Nil, "") private def filterTest(name: String, filters: Seq[Expression], result: String) = { test(name) { val converted = shim.convertFilters(testTable, filters) if (converted != result) { fail( s"Expected filters ${filters.mkString(",")} to convert to '$result' but got '$converted'") } } } private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)() }
Example 25
Source File: SparkSQLDriver.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, Arrays, List => JList} import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.execution.QueryExecution private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, attr.dataType.catalogString, "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.hiveResultString() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 26
Source File: SparkSQLOperationManager.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.server import java.util.{Map => JMap} import java.util.concurrent.ConcurrentHashMap import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager} import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.HiveSessionState import org.apache.spark.sql.hive.thriftserver.{ReflectionUtils, SparkExecuteStatementOperation} private[thriftserver] class SparkSQLOperationManager() extends OperationManager with Logging { val handleToOperation = ReflectionUtils .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation") val sessionToActivePool = new ConcurrentHashMap[SessionHandle, String]() val sessionToContexts = new ConcurrentHashMap[SessionHandle, SQLContext]() override def newExecuteStatementOperation( parentSession: HiveSession, statement: String, confOverlay: JMap[String, String], async: Boolean): ExecuteStatementOperation = synchronized { val sqlContext = sessionToContexts.get(parentSession.getSessionHandle) require(sqlContext != null, s"Session handle: ${parentSession.getSessionHandle} has not been" + s" initialized or had already closed.") val sessionState = sqlContext.sessionState.asInstanceOf[HiveSessionState] val runInBackground = async && sessionState.hiveThriftServerAsync val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay, runInBackground)(sqlContext, sessionToActivePool) handleToOperation.put(operation.getHandle, operation) logDebug(s"Created Operation for $statement with session=$parentSession, " + s"runInBackground=$runInBackground") operation } }
Example 27
Source File: ThriftServerTab.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 28
Source File: SparkSQLEnv.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream import scala.collection.JavaConverters._ import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.hive.{HiveSessionState, HiveUtils} import org.apache.spark.util.Utils def stop() { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { sparkContext.stop() sparkContext = null sqlContext = null } } }
Example 29
Source File: UDTRegistration.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.util.Utils def getUDTFor(userClass: String): Option[Class[_]] = { udtMap.get(userClass).map { udtClassName => if (Utils.classIsLoadable(udtClassName)) { val udtClass = Utils.classForName(udtClassName) if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) { udtClass } else { throw new SparkException( s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " + s"an UserDefinedType for ${userClass}") } } else { throw new SparkException( s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.") } } } }
Example 30
Source File: BoundAttribute.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types._ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean) extends LeafExpression { override def toString: String = s"input[$ordinal, ${dataType.simpleString}, $nullable]" // Use special getter for primitive types (for UnsafeRow) override def eval(input: InternalRow): Any = { if (input.isNullAt(ordinal)) { null } else { dataType match { case BooleanType => input.getBoolean(ordinal) case ByteType => input.getByte(ordinal) case ShortType => input.getShort(ordinal) case IntegerType | DateType => input.getInt(ordinal) case LongType | TimestampType => input.getLong(ordinal) case FloatType => input.getFloat(ordinal) case DoubleType => input.getDouble(ordinal) case StringType => input.getUTF8String(ordinal) case BinaryType => input.getBinary(ordinal) case CalendarIntervalType => input.getInterval(ordinal) case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale) case t: StructType => input.getStruct(ordinal, t.size) case _: ArrayType => input.getArray(ordinal) case _: MapType => input.getMap(ordinal) case _ => input.get(ordinal, dataType) } } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val javaType = ctx.javaType(dataType) val value = ctx.getValue(ctx.INPUT_ROW, dataType, ordinal.toString) if (ctx.currentVars != null && ctx.currentVars(ordinal) != null) { val oev = ctx.currentVars(ordinal) ev.isNull = oev.isNull ev.value = oev.value val code = oev.code oev.code = "" ev.copy(code = code) } else if (nullable) { ev.copy(code = s""" boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal); $javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value);""") } else { ev.copy(code = s"""$javaType ${ev.value} = $value;""", isNull = "false") } } } object BindReferences extends Logging { def bindReference[A <: Expression]( expression: A, input: AttributeSeq, allowFailures: Boolean = false): A = { expression.transform { case a: AttributeReference => attachTree(a, "Binding attribute") { val ordinal = input.indexOf(a.exprId) if (ordinal == -1) { if (allowFailures) { a } else { sys.error(s"Couldn't find $a in ${input.attrs.mkString("[", ",", "]")}") } } else { BoundReference(ordinal, a.dataType, input(ordinal).nullable) } } }.asInstanceOf[A] // Kind of a hack, but safe. TODO: Tighten return type when possible. } }
Example 31
Source File: RuleExecutor.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.rules import scala.collection.JavaConverters._ import com.google.common.util.concurrent.AtomicLongMap import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.util.Utils object RuleExecutor { protected val timeMap = AtomicLongMap.create[String]() def execute(plan: TreeType): TreeType = { var curPlan = plan batches.foreach { batch => val batchStartPlan = curPlan var iteration = 1 var lastPlan = curPlan var continue = true // Run until fix point (or the max number of iterations as specified in the strategy. while (continue) { curPlan = batch.rules.foldLeft(curPlan) { case (plan, rule) => val startTime = System.nanoTime() val result = rule(plan) val runTime = System.nanoTime() - startTime RuleExecutor.timeMap.addAndGet(rule.ruleName, runTime) if (!result.fastEquals(plan)) { logTrace( s""" |=== Applying Rule ${rule.ruleName} === |${sideBySide(plan.treeString, result.treeString).mkString("\n")} """.stripMargin) } result } iteration += 1 if (iteration > batch.strategy.maxIterations) { // Only log if this is a rule that is supposed to run more than once. if (iteration != 2) { val message = s"Max iterations (${iteration - 1}) reached for batch ${batch.name}" if (Utils.isTesting) { throw new TreeNodeException(curPlan, message, null) } else { logWarning(message) } } continue = false } if (curPlan.fastEquals(lastPlan)) { logTrace( s"Fixed point reached for batch ${batch.name} after ${iteration - 1} iterations.") continue = false } lastPlan = curPlan } if (!batchStartPlan.fastEquals(curPlan)) { logDebug( s""" |=== Result of Batch ${batch.name} === |${sideBySide(plan.treeString, curPlan.treeString).mkString("\n")} """.stripMargin) } else { logTrace(s"Batch ${batch.name} has no effect.") } } curPlan } }
Example 32
Source File: package.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.util.Collections import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.util.{AccumulatorV2, LongAccumulator} case class ColumnMetrics() { val elementTypes = new SetAccumulator[String] sparkContext.register(elementTypes) } val tupleCount: LongAccumulator = sparkContext.longAccumulator val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { debugPrint(s"== ${child.simpleString} ==") debugPrint(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case (attr, metric) => // This is called on driver. All accumulator updates have a fixed value. So it's safe to use // `asScala` which accesses the internal values using `java.util.Iterator`. val actualDataTypes = metric.elementTypes.value.asScala.mkString("{", ",", "}") debugPrint(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount.add(1) var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes.add(value.getClass.getName) } i += 1 } currentRow } } } } override def outputPartitioning: Partitioning = child.outputPartitioning override def inputRDDs(): Seq[RDD[InternalRow]] = { child.asInstanceOf[CodegenSupport].inputRDDs() } override def doProduce(ctx: CodegenContext): String = { child.asInstanceOf[CodegenSupport].produce(ctx, this) } override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { consume(ctx, input) } } }
Example 33
Source File: DriverRegistry.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.sql.{Driver, DriverManager} import scala.collection.mutable import org.apache.spark.internal.Logging import org.apache.spark.util.Utils object DriverRegistry extends Logging { private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty def register(className: String): Unit = { val cls = Utils.getContextOrSparkClassLoader.loadClass(className) if (cls.getClassLoader == null) { logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required") } else if (wrapperMap.get(className).isDefined) { logTrace(s"Wrapper for $className already exists") } else { synchronized { if (wrapperMap.get(className).isEmpty) { val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver]) DriverManager.registerDriver(wrapper) wrapperMap(className) = wrapper logTrace(s"Wrapper for $className registered") } } } } }
Example 34
Source File: CSVParser.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import java.io.{CharArrayWriter, StringReader} import com.univocity.parsers.csv._ import org.apache.spark.internal.Logging private[csv] class LineCsvWriter(params: CSVOptions, headers: Seq[String]) extends Logging { private val writerSettings = new CsvWriterSettings private val format = writerSettings.getFormat format.setDelimiter(params.delimiter) format.setQuote(params.quote) format.setQuoteEscape(params.escape) format.setComment(params.comment) writerSettings.setNullValue(params.nullValue) writerSettings.setEmptyValue(params.nullValue) writerSettings.setSkipEmptyLines(true) writerSettings.setQuoteAllFields(params.quoteAll) writerSettings.setHeaders(headers: _*) writerSettings.setQuoteEscapingEnabled(params.escapeQuotes) private val buffer = new CharArrayWriter() private val writer = new CsvWriter(buffer, writerSettings) def writeRow(row: Seq[String], includeHeader: Boolean): Unit = { if (includeHeader) { writer.writeHeaders() } writer.writeRow(row.toArray: _*) } def flush(): String = { writer.flush() val lines = buffer.toString.stripLineEnd buffer.reset() lines } def close(): Unit = { writer.close() } }
Example 35
Source File: FrequentItems.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.aggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 36
Source File: CompressibleColumnBuilder.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 37
Source File: console.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider} import org.apache.spark.sql.streaming.OutputMode class ConsoleSink(options: Map[String, String]) extends Sink with Logging { // Number of rows to display, by default 20 rows private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20) // Truncate the displayed data if it is too long, by default it is true private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true) // Track the batch id private var lastBatchId = -1L override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized { val batchIdStr = if (batchId <= lastBatchId) { s"Rerun batch: $batchId" } else { lastBatchId = batchId s"Batch: $batchId" } // scalastyle:off println println("-------------------------------------------") println(batchIdStr) println("-------------------------------------------") // scalastyle:off println data.sparkSession.createDataFrame( data.sparkSession.sparkContext.parallelize(data.collect()), data.schema) .show(numRowsToShow, isTruncated) } } class ConsoleSinkProvider extends StreamSinkProvider with DataSourceRegister { def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new ConsoleSink(parameters) } def shortName(): String = "console" }
Example 38
Source File: StateStoreCoordinator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import scala.collection.mutable import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.util.RpcUtils private class StateStoreCoordinator(override val rpcEnv: RpcEnv) extends ThreadSafeRpcEndpoint with Logging { private val instances = new mutable.HashMap[StateStoreId, ExecutorCacheTaskLocation] override def receive: PartialFunction[Any, Unit] = { case ReportActiveInstance(id, host, executorId) => logDebug(s"Reported state store $id is active at $executorId") instances.put(id, ExecutorCacheTaskLocation(host, executorId)) } override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case VerifyIfInstanceActive(id, execId) => val response = instances.get(id) match { case Some(location) => location.executorId == execId case None => false } logDebug(s"Verified that state store $id is active: $response") context.reply(response) case GetLocation(id) => val executorId = instances.get(id).map(_.toString) logDebug(s"Got location of the state store $id: $executorId") context.reply(executorId) case DeactivateInstances(checkpointLocation) => val storeIdsToRemove = instances.keys.filter(_.checkpointLocation == checkpointLocation).toSeq instances --= storeIdsToRemove logDebug(s"Deactivating instances related to checkpoint location $checkpointLocation: " + storeIdsToRemove.mkString(", ")) context.reply(true) case StopCoordinator => stop() // Stop before replying to ensure that endpoint name has been deregistered logInfo("StateStoreCoordinator stopped") context.reply(true) } }
Example 39
Source File: HBaseCredentialProvider.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import scala.reflect.runtime.universe import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.Credentials import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging private[security] class HBaseCredentialProvider extends ServiceCredentialProvider with Logging { override def serviceName: String = "hbase" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val obtainToken = mirror.classLoader. loadClass("org.apache.hadoop.hbase.security.token.TokenUtil"). getMethod("obtainToken", classOf[Configuration]) logDebug("Attempting to fetch HBase security token.") val token = obtainToken.invoke(null, hbaseConf(hadoopConf)) .asInstanceOf[Token[_ <: TokenIdentifier]] logInfo(s"Get token from HBase: ${token.toString}") creds.addToken(token.getService, token) } catch { case NonFatal(e) => logDebug(s"Failed to get token from service $serviceName", e) } None } override def credentialsRequired(hadoopConf: Configuration): Boolean = { hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos" } private def hbaseConf(conf: Configuration): Configuration = { try { val mirror = universe.runtimeMirror(getClass.getClassLoader) val confCreate = mirror.classLoader. loadClass("org.apache.hadoop.hbase.HBaseConfiguration"). getMethod("create", classOf[Configuration]) confCreate.invoke(null, conf).asInstanceOf[Configuration] } catch { case NonFatal(e) => logDebug("Fail to invoke HBaseConfiguration", e) conf } } }
Example 40
Source File: YarnRMClient.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.{List => JList} import scala.collection.JavaConverters._ import scala.util.Try import org.apache.hadoop.conf.Configuration import org.apache.hadoop.yarn.api.records._ import org.apache.hadoop.yarn.client.api.AMRMClient import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.hadoop.yarn.webapp.util.WebAppUtils import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.util.Utils def getMaxRegAttempts(sparkConf: SparkConf, yarnConf: YarnConfiguration): Int = { val sparkMaxAttempts = sparkConf.get(MAX_APP_ATTEMPTS).map(_.toInt) val yarnMaxAttempts = yarnConf.getInt( YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS) val retval: Int = sparkMaxAttempts match { case Some(x) => if (x <= yarnMaxAttempts) x else yarnMaxAttempts case None => yarnMaxAttempts } retval } }
Example 41
Source File: YarnClientSchedulerBackend.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.yarn.api.records.YarnApplicationState import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil} import org.apache.spark.internal.Logging import org.apache.spark.launcher.SparkAppHandle import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class YarnClientSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext) extends YarnSchedulerBackend(scheduler, sc) with Logging { private var client: Client = null private var monitorThread: MonitorThread = null override def stop() { assert(client != null, "Attempted to stop this scheduler before starting it!") if (monitorThread != null) { monitorThread.stopMonitor() } // Report a final state to the launcher if one is connected. This is needed since in client // mode this backend doesn't let the app monitor loop run to completion, so it does not report // the final state itself. // // Note: there's not enough information at this point to provide a better final state, // so assume the application was successful. client.reportLauncherState(SparkAppHandle.State.FINISHED) super.stop() YarnSparkHadoopUtil.get.stopCredentialUpdater() client.stop() logInfo("Stopped") } }
Example 42
Source File: SchedulerExtensionService.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import java.util.concurrent.atomic.AtomicBoolean import org.apache.hadoop.yarn.api.records.{ApplicationAttemptId, ApplicationId} import org.apache.spark.SparkContext import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.util.Utils override def stop(): Unit = { if (started.getAndSet(false)) { logInfo(s"Stopping $this") services.foreach { s => Utils.tryLogNonFatalError(s.stop()) } } } override def toString(): String = s"""SchedulerExtensionServices |(serviceOption=$serviceOption, | services=$services, | started=$started)""".stripMargin }
Example 43
Source File: YarnShuffleIntegrationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.commons.io.FileUtils import org.apache.hadoop.yarn.conf.YarnConfiguration import org.scalatest.Matchers import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.network.shuffle.ShuffleTestAccessor import org.apache.spark.network.yarn.{YarnShuffleService, YarnTestAccessor} import org.apache.spark.tags.ExtendedYarnTest @ExtendedYarnTest class YarnShuffleIntegrationSuite extends BaseYarnClusterSuite { override def newYarnConfig(): YarnConfiguration = { val yarnConfig = new YarnConfiguration() yarnConfig.set(YarnConfiguration.NM_AUX_SERVICES, "spark_shuffle") yarnConfig.set(YarnConfiguration.NM_AUX_SERVICE_FMT.format("spark_shuffle"), classOf[YarnShuffleService].getCanonicalName) yarnConfig.set("spark.shuffle.service.port", "0") yarnConfig } test("external shuffle service") { val shuffleServicePort = YarnTestAccessor.getShuffleServicePort val shuffleService = YarnTestAccessor.getShuffleServiceInstance val registeredExecFile = YarnTestAccessor.getRegisteredExecutorFile(shuffleService) logInfo("Shuffle service port = " + shuffleServicePort) val result = File.createTempFile("result", null, tempDir) val finalState = runSpark( false, mainClassName(YarnExternalShuffleDriver.getClass), appArgs = Seq(result.getAbsolutePath(), registeredExecFile.getAbsolutePath), extraConf = Map( "spark.shuffle.service.enabled" -> "true", "spark.shuffle.service.port" -> shuffleServicePort.toString ) ) checkResult(finalState, result) assert(YarnTestAccessor.getRegisteredExecutorFile(shuffleService).exists()) } } private object YarnExternalShuffleDriver extends Logging with Matchers { val WAIT_TIMEOUT_MILLIS = 10000 def main(args: Array[String]): Unit = { if (args.length != 2) { // scalastyle:off println System.err.println( s""" |Invalid command line: ${args.mkString(" ")} | |Usage: ExternalShuffleDriver [result file] [registered exec file] """.stripMargin) // scalastyle:on println System.exit(1) } val sc = new SparkContext(new SparkConf() .setAppName("External Shuffle Test")) val conf = sc.getConf val status = new File(args(0)) val registeredExecFile = new File(args(1)) logInfo("shuffle service executor file = " + registeredExecFile) var result = "failure" val execStateCopy = new File(registeredExecFile.getAbsolutePath + "_dup") try { val data = sc.parallelize(0 until 100, 10).map { x => (x % 10) -> x }.reduceByKey{ _ + _ }. collect().toSet sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) data should be ((0 until 10).map{x => x -> (x * 10 + 450)}.toSet) result = "success" // only one process can open a leveldb file at a time, so we copy the files FileUtils.copyDirectory(registeredExecFile, execStateCopy) assert(!ShuffleTestAccessor.reloadRegisteredExecutors(execStateCopy).isEmpty) } finally { sc.stop() FileUtils.deleteDirectory(execStateCopy) Files.write(result, status, StandardCharsets.UTF_8) } } }
Example 44
Source File: ExtensionServiceIntegrationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.scalatest.BeforeAndAfter import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging before { val sparkConf = new SparkConf() sparkConf.set(SCHEDULER_SERVICES, Seq(classOf[SimpleExtensionService].getName())) sparkConf.setMaster("local").setAppName("ExtensionServiceIntegrationSuite") sc = new SparkContext(sparkConf) } test("Instantiate") { val services = new SchedulerExtensionServices() assertResult(Nil, "non-nil service list") { services.getServices } services.start(SchedulerExtensionServiceBinding(sc, applicationId)) services.stop() } test("Contains SimpleExtensionService Service") { val services = new SchedulerExtensionServices() try { services.start(SchedulerExtensionServiceBinding(sc, applicationId)) val serviceList = services.getServices assert(serviceList.nonEmpty, "empty service list") val (service :: Nil) = serviceList val simpleService = service.asInstanceOf[SimpleExtensionService] assert(simpleService.started.get, "service not started") services.stop() assert(!simpleService.started.get, "service not stopped") } finally { services.stop() } } }
Example 45
Source File: SocketInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io._ import java.net.{ConnectException, Socket} import java.nio.charset.StandardCharsets import scala.reflect.ClassTag import scala.util.control.NonFatal import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.NextIterator private[streaming] class SocketInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](_ssc) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { private var socket: Socket = _ def onStart() { logInfo(s"Connecting to $host:$port") try { socket = new Socket(host, port) } catch { case e: ConnectException => restart(s"Error connecting to $host:$port", e) return } logInfo(s"Connected to $host:$port") // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // in case restart thread close it twice synchronized { if (socket != null) { socket.close() socket = null logInfo(s"Closed socket to $host:$port") } } } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader( new InputStreamReader(inputStream, StandardCharsets.UTF_8)) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 46
Source File: DStreamCheckpointData.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{IOException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.internal.Logging import org.apache.spark.streaming.Time import org.apache.spark.util.Utils private[streaming] class DStreamCheckpointData[T: ClassTag](dstream: DStream[T]) extends Serializable with Logging { protected val data = new HashMap[Time, AnyRef]() // Mapping of the batch time to the checkpointed RDD file of that time @transient private var timeToCheckpointFile = new HashMap[Time, String] // Mapping of the batch time to the time of the oldest checkpointed RDD // in that batch's checkpoint data @transient private var timeToOldestCheckpointFileTime = new HashMap[Time, Time] @transient private var fileSystem: FileSystem = null protected[streaming] def currentCheckpointFiles = data.asInstanceOf[HashMap[Time, String]] def restore() { // Create RDDs from the checkpoint data currentCheckpointFiles.foreach { case(time, file) => logInfo("Restoring checkpointed RDD for time " + time + " from file '" + file + "'") dstream.generatedRDDs += ((time, dstream.context.sparkContext.checkpointFile[T](file))) } } override def toString: String = { "[\n" + currentCheckpointFiles.size + " checkpoint files \n" + currentCheckpointFiles.mkString("\n") + "\n]" } @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { logDebug(this.getClass().getSimpleName + ".writeObject used") if (dstream.context.graph != null) { dstream.context.graph.synchronized { if (dstream.context.graph.checkpointInProgress) { oos.defaultWriteObject() } else { val msg = "Object of " + this.getClass.getName + " is being serialized " + " possibly as a part of closure of an RDD operation. This is because " + " the DStream object is being referred to from within the closure. " + " Please rewrite the RDD operation inside this DStream to avoid this. " + " This has been enforced to avoid bloating of Spark tasks " + " with unnecessary objects." throw new java.io.NotSerializableException(msg) } } } else { throw new java.io.NotSerializableException( "Graph is unexpectedly null when DStream is being serialized.") } } @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { logDebug(this.getClass().getSimpleName + ".readObject used") ois.defaultReadObject() timeToOldestCheckpointFileTime = new HashMap[Time, Time] timeToCheckpointFile = new HashMap[Time, String] } }
Example 47
Source File: StreamingTab.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging { import StreamingTab._ private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 48
Source File: RecurringTimer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.internal.Logging import org.apache.spark.util.{Clock, SystemClock} private[streaming] class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String) extends Logging { private val thread = new Thread("RecurringTimer - " + name) { setDaemon(true) override def run() { loop } } @volatile private var prevTime = -1L @volatile private var nextTime = -1L @volatile private var stopped = false private def loop() { try { while (!stopped) { triggerActionForNextInterval() } triggerActionForNextInterval() } catch { case e: InterruptedException => } } } private[streaming] object RecurringTimer extends Logging { def main(args: Array[String]) { var lastRecurTime = 0L val period = 1000 def onRecur(time: Long) { val currentTime = System.currentTimeMillis() logInfo("" + currentTime + ": " + (currentTime - lastRecurTime)) lastRecurTime = currentTime } val timer = new RecurringTimer(new SystemClock(), period, onRecur, "Test") timer.start() Thread.sleep(30 * 1000) timer.stop(true) } }
Example 49
Source File: RawTextSender.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") // scalastyle:on println System.exit(1) } // Parse the arguments using a pattern match val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 50
Source File: FileBasedWriteAheadLogReader.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{Closeable, EOFException, IOException} import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.spark.internal.Logging private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration) extends Iterator[ByteBuffer] with Closeable with Logging { private val instream = HdfsUtils.getInputStream(path, conf) private var closed = (instream == null) // the file may be deleted as we're opening the stream private var nextItem: Option[ByteBuffer] = None override def hasNext: Boolean = synchronized { if (closed) { return false } if (nextItem.isDefined) { // handle the case where hasNext is called without calling next true } else { try { val length = instream.readInt() val buffer = new Array[Byte](length) instream.readFully(buffer) nextItem = Some(ByteBuffer.wrap(buffer)) logTrace("Read next item " + nextItem.get) true } catch { case e: EOFException => logDebug("Error reading next item, EOF reached", e) close() false case e: IOException => logWarning("Error while trying to read data. If the file was deleted, " + "this should be okay.", e) close() if (HdfsUtils.checkFileExists(path, conf)) { // If file exists, this could be a legitimate error throw e } else { // File was deleted. This can occur when the daemon cleanup thread takes time to // delete the file during recovery. false } case e: Exception => logWarning("Error while trying to read data from HDFS.", e) close() throw e } } } override def next(): ByteBuffer = synchronized { val data = nextItem.getOrElse { close() throw new IllegalStateException( "next called without calling hasNext or after hasNext returned false") } nextItem = None // Ensure the next hasNext call loads new data. data } override def close(): Unit = synchronized { if (!closed) { instream.close() } closed = true } }
Example 51
Source File: RateLimitedOutputStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.OutputStream import java.util.concurrent.TimeUnit._ import scala.annotation.tailrec import org.apache.spark.internal.Logging private[streaming] class RateLimitedOutputStream(out: OutputStream, desiredBytesPerSec: Int) extends OutputStream with Logging { require(desiredBytesPerSec > 0) private val SYNC_INTERVAL = NANOSECONDS.convert(10, SECONDS) private val CHUNK_SIZE = 8192 private var lastSyncTime = System.nanoTime private var bytesWrittenSinceSync = 0L override def write(b: Int) { waitToWrite(1) out.write(b) } override def write(bytes: Array[Byte]) { write(bytes, 0, bytes.length) } @tailrec override final def write(bytes: Array[Byte], offset: Int, length: Int) { val writeSize = math.min(length - offset, CHUNK_SIZE) if (writeSize > 0) { waitToWrite(writeSize) out.write(bytes, offset, writeSize) write(bytes, offset + writeSize, length) } } override def flush() { out.flush() } override def close() { out.close() } @tailrec private def waitToWrite(numBytes: Int) { val now = System.nanoTime val elapsedNanosecs = math.max(now - lastSyncTime, 1) val rate = bytesWrittenSinceSync.toDouble * 1000000000 / elapsedNanosecs if (rate < desiredBytesPerSec) { // It's okay to write; just update some variables and return bytesWrittenSinceSync += numBytes if (now > lastSyncTime + SYNC_INTERVAL) { // Sync interval has passed; let's resync lastSyncTime = now bytesWrittenSinceSync = numBytes } } else { // Calculate how much time we should sleep to bring ourselves to the desired rate. val targetTimeInMillis = bytesWrittenSinceSync * 1000 / desiredBytesPerSec val elapsedTimeInMillis = elapsedNanosecs / 1000000 val sleepTimeInMillis = targetTimeInMillis - elapsedTimeInMillis if (sleepTimeInMillis > 0) { logTrace("Natural rate is " + rate + " per second but desired rate is " + desiredBytesPerSec + ", sleeping for " + sleepTimeInMillis + " ms to compensate.") Thread.sleep(sleepTimeInMillis) } waitToWrite(numBytes) } } }
Example 52
Source File: FailureSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.File import org.scalatest.BeforeAndAfter import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.util.Utils class FailureSuite extends SparkFunSuite with BeforeAndAfter with Logging { private val batchDuration: Duration = Milliseconds(1000) private val numBatches = 30 private var directory: File = null before { directory = Utils.createTempDir() } after { if (directory != null) { Utils.deleteRecursively(directory) } StreamingContext.getActive().foreach { _.stop() } // Stop SparkContext if active SparkContext.getOrCreate(new SparkConf().setMaster("local").setAppName("bla")).stop() } test("multiple failures with map") { MasterFailureTest.testMap(directory.getAbsolutePath, numBatches, batchDuration) } test("multiple failures with updateStateByKey") { MasterFailureTest.testUpdateStateByKey(directory.getAbsolutePath, numBatches, batchDuration) } }
Example 53
Source File: BroadcastManager.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.broadcast import java.util.concurrent.atomic.AtomicLong import scala.reflect.ClassTag import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.internal.Logging private[spark] class BroadcastManager( val isDriver: Boolean, conf: SparkConf, securityManager: SecurityManager) extends Logging { private var initialized = false private var broadcastFactory: BroadcastFactory = null initialize() // Called by SparkContext or Executor before using Broadcast private def initialize() { synchronized { if (!initialized) { broadcastFactory = new TorrentBroadcastFactory broadcastFactory.initialize(isDriver, conf, securityManager) initialized = true } } } def stop() { broadcastFactory.stop() } private val nextBroadcastId = new AtomicLong(0) def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean): Broadcast[T] = { broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement()) } def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) { broadcastFactory.unbroadcast(id, removeFromDriver, blocking) } }
Example 54
Source File: ShellBasedGroupsMappingProvider.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.security import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[spark] class ShellBasedGroupsMappingProvider extends GroupMappingServiceProvider with Logging { override def getGroups(username: String): Set[String] = { val userGroups = getUnixGroups(username) logDebug("User: " + username + " Groups: " + userGroups.mkString(",")) userGroups } // shells out a "bash -c id -Gn username" to get user groups private def getUnixGroups(username: String): Set[String] = { val cmdSeq = Seq("bash", "-c", "id -Gn " + username) // we need to get rid of the trailing "\n" from the result of command execution Utils.executeAndGetOutput(cmdSeq).stripLineEnd.split(" ").toSet } }
Example 55
Source File: CryptoStreamUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.security import java.io.{InputStream, OutputStream} import java.util.Properties import javax.crypto.spec.{IvParameterSpec, SecretKeySpec} import org.apache.commons.crypto.random._ import org.apache.commons.crypto.stream._ import org.apache.hadoop.io.Text import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[this] def createInitializationVector(properties: Properties): Array[Byte] = { val iv = new Array[Byte](IV_LENGTH_IN_BYTES) val initialIVStart = System.currentTimeMillis() CryptoRandomFactory.getCryptoRandom(properties).nextBytes(iv) val initialIVFinish = System.currentTimeMillis() val initialIVTime = initialIVFinish - initialIVStart if (initialIVTime > 2000) { logWarning(s"It costs ${initialIVTime} milliseconds to create the Initialization Vector " + s"used by CryptoStream") } iv } }
Example 56
Source File: EventLogDownloadResource.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.status.api.v1 import java.io.OutputStream import java.util.zip.ZipOutputStream import javax.ws.rs.{GET, Produces} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} import scala.util.control.NonFatal import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging @Produces(Array(MediaType.APPLICATION_OCTET_STREAM)) private[v1] class EventLogDownloadResource( val uIRoot: UIRoot, val appId: String, val attemptId: Option[String]) extends Logging { val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf) @GET def getEventLogs(): Response = { try { val fileName = { attemptId match { case Some(id) => s"eventLogs-$appId-$id.zip" case None => s"eventLogs-$appId.zip" } } val stream = new StreamingOutput { override def write(output: OutputStream): Unit = { val zipStream = new ZipOutputStream(output) try { uIRoot.writeEventLogs(appId, attemptId, zipStream) } finally { zipStream.close() } } } Response.ok(stream) .header("Content-Disposition", s"attachment; filename=$fileName") .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM) .build() } catch { case NonFatal(e) => Response.serverError() .entity(s"Event logs are not available for app: $appId.") .status(Response.Status.SERVICE_UNAVAILABLE) .build() } } }
Example 57
Source File: StorageMemoryPool.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import javax.annotation.concurrent.GuardedBy import org.apache.spark.internal.Logging import org.apache.spark.storage.BlockId import org.apache.spark.storage.memory.MemoryStore def freeSpaceToShrinkPool(spaceToFree: Long): Long = lock.synchronized { val spaceFreedByReleasingUnusedMemory = math.min(spaceToFree, memoryFree) val remainingSpaceToFree = spaceToFree - spaceFreedByReleasingUnusedMemory if (remainingSpaceToFree > 0) { // If reclaiming free memory did not adequately shrink the pool, begin evicting blocks: val spaceFreedByEviction = memoryStore.evictBlocksToFreeSpace(None, remainingSpaceToFree, memoryMode) // When a block is released, BlockManager.dropFromMemory() calls releaseMemory(), so we do // not need to decrement _memoryUsed here. However, we do need to decrement the pool size. spaceFreedByReleasingUnusedMemory + spaceFreedByEviction } else { spaceFreedByReleasingUnusedMemory } } }
Example 58
Source File: NettyRpcCallContext.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc.netty import scala.concurrent.Promise import org.apache.spark.internal.Logging import org.apache.spark.network.client.RpcResponseCallback import org.apache.spark.rpc.{RpcAddress, RpcCallContext} private[netty] abstract class NettyRpcCallContext(override val senderAddress: RpcAddress) extends RpcCallContext with Logging { protected def send(message: Any): Unit override def reply(response: Any): Unit = { send(response) } override def sendFailure(e: Throwable): Unit = { send(RpcFailure(e)) } } private[netty] class RemoteNettyRpcCallContext( nettyEnv: NettyRpcEnv, callback: RpcResponseCallback, senderAddress: RpcAddress) extends NettyRpcCallContext(senderAddress) { override protected def send(message: Any): Unit = { val reply = nettyEnv.serialize(message) callback.onSuccess(reply) } }
Example 59
Source File: RpcEndpointRef.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import scala.concurrent.Future import scala.reflect.ClassTag import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.util.RpcUtils def askWithRetry[T: ClassTag](message: Any, timeout: RpcTimeout): T = { // TODO: Consider removing multiple attempts var attempts = 0 var lastException: Exception = null while (attempts < maxRetries) { attempts += 1 try { val future = ask[T](message, timeout) val result = timeout.awaitResult(future) if (result == null) { throw new SparkException("RpcEndpoint returned null") } return result } catch { case ie: InterruptedException => throw ie case e: Exception => lastException = e logWarning(s"Error sending message [message = $message] in $attempts attempts", e) } if (attempts < maxRetries) { Thread.sleep(retryWaitMs) } } throw new SparkException( s"Error sending message [message = $message]", lastException) } }
Example 60
Source File: BlockTransferService.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Future, Promise} import scala.concurrent.duration.Duration import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient} import org.apache.spark.scheduler.MapStatus import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.ThreadUtils private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel, classTag: ClassTag[_]): Unit = { val future = uploadBlock(hostname, port, execId, blockId, blockData, level, classTag) ThreadUtils.awaitResult(future, Duration.Inf) } }
Example 61
Source File: NettyBlockRpcServer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, MapOutputReady, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.scheduler.MapStatus import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) case mapOutputReady: MapOutputReady => val mapStatus: MapStatus = serializer.newInstance().deserialize(ByteBuffer.wrap(mapOutputReady.serializedMapStatus)) blockManager.mapOutputReady( mapOutputReady.shuffleId, mapOutputReady.mapId, mapOutputReady.numReduces, mapStatus) } } override def getStreamManager(): StreamManager = streamManager }
Example 62
Source File: SortShuffleWriter.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.Utils import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = context.taskMetrics().shuffleWriteMetrics override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() writeMetrics.incWriteTime(System.nanoTime - startTime) sorter = null } } } } private[spark] object SortShuffleWriter { def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = { // We cannot bypass sorting if we need to do map-side aggregation. if (dep.mapSideCombine) { require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!") false } else { val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) dep.partitioner.numPartitions <= bypassMergeThreshold } } }
Example 63
Source File: MetricsConfig.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.metrics import java.io.{FileInputStream, InputStream} import java.util.Properties import scala.collection.JavaConverters._ import scala.collection.mutable import scala.util.matching.Regex import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[spark] class MetricsConfig(conf: SparkConf) extends Logging { private val DEFAULT_PREFIX = "*" private val INSTANCE_REGEX = "^(\\*|[a-zA-Z]+)\\.(.+)".r private val DEFAULT_METRICS_CONF_FILENAME = "metrics.properties" private[metrics] val properties = new Properties() private[metrics] var perInstanceSubProperties: mutable.HashMap[String, Properties] = null private def setDefaultProperties(prop: Properties) { prop.setProperty("*.sink.servlet.class", "org.apache.spark.metrics.sink.MetricsServlet") prop.setProperty("*.sink.servlet.path", "/metrics/json") prop.setProperty("master.sink.servlet.path", "/metrics/master/json") prop.setProperty("applications.sink.servlet.path", "/metrics/applications/json") } private[this] def loadPropertiesFromFile(path: Option[String]): Unit = { var is: InputStream = null try { is = path match { case Some(f) => new FileInputStream(f) case None => Utils.getSparkClassLoader.getResourceAsStream(DEFAULT_METRICS_CONF_FILENAME) } if (is != null) { properties.load(is) } } catch { case e: Exception => val file = path.getOrElse(DEFAULT_METRICS_CONF_FILENAME) logError(s"Error loading configuration file $file", e) } finally { if (is != null) { is.close() } } } }
Example 64
Source File: PythonGatewayServer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.DataOutputStream import java.net.Socket import py4j.GatewayServer import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[spark] object PythonGatewayServer extends Logging { initializeLogIfNecessary(true) def main(args: Array[String]): Unit = Utils.tryOrExit { // Start a GatewayServer on an ephemeral port val gatewayServer: GatewayServer = new GatewayServer(null, 0) gatewayServer.start() val boundPort: Int = gatewayServer.getListeningPort if (boundPort == -1) { logError("GatewayServer failed to bind; exiting") System.exit(1) } else { logDebug(s"Started PythonGatewayServer on port $boundPort") } // Communicate the bound port back to the caller via the caller-specified callback port val callbackHost = sys.env("_PYSPARK_DRIVER_CALLBACK_HOST") val callbackPort = sys.env("_PYSPARK_DRIVER_CALLBACK_PORT").toInt logDebug(s"Communicating GatewayServer port to Python driver at $callbackHost:$callbackPort") val callbackSocket = new Socket(callbackHost, callbackPort) val dos = new DataOutputStream(callbackSocket.getOutputStream) dos.writeInt(boundPort) dos.close() callbackSocket.close() // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies: while (System.in.read() != -1) { // Do nothing } logDebug("Exiting due to broken pipe from Python driver") System.exit(0) } }
Example 65
Source File: RRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.util.{Map => JMap} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.api.java.{JavaPairRDD, JavaRDD, JavaSparkContext} import org.apache.spark.api.python.PythonRDD import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD private abstract class BaseRRDD[T: ClassTag, U: ClassTag]( parent: RDD[T], numPartitions: Int, func: Array[Byte], deserializer: String, serializer: String, packageNames: Array[Byte], broadcastVars: Array[Broadcast[Object]]) extends RDD[U](parent) with Logging { override def getPartitions: Array[Partition] = parent.partitions override def compute(partition: Partition, context: TaskContext): Iterator[U] = { val runner = new RRunner[U]( func, deserializer, serializer, packageNames, broadcastVars, numPartitions) // The parent may be also an RRDD, so we should launch it first. val parentIterator = firstParent[T].iterator(partition, context) runner.compute(parentIterator, partition.index) } } def createRDDFromFile(jsc: JavaSparkContext, fileName: String, parallelism: Int): JavaRDD[Array[Byte]] = { PythonRDD.readRDDFromFile(jsc, fileName, parallelism) } }
Example 66
Source File: SparkCuratorUtil.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import scala.collection.JavaConverters._ import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory} import org.apache.curator.retry.ExponentialBackoffRetry import org.apache.zookeeper.KeeperException import org.apache.spark.SparkConf import org.apache.spark.internal.Logging private[spark] object SparkCuratorUtil extends Logging { private val ZK_CONNECTION_TIMEOUT_MILLIS = 15000 private val ZK_SESSION_TIMEOUT_MILLIS = 60000 private val RETRY_WAIT_MILLIS = 5000 private val MAX_RECONNECT_ATTEMPTS = 3 def newClient( conf: SparkConf, zkUrlConf: String = "spark.deploy.zookeeper.url"): CuratorFramework = { val ZK_URL = conf.get(zkUrlConf) val zk = CuratorFrameworkFactory.newClient(ZK_URL, ZK_SESSION_TIMEOUT_MILLIS, ZK_CONNECTION_TIMEOUT_MILLIS, new ExponentialBackoffRetry(RETRY_WAIT_MILLIS, MAX_RECONNECT_ATTEMPTS)) zk.start() zk } def mkdir(zk: CuratorFramework, path: String) { if (zk.checkExists().forPath(path) == null) { try { zk.create().creatingParentsIfNeeded().forPath(path) } catch { case nodeExist: KeeperException.NodeExistsException => // do nothing, ignore node existing exception. case e: Exception => throw e } } } def deleteRecursive(zk: CuratorFramework, path: String) { if (zk.checkExists().forPath(path) != null) { for (child <- zk.getChildren.forPath(path).asScala) { zk.delete().forPath(path + "/" + child) } zk.delete().forPath(path) } } }
Example 67
Source File: ExternalShuffleService.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.util.concurrent.CountDownLatch import scala.collection.JavaConverters._ import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.internal.Logging import org.apache.spark.metrics.MetricsSystem import org.apache.spark.network.TransportContext import org.apache.spark.network.netty.SparkTransportConf import org.apache.spark.network.sasl.SaslServerBootstrap import org.apache.spark.network.server.{TransportServer, TransportServerBootstrap} import org.apache.spark.network.shuffle.ExternalShuffleBlockHandler import org.apache.spark.network.util.TransportConf import org.apache.spark.util.{ShutdownHookManager, Utils} private[spark] def main( args: Array[String], newShuffleService: (SparkConf, SecurityManager) => ExternalShuffleService): Unit = { Utils.initDaemon(log) val sparkConf = new SparkConf Utils.loadDefaultSparkProperties(sparkConf) val securityManager = new SecurityManager(sparkConf) // we override this value since this service is started from the command line // and we assume the user really wants it to be running sparkConf.set("spark.shuffle.service.enabled", "true") server = newShuffleService(sparkConf, securityManager) server.start() logDebug("Adding shutdown hook") // force eager creation of logger ShutdownHookManager.addShutdownHook { () => logInfo("Shutting down shuffle service.") server.stop() barrier.countDown() } // keep running until the process is terminated barrier.await() } }
Example 68
Source File: FileSystemPersistenceEngine.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.Utils private[master] class FileSystemPersistenceEngine( val dir: String, val serializer: Serializer) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { val f = new File(dir + File.separator + name) if (!f.delete()) { logWarning(s"Error deleting ${f.getPath()}") } } override def read[T: ClassTag](prefix: String): Seq[T] = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) var out: SerializationStream = null Utils.tryWithSafeFinally { out = serializer.newInstance().serializeStream(fileOut) out.writeObject(value) } { fileOut.close() if (out != null) { out.close() } } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileIn = new FileInputStream(file) var in: DeserializationStream = null try { in = serializer.newInstance().deserializeStream(fileIn) in.readObject[T]() } finally { fileIn.close() if (in != null) { in.close() } } } }
Example 69
Source File: RecoveryModeFactory.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 70
Source File: MasterArguments.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import scala.annotation.tailrec import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.util.{IntParam, Utils} private def printUsageAndExit(exitCode: Int) { // scalastyle:off println System.err.println( "Usage: Master [options]\n" + "\n" + "Options:\n" + " -i HOST, --ip HOST Hostname to listen on (deprecated, please use --host or -h) \n" + " -h HOST, --host HOST Hostname to listen on\n" + " -p PORT, --port PORT Port to listen on (default: 7077)\n" + " --webui-port PORT Port for web UI (default: 8080)\n" + " --properties-file FILE Path to a custom Spark properties file.\n" + " Default is conf/spark-defaults.conf.") // scalastyle:on println System.exit(exitCode) } }
Example 71
Source File: MasterWebUI.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master.ui import scala.collection.mutable.HashMap import org.eclipse.jetty.servlet.ServletContextHandler import org.apache.spark.deploy.master.Master import org.apache.spark.internal.Logging import org.apache.spark.ui.{SparkUI, WebUI} import org.apache.spark.ui.JettyUtils._ def initialize() { val masterPage = new MasterPage(this) attachPage(new ApplicationPage(this)) attachPage(masterPage) attachHandler(createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR, "/static")) attachHandler(createRedirectHandler( "/app/kill", "/", masterPage.handleAppKillRequest, httpMethods = Set("POST"))) attachHandler(createRedirectHandler( "/driver/kill", "/", masterPage.handleDriverKillRequest, httpMethods = Set("POST"))) } def addProxyTargets(id: String, target: String): Unit = { var endTarget = target.stripSuffix("/") val handler = createProxyHandler("/proxy/" + id, endTarget) attachHandler(handler) proxyHandlers(id) = handler } def removeProxyTargets(id: String): Unit = { proxyHandlers.remove(id).foreach(detachHandler) } } private[master] object MasterWebUI { private val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR }
Example 72
Source File: ZooKeeperLeaderElectionAgent.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.curator.framework.CuratorFramework import org.apache.curator.framework.recipes.leader.{LeaderLatch, LeaderLatchListener} import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.internal.Logging private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderElectable, conf: SparkConf) extends LeaderLatchListener with LeaderElectionAgent with Logging { val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election" private var zk: CuratorFramework = _ private var leaderLatch: LeaderLatch = _ private var status = LeadershipStatus.NOT_LEADER start() private def start() { logInfo("Starting ZooKeeper LeaderElection agent") zk = SparkCuratorUtil.newClient(conf) leaderLatch = new LeaderLatch(zk, WORKING_DIR) leaderLatch.addListener(this) leaderLatch.start() } override def stop() { leaderLatch.close() zk.close() } override def isLeader() { synchronized { // could have lost leadership by now. if (!leaderLatch.hasLeadership) { return } logInfo("We have gained leadership") updateLeadershipStatus(true) } } override def notLeader() { synchronized { // could have gained leadership by now. if (leaderLatch.hasLeadership) { return } logInfo("We have lost leadership") updateLeadershipStatus(false) } } private def updateLeadershipStatus(isLeader: Boolean) { if (isLeader && status == LeadershipStatus.NOT_LEADER) { status = LeadershipStatus.LEADER masterInstance.electedLeader() } else if (!isLeader && status == LeadershipStatus.LEADER) { status = LeadershipStatus.NOT_LEADER masterInstance.revokedLeadership() } } private object LeadershipStatus extends Enumeration { type LeadershipStatus = Value val LEADER, NOT_LEADER = Value } }
Example 73
Source File: ZooKeeperPersistenceEngine.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer) extends PersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String): Seq[T] = { zk.getChildren.forPath(WORKING_DIR).asScala .filter(_.startsWith(prefix)).flatMap(deserializeFromFile[T]) } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes) } private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) try { Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData))) } catch { case e: Exception => logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } }
Example 74
Source File: WorkerWebUI.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker.ui import java.io.File import javax.servlet.http.HttpServletRequest import org.apache.spark.deploy.worker.Worker import org.apache.spark.internal.Logging import org.apache.spark.ui.{SparkUI, WebUI} import org.apache.spark.ui.JettyUtils._ import org.apache.spark.util.RpcUtils def initialize() { val logPage = new LogPage(this) attachPage(logPage) attachPage(new WorkerPage(this)) attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static")) attachHandler(createServletHandler("/log", (request: HttpServletRequest) => logPage.renderLog(request), worker.securityMgr, worker.conf)) } } private[worker] object WorkerWebUI { val STATIC_RESOURCE_BASE = SparkUI.STATIC_RESOURCE_DIR val DEFAULT_RETAINED_DRIVERS = 1000 val DEFAULT_RETAINED_EXECUTORS = 1000 }
Example 75
Source File: WorkerWatcher.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import org.apache.spark.internal.Logging import org.apache.spark.rpc._ private[spark] class WorkerWatcher( override val rpcEnv: RpcEnv, workerUrl: String, isTesting: Boolean = false) extends RpcEndpoint with Logging { logInfo(s"Connecting to worker $workerUrl") if (!isTesting) { rpcEnv.asyncSetupEndpointRefByURI(workerUrl) } // Used to avoid shutting down JVM during tests // In the normal case, exitNonZero will call `System.exit(-1)` to shutdown the JVM. In the unit // test, the user should call `setTesting(true)` so that `exitNonZero` will set `isShutDown` to // true rather than calling `System.exit`. The user can check `isShutDown` to know if // `exitNonZero` is called. private[deploy] var isShutDown = false // Lets filter events only from the worker's rpc system private val expectedAddress = RpcAddress.fromURIString(workerUrl) private def isWorker(address: RpcAddress) = expectedAddress == address private def exitNonZero() = if (isTesting) isShutDown = true else System.exit(-1) override def receive: PartialFunction[Any, Unit] = { case e => logWarning(s"Received unexpected message: $e") } override def onConnected(remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { logInfo(s"Successfully connected to $workerUrl") } } override def onDisconnected(remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { // This log message will never be seen logError(s"Lost connection to worker rpc endpoint $workerUrl. Exiting.") exitNonZero() } } override def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { // These logs may not be seen if the worker (and associated pipe) has died logError(s"Could not initialize connection to worker $workerUrl. Exiting.") logError(s"Error was: $cause") exitNonZero() } } }
Example 76
Source File: HistoryServerArguments.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import scala.annotation.tailrec import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[history] class HistoryServerArguments(conf: SparkConf, args: Array[String]) extends Logging { private var propertiesFile: String = null parse(args.toList) @tailrec private def parse(args: List[String]): Unit = { if (args.length == 1) { setLogDirectory(args.head) } else { args match { case ("--dir" | "-d") :: value :: tail => setLogDirectory(value) parse(tail) case ("--help" | "-h") :: tail => printUsageAndExit(0) case ("--properties-file") :: value :: tail => propertiesFile = value parse(tail) case Nil => case _ => printUsageAndExit(1) } } } private def setLogDirectory(value: String): Unit = { logWarning("Setting log directory through the command line is deprecated as of " + "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.") conf.set("spark.history.fs.logDirectory", value) } // This mutates the SparkConf, so all accesses to it must be made after this line Utils.loadDefaultSparkProperties(conf, propertiesFile) private def printUsageAndExit(exitCode: Int) { // scalastyle:off println System.err.println( """ |Usage: HistoryServer [options] | |Options: | DIR Deprecated; set spark.history.fs.logDirectory directly | --dir DIR (-d DIR) Deprecated; set spark.history.fs.logDirectory directly | --properties-file FILE Path to a custom Spark properties file. | Default is conf/spark-defaults.conf. | |Configuration options can be set by setting the corresponding JVM system property. |History Server options are always available; additional options depend on the provider. | |History Server options: | | spark.history.ui.port Port where server will listen for connections | (default 18080) | spark.history.acls.enable Whether to enable view acls for all applications | (default false) | spark.history.provider Name of history provider class (defaults to | file system-based provider) | spark.history.retainedApplications Max number of application UIs to keep loaded in memory | (default 50) |FsHistoryProvider options: | | spark.history.fs.logDirectory Directory where app logs are stored | (default: file:/tmp/spark-events) | spark.history.fs.updateInterval How often to reload log data from storage | (in seconds, default: 10) |""".stripMargin) // scalastyle:on println System.exit(exitCode) } }
Example 77
Source File: LocalSparkCluster.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkConf import org.apache.spark.deploy.master.Master import org.apache.spark.deploy.worker.Worker import org.apache.spark.internal.Logging import org.apache.spark.rpc.RpcEnv import org.apache.spark.util.Utils for (workerNum <- 1 to numWorkers) { val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker, memoryPerWorker, masters, null, Some(workerNum), _conf) workerRpcEnvs += workerEnv } masters } def stop() { logInfo("Shutting down local Spark cluster.") // Stop the workers before the master so they don't get upset that it disconnected workerRpcEnvs.foreach(_.shutdown()) masterRpcEnvs.foreach(_.shutdown()) workerRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.clear() workerRpcEnvs.clear() } }
Example 78
Source File: SparkHadoopMapRedUtil.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mapred import java.io.IOException import org.apache.hadoop.mapreduce.{TaskAttemptContext => MapReduceTaskAttemptContext} import org.apache.hadoop.mapreduce.{OutputCommitter => MapReduceOutputCommitter} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.executor.CommitDeniedException import org.apache.spark.internal.Logging object SparkHadoopMapRedUtil extends Logging { def commitTask( committer: MapReduceOutputCommitter, mrTaskContext: MapReduceTaskAttemptContext, jobId: Int, splitId: Int): Unit = { val mrTaskAttemptID = mrTaskContext.getTaskAttemptID // Called after we have decided to commit def performCommit(): Unit = { try { committer.commitTask(mrTaskContext) logInfo(s"$mrTaskAttemptID: Committed") } catch { case cause: IOException => logError(s"Error committing the output of task: $mrTaskAttemptID", cause) committer.abortTask(mrTaskContext) throw cause } } // First, check whether the task's output has already been committed by some other attempt if (committer.needsTaskCommit(mrTaskContext)) { val shouldCoordinateWithDriver: Boolean = { val sparkConf = SparkEnv.get.conf // We only need to coordinate with the driver if there are concurrent task attempts. // Note that this could happen even when speculation is not enabled (e.g. see SPARK-8029). // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs. sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", defaultValue = true) } if (shouldCoordinateWithDriver) { val outputCommitCoordinator = SparkEnv.get.outputCommitCoordinator val taskAttemptNumber = TaskContext.get().attemptNumber() val canCommit = outputCommitCoordinator.canCommit(jobId, splitId, taskAttemptNumber) if (canCommit) { performCommit() } else { val message = s"$mrTaskAttemptID: Not committed because the driver did not authorize commit" logInfo(message) // We need to abort the task so that the driver can reschedule new attempts, if necessary committer.abortTask(mrTaskContext) throw new CommitDeniedException(message, jobId, splitId, taskAttemptNumber) } } else { // Speculation is disabled or a user has chosen to manually bypass the commit coordination performCommit() } } else { // Some other attempt committed the output, so we do nothing and signal success logInfo(s"No need to commit output of task because needsTaskCommit=false: $mrTaskAttemptID") } } }
Example 79
Source File: LocalSchedulerBackend.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.local import java.io.File import java.net.URL import java.nio.ByteBuffer import org.apache.spark.{SparkConf, SparkContext, SparkEnv, TaskState} import org.apache.spark.TaskState.TaskState import org.apache.spark.executor.{Executor, ExecutorBackend} import org.apache.spark.internal.Logging import org.apache.spark.launcher.{LauncherBackend, SparkAppHandle} import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler._ import org.apache.spark.scheduler.cluster.ExecutorInfo private case class ReviveOffers() private case class StatusUpdate(taskId: Long, state: TaskState, serializedData: ByteBuffer) private case class KillTask(taskId: Long, interruptThread: Boolean) private case class StopExecutor() def getUserClasspath(conf: SparkConf): Seq[URL] = { val userClassPathStr = conf.getOption("spark.executor.extraClassPath") userClassPathStr.map(_.split(File.pathSeparator)).toSeq.flatten.map(new File(_).toURI.toURL) } launcherBackend.connect() override def start() { val rpcEnv = SparkEnv.get.rpcEnv val executorEndpoint = new LocalEndpoint(rpcEnv, userClassPath, scheduler, this, totalCores) localEndpoint = rpcEnv.setupEndpoint("LocalSchedulerBackendEndpoint", executorEndpoint) listenerBus.post(SparkListenerExecutorAdded( System.currentTimeMillis, executorEndpoint.localExecutorId, new ExecutorInfo(executorEndpoint.localExecutorHostname, totalCores, Map.empty))) launcherBackend.setAppId(appId) launcherBackend.setState(SparkAppHandle.State.RUNNING) } override def stop() { stop(SparkAppHandle.State.FINISHED) } override def reviveOffers() { localEndpoint.send(ReviveOffers) } override def defaultParallelism(): Int = scheduler.conf.getInt("spark.default.parallelism", totalCores) override def killTask(taskId: Long, executorId: String, interruptThread: Boolean) { localEndpoint.send(KillTask(taskId, interruptThread)) } override def statusUpdate(taskId: Long, state: TaskState, serializedData: ByteBuffer) { localEndpoint.send(StatusUpdate(taskId, state, serializedData)) } override def applicationId(): String = appId private def stop(finalState: SparkAppHandle.State): Unit = { localEndpoint.ask(StopExecutor) try { launcherBackend.setState(finalState) } finally { launcherBackend.close() } } }
Example 80
Source File: ShuffleMapTask.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.lang.management.ManagementFactory import java.nio.ByteBuffer import java.util.Properties import scala.language.existentials import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.executor.TaskMetrics import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.shuffle.ShuffleWriter import org.apache.spark.storage.BlockManagerId def this(partitionId: Int) { this(0, 0, null, new Partition { override def index: Int = 0 }, null, new Properties, null) } @transient private val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } var rdd: RDD[_] = null var dep: ShuffleDependency[_, _, _] = null override def prepTask(): Unit = { // Deserialize the RDD using the broadcast variable. val threadMXBean = ManagementFactory.getThreadMXBean val deserializeStartTime = System.currentTimeMillis() val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime } else 0L val ser = SparkEnv.get.closureSerializer.newInstance() val (_rdd, _dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) rdd = _rdd dep = _dep _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime } else 0L } override def runTask(context: TaskContext): MapStatus = { if (dep == null || rdd == null) { prepTask() } var writer: ShuffleWriter[Any, Any] = null try { val manager = SparkEnv.get.shuffleManager writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context) writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]]) val status = writer.stop(success = true).get FutureTaskNotifier.taskCompleted(status, partitionId, dep.shuffleId, dep.partitioner.numPartitions, nextStageLocs, metrics.shuffleWriteMetrics, false) status } catch { case e: Exception => try { if (writer != null) { writer.stop(success = false) } } catch { case e: Exception => log.debug("Could not stop writer", e) } throw e } } override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId) } object ShuffleMapTask { def apply( stageId: Int, stageAttemptId: Int, partition: Partition, properties: Properties, internalAccumulatorsSer: Array[Byte], isFutureTask: Boolean, rdd: RDD[_], dep: ShuffleDependency[_, _, _], nextStageLocs: Option[Seq[BlockManagerId]]): ShuffleMapTask = { val smt = new ShuffleMapTask(stageId, stageAttemptId, null, partition, null, properties, internalAccumulatorsSer, isFutureTask, nextStageLocs) smt.rdd = rdd smt.dep = dep smt } }
Example 81
Source File: BlacklistTracker.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.internal.config import org.apache.spark.util.Utils private[scheduler] object BlacklistTracker extends Logging { private val DEFAULT_TIMEOUT = "1h" def validateBlacklistConfs(conf: SparkConf): Unit = { def mustBePos(k: String, v: String): Unit = { throw new IllegalArgumentException(s"$k was $v, but must be > 0.") } Seq( config.MAX_TASK_ATTEMPTS_PER_EXECUTOR, config.MAX_TASK_ATTEMPTS_PER_NODE, config.MAX_FAILURES_PER_EXEC_STAGE, config.MAX_FAILED_EXEC_PER_NODE_STAGE ).foreach { config => val v = conf.get(config) if (v <= 0) { mustBePos(config.key, v.toString) } } val timeout = getBlacklistTimeout(conf) if (timeout <= 0) { // first, figure out where the timeout came from, to include the right conf in the message. conf.get(config.BLACKLIST_TIMEOUT_CONF) match { case Some(t) => mustBePos(config.BLACKLIST_TIMEOUT_CONF.key, timeout.toString) case None => mustBePos(config.BLACKLIST_LEGACY_TIMEOUT_CONF.key, timeout.toString) } } val maxTaskFailures = conf.get(config.MAX_TASK_FAILURES) val maxNodeAttempts = conf.get(config.MAX_TASK_ATTEMPTS_PER_NODE) if (maxNodeAttempts >= maxTaskFailures) { throw new IllegalArgumentException(s"${config.MAX_TASK_ATTEMPTS_PER_NODE.key} " + s"( = ${maxNodeAttempts}) was >= ${config.MAX_TASK_FAILURES.key} " + s"( = ${maxTaskFailures} ). Though blacklisting is enabled, with this configuration, " + s"Spark will not be robust to one bad node. Decrease " + s"${config.MAX_TASK_ATTEMPTS_PER_NODE.key}, increase ${config.MAX_TASK_FAILURES.key}, " + s"or disable blacklisting with ${config.BLACKLIST_ENABLED.key}") } } }
Example 82
Source File: TaskDescription.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.nio.ByteBuffer import scala.collection.mutable import scala.collection.mutable.HashSet import scala.util.control.NonFatal import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.serializer.SerializerInstance import org.apache.spark.util.SerializableBuffer private[spark] class TaskDescription( val taskId: Long, val attemptNumber: Int, val executorId: String, val name: String, val index: Int, // Index within this task's TaskSet val isFutureTask: Boolean, @transient private val _task: Task[_], @transient private val _addedFiles: mutable.Map[String, Long], @transient private val _addedJars: mutable.Map[String, Long], @transient private val _ser: SerializerInstance) extends Serializable with Logging { // Because ByteBuffers are not serializable, wrap the task in a SerializableBuffer private var buffer: SerializableBuffer = _ def prepareSerializedTask(): Unit = { if (_task != null) { val serializedTask: ByteBuffer = try { Task.serializeWithDependencies(_task, _addedFiles, _addedJars, _ser) } catch { // If the task cannot be serialized, then there is not point in re-attempting // the task as it will always fail. So just abort the task set. case NonFatal(e) => val msg = s"Failed to serialize the task $taskId, not attempting to retry it." logError(msg, e) // FIXME(shivaram): We dont have a handle to the taskSet here to abort it. throw new TaskNotSerializableException(e) } if (serializedTask.limit > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024) { logWarning(s"Stage ${_task.stageId} contains a task of very large size " + s"(${serializedTask.limit / 1024} KB). The maximum recommended task size is " + s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.") } buffer = new SerializableBuffer(serializedTask) } else { buffer = new SerializableBuffer(ByteBuffer.allocate(0)) } } def serializedTask: ByteBuffer = buffer.value override def toString: String = "TaskDescription(TID=%d, index=%d)".format(taskId, index) }
Example 83
Source File: JobWaiter.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.util.concurrent.atomic.AtomicInteger import scala.concurrent.{Future, Promise} import org.apache.spark.internal.Logging def cancel() { dagScheduler.cancelJob(jobId) } override def taskSucceeded(index: Int, result: Any): Unit = { // resultHandler call must be synchronized in case resultHandler itself is not thread safe. synchronized { resultHandler(index, result.asInstanceOf[T]) } if (finishedTasks.incrementAndGet() == totalTasks) { jobPromise.success(()) } } override def jobFailed(exception: Exception): Unit = { if (!jobPromise.tryFailure(exception)) { logWarning("Ignore failure", exception) } } }
Example 84
Source File: FutureTaskNotifier.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.internal.Logging import org.apache.spark.storage.BlockManagerId import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.storage.StorageLevel private[spark] object FutureTaskNotifier extends Logging { def taskCompleted( status: MapStatus, mapId: Int, shuffleId: Int, numReduces: Int, nextStageLocs: Option[Seq[BlockManagerId]], shuffleWriteMetrics: ShuffleWriteMetrics, skipZeroByteNotifications: Boolean): Unit = { if (!nextStageLocs.isEmpty && numReduces == nextStageLocs.get.length) { val drizzleRpcsStart = System.nanoTime sendMapStatusToNextTaskLocations(status, mapId, shuffleId, numReduces, nextStageLocs, skipZeroByteNotifications) shuffleWriteMetrics.incWriteTime(System.nanoTime - drizzleRpcsStart) } else { logInfo( s"No taskCompletion next: ${nextStageLocs.map(_.length).getOrElse(0)} r: $numReduces") } } // Push metadata saying that this map task finished, so that the tasks in the next stage // know they can begin pulling the data. private def sendMapStatusToNextTaskLocations( status: MapStatus, mapId: Int, shuffleId: Int, numReduces: Int, nextStageLocs: Option[Seq[BlockManagerId]], skipZeroByteNotifications: Boolean) { val numReduces = nextStageLocs.get.length val uniqueLocations = if (skipZeroByteNotifications) { nextStageLocs.get.zipWithIndex.filter { x => status.getSizeForBlock(x._2) != 0L }.map(_._1).toSet } else { nextStageLocs.get.toSet } uniqueLocations.foreach { blockManagerId => try { SparkEnv.get.blockManager.blockTransferService.mapOutputReady( blockManagerId.host, blockManagerId.port, shuffleId, mapId, numReduces, status) } catch { case e: Exception => logWarning(s"Failed to send map outputs to $blockManagerId", e) } } } }
Example 85
Source File: BatchShuffleMapTask.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import java.util.Properties import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.storage.BlockManagerId private[spark] class BatchShuffleMapTask( stageId: Int, stageAttemptId: Int, taskBinaries: Broadcast[Array[Byte]], partitions: Array[Partition], partitionId: Int, @transient private var locs: Seq[TaskLocation], internalAccumulatorsSer: Array[Byte], localProperties: Properties, isFutureTask: Boolean, nextStageLocs: Option[Seq[BlockManagerId]] = None, depShuffleIds: Option[Seq[Seq[Int]]] = None, depShuffleNumMaps: Option[Seq[Int]] = None, jobId: Option[Int] = None, appId: Option[String] = None, appAttemptId: Option[String] = None) extends Task[Array[MapStatus]](stageId, stageAttemptId, partitionId, internalAccumulatorsSer, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps, jobId, appId, appAttemptId) with BatchTask with Logging { @transient private val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } var rdds: Array[RDD[_]] = null var deps: Array[ShuffleDependency[_, _, _]] = null override def prepTask(): Unit = { // Deserialize the RDD using the broadcast variable. val ser = SparkEnv.get.closureSerializer.newInstance() val (rddI, depI) = ser.deserialize[(Array[RDD[_]], Array[ShuffleDependency[_, _, _]])]( ByteBuffer.wrap(taskBinaries.value), Thread.currentThread.getContextClassLoader) rdds = rddI deps = depI } def getTasks(): Seq[Task[Any]] = { if (deps == null || rdds == null) { prepTask() } (0 until partitions.length).map { i => val s = ShuffleMapTask(stageId, stageAttemptId, partitions(i), localProperties, internalAccumulatorsSer, isFutureTask, rdds(i), deps(i), nextStageLocs) s.epoch = epoch s }.map(_.asInstanceOf[Task[Any]]) } override def runTask(context: TaskContext): Array[MapStatus] = { throw new RuntimeException("BatchShuffleMapTasks should not be run!") } override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "BatchShuffleMapTask(%d, %d)".format(stageId, partitionId) }
Example 86
Source File: FutureTaskWaiter.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.HashSet import org.apache.spark.internal.Logging import org.apache.spark.MapOutputTracker import org.apache.spark.SparkConf import org.apache.spark.storage.BlockManager import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.TimeStampedHashMap private[spark] case class FutureTaskInfo(shuffleId: Int, numMaps: Int, reduceId: Int, taskId: Long, nonZeroPartitions: Option[Array[Int]], taskCb: () => Unit) private[spark] class FutureTaskWaiter( conf: SparkConf, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends Logging { // Key is (shuffleId, reduceId) private val futureTaskInfo = new TimeStampedHashMap[(Int, Int), FutureTaskInfo] // Key is (shuffleId, reduceId), value is the set of blockIds we are waiting for private val futureTasksBlockWait = new TimeStampedHashMap[(Int, Int), HashSet[Int]] def submitFutureTask(info: FutureTaskInfo) { futureTasksBlockWait.synchronized { val blocksToWaitFor = if (info.nonZeroPartitions.isDefined) { info.nonZeroPartitions.get.toSet } else { (0 until info.numMaps).toArray.toSet } // Check if all the blocks already exist. If so just trigger taskCb // Count how many outputs have been registered with the MapOutputTracker for this shuffle // and intersect with blocksToWaitFor to only get how many for this reduce are available val availableBlocks = mapOutputTracker.getAvailableMapOutputs(info.shuffleId).intersect(blocksToWaitFor) val mapsToWait = blocksToWaitFor.size val numMapsPending = blocksToWaitFor.size - availableBlocks.size if (availableBlocks.size >= mapsToWait) { info.taskCb() } else { futureTaskInfo.put((info.shuffleId, info.reduceId), info) // NOTE: Its fine not to synchronize here as two future tasks shouldn't be submitted at the // same time Calculate the number of blocks to wait for before starting future task val waitForBlocks = blocksToWaitFor.diff(availableBlocks) futureTasksBlockWait.put( (info.shuffleId, info.reduceId), new HashSet[Int]() ++ waitForBlocks) } } } def shuffleBlockReady(shuffleBlockId: ShuffleBlockId): Unit = { val key = (shuffleBlockId.shuffleId, shuffleBlockId.reduceId) futureTasksBlockWait.synchronized { if (futureTaskInfo.contains(key)) { if (futureTasksBlockWait.contains(key)) { futureTasksBlockWait(key) -= shuffleBlockId.mapId // If we have all the blocks, run the CB if (futureTasksBlockWait(key).size <= 0) { val cb = futureTaskInfo(key).taskCb futureTasksBlockWait.remove(key) futureTaskInfo.remove(key) cb() } } } } } def addMapStatusAvailable(shuffleId: Int, mapId: Int, numReduces: Int, mapStatus: MapStatus) { // NOTE: This should be done before we trigger future tasks. mapOutputTracker.addStatus(shuffleId, mapId, mapStatus) futureTasksBlockWait.synchronized { // Register the output for each reduce task. (0 until numReduces).foreach { reduceId => shuffleBlockReady(new ShuffleBlockId(shuffleId, mapId, reduceId)) } } } }
Example 87
Source File: ReplayListenerBus.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io.{InputStream, IOException} import scala.io.Source import com.fasterxml.jackson.core.JsonParseException import org.json4s.jackson.JsonMethods._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.ReplayListenerBus._ import org.apache.spark.util.JsonProtocol def replay( logData: InputStream, sourceName: String, maybeTruncated: Boolean = false, eventsFilter: ReplayEventsFilter = SELECT_ALL_FILTER): Unit = { var currentLine: String = null var lineNumber: Int = 0 try { val lineEntries = Source.fromInputStream(logData) .getLines() .zipWithIndex .filter { case (line, _) => eventsFilter(line) } while (lineEntries.hasNext) { try { val entry = lineEntries.next() currentLine = entry._1 lineNumber = entry._2 + 1 postToAll(JsonProtocol.sparkEventFromJson(parse(currentLine))) } catch { case jpe: JsonParseException => // We can only ignore exception from last line of the file that might be truncated // the last entry may not be the very last line in the event log, but we treat it // as such in a best effort to replay the given input if (!maybeTruncated || lineEntries.hasNext) { throw jpe } else { logWarning(s"Got JsonParseException from log file $sourceName" + s" at line $lineNumber, the file might not have finished writing cleanly.") } } } } catch { case ioe: IOException => throw ioe case e: Exception => logError(s"Exception parsing Spark event log: $sourceName", e) logError(s"Malformed line #$lineNumber: $currentLine\n") } } } private[spark] object ReplayListenerBus { type ReplayEventsFilter = (String) => Boolean // utility filter that selects all event logs during replay val SELECT_ALL_FILTER: ReplayEventsFilter = { (eventString: String) => true } }
Example 88
Source File: SparkUncaughtExceptionHandler.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.internal.Logging private[spark] object SparkUncaughtExceptionHandler extends Thread.UncaughtExceptionHandler with Logging { override def uncaughtException(thread: Thread, exception: Throwable) { try { // Make it explicit that uncaught exceptions are thrown when container is shutting down. // It will help users when they analyze the executor logs val inShutdownMsg = if (ShutdownHookManager.inShutdown()) "[Container in shutdown] " else "" val errMsg = "Uncaught exception in thread " logError(inShutdownMsg + errMsg + thread, exception) // We may have been called from a shutdown hook. If so, we must not call System.exit(). // (If we do, we will deadlock.) if (!ShutdownHookManager.inShutdown()) { if (exception.isInstanceOf[OutOfMemoryError]) { System.exit(SparkExitCode.OOM) } else { System.exit(SparkExitCode.UNCAUGHT_EXCEPTION) } } } catch { case oom: OutOfMemoryError => Runtime.getRuntime.halt(SparkExitCode.OOM) case t: Throwable => Runtime.getRuntime.halt(SparkExitCode.UNCAUGHT_EXCEPTION_TWICE) } } def uncaughtException(exception: Throwable) { uncaughtException(Thread.currentThread(), exception) } }
Example 89
Source File: TopologyMapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.util.Utils @DeveloperApi class FileBasedTopologyMapper(conf: SparkConf) extends TopologyMapper(conf) with Logging { val topologyFile = conf.getOption("spark.storage.replication.topologyFile") require(topologyFile.isDefined, "Please specify topology file via " + "spark.storage.replication.topologyFile for FileBasedTopologyMapper.") val topologyMap = Utils.getPropertiesFromFile(topologyFile.get) override def getTopologyForHost(hostname: String): Option[String] = { val topology = topologyMap.get(hostname) if (topology.isDefined) { logDebug(s"$hostname -> ${topology.get}") } else { logWarning(s"$hostname does not have any topology information") } topology } }
Example 90
Source File: BlockManagerSlaveEndpoint.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.{MapOutputTracker, SparkEnv} import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{ThreadUtils, Utils} private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends ThreadSafeRpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) case TriggerThreadDump => context.reply(Utils.getThreadDump()) } private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.onSuccess { case response => logDebug("Done " + actionMessage + ", response is " + response) context.reply(response) logDebug("Sent response: " + response + " to " + context.senderAddress) } future.onFailure { case t: Throwable => logError("Error in " + actionMessage, t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 91
Source File: DiskStore.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import java.io.{FileOutputStream, IOException, RandomAccessFile} import java.nio.ByteBuffer import java.nio.channels.FileChannel.MapMode import com.google.common.io.Closeables import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.util.Utils import org.apache.spark.util.io.ChunkedByteBuffer def put(blockId: BlockId)(writeFunc: FileOutputStream => Unit): Unit = { if (contains(blockId)) { throw new IllegalStateException(s"Block $blockId is already present in the disk store") } logDebug(s"Attempting to put block $blockId") val startTime = System.currentTimeMillis val file = diskManager.getFile(blockId) val fileOutputStream = new FileOutputStream(file) var threwException: Boolean = true try { writeFunc(fileOutputStream) threwException = false } finally { try { Closeables.close(fileOutputStream, threwException) } finally { if (threwException) { remove(blockId) } } } val finishTime = System.currentTimeMillis logDebug("Block %s stored as %s file on disk in %d ms".format( file.getName, Utils.bytesToString(file.length()), finishTime - startTime)) } def putBytes(blockId: BlockId, bytes: ChunkedByteBuffer): Unit = { put(blockId) { fileOutputStream => val channel = fileOutputStream.getChannel Utils.tryWithSafeFinally { bytes.writeFully(channel) } { channel.close() } } } def getBytes(blockId: BlockId): ChunkedByteBuffer = { val file = diskManager.getFile(blockId.name) val channel = new RandomAccessFile(file, "r").getChannel Utils.tryWithSafeFinally { // For small files, directly read rather than memory map if (file.length < minMemoryMapBytes) { val buf = ByteBuffer.allocate(file.length.toInt) channel.position(0) while (buf.remaining() != 0) { if (channel.read(buf) == -1) { throw new IOException("Reached EOF before filling buffer\n" + s"offset=0\nfile=${file.getAbsolutePath}\nbuf.remaining=${buf.remaining}") } } buf.flip() new ChunkedByteBuffer(buf) } else { new ChunkedByteBuffer(channel.map(MapMode.READ_ONLY, 0, file.length)) } } { channel.close() } } def remove(blockId: BlockId): Boolean = { val file = diskManager.getFile(blockId.name) if (file.exists()) { val ret = file.delete() if (!ret) { logWarning(s"Error deleting ${file.getPath()}") } ret } else { false } } def contains(blockId: BlockId): Boolean = { val file = diskManager.getFile(blockId.name) file.exists() } }
Example 92
Source File: BlockReplicationPolicy.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.collection.mutable import scala.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging private def getSampleIds(n: Int, m: Int, r: Random): List[Int] = { val indices = (n - m + 1 to n).foldLeft(Set.empty[Int]) {case (set, i) => val t = r.nextInt(i) + 1 if (set.contains(t)) set + i else set + t } // we shuffle the result to ensure a random arrangement within the sample // to avoid any bias from set implementations r.shuffle(indices.map(_ - 1).toList) } }
Example 93
Source File: OrderedRDDFunctions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 94
Source File: SequenceFileRDDFunctions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.{classTag, ClassTag} import org.apache.hadoop.io.Writable import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.internal.Logging def saveAsSequenceFile( path: String, codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope { def anyToWritable[U <% Writable](u: U): Writable = u // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and // valueWritableClass at the compile time. To implement that, we need to add type parameters to // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a // breaking change. val convertKey = self.keyClass != keyWritableClass val convertValue = self.valueClass != valueWritableClass logInfo("Saving as sequence file of type (" + keyWritableClass.getSimpleName + "," + valueWritableClass.getSimpleName + ")" ) val format = classOf[SequenceFileOutputFormat[Writable, Writable]] val jobConf = new JobConf(self.context.hadoopConfiguration) if (!convertKey && !convertValue) { self.saveAsHadoopFile(path, keyWritableClass, valueWritableClass, format, jobConf, codec) } else if (!convertKey && convertValue) { self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile( path, keyWritableClass, valueWritableClass, format, jobConf, codec) } else if (convertKey && !convertValue) { self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile( path, keyWritableClass, valueWritableClass, format, jobConf, codec) } else if (convertKey && convertValue) { self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile( path, keyWritableClass, valueWritableClass, format, jobConf, codec) } } }
Example 95
Source File: TaskContextImpl.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.util.Properties import scala.collection.mutable.ArrayBuffer import org.apache.spark.executor.TaskMetrics import org.apache.spark.internal.Logging import org.apache.spark.memory.TaskMemoryManager import org.apache.spark.metrics.MetricsSystem import org.apache.spark.metrics.source.Source import org.apache.spark.util._ private[spark] class TaskContextImpl( val stageId: Int, val partitionId: Int, override val taskAttemptId: Long, override val attemptNumber: Int, var _taskMemoryManager: TaskMemoryManager, localProperties: Properties, @transient private val metricsSystem: MetricsSystem, // The default value is only used in tests. override val taskMetrics: TaskMetrics = TaskMetrics.empty, var batchId: Int = 0) extends TaskContext with Logging { private[spark] def markInterrupted(): Unit = { interrupted = true } override def isCompleted(): Boolean = completed override def isRunningLocally(): Boolean = false override def isInterrupted(): Boolean = interrupted override def getLocalProperty(key: String): String = localProperties.getProperty(key) override def getMetricsSources(sourceName: String): Seq[Source] = metricsSystem.getSourcesByName(sourceName) private[spark] override def registerAccumulator(a: AccumulatorV2[_, _]): Unit = { taskMetrics.registerAccumulator(a) } }
Example 96
Source File: SparkFunSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark // scalastyle:off import java.io.File import org.scalatest.{BeforeAndAfterAll, FunSuite, Outcome} import org.apache.spark.internal.Logging import org.apache.spark.util.AccumulatorContext final protected override def withFixture(test: NoArgTest): Outcome = { val testName = test.text val suiteName = this.getClass.getName val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s") try { logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n") test() } finally { logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n") } } }
Example 97
Source File: SparkFunSuite.scala From spark-alchemy with Apache License 2.0 | 5 votes |
package org.apache.spark // scalastyle:off import java.io.File import scala.annotation.tailrec import org.apache.log4j.{Appender, Level, Logger} import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, BeforeAndAfterEach, FunSuite, Outcome, Suite} import org.apache.spark.internal.Logging import org.apache.spark.internal.config.Tests.IS_TESTING import org.apache.spark.util.{AccumulatorContext, Utils} protected def withLogAppender( appender: Appender, loggerName: Option[String] = None, level: Option[Level] = None)( f: => Unit): Unit = { val logger = loggerName.map(Logger.getLogger).getOrElse(Logger.getRootLogger) val restoreLevel = logger.getLevel logger.addAppender(appender) if (level.isDefined) { logger.setLevel(level.get) } try f finally { logger.removeAppender(appender) if (level.isDefined) { logger.setLevel(restoreLevel) } } } }
Example 98
Source File: TestBroadCast.scala From asyspark with MIT License | 5 votes |
package org.apache.spark.examples import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import scala.collection.mutable object TestBroadCast extends Logging{ val sparkSession = SparkSession.builder().appName("test BoradCast").getOrCreate() val sc = sparkSession.sparkContext def main(args: Array[String]): Unit = { // val data = sc.parallelize(Seq(1 until 10000000)) val num = args(args.length - 2).toInt val times = args(args.length -1).toInt println(num) val start = System.nanoTime() val seq =Seq(1 until num) for(i <- 0 until times) { val start2 = System.nanoTime() val bc = sc.broadcast(seq) val rdd = sc.parallelize(1 until 10, 5) rdd.map(_ => bc.value.take(1)).collect() println((System.nanoTime() - start2)/ 1e6 + "ms") } logInfo((System.nanoTime() - start) / 1e6 + "ms") } def testMap(): Unit ={ val smallRDD = sc.parallelize(Seq(1,2,3)) val bigRDD = sc.parallelize(Seq(1 until 20)) bigRDD.mapPartitions { partition => val hashMap = new mutable.HashMap[Int,Int]() for(ele <- smallRDD) { hashMap(ele) = ele } // some operation here partition } } }
Example 99
Source File: DeltaPushFilter.scala From connectors with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import scala.collection.immutable.HashSet import scala.collection.JavaConverters._ import org.apache.hadoop.hive.ql.exec.{FunctionRegistry, SerializationUtilities} import org.apache.hadoop.hive.ql.lib._ import org.apache.hadoop.hive.ql.parse.SemanticException import org.apache.hadoop.hive.ql.plan.{ExprNodeColumnDesc, ExprNodeConstantDesc, ExprNodeGenericFuncDesc} import org.apache.hadoop.hive.ql.udf.generic._ import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{And, EqualNullSafe, EqualTo, Expression, GreaterThan, GreaterThanOrEqual, InSet, LessThan, LessThanOrEqual, Like, Literal, Not} object DeltaPushFilter extends Logging { lazy val supportedPushDownUDFs = Array( "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPLessThan", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPGreaterThan", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPNotEqual", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualNS", "org.apache.hadoop.hive.ql.udf.UDFLike", "org.apache.hadoop.hive.ql.udf.generic.GenericUDFIn" ) def partitionFilterConverter(hiveFilterExprSeriablized: String): Seq[Expression] = { if (hiveFilterExprSeriablized != null) { val filterExpr = SerializationUtilities.deserializeExpression(hiveFilterExprSeriablized) val opRules = new java.util.LinkedHashMap[Rule, NodeProcessor]() val nodeProcessor = new NodeProcessor() { @throws[SemanticException] def process(nd: Node, stack: java.util.Stack[Node], procCtx: NodeProcessorCtx, nodeOutputs: Object*): Object = { nd match { case e: ExprNodeGenericFuncDesc if FunctionRegistry.isOpAnd(e) => nodeOutputs.map(_.asInstanceOf[Expression]).reduce(And) case e: ExprNodeGenericFuncDesc => val (columnDesc, constantDesc) = if (nd.getChildren.get(0).isInstanceOf[ExprNodeColumnDesc]) { (nd.getChildren.get(0), nd.getChildren.get(1)) } else { (nd.getChildren.get(1), nd.getChildren.get(0)) } val columnAttr = UnresolvedAttribute( columnDesc.asInstanceOf[ExprNodeColumnDesc].getColumn) val constantVal = Literal(constantDesc.asInstanceOf[ExprNodeConstantDesc].getValue) nd.asInstanceOf[ExprNodeGenericFuncDesc].getGenericUDF match { case f: GenericUDFOPNotEqualNS => Not(EqualNullSafe(columnAttr, constantVal)) case f: GenericUDFOPNotEqual => Not(EqualTo(columnAttr, constantVal)) case f: GenericUDFOPEqualNS => EqualNullSafe(columnAttr, constantVal) case f: GenericUDFOPEqual => EqualTo(columnAttr, constantVal) case f: GenericUDFOPGreaterThan => GreaterThan(columnAttr, constantVal) case f: GenericUDFOPEqualOrGreaterThan => GreaterThanOrEqual(columnAttr, constantVal) case f: GenericUDFOPLessThan => LessThan(columnAttr, constantVal) case f: GenericUDFOPEqualOrLessThan => LessThanOrEqual(columnAttr, constantVal) case f: GenericUDFBridge if f.getUdfName.equals("like") => Like(columnAttr, constantVal) case f: GenericUDFIn => val inConstantVals = nd.getChildren.asScala .filter(_.isInstanceOf[ExprNodeConstantDesc]) .map(_.asInstanceOf[ExprNodeConstantDesc].getValue) .map(Literal(_)).toSet InSet(columnAttr, HashSet() ++ inConstantVals) case _ => throw new RuntimeException(s"Unsupported func(${nd.getName}) " + s"which can not be pushed down to delta") } case _ => null } } } val disp = new DefaultRuleDispatcher(nodeProcessor, opRules, null) val ogw = new DefaultGraphWalker(disp) val topNodes = new java.util.ArrayList[Node]() topNodes.add(filterExpr) val nodeOutput = new java.util.HashMap[Node, Object]() try { ogw.startWalking(topNodes, nodeOutput) } catch { case ex: Exception => throw new RuntimeException(ex) } logInfo(s"converted partition filter expr:" + s"${nodeOutput.get(filterExpr).asInstanceOf[Expression].toJSON}") Seq(nodeOutput.get(filterExpr).asInstanceOf[Expression]) } else Seq.empty[org.apache.spark.sql.catalyst.expressions.Expression] } }
Example 100
Source File: SparkFunSuite.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import org.apache.spark.internal.Logging import org.scalatest._ import org.slf4j.Logger abstract class SparkFunSuite extends FunSuite with Logging { protected val logger: Logger = log final protected override def withFixture(test: NoArgTest): Outcome = { val testName = test.text val suiteName = this.getClass.getName val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s") try { logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n") test() } finally { logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n") } } protected final def getTestResourcePath(file: String): String = getTestResourceFile(file).getCanonicalPath // helper function protected final def getTestResourceFile(file: String): File = new File(getClass.getClassLoader.getResource(file).getFile) }
Example 101
Source File: CustomReceiver.scala From Learning-Spark-SQL with MIT License | 5 votes |
import java.io.{BufferedReader, InputStreamReader} import java.net.Socket import java.nio.charset.StandardCharsets import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.receiver.Receiver private def receive() { var socket: Socket = null var userInput: String = null try { println("Connecting to " + host + ":" + port) socket = new Socket(host, port) println("Connected to " + host + ":" + port) val reader = new BufferedReader( new InputStreamReader(socket.getInputStream(), StandardCharsets.UTF_8)) userInput = reader.readLine() while(!isStopped && userInput != null) { store(userInput) userInput = reader.readLine() } reader.close() socket.close() println("Stopped receiving") restart("Trying to connect again") } catch { case e: java.net.ConnectException => restart("Error connecting to " + host + ":" + port, e) case t: Throwable => restart("Error receiving data", t) } } }
Example 102
Source File: VOrderedRDDFunctions.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.Partitioner import org.apache.spark.internal.Logging import org.apache.spark.util.collection.CompactBuffer import scala.reflect.ClassTag class VOrderedRDDFunctions[K, V](self: RDD[(K, V)]) (implicit kt: ClassTag[K], vt: ClassTag[V], ord: Ordering[K]) extends Logging with Serializable { def groupByKeyUsingSort(partitioner: Partitioner): RDD[(K, Iterable[V])] = { self.repartitionAndSortWithinPartitions(partitioner) .mapPartitions { (iter: Iterator[(K, V)]) => new Iterator[(K, CompactBuffer[V])] { private var firstElemInNextGroup: (K, V) = null override def hasNext: Boolean = firstElemInNextGroup != null || iter.hasNext override def next(): (K, CompactBuffer[V]) = { if (firstElemInNextGroup == null) { firstElemInNextGroup = iter.next() } val key = firstElemInNextGroup._1 val group = CompactBuffer[V](firstElemInNextGroup._2) firstElemInNextGroup = null var reachNewGroup = false while (iter.hasNext && !reachNewGroup) { val currElem = iter.next() if (currElem._1 == key) { group += currElem._2 } else { firstElemInNextGroup = currElem reachNewGroup = true } } (key, group) } } } } } private[spark] object VOrderedRDDFunctions { implicit def fromRDD[K: ClassTag, V: ClassTag](rdd: RDD[(K, V)])(implicit ord: Ordering[K]): VOrderedRDDFunctions[K, V] = { new VOrderedRDDFunctions(rdd) } }
Example 103
Source File: OrcFileOperator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import java.io.IOException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[hive] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None, ignoreCorruptFiles: Boolean = false) : Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => val reader = try { Some(OrcFile.createReader(fs, path)) } catch { case e: IOException => if (ignoreCorruptFiles) { logWarning(s"Skipped the footer in the corrupted file: $path", e) None } else { throw new SparkException(s"Could not read footer for file: $path", e) } } path -> reader }.collectFirst { case (path, Some(reader)) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration], ignoreCorruptFiles: Boolean) : Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.toIterator.map(getFileReader(_, conf, ignoreCorruptFiles)).collectFirst { case Some(reader) => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 104
Source File: FiltersSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.client import java.util.Collections import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.serde.serdeConstants import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { private val shim = new Shim_v0_13 private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test") private val varCharCol = new FieldSchema() varCharCol.setName("varchar") varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME) testTable.setPartCols(Collections.singletonList(varCharCol)) filterTest("string filter", (a("stringcol", StringType) > Literal("test")) :: Nil, "stringcol > \"test\"") filterTest("string filter backwards", (Literal("test") > a("stringcol", StringType)) :: Nil, "\"test\" > stringcol") filterTest("int filter", (a("intcol", IntegerType) === Literal(1)) :: Nil, "intcol = 1") filterTest("int filter backwards", (Literal(1) === a("intcol", IntegerType)) :: Nil, "1 = intcol") filterTest("int and string filter", (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil, "1 = intcol and \"a\" = strcol") filterTest("skip varchar", (Literal("") === a("varchar", StringType)) :: Nil, "") filterTest("SPARK-19912 String literals should be escaped for Hive metastore partition pruning", (a("stringcol", StringType) === Literal("p1\" and q=\"q1")) :: (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil, """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""") filterTest("SPARK-24879 null literals should be ignored for IN constructs", (a("intcol", IntegerType) in (Literal(1), Literal(null))) :: Nil, "(intcol = 1)") // Applying the predicate `x IN (NULL)` should return an empty set, but since this optimization // will be applied by Catalyst, this filter converter does not need to account for this. filterTest("SPARK-24879 IN predicates with only NULLs will not cause a NPE", (a("intcol", IntegerType) in Literal(null)) :: Nil, "") filterTest("typecast null literals should not be pushed down in simple predicates", (a("intcol", IntegerType) === Literal(null, IntegerType)) :: Nil, "") private def filterTest(name: String, filters: Seq[Expression], result: String) = { test(name) { withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> "true") { val converted = shim.convertFilters(testTable, filters) if (converted != result) { fail(s"Expected ${filters.mkString(",")} to convert to '$result' but got '$converted'") } } } } test("turn on/off ADVANCED_PARTITION_PREDICATE_PUSHDOWN") { import org.apache.spark.sql.catalyst.dsl.expressions._ Seq(true, false).foreach { enabled => withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> enabled.toString) { val filters = (Literal(1) === a("intcol", IntegerType) || Literal(2) === a("intcol", IntegerType)) :: Nil val converted = shim.convertFilters(testTable, filters) if (enabled) { assert(converted == "(1 = intcol or 2 = intcol)") } else { assert(converted.isEmpty) } } } } private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)() }
Example 105
Source File: SparkSQLDriver.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, Arrays, List => JList} import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, attr.dataType.catalogString, "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) { execution.hiveResultString() } tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 106
Source File: SparkSQLOperationManager.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.server import java.util.{Map => JMap} import java.util.concurrent.ConcurrentHashMap import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager} import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.thriftserver.{ReflectionUtils, SparkExecuteStatementOperation} import org.apache.spark.sql.internal.SQLConf private[thriftserver] class SparkSQLOperationManager() extends OperationManager with Logging { val handleToOperation = ReflectionUtils .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation") val sessionToActivePool = new ConcurrentHashMap[SessionHandle, String]() val sessionToContexts = new ConcurrentHashMap[SessionHandle, SQLContext]() override def newExecuteStatementOperation( parentSession: HiveSession, statement: String, confOverlay: JMap[String, String], async: Boolean): ExecuteStatementOperation = synchronized { val sqlContext = sessionToContexts.get(parentSession.getSessionHandle) require(sqlContext != null, s"Session handle: ${parentSession.getSessionHandle} has not been" + s" initialized or had already closed.") val conf = sqlContext.sessionState.conf val hiveSessionState = parentSession.getSessionState setConfMap(conf, hiveSessionState.getOverriddenConfigurations) setConfMap(conf, hiveSessionState.getHiveVariables) val runInBackground = async && conf.getConf(HiveUtils.HIVE_THRIFT_SERVER_ASYNC) val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay, runInBackground)(sqlContext, sessionToActivePool) handleToOperation.put(operation.getHandle, operation) logDebug(s"Created Operation for $statement with session=$parentSession, " + s"runInBackground=$runInBackground") operation } def setConfMap(conf: SQLConf, confMap: java.util.Map[String, String]): Unit = { val iterator = confMap.entrySet().iterator() while (iterator.hasNext) { val kv = iterator.next() conf.setConfString(kv.getKey, kv.getValue) } } }
Example 107
Source File: ThriftServerTab.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 108
Source File: SparkSQLEnv.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION import org.apache.spark.util.Utils def stop() { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { sparkContext.stop() sparkContext = null sqlContext = null } } }
Example 109
Source File: UDTRegistration.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.util.Utils def getUDTFor(userClass: String): Option[Class[_]] = { udtMap.get(userClass).map { udtClassName => if (Utils.classIsLoadable(udtClassName)) { val udtClass = Utils.classForName(udtClassName) if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) { udtClass } else { throw new SparkException( s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " + s"an UserDefinedType for ${userClass}") } } else { throw new SparkException( s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.") } } } }
Example 110
Source File: BoundAttribute.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral, JavaCode} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types._ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean) extends LeafExpression { override def toString: String = s"input[$ordinal, ${dataType.simpleString}, $nullable]" private val accessor: (InternalRow, Int) => Any = InternalRow.getAccessor(dataType) // Use special getter for primitive types (for UnsafeRow) override def eval(input: InternalRow): Any = { if (nullable && input.isNullAt(ordinal)) { null } else { accessor(input, ordinal) } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { if (ctx.currentVars != null && ctx.currentVars(ordinal) != null) { val oev = ctx.currentVars(ordinal) ev.isNull = oev.isNull ev.value = oev.value ev.copy(code = oev.code) } else { assert(ctx.INPUT_ROW != null, "INPUT_ROW and currentVars cannot both be null.") val javaType = JavaCode.javaType(dataType) val value = CodeGenerator.getValue(ctx.INPUT_ROW, dataType, ordinal.toString) if (nullable) { ev.copy(code = code""" |boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal); |$javaType ${ev.value} = ${ev.isNull} ? | ${CodeGenerator.defaultValue(dataType)} : ($value); """.stripMargin) } else { ev.copy(code = code"$javaType ${ev.value} = $value;", isNull = FalseLiteral) } } } } object BindReferences extends Logging { def bindReference[A <: Expression]( expression: A, input: AttributeSeq, allowFailures: Boolean = false): A = { expression.transform { case a: AttributeReference => attachTree(a, "Binding attribute") { val ordinal = input.indexOf(a.exprId) if (ordinal == -1) { if (allowFailures) { a } else { sys.error(s"Couldn't find $a in ${input.attrs.mkString("[", ",", "]")}") } } else { BoundReference(ordinal, a.dataType, input(ordinal).nullable) } } }.asInstanceOf[A] // Kind of a hack, but safe. TODO: Tighten return type when possible. } }
Example 111
Source File: CodeGeneratorWithInterpretedFallback.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import scala.util.control.NonFatal import org.apache.spark.internal.Logging import org.apache.spark.sql.internal.SQLConf import org.apache.spark.util.Utils abstract class CodeGeneratorWithInterpretedFallback[IN, OUT] extends Logging { def createObject(in: IN): OUT = { // We are allowed to choose codegen-only or no-codegen modes if under tests. val config = SQLConf.get.getConf(SQLConf.CODEGEN_FACTORY_MODE) val fallbackMode = CodegenObjectFactoryMode.withName(config) fallbackMode match { case CodegenObjectFactoryMode.CODEGEN_ONLY if Utils.isTesting => createCodeGeneratedObject(in) case CodegenObjectFactoryMode.NO_CODEGEN if Utils.isTesting => createInterpretedObject(in) case _ => try { createCodeGeneratedObject(in) } catch { case NonFatal(_) => // We should have already seen the error message in `CodeGenerator` logWarning("Expr codegen error and falling back to interpreter mode") createInterpretedObject(in) } } } protected def createCodeGeneratedObject(in: IN): OUT protected def createInterpretedObject(in: IN): OUT }
Example 112
Source File: RuleExecutor.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.rules import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.trees.TreeNode import org.apache.spark.sql.catalyst.util.sideBySide import org.apache.spark.util.Utils object RuleExecutor { protected val queryExecutionMeter = QueryExecutionMetering() def execute(plan: TreeType): TreeType = { var curPlan = plan val queryExecutionMetrics = RuleExecutor.queryExecutionMeter batches.foreach { batch => val batchStartPlan = curPlan var iteration = 1 var lastPlan = curPlan var continue = true // Run until fix point (or the max number of iterations as specified in the strategy. while (continue) { curPlan = batch.rules.foldLeft(curPlan) { case (plan, rule) => val startTime = System.nanoTime() val result = rule(plan) val runTime = System.nanoTime() - startTime if (!result.fastEquals(plan)) { queryExecutionMetrics.incNumEffectiveExecution(rule.ruleName) queryExecutionMetrics.incTimeEffectiveExecutionBy(rule.ruleName, runTime) logTrace( s""" |=== Applying Rule ${rule.ruleName} === |${sideBySide(plan.treeString, result.treeString).mkString("\n")} """.stripMargin) } queryExecutionMetrics.incExecutionTimeBy(rule.ruleName, runTime) queryExecutionMetrics.incNumExecution(rule.ruleName) // Run the structural integrity checker against the plan after each rule. if (!isPlanIntegral(result)) { val message = s"After applying rule ${rule.ruleName} in batch ${batch.name}, " + "the structural integrity of the plan is broken." throw new TreeNodeException(result, message, null) } result } iteration += 1 if (iteration > batch.strategy.maxIterations) { // Only log if this is a rule that is supposed to run more than once. if (iteration != 2) { val message = s"Max iterations (${iteration - 1}) reached for batch ${batch.name}" if (Utils.isTesting) { throw new TreeNodeException(curPlan, message, null) } else { logWarning(message) } } continue = false } if (curPlan.fastEquals(lastPlan)) { logTrace( s"Fixed point reached for batch ${batch.name} after ${iteration - 1} iterations.") continue = false } lastPlan = curPlan } if (!batchStartPlan.fastEquals(curPlan)) { logDebug( s""" |=== Result of Batch ${batch.name} === |${sideBySide(batchStartPlan.treeString, curPlan.treeString).mkString("\n")} """.stripMargin) } else { logTrace(s"Batch ${batch.name} has no effect.") } } curPlan } }
Example 113
Source File: ParseMode.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import java.util.Locale import org.apache.spark.internal.Logging sealed trait ParseMode { def fromString(mode: String): ParseMode = mode.toUpperCase(Locale.ROOT) match { case PermissiveMode.name => PermissiveMode case DropMalformedMode.name => DropMalformedMode case FailFastMode.name => FailFastMode case _ => logWarning(s"$mode is not a valid parse mode. Using ${PermissiveMode.name}.") PermissiveMode } }
Example 114
Source File: DataSourceV2Utils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import java.util.regex.Pattern import org.apache.spark.internal.Logging import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.v2.{DataSourceV2, SessionConfigSupport} private[sql] object DataSourceV2Utils extends Logging { def extractSessionConfigs(ds: DataSourceV2, conf: SQLConf): Map[String, String] = ds match { case cs: SessionConfigSupport => val keyPrefix = cs.keyPrefix() require(keyPrefix != null, "The data source config key prefix can't be null.") val pattern = Pattern.compile(s"^spark\\.datasource\\.$keyPrefix\\.(.+)") conf.getAllConfs.flatMap { case (key, value) => val m = pattern.matcher(key) if (m.matches() && m.groupCount() > 0) { Seq((m.group(1), value)) } else { Seq.empty } } case _ => Map.empty } }
Example 115
Source File: DriverRegistry.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.sql.{Driver, DriverManager} import scala.collection.mutable import org.apache.spark.internal.Logging import org.apache.spark.util.Utils DriverManager.getDrivers private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty def register(className: String): Unit = { val cls = Utils.getContextOrSparkClassLoader.loadClass(className) if (cls.getClassLoader == null) { logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required") } else if (wrapperMap.get(className).isDefined) { logTrace(s"Wrapper for $className already exists") } else { synchronized { if (wrapperMap.get(className).isEmpty) { val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver]) DriverManager.registerDriver(wrapper) wrapperMap(className) = wrapper logTrace(s"Wrapper for $className registered") } } } } }
Example 116
Source File: SQLHadoopMapReduceCommitProtocol.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) val configuration = context.getConfiguration val clazz = configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter]) if (clazz != null) { logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}") // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat // has an associated output committer. To override this output committer, // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS. // If a data source needs to override the output committer, it needs to set the // output committer in prepareForWrite method. if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) { // The specified output committer is a FileOutputCommitter. // So, we will use the FileOutputCommitter-specified constructor. val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) committer = ctor.newInstance(new Path(path), context) } else { // The specified output committer is just an OutputCommitter. // So, we will use the no-argument constructor. val ctor = clazz.getDeclaredConstructor() committer = ctor.newInstance() } } logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } }
Example 117
Source File: BasicWriteStatsTracker.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import java.io.FileNotFoundException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.{SparkContext, TaskContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.SQLExecution import org.apache.spark.sql.execution.metric.{SQLMetric, SQLMetrics} import org.apache.spark.util.SerializableConfiguration class BasicWriteJobStatsTracker( serializableHadoopConf: SerializableConfiguration, @transient val metrics: Map[String, SQLMetric]) extends WriteJobStatsTracker { override def newTaskInstance(): WriteTaskStatsTracker = { new BasicWriteTaskStatsTracker(serializableHadoopConf.value) } override def processStats(stats: Seq[WriteTaskStats]): Unit = { val sparkContext = SparkContext.getActive.get var numPartitions: Long = 0L var numFiles: Long = 0L var totalNumBytes: Long = 0L var totalNumOutput: Long = 0L val basicStats = stats.map(_.asInstanceOf[BasicWriteTaskStats]) basicStats.foreach { summary => numPartitions += summary.numPartitions numFiles += summary.numFiles totalNumBytes += summary.numBytes totalNumOutput += summary.numRows } metrics(BasicWriteJobStatsTracker.NUM_FILES_KEY).add(numFiles) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_BYTES_KEY).add(totalNumBytes) metrics(BasicWriteJobStatsTracker.NUM_OUTPUT_ROWS_KEY).add(totalNumOutput) metrics(BasicWriteJobStatsTracker.NUM_PARTS_KEY).add(numPartitions) val executionId = sparkContext.getLocalProperty(SQLExecution.EXECUTION_ID_KEY) SQLMetrics.postDriverMetricUpdates(sparkContext, executionId, metrics.values.toList) } } object BasicWriteJobStatsTracker { private val NUM_FILES_KEY = "numFiles" private val NUM_OUTPUT_BYTES_KEY = "numOutputBytes" private val NUM_OUTPUT_ROWS_KEY = "numOutputRows" private val NUM_PARTS_KEY = "numParts" def metrics: Map[String, SQLMetric] = { val sparkContext = SparkContext.getActive.get Map( NUM_FILES_KEY -> SQLMetrics.createMetric(sparkContext, "number of written files"), NUM_OUTPUT_BYTES_KEY -> SQLMetrics.createMetric(sparkContext, "bytes of written output"), NUM_OUTPUT_ROWS_KEY -> SQLMetrics.createMetric(sparkContext, "number of output rows"), NUM_PARTS_KEY -> SQLMetrics.createMetric(sparkContext, "number of dynamic part") ) } }
Example 118
Source File: FrequentItems.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 119
Source File: CompressibleColumnBuilder.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 120
Source File: MetricsReporter.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.text.SimpleDateFormat import com.codahale.metrics.{Gauge, MetricRegistry} import org.apache.spark.internal.Logging import org.apache.spark.metrics.source.{Source => CodahaleSource} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.streaming.StreamingQueryProgress class MetricsReporter( stream: StreamExecution, override val sourceName: String) extends CodahaleSource with Logging { override val metricRegistry: MetricRegistry = new MetricRegistry // Metric names should not have . in them, so that all the metrics of a query are identified // together in Ganglia as a single metric group registerGauge("inputRate-total", _.inputRowsPerSecond, 0.0) registerGauge("processingRate-total", _.processedRowsPerSecond, 0.0) registerGauge("latency", _.durationMs.get("triggerExecution").longValue(), 0L) private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 timestampFormat.setTimeZone(DateTimeUtils.getTimeZone("UTC")) registerGauge("eventTime-watermark", progress => convertStringDateToMillis(progress.eventTime.get("watermark")), 0L) registerGauge("states-rowsTotal", _.stateOperators.map(_.numRowsTotal).sum, 0L) registerGauge("states-usedBytes", _.stateOperators.map(_.memoryUsedBytes).sum, 0L) private def convertStringDateToMillis(isoUtcDateStr: String) = { if (isoUtcDateStr != null) { timestampFormat.parse(isoUtcDateStr).getTime } else { 0L } } private def registerGauge[T]( name: String, f: StreamingQueryProgress => T, default: T): Unit = { synchronized { metricRegistry.register(name, new Gauge[T] { override def getValue: T = Option(stream.lastProgress).map(f).getOrElse(default) }) } } }
Example 121
Source File: FileStreamSink.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FileFormat, FileFormatWriter} import org.apache.spark.util.SerializableConfiguration object FileStreamSink extends Logging { // The name of the subdirectory that is used to store metadata about which files are valid. val metadataDir = "_spark_metadata" class FileStreamSink( sparkSession: SparkSession, path: String, fileFormat: FileFormat, partitionColumnNames: Seq[String], options: Map[String, String]) extends Sink with Logging { private val basePath = new Path(path) private val logPath = new Path(basePath, FileStreamSink.metadataDir) private val fileLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, sparkSession, logPath.toUri.toString) private val hadoopConf = sparkSession.sessionState.newHadoopConf() private def basicWriteJobStatsTracker: BasicWriteJobStatsTracker = { val serializableHadoopConf = new SerializableConfiguration(hadoopConf) new BasicWriteJobStatsTracker(serializableHadoopConf, BasicWriteJobStatsTracker.metrics) } override def addBatch(batchId: Long, data: DataFrame): Unit = { if (batchId <= fileLog.getLatest().map(_._1).getOrElse(-1L)) { logInfo(s"Skipping already committed batch $batchId") } else { val committer = FileCommitProtocol.instantiate( className = sparkSession.sessionState.conf.streamingFileCommitProtocolClass, jobId = batchId.toString, outputPath = path) committer match { case manifestCommitter: ManifestFileCommitProtocol => manifestCommitter.setupManifestOptions(fileLog, batchId) case _ => // Do nothing } // Get the actual partition columns as attributes after matching them by name with // the given columns names. val partitionColumns: Seq[Attribute] = partitionColumnNames.map { col => val nameEquality = data.sparkSession.sessionState.conf.resolver data.logicalPlan.output.find(f => nameEquality(f.name, col)).getOrElse { throw new RuntimeException(s"Partition column $col not found in schema ${data.schema}") } } val qe = data.queryExecution FileFormatWriter.write( sparkSession = sparkSession, plan = qe.executedPlan, fileFormat = fileFormat, committer = committer, outputSpec = FileFormatWriter.OutputSpec(path, Map.empty, qe.analyzed.output), hadoopConf = hadoopConf, partitionColumns = partitionColumns, bucketSpec = None, statsTrackers = Seq(basicWriteJobStatsTracker), options = options) } } override def toString: String = s"FileSink[$path]" }
Example 122
Source File: StateStoreCoordinator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.state import java.util.UUID import scala.collection.mutable import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.util.RpcUtils private class StateStoreCoordinator(override val rpcEnv: RpcEnv) extends ThreadSafeRpcEndpoint with Logging { private val instances = new mutable.HashMap[StateStoreProviderId, ExecutorCacheTaskLocation] override def receive: PartialFunction[Any, Unit] = { case ReportActiveInstance(id, host, executorId) => logDebug(s"Reported state store $id is active at $executorId") instances.put(id, ExecutorCacheTaskLocation(host, executorId)) } override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case VerifyIfInstanceActive(id, execId) => val response = instances.get(id) match { case Some(location) => location.executorId == execId case None => false } logDebug(s"Verified that state store $id is active: $response") context.reply(response) case GetLocation(id) => val executorId = instances.get(id).map(_.toString) logDebug(s"Got location of the state store $id: $executorId") context.reply(executorId) case DeactivateInstances(runId) => val storeIdsToRemove = instances.keys.filter(_.queryRunId == runId).toSeq instances --= storeIdsToRemove logDebug(s"Deactivating instances related to checkpoint location $runId: " + storeIdsToRemove.mkString(", ")) context.reply(true) case StopCoordinator => stop() // Stop before replying to ensure that endpoint name has been deregistered logInfo("StateStoreCoordinator stopped") context.reply(true) } }
Example 123
Source File: FileStreamOptions.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import scala.util.Try import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.util.Utils val fileNameOnly: Boolean = withBooleanParameter("fileNameOnly", false) private def withBooleanParameter(name: String, default: Boolean) = { parameters.get(name).map { str => try { str.toBoolean } catch { case _: IllegalArgumentException => throw new IllegalArgumentException( s"Invalid value '$str' for option '$name', must be 'true' or 'false'") } }.getOrElse(default) } }
Example 124
Source File: RPCContinuousShuffleReader.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous.shuffle import java.util.concurrent._ import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.util.NextIterator override def getNext(): UnsafeRow = { var nextRow: UnsafeRow = null while (!finished && nextRow == null) { completion.poll(epochIntervalMs, TimeUnit.MILLISECONDS) match { case null => // Try again if the poll didn't wait long enough to get a real result. // But we should be getting at least an epoch marker every checkpoint interval. val writerIdsUncommitted = writerEpochMarkersReceived.zipWithIndex.collect { case (flag, idx) if !flag => idx } logWarning( s"Completion service failed to make progress after $epochIntervalMs ms. Waiting " + s"for writers ${writerIdsUncommitted.mkString(",")} to send epoch markers.") // The completion service guarantees this future will be available immediately. case future => future.get() match { case ReceiverRow(writerId, r) => // Start reading the next element in the queue we just took from. completion.submit(completionTask(writerId)) nextRow = r case ReceiverEpochMarker(writerId) => // Don't read any more from this queue. If all the writers have sent epoch markers, // the epoch is over; otherwise we need to loop again to poll from the remaining // writers. writerEpochMarkersReceived(writerId) = true if (writerEpochMarkersReceived.forall(_ == true)) { finished = true } } } } nextRow } override def close(): Unit = { executor.shutdownNow() } } } }
Example 125
Source File: WriteToContinuousDataSourceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan) extends SparkPlan with Logging { override def children: Seq[SparkPlan] = Seq(query) override def output: Seq[Attribute] = Nil override protected def doExecute(): RDD[InternalRow] = { val writerFactory = writer.createWriterFactory() val rdd = new ContinuousWriteRDD(query.execute(), writerFactory) logInfo(s"Start processing data source writer: $writer. " + s"The input RDD has ${rdd.partitions.length} partitions.") EpochCoordinatorRef.get( sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), sparkContext.env) .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions)) try { // Force the RDD to run so continuous processing starts; no data is actually being collected // to the driver, as ContinuousWriteRDD outputs nothing. rdd.collect() } catch { case _: InterruptedException => // Interruption is how continuous queries are ended, so accept and ignore the exception. case cause: Throwable => cause match { // Do not wrap interruption exceptions that will be handled by streaming specially. case _ if StreamExecution.isInterruptionException(cause) => throw cause // Only wrap non fatal exceptions. case NonFatal(e) => throw new SparkException("Writing job aborted.", e) case _ => throw cause } } sparkContext.emptyRDD } }
Example 126
Source File: StreamMetadata.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import java.util.ConcurrentModificationException import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileAlreadyExistsException, FSDataInputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.execution.streaming.CheckpointFileManager.CancellableFSDataOutputStream import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: CancellableFSDataOutputStream = null try { val fileManager = CheckpointFileManager.create(metadataFile.getParent, hadoopConf) output = fileManager.createAtomic(metadataFile, overwriteIfPossible = false) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case e: FileAlreadyExistsException => if (output != null) { output.cancel() } throw new ConcurrentModificationException( s"Multiple streaming queries are concurrently using $metadataFile", e) case e: Throwable => if (output != null) { output.cancel() } logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } } }
Example 127
Source File: ConsoleWriter.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.sources import org.apache.spark.internal.Logging import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.sources.v2.DataSourceOptions import org.apache.spark.sql.sources.v2.writer.{DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.types.StructType class ConsoleWriter(schema: StructType, options: DataSourceOptions) extends StreamWriter with Logging { // Number of rows to display, by default 20 rows protected val numRowsToShow = options.getInt("numRows", 20) // Truncate the displayed data if it is too long, by default it is true protected val isTruncated = options.getBoolean("truncate", true) assert(SparkSession.getActiveSession.isDefined) protected val spark = SparkSession.getActiveSession.get def createWriterFactory(): DataWriterFactory[InternalRow] = PackedRowWriterFactory override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = { // We have to print a "Batch" label for the epoch for compatibility with the pre-data source V2 // behavior. printRows(messages, schema, s"Batch: $epochId") } def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} protected def printRows( commitMessages: Array[WriterCommitMessage], schema: StructType, printMessage: String): Unit = { val rows = commitMessages.collect { case PackedRowCommitMessage(rs) => rs }.flatten // scalastyle:off println println("-------------------------------------------") println(printMessage) println("-------------------------------------------") // scalastyle:off println Dataset.ofRows(spark, LocalRelation(schema.toAttributes, rows)) .show(numRowsToShow, isTruncated) } override def toString(): String = { s"ConsoleWriter[numRows=$numRowsToShow, truncate=$isTruncated]" } }
Example 128
Source File: ManifestFileCommitProtocol.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.UUID import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = { this.fileLog = fileLog this.batchId = batchId } override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray if (fileLog.add(batchId, fileStatuses)) { logInfo(s"Committed batch $batchId") } else { throw new IllegalStateException(s"Race while writing batch $batchId") } } override def abortJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def setupTask(taskContext: TaskAttemptContext): Unit = { addedFiles = new ArrayBuffer[String] } override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, // the file name is fine and won't overflow. val split = taskContext.getTaskAttemptID.getTaskID.getId val uuid = UUID.randomUUID.toString val filename = f"part-$split%05d-$uuid$ext" val file = dir.map { d => new Path(new Path(path, d), filename).toString }.getOrElse { new Path(path, filename).toString } addedFiles += file file } override def newTaskTempFileAbsPath( taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = { throw new UnsupportedOperationException( s"$this does not support adding files with an absolute path") } override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { if (addedFiles.nonEmpty) { val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration) val statuses: Seq[SinkFileStatus] = addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f)))) new TaskCommitMessage(statuses) } else { new TaskCommitMessage(Seq.empty[SinkFileStatus]) } } override def abortTask(taskContext: TaskAttemptContext): Unit = { // Do nothing // TODO: we can also try delete the addedFiles as a best-effort cleanup. } }
Example 129
Source File: WholeStageCodegenSparkSubmitSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.TimeLimits import org.apache.spark.{SparkFunSuite, TestUtils} import org.apache.spark.deploy.SparkSubmitSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.{LocalSparkSession, QueryTest, Row, SparkSession} import org.apache.spark.sql.functions.{array, col, count, lit} import org.apache.spark.sql.types.IntegerType import org.apache.spark.unsafe.Platform import org.apache.spark.util.ResetSystemProperties // Due to the need to set driver's extraJavaOptions, this test needs to use actual SparkSubmit. class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach with ResetSystemProperties { test("Generated code on driver should not embed platform-specific constant") { val unusedJar = TestUtils.createJarWithClasses(Seq.empty) // HotSpot JVM specific: Set up a local cluster with the driver/executor using mismatched // settings of UseCompressedOops JVM option. val argsForSparkSubmit = Seq( "--class", WholeStageCodegenSparkSubmitSuite.getClass.getName.stripSuffix("$"), "--master", "local-cluster[1,1,1024]", "--driver-memory", "1g", "--conf", "spark.ui.enabled=false", "--conf", "spark.master.rest.enabled=false", "--conf", "spark.driver.extraJavaOptions=-XX:-UseCompressedOops", "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops", unusedJar.toString) SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") } } object WholeStageCodegenSparkSubmitSuite extends Assertions with Logging { var spark: SparkSession = _ def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") spark = SparkSession.builder().getOrCreate() // Make sure the test is run where the driver and the executors uses different object layouts val driverArrayHeaderSize = Platform.BYTE_ARRAY_OFFSET val executorArrayHeaderSize = spark.sparkContext.range(0, 1).map(_ => Platform.BYTE_ARRAY_OFFSET).collect.head.toInt assert(driverArrayHeaderSize > executorArrayHeaderSize) val df = spark.range(71773).select((col("id") % lit(10)).cast(IntegerType) as "v") .groupBy(array(col("v"))).agg(count(col("*"))) val plan = df.queryExecution.executedPlan assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined) val expectedAnswer = Row(Array(0), 7178) :: Row(Array(1), 7178) :: Row(Array(2), 7178) :: Row(Array(3), 7177) :: Row(Array(4), 7177) :: Row(Array(5), 7177) :: Row(Array(6), 7177) :: Row(Array(7), 7177) :: Row(Array(8), 7177) :: Row(Array(9), 7177) :: Nil val result = df.collect QueryTest.sameRows(result.toSeq, expectedAnswer) match { case Some(errMsg) => fail(errMsg) case _ => } } }
Example 130
Source File: DruidClient.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.druid import com.ning.http.client.{ AsyncCompletionHandler, AsyncHttpClient, AsyncHttpClientConfig, Response } import org.json4s._ import org.json4s.jackson._ import org.json4s.jackson.JsonMethods._ import scala.concurrent.{ExecutionContext, Future, Promise} import scala.util.{Failure, Success} import org.apache.spark.internal.Logging def descTable(datasouceName: String): Seq[(String, Any)] = { val future = execute(DescTableRequest(datasouceName).toJson, DescTableResponse.parse) var data: Seq[(String, Any)] = null future.onComplete { case Success(resp) => data = resp.data case Failure(ex) => ex.printStackTrace() } while (!future.isCompleted) { Thread.sleep(500) } data } def close(): Unit = { client.close() } }
Example 131
Source File: MesosClusterPersistenceEngine.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import scala.collection.JavaConverters._ import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.zookeeper.KeeperException.NoNodeException import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[spark] class ZookeeperMesosClusterPersistenceEngine( baseDir: String, zk: CuratorFramework, conf: SparkConf) extends MesosClusterPersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark_mesos_dispatcher") + "/" + baseDir SparkCuratorUtil.mkdir(zk, WORKING_DIR) def path(name: String): String = { WORKING_DIR + "/" + name } override def expunge(name: String): Unit = { zk.delete().forPath(path(name)) } override def persist(name: String, obj: Object): Unit = { val serialized = Utils.serialize(obj) val zkPath = path(name) zk.create().withMode(CreateMode.PERSISTENT).forPath(zkPath, serialized) } override def fetch[T](name: String): Option[T] = { val zkPath = path(name) try { val fileData = zk.getData().forPath(zkPath) Some(Utils.deserialize[T](fileData)) } catch { case e: NoNodeException => None case e: Exception => logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(zkPath) None } } override def fetchAll[T](): Iterable[T] = { zk.getChildren.forPath(WORKING_DIR).asScala.flatMap(fetch[T]) } }
Example 132
Source File: YARNHadoopDelegationTokenManager.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import java.util.ServiceLoader import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.security.Credentials import org.apache.spark.SparkConf import org.apache.spark.deploy.security.HadoopDelegationTokenManager import org.apache.spark.internal.Logging import org.apache.spark.util.Utils def obtainDelegationTokens(hadoopConf: Configuration, creds: Credentials): Long = { val superInterval = delegationTokenManager.obtainDelegationTokens(hadoopConf, creds) credentialProviders.values.flatMap { provider => if (provider.credentialsRequired(hadoopConf)) { provider.obtainCredentials(hadoopConf, sparkConf, creds) } else { logDebug(s"Service ${provider.serviceName} does not require a token." + s" Check your configuration to see if security is disabled or not.") None } }.foldLeft(superInterval)(math.min) } private def getCredentialProviders: Map[String, ServiceCredentialProvider] = { val providers = loadCredentialProviders providers. filter { p => delegationTokenManager.isServiceEnabled(p.serviceName) } .map { p => (p.serviceName, p) } .toMap } private def loadCredentialProviders: List[ServiceCredentialProvider] = { ServiceLoader.load(classOf[ServiceCredentialProvider], Utils.getContextOrSparkClassLoader) .asScala .toList } }
Example 133
Source File: YarnProxyRedirectFilter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import javax.servlet._ import javax.servlet.http.{HttpServletRequest, HttpServletResponse} import org.apache.spark.internal.Logging class YarnProxyRedirectFilter extends Filter with Logging { import YarnProxyRedirectFilter._ override def destroy(): Unit = { } override def init(config: FilterConfig): Unit = { } override def doFilter(req: ServletRequest, res: ServletResponse, chain: FilterChain): Unit = { val hreq = req.asInstanceOf[HttpServletRequest] // The YARN proxy will send a request with the "proxy-user" cookie set to the YARN's client // user name. We don't expect any other clients to set this cookie, since the SHS does not // use cookies for anything. Option(hreq.getCookies()).flatMap(_.find(_.getName() == COOKIE_NAME)) match { case Some(_) => doRedirect(hreq, res.asInstanceOf[HttpServletResponse]) case _ => chain.doFilter(req, res) } } private def doRedirect(req: HttpServletRequest, res: HttpServletResponse): Unit = { val redirect = req.getRequestURL().toString() // Need a client-side redirect instead of an HTTP one, otherwise the YARN proxy itself // will handle the redirect and get into an infinite loop. val content = s""" |<html xmlns="http://www.w3.org/1999/xhtml"> |<head> | <title>Spark History Server Redirect</title> | <meta http-equiv="refresh" content="0;URL='$redirect'" /> |</head> |<body> | <p>The requested page can be found at: <a href="$redirect">$redirect</a>.</p> |</body> |</html> """.stripMargin logDebug(s"Redirecting YARN proxy request to $redirect.") res.setStatus(HttpServletResponse.SC_OK) res.setContentType("text/html") res.getWriter().write(content) } } private[spark] object YarnProxyRedirectFilter { val COOKIE_NAME = "proxy-user" }
Example 134
Source File: YarnRMClient.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import scala.collection.JavaConverters._ import org.apache.hadoop.yarn.api.records._ import org.apache.hadoop.yarn.client.api.AMRMClient import org.apache.hadoop.yarn.client.api.AMRMClient.ContainerRequest import org.apache.hadoop.yarn.conf.YarnConfiguration import org.apache.hadoop.yarn.webapp.util.WebAppUtils import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.util.Utils def getMaxRegAttempts(sparkConf: SparkConf, yarnConf: YarnConfiguration): Int = { val sparkMaxAttempts = sparkConf.get(MAX_APP_ATTEMPTS).map(_.toInt) val yarnMaxAttempts = yarnConf.getInt( YarnConfiguration.RM_AM_MAX_ATTEMPTS, YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS) sparkMaxAttempts match { case Some(x) => if (x <= yarnMaxAttempts) x else yarnMaxAttempts case None => yarnMaxAttempts } } }
Example 135
Source File: ExtensionServiceIntegrationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.scalatest.BeforeAndAfter import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging before { val sparkConf = new SparkConf() sparkConf.set(SCHEDULER_SERVICES, Seq(classOf[SimpleExtensionService].getName())) sparkConf.setMaster("local").setAppName("ExtensionServiceIntegrationSuite") sc = new SparkContext(sparkConf) } test("Instantiate") { val services = new SchedulerExtensionServices() assertResult(Nil, "non-nil service list") { services.getServices } services.start(SchedulerExtensionServiceBinding(sc, applicationId)) services.stop() } test("Contains SimpleExtensionService Service") { val services = new SchedulerExtensionServices() try { services.start(SchedulerExtensionServiceBinding(sc, applicationId)) val serviceList = services.getServices assert(serviceList.nonEmpty, "empty service list") val (service :: Nil) = serviceList val simpleService = service.asInstanceOf[SimpleExtensionService] assert(simpleService.started.get, "service not started") services.stop() assert(!simpleService.started.get, "service not stopped") } finally { services.stop() } } }
Example 136
Source File: EventTransformer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.io.{ObjectInput, ObjectOutput} import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[streaming] object EventTransformer extends Logging { def readExternal(in: ObjectInput): (java.util.HashMap[CharSequence, CharSequence], Array[Byte]) = { val bodyLength = in.readInt() val bodyBuff = new Array[Byte](bodyLength) in.readFully(bodyBuff) val numHeaders = in.readInt() val headers = new java.util.HashMap[CharSequence, CharSequence] for (i <- 0 until numHeaders) { val keyLength = in.readInt() val keyBuff = new Array[Byte](keyLength) in.readFully(keyBuff) val key: String = Utils.deserialize(keyBuff) val valLength = in.readInt() val valBuff = new Array[Byte](valLength) in.readFully(valBuff) val value: String = Utils.deserialize(valBuff) headers.put(key, value) } (headers, bodyBuff) } def writeExternal(out: ObjectOutput, headers: java.util.Map[CharSequence, CharSequence], body: Array[Byte]) { out.writeInt(body.length) out.write(body) val numHeaders = headers.size() out.writeInt(numHeaders) for ((k, v) <- headers.asScala) { val keyBuff = Utils.serialize(k.toString) out.writeInt(keyBuff.length) out.write(keyBuff) val valBuff = Utils.serialize(v.toString) out.writeInt(valBuff.length) out.write(valBuff) } } }
Example 137
Source File: FlumeStreamSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.flume import java.util.concurrent.ConcurrentLinkedQueue import scala.collection.JavaConverters._ import scala.concurrent.duration._ import scala.language.postfixOps import org.jboss.netty.channel.ChannelPipeline import org.jboss.netty.channel.socket.SocketChannel import org.jboss.netty.channel.socket.nio.NioClientSocketChannelFactory import org.jboss.netty.handler.codec.compression._ import org.scalatest.{BeforeAndAfter, Matchers} import org.scalatest.concurrent.Eventually._ import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.internal.Logging import org.apache.spark.network.util.JavaUtils import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Milliseconds, StreamingContext, TestOutputStream} class FlumeStreamSuite extends SparkFunSuite with BeforeAndAfter with Matchers with Logging { val conf = new SparkConf().setMaster("local[4]").setAppName("FlumeStreamSuite") var ssc: StreamingContext = null test("flume input stream") { testFlumeStream(testCompression = false) } test("flume input compressed stream") { testFlumeStream(testCompression = true) } private class CompressionChannelFactory(compressionLevel: Int) extends NioClientSocketChannelFactory { override def newChannel(pipeline: ChannelPipeline): SocketChannel = { val encoder = new ZlibEncoder(compressionLevel) pipeline.addFirst("deflater", encoder) pipeline.addFirst("inflater", new ZlibDecoder()) super.newChannel(pipeline) } } }
Example 138
Source File: CachedKafkaProducer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.{util => ju} import java.util.concurrent.{ConcurrentMap, ExecutionException, TimeUnit} import com.google.common.cache._ import com.google.common.util.concurrent.{ExecutionError, UncheckedExecutionException} import org.apache.kafka.clients.producer.KafkaProducer import scala.collection.JavaConverters._ import scala.util.control.NonFatal import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging private[kafka010] object CachedKafkaProducer extends Logging { private type Producer = KafkaProducer[Array[Byte], Array[Byte]] private lazy val cacheExpireTimeout: Long = SparkEnv.get.conf.getTimeAsMs("spark.kafka.producer.cache.timeout", "10m") private val cacheLoader = new CacheLoader[Seq[(String, Object)], Producer] { override def load(config: Seq[(String, Object)]): Producer = { val configMap = config.map(x => x._1 -> x._2).toMap.asJava createKafkaProducer(configMap) } } private val removalListener = new RemovalListener[Seq[(String, Object)], Producer]() { override def onRemoval( notification: RemovalNotification[Seq[(String, Object)], Producer]): Unit = { val paramsSeq: Seq[(String, Object)] = notification.getKey val producer: Producer = notification.getValue logDebug( s"Evicting kafka producer $producer params: $paramsSeq, due to ${notification.getCause}") close(paramsSeq, producer) } } private lazy val guavaCache: LoadingCache[Seq[(String, Object)], Producer] = CacheBuilder.newBuilder().expireAfterAccess(cacheExpireTimeout, TimeUnit.MILLISECONDS) .removalListener(removalListener) .build[Seq[(String, Object)], Producer](cacheLoader) private def createKafkaProducer(producerConfiguration: ju.Map[String, Object]): Producer = { val kafkaProducer: Producer = new Producer(producerConfiguration) logDebug(s"Created a new instance of KafkaProducer for $producerConfiguration.") kafkaProducer } private def close(paramsSeq: Seq[(String, Object)], producer: Producer): Unit = { try { logInfo(s"Closing the KafkaProducer with params: ${paramsSeq.mkString("\n")}.") producer.close() } catch { case NonFatal(e) => logWarning("Error while closing kafka producer.", e) } } private def clear(): Unit = { logInfo("Cleaning up guava cache.") guavaCache.invalidateAll() } // Intended for testing purpose only. private def getAsMap: ConcurrentMap[Seq[(String, Object)], Producer] = guavaCache.asMap() }
Example 139
Source File: KafkaWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.{util => ju} import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SparkSession} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} import org.apache.spark.sql.types.{BinaryType, StringType} import org.apache.spark.util.Utils private[kafka010] object KafkaWriter extends Logging { val TOPIC_ATTRIBUTE_NAME: String = "topic" val KEY_ATTRIBUTE_NAME: String = "key" val VALUE_ATTRIBUTE_NAME: String = "value" override def toString: String = "KafkaWriter" def validateQuery( schema: Seq[Attribute], kafkaParameters: ju.Map[String, Object], topic: Option[String] = None): Unit = { schema.find(_.name == TOPIC_ATTRIBUTE_NAME).getOrElse( if (topic.isEmpty) { throw new AnalysisException(s"topic option required when no " + s"'$TOPIC_ATTRIBUTE_NAME' attribute is present. Use the " + s"${KafkaSourceProvider.TOPIC_OPTION_KEY} option for setting a topic.") } else { Literal(topic.get, StringType) } ).dataType match { case StringType => // good case _ => throw new AnalysisException(s"Topic type must be a String") } schema.find(_.name == KEY_ATTRIBUTE_NAME).getOrElse( Literal(null, StringType) ).dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException(s"$KEY_ATTRIBUTE_NAME attribute type " + s"must be a String or BinaryType") } schema.find(_.name == VALUE_ATTRIBUTE_NAME).getOrElse( throw new AnalysisException(s"Required attribute '$VALUE_ATTRIBUTE_NAME' not found") ).dataType match { case StringType | BinaryType => // good case _ => throw new AnalysisException(s"$VALUE_ATTRIBUTE_NAME attribute type " + s"must be a String or BinaryType") } } def write( sparkSession: SparkSession, queryExecution: QueryExecution, kafkaParameters: ju.Map[String, Object], topic: Option[String] = None): Unit = { val schema = queryExecution.analyzed.output validateQuery(schema, kafkaParameters, topic) queryExecution.toRdd.foreachPartition { iter => val writeTask = new KafkaWriteTask(kafkaParameters, schema, topic) Utils.tryWithSafeFinally(block = writeTask.execute(iter))( finallyBlock = writeTask.close()) } } }
Example 140
Source File: KafkaSink.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.{util => ju} import org.apache.spark.internal.Logging import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.execution.streaming.Sink private[kafka010] class KafkaSink( sqlContext: SQLContext, executorKafkaParams: ju.Map[String, Object], topic: Option[String]) extends Sink with Logging { @volatile private var latestBatchId = -1L override def toString(): String = "KafkaSink" override def addBatch(batchId: Long, data: DataFrame): Unit = { if (batchId <= latestBatchId) { logInfo(s"Skipping already committed batch $batchId") } else { KafkaWriter.write(sqlContext.sparkSession, data.queryExecution, executorKafkaParams, topic) latestBatchId = batchId } } }
Example 141
Source File: Signaling.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.repl import org.apache.spark.SparkContext import org.apache.spark.internal.Logging import org.apache.spark.util.SignalUtils private[repl] object Signaling extends Logging { def cancelOnInterrupt(): Unit = SignalUtils.register("INT") { SparkContext.getActive.map { ctx => if (!ctx.statusTracker.getActiveJobIds().isEmpty) { logWarning("Cancelling all active jobs, this can take a while. " + "Press Ctrl+C again to exit now.") ctx.cancelAllJobs() true } else { false } }.getOrElse(false) } }
Example 142
Source File: FiltersSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.client import java.util.Collections import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.hadoop.hive.serde.serdeConstants import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { private val shim = new Shim_v0_13 private val testTable = new org.apache.hadoop.hive.ql.metadata.Table("default", "test") private val varCharCol = new FieldSchema() varCharCol.setName("varchar") varCharCol.setType(serdeConstants.VARCHAR_TYPE_NAME) testTable.setPartCols(Collections.singletonList(varCharCol)) filterTest("string filter", (a("stringcol", StringType) > Literal("test")) :: Nil, "stringcol > \"test\"") filterTest("string filter backwards", (Literal("test") > a("stringcol", StringType)) :: Nil, "\"test\" > stringcol") filterTest("int filter", (a("intcol", IntegerType) === Literal(1)) :: Nil, "intcol = 1") filterTest("int filter backwards", (Literal(1) === a("intcol", IntegerType)) :: Nil, "1 = intcol") filterTest("int and string filter", (Literal(1) === a("intcol", IntegerType)) :: (Literal("a") === a("strcol", IntegerType)) :: Nil, "1 = intcol and \"a\" = strcol") filterTest("skip varchar", (Literal("") === a("varchar", StringType)) :: Nil, "") filterTest("SPARK-19912 String literals should be escaped for Hive metastore partition pruning", (a("stringcol", StringType) === Literal("p1\" and q=\"q1")) :: (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil, """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""") private def filterTest(name: String, filters: Seq[Expression], result: String) = { test(name) { withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> "true") { val converted = shim.convertFilters(testTable, filters) if (converted != result) { fail(s"Expected ${filters.mkString(",")} to convert to '$result' but got '$converted'") } } } } test("turn on/off ADVANCED_PARTITION_PREDICATE_PUSHDOWN") { import org.apache.spark.sql.catalyst.dsl.expressions._ Seq(true, false).foreach { enabled => withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> enabled.toString) { val filters = (Literal(1) === a("intcol", IntegerType) || Literal(2) === a("intcol", IntegerType)) :: Nil val converted = shim.convertFilters(testTable, filters) if (enabled) { assert(converted == "(1 = intcol or 2 = intcol)") } else { assert(converted.isEmpty) } } } } private def a(name: String, dataType: DataType) = AttributeReference(name, dataType)() }
Example 143
Source File: SparkSQLDriver.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, Arrays, List => JList} import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.internal.Logging import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.execution.{QueryExecution, SQLExecution} private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, attr.dataType.catalogString, "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.sessionState.executePlan(context.sql(command).logicalPlan) hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) { execution.hiveResultString() } tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 144
Source File: SparkSQLOperationManager.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.server import java.util.{Map => JMap} import java.util.concurrent.ConcurrentHashMap import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager} import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.internal.Logging import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.hive.thriftserver.{ReflectionUtils, SparkExecuteStatementOperation} import org.apache.spark.sql.internal.SQLConf private[thriftserver] class SparkSQLOperationManager() extends OperationManager with Logging { val handleToOperation = ReflectionUtils .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation") val sessionToActivePool = new ConcurrentHashMap[SessionHandle, String]() val sessionToContexts = new ConcurrentHashMap[SessionHandle, SQLContext]() override def newExecuteStatementOperation( parentSession: HiveSession, statement: String, confOverlay: JMap[String, String], async: Boolean): ExecuteStatementOperation = synchronized { val sqlContext = sessionToContexts.get(parentSession.getSessionHandle) require(sqlContext != null, s"Session handle: ${parentSession.getSessionHandle} has not been" + s" initialized or had already closed.") val conf = sqlContext.sessionState.conf val hiveSessionState = parentSession.getSessionState setConfMap(conf, hiveSessionState.getOverriddenConfigurations) setConfMap(conf, hiveSessionState.getHiveVariables) val runInBackground = async && conf.getConf(HiveUtils.HIVE_THRIFT_SERVER_ASYNC) val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay, runInBackground)(sqlContext, sessionToActivePool) handleToOperation.put(operation.getHandle, operation) logDebug(s"Created Operation for $statement with session=$parentSession, " + s"runInBackground=$runInBackground") operation } def setConfMap(conf: SQLConf, confMap: java.util.Map[String, String]): Unit = { val iterator = confMap.entrySet().iterator() while (iterator.hasNext) { val kv = iterator.next() conf.setConfString(kv.getKey, kv.getValue) } } }
Example 145
Source File: ThriftServerTab.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 146
Source File: SparkSQLEnv.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.internal.Logging import org.apache.spark.sql.{SparkSession, SQLContext} import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils} import org.apache.spark.util.Utils def stop() { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { sparkContext.stop() sparkContext = null sqlContext = null } } }
Example 147
Source File: UDTRegistration.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.util.Utils def getUDTFor(userClass: String): Option[Class[_]] = { udtMap.get(userClass).map { udtClassName => if (Utils.classIsLoadable(udtClassName)) { val udtClass = Utils.classForName(udtClassName) if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) { udtClass } else { throw new SparkException( s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " + s"an UserDefinedType for ${userClass}") } } else { throw new SparkException( s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.") } } } }
Example 148
Source File: BoundAttribute.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.errors.attachTree import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types._ case class BoundReference(ordinal: Int, dataType: DataType, nullable: Boolean) extends LeafExpression { override def toString: String = s"input[$ordinal, ${dataType.simpleString}, $nullable]" // Use special getter for primitive types (for UnsafeRow) override def eval(input: InternalRow): Any = { if (input.isNullAt(ordinal)) { null } else { dataType match { case BooleanType => input.getBoolean(ordinal) case ByteType => input.getByte(ordinal) case ShortType => input.getShort(ordinal) case IntegerType | DateType => input.getInt(ordinal) case LongType | TimestampType => input.getLong(ordinal) case FloatType => input.getFloat(ordinal) case DoubleType => input.getDouble(ordinal) case StringType => input.getUTF8String(ordinal) case BinaryType => input.getBinary(ordinal) case CalendarIntervalType => input.getInterval(ordinal) case t: DecimalType => input.getDecimal(ordinal, t.precision, t.scale) case t: StructType => input.getStruct(ordinal, t.size) case _: ArrayType => input.getArray(ordinal) case _: MapType => input.getMap(ordinal) case _ => input.get(ordinal, dataType) } } } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { if (ctx.currentVars != null && ctx.currentVars(ordinal) != null) { val oev = ctx.currentVars(ordinal) ev.isNull = oev.isNull ev.value = oev.value ev.copy(code = oev.code) } else { assert(ctx.INPUT_ROW != null, "INPUT_ROW and currentVars cannot both be null.") val javaType = ctx.javaType(dataType) val value = ctx.getValue(ctx.INPUT_ROW, dataType, ordinal.toString) if (nullable) { ev.copy(code = s""" |boolean ${ev.isNull} = ${ctx.INPUT_ROW}.isNullAt($ordinal); |$javaType ${ev.value} = ${ev.isNull} ? ${ctx.defaultValue(dataType)} : ($value); """.stripMargin) } else { ev.copy(code = s"$javaType ${ev.value} = $value;", isNull = "false") } } } } object BindReferences extends Logging { def bindReference[A <: Expression]( expression: A, input: AttributeSeq, allowFailures: Boolean = false): A = { expression.transform { case a: AttributeReference => attachTree(a, "Binding attribute") { val ordinal = input.indexOf(a.exprId) if (ordinal == -1) { if (allowFailures) { a } else { sys.error(s"Couldn't find $a in ${input.attrs.mkString("[", ",", "]")}") } } else { BoundReference(ordinal, a.dataType, input(ordinal).nullable) } } }.asInstanceOf[A] // Kind of a hack, but safe. TODO: Tighten return type when possible. } }
Example 149
Source File: JSONOptions.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.json import java.util.{Locale, TimeZone} import com.fasterxml.jackson.core.{JsonFactory, JsonParser} import org.apache.commons.lang3.time.FastDateFormat import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.util._ def setJacksonOptions(factory: JsonFactory): Unit = { factory.configure(JsonParser.Feature.ALLOW_COMMENTS, allowComments) factory.configure(JsonParser.Feature.ALLOW_UNQUOTED_FIELD_NAMES, allowUnquotedFieldNames) factory.configure(JsonParser.Feature.ALLOW_SINGLE_QUOTES, allowSingleQuotes) factory.configure(JsonParser.Feature.ALLOW_NUMERIC_LEADING_ZEROS, allowNumericLeadingZeros) factory.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, allowNonNumericNumbers) factory.configure(JsonParser.Feature.ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER, allowBackslashEscapingAnyCharacter) factory.configure(JsonParser.Feature.ALLOW_UNQUOTED_CONTROL_CHARS, allowUnquotedControlChars) } }
Example 150
Source File: ParseMode.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import java.util.Locale import org.apache.spark.internal.Logging sealed trait ParseMode { def fromString(mode: String): ParseMode = mode.toUpperCase(Locale.ROOT) match { case PermissiveMode.name => PermissiveMode case DropMalformedMode.name => DropMalformedMode case FailFastMode.name => FailFastMode case _ => logWarning(s"$mode is not a valid parse mode. Using ${PermissiveMode.name}.") PermissiveMode } }
Example 151
Source File: DataSourceV2Utils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import java.util.regex.Pattern import org.apache.spark.internal.Logging import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.v2.{DataSourceV2, SessionConfigSupport} private[sql] object DataSourceV2Utils extends Logging { def extractSessionConfigs(ds: DataSourceV2, conf: SQLConf): Map[String, String] = ds match { case cs: SessionConfigSupport => val keyPrefix = cs.keyPrefix() require(keyPrefix != null, "The data source config key prefix can't be null.") val pattern = Pattern.compile(s"^spark\\.datasource\\.$keyPrefix\\.(.+)") conf.getAllConfs.flatMap { case (key, value) => val m = pattern.matcher(key) if (m.matches() && m.groupCount() > 0) { Seq((m.group(1), value)) } else { Seq.empty } } case _ => Map.empty } }
Example 152
Source File: DriverRegistry.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.sql.{Driver, DriverManager} import scala.collection.mutable import org.apache.spark.internal.Logging import org.apache.spark.util.Utils DriverManager.getDrivers private val wrapperMap: mutable.Map[String, DriverWrapper] = mutable.Map.empty def register(className: String): Unit = { val cls = Utils.getContextOrSparkClassLoader.loadClass(className) if (cls.getClassLoader == null) { logTrace(s"$className has been loaded with bootstrap ClassLoader, wrapper is not required") } else if (wrapperMap.get(className).isDefined) { logTrace(s"Wrapper for $className already exists") } else { synchronized { if (wrapperMap.get(className).isEmpty) { val wrapper = new DriverWrapper(cls.newInstance().asInstanceOf[Driver]) DriverManager.registerDriver(wrapper) wrapperMap(className) = wrapper logTrace(s"Wrapper for $className registered") } } } } }
Example 153
Source File: SQLHadoopMapReduceCommitProtocol.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{OutputCommitter, TaskAttemptContext} import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter import org.apache.spark.internal.Logging import org.apache.spark.internal.io.HadoopMapReduceCommitProtocol import org.apache.spark.sql.internal.SQLConf class SQLHadoopMapReduceCommitProtocol( jobId: String, path: String, dynamicPartitionOverwrite: Boolean = false) extends HadoopMapReduceCommitProtocol(jobId, path, dynamicPartitionOverwrite) with Serializable with Logging { override protected def setupCommitter(context: TaskAttemptContext): OutputCommitter = { var committer = super.setupCommitter(context) val configuration = context.getConfiguration val clazz = configuration.getClass(SQLConf.OUTPUT_COMMITTER_CLASS.key, null, classOf[OutputCommitter]) if (clazz != null) { logInfo(s"Using user defined output committer class ${clazz.getCanonicalName}") // Every output format based on org.apache.hadoop.mapreduce.lib.output.OutputFormat // has an associated output committer. To override this output committer, // we will first try to use the output committer set in SQLConf.OUTPUT_COMMITTER_CLASS. // If a data source needs to override the output committer, it needs to set the // output committer in prepareForWrite method. if (classOf[FileOutputCommitter].isAssignableFrom(clazz)) { // The specified output committer is a FileOutputCommitter. // So, we will use the FileOutputCommitter-specified constructor. val ctor = clazz.getDeclaredConstructor(classOf[Path], classOf[TaskAttemptContext]) committer = ctor.newInstance(new Path(path), context) } else { // The specified output committer is just an OutputCommitter. // So, we will use the no-argument constructor. val ctor = clazz.getDeclaredConstructor() committer = ctor.newInstance() } } logInfo(s"Using output committer class ${committer.getClass.getCanonicalName}") committer } }
Example 154
Source File: FrequentItems.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.stat import scala.collection.mutable.{Map => MutableMap} import org.apache.spark.internal.Logging import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types._ object FrequentItems extends Logging { def singlePassFreqItems( df: DataFrame, cols: Seq[String], support: Double): DataFrame = { require(support >= 1e-4 && support <= 1.0, s"Support must be in [1e-4, 1], but got $support.") val numCols = cols.length // number of max items to keep counts for val sizeOfMap = (1 / support).toInt val countMaps = Seq.tabulate(numCols)(i => new FreqItemCounter(sizeOfMap)) val originalSchema = df.schema val colInfo: Array[(String, DataType)] = cols.map { name => val index = originalSchema.fieldIndex(name) (name, originalSchema.fields(index).dataType) }.toArray val freqItems = df.select(cols.map(Column(_)) : _*).rdd.treeAggregate(countMaps)( seqOp = (counts, row) => { var i = 0 while (i < numCols) { val thisMap = counts(i) val key = row.get(i) thisMap.add(key, 1L) i += 1 } counts }, combOp = (baseCounts, counts) => { var i = 0 while (i < numCols) { baseCounts(i).merge(counts(i)) i += 1 } baseCounts } ) val justItems = freqItems.map(m => m.baseMap.keys.toArray) val resultRow = Row(justItems : _*) // append frequent Items to the column name for easy debugging val outputCols = colInfo.map { v => StructField(v._1 + "_freqItems", ArrayType(v._2, false)) } val schema = StructType(outputCols).toAttributes Dataset.ofRows(df.sparkSession, LocalRelation.fromExternalRows(schema, Seq(resultRow))) } }
Example 155
Source File: CompressibleColumnBuilder.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import java.nio.{ByteBuffer, ByteOrder} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.execution.columnar.{ColumnBuilder, NativeColumnBuilder} import org.apache.spark.sql.types.AtomicType import org.apache.spark.unsafe.Platform private[columnar] trait CompressibleColumnBuilder[T <: AtomicType] extends ColumnBuilder with Logging { this: NativeColumnBuilder[T] with WithCompressionSchemes => var compressionEncoders: Seq[Encoder[T]] = _ abstract override def initialize( initialSize: Int, columnName: String, useCompression: Boolean): Unit = { compressionEncoders = if (useCompression) { schemes.filter(_.supports(columnType)).map(_.encoder[T](columnType)) } else { Seq(PassThrough.encoder(columnType)) } super.initialize(initialSize, columnName, useCompression) } // The various compression schemes, while saving memory use, cause all of the data within // the row to become unaligned, thus causing crashes. Until a way of fixing the compression // is found to also allow aligned accesses this must be disabled for SPARC. protected def isWorthCompressing(encoder: Encoder[T]) = { CompressibleColumnBuilder.unaligned && encoder.compressionRatio < 0.8 } private def gatherCompressibilityStats(row: InternalRow, ordinal: Int): Unit = { compressionEncoders.foreach(_.gatherCompressibilityStats(row, ordinal)) } abstract override def appendFrom(row: InternalRow, ordinal: Int): Unit = { super.appendFrom(row, ordinal) if (!row.isNullAt(ordinal)) { gatherCompressibilityStats(row, ordinal) } } override def build(): ByteBuffer = { val nonNullBuffer = buildNonNulls() val encoder: Encoder[T] = { val candidate = compressionEncoders.minBy(_.compressionRatio) if (isWorthCompressing(candidate)) candidate else PassThrough.encoder(columnType) } // Header = null count + null positions val headerSize = 4 + nulls.limit() val compressedSize = if (encoder.compressedSize == 0) { nonNullBuffer.remaining() } else { encoder.compressedSize } val compressedBuffer = ByteBuffer // Reserves 4 bytes for compression scheme ID .allocate(headerSize + 4 + compressedSize) .order(ByteOrder.nativeOrder) // Write the header .putInt(nullCount) .put(nulls) logDebug(s"Compressor for [$columnName]: $encoder, ratio: ${encoder.compressionRatio}") encoder.compress(nonNullBuffer, compressedBuffer) } } private[columnar] object CompressibleColumnBuilder { val unaligned = Platform.unaligned() }
Example 156
Source File: MetricsReporter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import com.codahale.metrics.{Gauge, MetricRegistry} import org.apache.spark.internal.Logging import org.apache.spark.metrics.source.{Source => CodahaleSource} import org.apache.spark.sql.streaming.StreamingQueryProgress class MetricsReporter( stream: StreamExecution, override val sourceName: String) extends CodahaleSource with Logging { override val metricRegistry: MetricRegistry = new MetricRegistry // Metric names should not have . in them, so that all the metrics of a query are identified // together in Ganglia as a single metric group registerGauge("inputRate-total", _.inputRowsPerSecond, 0.0) registerGauge("processingRate-total", _.processedRowsPerSecond, 0.0) registerGauge("latency", _.durationMs.get("triggerExecution").longValue(), 0L) private def registerGauge[T]( name: String, f: StreamingQueryProgress => T, default: T): Unit = { synchronized { metricRegistry.register(name, new Gauge[T] { override def getValue: T = Option(stream.lastProgress).map(f).getOrElse(default) }) } } }
Example 157
Source File: FileStreamOptions.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import scala.util.Try import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.util.Utils val fileNameOnly: Boolean = withBooleanParameter("fileNameOnly", false) private def withBooleanParameter(name: String, default: Boolean) = { parameters.get(name).map { str => try { str.toBoolean } catch { case _: IllegalArgumentException => throw new IllegalArgumentException( s"Invalid value '$str' for option '$name', must be 'true' or 'false'") } }.getOrElse(default) } }
Example 158
Source File: StreamMetadata.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.io.{InputStreamReader, OutputStreamWriter} import java.nio.charset.StandardCharsets import scala.util.control.NonFatal import org.apache.commons.io.IOUtils import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, FSDataInputStream, FSDataOutputStream, Path} import org.json4s.NoTypeHints import org.json4s.jackson.Serialization import org.apache.spark.internal.Logging import org.apache.spark.sql.streaming.StreamingQuery def write( metadata: StreamMetadata, metadataFile: Path, hadoopConf: Configuration): Unit = { var output: FSDataOutputStream = null try { val fs = metadataFile.getFileSystem(hadoopConf) output = fs.create(metadataFile) val writer = new OutputStreamWriter(output) Serialization.write(metadata, writer) writer.close() } catch { case NonFatal(e) => logError(s"Error writing stream metadata $metadata to $metadataFile", e) throw e } finally { IOUtils.closeQuietly(output) } } }
Example 159
Source File: ConsoleWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.sources import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.sources.v2.DataSourceOptions import org.apache.spark.sql.sources.v2.writer.{DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.types.StructType class ConsoleWriter(schema: StructType, options: DataSourceOptions) extends StreamWriter with Logging { // Number of rows to display, by default 20 rows protected val numRowsToShow = options.getInt("numRows", 20) // Truncate the displayed data if it is too long, by default it is true protected val isTruncated = options.getBoolean("truncate", true) assert(SparkSession.getActiveSession.isDefined) protected val spark = SparkSession.getActiveSession.get def createWriterFactory(): DataWriterFactory[Row] = PackedRowWriterFactory override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = { // We have to print a "Batch" label for the epoch for compatibility with the pre-data source V2 // behavior. printRows(messages, schema, s"Batch: $epochId") } def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} protected def printRows( commitMessages: Array[WriterCommitMessage], schema: StructType, printMessage: String): Unit = { val rows = commitMessages.collect { case PackedRowCommitMessage(rs) => rs }.flatten // scalastyle:off println println("-------------------------------------------") println(printMessage) println("-------------------------------------------") // scalastyle:off println spark .createDataFrame(rows.toList.asJava, schema) .show(numRowsToShow, isTruncated) } override def toString(): String = { s"ConsoleWriter[numRows=$numRowsToShow, truncate=$isTruncated]" } }
Example 160
Source File: ManifestFileCommitProtocol.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.UUID import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext} import org.apache.spark.internal.Logging import org.apache.spark.internal.io.FileCommitProtocol import org.apache.spark.internal.io.FileCommitProtocol.TaskCommitMessage def setupManifestOptions(fileLog: FileStreamSinkLog, batchId: Long): Unit = { this.fileLog = fileLog this.batchId = batchId } override def setupJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def commitJob(jobContext: JobContext, taskCommits: Seq[TaskCommitMessage]): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") val fileStatuses = taskCommits.flatMap(_.obj.asInstanceOf[Seq[SinkFileStatus]]).toArray if (fileLog.add(batchId, fileStatuses)) { logInfo(s"Committed batch $batchId") } else { throw new IllegalStateException(s"Race while writing batch $batchId") } } override def abortJob(jobContext: JobContext): Unit = { require(fileLog != null, "setupManifestOptions must be called before this function") // Do nothing } override def setupTask(taskContext: TaskAttemptContext): Unit = { addedFiles = new ArrayBuffer[String] } override def newTaskTempFile( taskContext: TaskAttemptContext, dir: Option[String], ext: String): String = { // The file name looks like part-r-00000-2dd664f9-d2c4-4ffe-878f-c6c70c1fb0cb_00003.gz.parquet // Note that %05d does not truncate the split number, so if we have more than 100000 tasks, // the file name is fine and won't overflow. val split = taskContext.getTaskAttemptID.getTaskID.getId val uuid = UUID.randomUUID.toString val filename = f"part-$split%05d-$uuid$ext" val file = dir.map { d => new Path(new Path(path, d), filename).toString }.getOrElse { new Path(path, filename).toString } addedFiles += file file } override def newTaskTempFileAbsPath( taskContext: TaskAttemptContext, absoluteDir: String, ext: String): String = { throw new UnsupportedOperationException( s"$this does not support adding files with an absolute path") } override def commitTask(taskContext: TaskAttemptContext): TaskCommitMessage = { if (addedFiles.nonEmpty) { val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration) val statuses: Seq[SinkFileStatus] = addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f)))) new TaskCommitMessage(statuses) } else { new TaskCommitMessage(Seq.empty[SinkFileStatus]) } } override def abortTask(taskContext: TaskAttemptContext): Unit = { // Do nothing // TODO: we can also try delete the addedFiles as a best-effort cleanup. } }
Example 161
Source File: SocketInputDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io._ import java.net.{ConnectException, Socket} import java.nio.charset.StandardCharsets import scala.reflect.ClassTag import scala.util.control.NonFatal import org.apache.spark.internal.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.util.NextIterator private[streaming] class SocketInputDStream[T: ClassTag]( _ssc: StreamingContext, host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends ReceiverInputDStream[T](_ssc) { def getReceiver(): Receiver[T] = { new SocketReceiver(host, port, bytesToObjects, storageLevel) } } private[streaming] class SocketReceiver[T: ClassTag]( host: String, port: Int, bytesToObjects: InputStream => Iterator[T], storageLevel: StorageLevel ) extends Receiver[T](storageLevel) with Logging { private var socket: Socket = _ def onStart() { logInfo(s"Connecting to $host:$port") try { socket = new Socket(host, port) } catch { case e: ConnectException => restart(s"Error connecting to $host:$port", e) return } logInfo(s"Connected to $host:$port") // Start the thread that receives data over a connection new Thread("Socket Receiver") { setDaemon(true) override def run() { receive() } }.start() } def onStop() { // in case restart thread close it twice synchronized { if (socket != null) { socket.close() socket = null logInfo(s"Closed socket to $host:$port") } } } def bytesToLines(inputStream: InputStream): Iterator[String] = { val dataInputStream = new BufferedReader( new InputStreamReader(inputStream, StandardCharsets.UTF_8)) new NextIterator[String] { protected override def getNext() = { val nextValue = dataInputStream.readLine() if (nextValue == null) { finished = true } nextValue } protected override def close() { dataInputStream.close() } } } }
Example 162
Source File: StreamingTab.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging { import StreamingTab._ private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) parent.setStreamingJobProgressListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 163
Source File: RecurringTimer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.internal.Logging import org.apache.spark.util.{Clock, SystemClock} private[streaming] class RecurringTimer(clock: Clock, period: Long, callback: (Long) => Unit, name: String) extends Logging { private val thread = new Thread("RecurringTimer - " + name) { setDaemon(true) override def run() { loop } } @volatile private var prevTime = -1L @volatile private var nextTime = -1L @volatile private var stopped = false private def loop() { try { while (!stopped) { triggerActionForNextInterval() } triggerActionForNextInterval() } catch { case e: InterruptedException => } } } private[streaming] object RecurringTimer extends Logging { def main(args: Array[String]) { var lastRecurTime = 0L val period = 1000 def onRecur(time: Long) { val currentTime = System.currentTimeMillis() logInfo("" + currentTime + ": " + (currentTime - lastRecurTime)) lastRecurTime = currentTime } val timer = new RecurringTimer(new SystemClock(), period, onRecur, "Test") timer.start() Thread.sleep(30 * 1000) timer.stop(true) } }
Example 164
Source File: RawTextSender.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") // scalastyle:on println System.exit(1) } // Parse the arguments using a pattern match val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 165
Source File: FileBasedWriteAheadLogReader.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{Closeable, EOFException, IOException} import java.nio.ByteBuffer import org.apache.hadoop.conf.Configuration import org.apache.spark.internal.Logging private[streaming] class FileBasedWriteAheadLogReader(path: String, conf: Configuration) extends Iterator[ByteBuffer] with Closeable with Logging { private val instream = HdfsUtils.getInputStream(path, conf) private var closed = (instream == null) // the file may be deleted as we're opening the stream private var nextItem: Option[ByteBuffer] = None override def hasNext: Boolean = synchronized { if (closed) { return false } if (nextItem.isDefined) { // handle the case where hasNext is called without calling next true } else { try { val length = instream.readInt() val buffer = new Array[Byte](length) instream.readFully(buffer) nextItem = Some(ByteBuffer.wrap(buffer)) logTrace("Read next item " + nextItem.get) true } catch { case e: EOFException => logDebug("Error reading next item, EOF reached", e) close() false case e: IOException => logWarning("Error while trying to read data. If the file was deleted, " + "this should be okay.", e) close() if (HdfsUtils.checkFileExists(path, conf)) { // If file exists, this could be a legitimate error throw e } else { // File was deleted. This can occur when the daemon cleanup thread takes time to // delete the file during recovery. false } case e: Exception => logWarning("Error while trying to read data from HDFS.", e) close() throw e } } } override def next(): ByteBuffer = synchronized { val data = nextItem.getOrElse { close() throw new IllegalStateException( "next called without calling hasNext or after hasNext returned false") } nextItem = None // Ensure the next hasNext call loads new data. data } override def close(): Unit = synchronized { if (!closed) { instream.close() } closed = true } }
Example 166
Source File: RateLimitedOutputStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.OutputStream import java.util.concurrent.TimeUnit._ import scala.annotation.tailrec import org.apache.spark.internal.Logging private[streaming] class RateLimitedOutputStream(out: OutputStream, desiredBytesPerSec: Int) extends OutputStream with Logging { require(desiredBytesPerSec > 0) private val SYNC_INTERVAL = NANOSECONDS.convert(10, SECONDS) private val CHUNK_SIZE = 8192 private var lastSyncTime = System.nanoTime private var bytesWrittenSinceSync = 0L override def write(b: Int) { waitToWrite(1) out.write(b) } override def write(bytes: Array[Byte]) { write(bytes, 0, bytes.length) } @tailrec override final def write(bytes: Array[Byte], offset: Int, length: Int) { val writeSize = math.min(length - offset, CHUNK_SIZE) if (writeSize > 0) { waitToWrite(writeSize) out.write(bytes, offset, writeSize) write(bytes, offset + writeSize, length) } } override def flush() { out.flush() } override def close() { out.close() } @tailrec private def waitToWrite(numBytes: Int) { val now = System.nanoTime val elapsedNanosecs = math.max(now - lastSyncTime, 1) val rate = bytesWrittenSinceSync.toDouble * 1000000000 / elapsedNanosecs if (rate < desiredBytesPerSec) { // It's okay to write; just update some variables and return bytesWrittenSinceSync += numBytes if (now > lastSyncTime + SYNC_INTERVAL) { // Sync interval has passed; let's resync lastSyncTime = now bytesWrittenSinceSync = numBytes } } else { // Calculate how much time we should sleep to bring ourselves to the desired rate. val targetTimeInMillis = bytesWrittenSinceSync * 1000 / desiredBytesPerSec val elapsedTimeInMillis = elapsedNanosecs / 1000000 val sleepTimeInMillis = targetTimeInMillis - elapsedTimeInMillis if (sleepTimeInMillis > 0) { logTrace("Natural rate is " + rate + " per second but desired rate is " + desiredBytesPerSec + ", sleeping for " + sleepTimeInMillis + " ms to compensate.") Thread.sleep(sleepTimeInMillis) } waitToWrite(numBytes) } } }
Example 167
Source File: FailureSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.File import org.scalatest.BeforeAndAfter import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.util.Utils class FailureSuite extends SparkFunSuite with BeforeAndAfter with Logging { private val batchDuration: Duration = Milliseconds(1000) private val numBatches = 30 private var directory: File = null before { directory = Utils.createTempDir() } after { if (directory != null) { Utils.deleteRecursively(directory) } StreamingContext.getActive().foreach { _.stop() } // Stop SparkContext if active SparkContext.getOrCreate(new SparkConf().setMaster("local").setAppName("bla")).stop() } test("multiple failures with map") { MasterFailureTest.testMap(directory.getAbsolutePath, numBatches, batchDuration) } test("multiple failures with updateStateByKey") { MasterFailureTest.testUpdateStateByKey(directory.getAbsolutePath, numBatches, batchDuration) } }
Example 168
Source File: BroadcastManager.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.broadcast import java.util.concurrent.atomic.AtomicLong import scala.reflect.ClassTag import org.apache.commons.collections.map.{AbstractReferenceMap, ReferenceMap} import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.internal.Logging private[spark] class BroadcastManager( val isDriver: Boolean, conf: SparkConf, securityManager: SecurityManager) extends Logging { private var initialized = false private var broadcastFactory: BroadcastFactory = null initialize() // Called by SparkContext or Executor before using Broadcast private def initialize() { synchronized { if (!initialized) { broadcastFactory = new TorrentBroadcastFactory broadcastFactory.initialize(isDriver, conf, securityManager) initialized = true } } } def stop() { broadcastFactory.stop() } private val nextBroadcastId = new AtomicLong(0) private[broadcast] val cachedValues = { new ReferenceMap(AbstractReferenceMap.HARD, AbstractReferenceMap.WEAK) } def newBroadcast[T: ClassTag](value_ : T, isLocal: Boolean): Broadcast[T] = { broadcastFactory.newBroadcast[T](value_, isLocal, nextBroadcastId.getAndIncrement()) } def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean) { broadcastFactory.unbroadcast(id, removeFromDriver, blocking) } }
Example 169
Source File: ShellBasedGroupsMappingProvider.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.security import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[spark] class ShellBasedGroupsMappingProvider extends GroupMappingServiceProvider with Logging { override def getGroups(username: String): Set[String] = { val userGroups = getUnixGroups(username) logDebug("User: " + username + " Groups: " + userGroups.mkString(",")) userGroups } // shells out a "bash -c id -Gn username" to get user groups private def getUnixGroups(username: String): Set[String] = { val cmdSeq = Seq("bash", "-c", "id -Gn " + username) // we need to get rid of the trailing "\n" from the result of command execution Utils.executeAndGetOutput(cmdSeq).stripLineEnd.split(" ").toSet } }
Example 170
Source File: KVUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.status import java.io.File import scala.annotation.meta.getter import scala.collection.JavaConverters._ import scala.language.implicitConversions import scala.reflect.{classTag, ClassTag} import com.fasterxml.jackson.annotation.JsonInclude import com.fasterxml.jackson.module.scala.DefaultScalaModule import org.apache.spark.internal.Logging import org.apache.spark.util.kvstore._ private[spark] object KVUtils extends Logging { def viewToSeq[T]( view: KVStoreView[T], max: Int) (filter: T => Boolean): Seq[T] = { val iter = view.closeableIterator() try { iter.asScala.filter(filter).take(max).toList } finally { iter.close() } } private[spark] class MetadataMismatchException extends Exception }
Example 171
Source File: NettyRpcCallContext.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc.netty import scala.concurrent.Promise import org.apache.spark.internal.Logging import org.apache.spark.network.client.RpcResponseCallback import org.apache.spark.rpc.{RpcAddress, RpcCallContext} private[netty] abstract class NettyRpcCallContext(override val senderAddress: RpcAddress) extends RpcCallContext with Logging { protected def send(message: Any): Unit override def reply(response: Any): Unit = { send(response) } override def sendFailure(e: Throwable): Unit = { send(RpcFailure(e)) } } private[netty] class RemoteNettyRpcCallContext( nettyEnv: NettyRpcEnv, callback: RpcResponseCallback, senderAddress: RpcAddress) extends NettyRpcCallContext(senderAddress) { override protected def send(message: Any): Unit = { val reply = nettyEnv.serialize(message) callback.onSuccess(reply) } }
Example 172
Source File: BlockTransferService.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Future, Promise} import scala.concurrent.duration.Duration import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempFileManager} import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.ThreadUtils private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel, classTag: ClassTag[_]): Unit = { val future = uploadBlock(hostname, port, execId, blockId, blockData, level, classTag) ThreadUtils.awaitResult(future, Duration.Inf) } }
Example 173
Source File: NettyBlockRpcServer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.NioManagedBuffer import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocksNum = openBlocks.blockIds.length val blocks = for (i <- (0 until blocksNum).view) yield blockManager.getBlockData(BlockId.apply(openBlocks.blockIds(i))) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with $blocksNum buffers") responseContext.onSuccess(new StreamHandle(streamId, blocksNum).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 174
Source File: SortShuffleWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.Utils import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = context.taskMetrics().shuffleWriteMetrics override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() writeMetrics.incWriteTime(System.nanoTime - startTime) sorter = null } } } } private[spark] object SortShuffleWriter { def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = { // We cannot bypass sorting if we need to do map-side aggregation. if (dep.mapSideCombine) { require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!") false } else { val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) dep.partitioner.numPartitions <= bypassMergeThreshold } } }
Example 175
Source File: StatsdSink.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.metrics.sink import java.util.Properties import java.util.concurrent.TimeUnit import com.codahale.metrics.MetricRegistry import org.apache.spark.SecurityManager import org.apache.spark.internal.Logging import org.apache.spark.metrics.MetricsSystem private[spark] object StatsdSink { val STATSD_KEY_HOST = "host" val STATSD_KEY_PORT = "port" val STATSD_KEY_PERIOD = "period" val STATSD_KEY_UNIT = "unit" val STATSD_KEY_PREFIX = "prefix" val STATSD_DEFAULT_HOST = "127.0.0.1" val STATSD_DEFAULT_PORT = "8125" val STATSD_DEFAULT_PERIOD = "10" val STATSD_DEFAULT_UNIT = "SECONDS" val STATSD_DEFAULT_PREFIX = "" } private[spark] class StatsdSink( val property: Properties, val registry: MetricRegistry, securityMgr: SecurityManager) extends Sink with Logging { import StatsdSink._ val host = property.getProperty(STATSD_KEY_HOST, STATSD_DEFAULT_HOST) val port = property.getProperty(STATSD_KEY_PORT, STATSD_DEFAULT_PORT).toInt val pollPeriod = property.getProperty(STATSD_KEY_PERIOD, STATSD_DEFAULT_PERIOD).toInt val pollUnit = TimeUnit.valueOf(property.getProperty(STATSD_KEY_UNIT, STATSD_DEFAULT_UNIT).toUpperCase) val prefix = property.getProperty(STATSD_KEY_PREFIX, STATSD_DEFAULT_PREFIX) MetricsSystem.checkMinimalPollingPeriod(pollUnit, pollPeriod) val reporter = new StatsdReporter(registry, host, port, prefix) override def start(): Unit = { reporter.start(pollPeriod, pollUnit) logInfo(s"StatsdSink started with prefix: '$prefix'") } override def stop(): Unit = { reporter.stop() logInfo("StatsdSink stopped.") } override def report(): Unit = reporter.report() }
Example 176
Source File: PythonGatewayServer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.{DataOutputStream, File, FileOutputStream} import java.net.InetAddress import java.nio.charset.StandardCharsets.UTF_8 import java.nio.file.Files import py4j.GatewayServer import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[spark] object PythonGatewayServer extends Logging { initializeLogIfNecessary(true) def main(args: Array[String]): Unit = { val secret = Utils.createSecret(new SparkConf()) // Start a GatewayServer on an ephemeral port. Make sure the callback client is configured // with the same secret, in case the app needs callbacks from the JVM to the underlying // python processes. val localhost = InetAddress.getLoopbackAddress() val gatewayServer: GatewayServer = new GatewayServer.GatewayServerBuilder() .authToken(secret) .javaPort(0) .javaAddress(localhost) .callbackClient(GatewayServer.DEFAULT_PYTHON_PORT, localhost, secret) .build() gatewayServer.start() val boundPort: Int = gatewayServer.getListeningPort if (boundPort == -1) { logError("GatewayServer failed to bind; exiting") System.exit(1) } else { logDebug(s"Started PythonGatewayServer on port $boundPort") } // Communicate the connection information back to the python process by writing the // information in the requested file. This needs to match the read side in java_gateway.py. val connectionInfoPath = new File(sys.env("_PYSPARK_DRIVER_CONN_INFO_PATH")) val tmpPath = Files.createTempFile(connectionInfoPath.getParentFile().toPath(), "connection", ".info").toFile() val dos = new DataOutputStream(new FileOutputStream(tmpPath)) dos.writeInt(boundPort) val secretBytes = secret.getBytes(UTF_8) dos.writeInt(secretBytes.length) dos.write(secretBytes, 0, secretBytes.length) dos.close() if (!tmpPath.renameTo(connectionInfoPath)) { logError(s"Unable to write connection information to $connectionInfoPath.") System.exit(1) } // Exit on EOF or broken pipe to ensure that this process dies when the Python driver dies: while (System.in.read() != -1) { // Do nothing } logDebug("Exiting due to broken pipe from Python driver") System.exit(0) } }
Example 177
Source File: RBackendAuthHandler.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.{ByteArrayOutputStream, DataOutputStream} import java.nio.charset.StandardCharsets.UTF_8 import io.netty.channel.{Channel, ChannelHandlerContext, SimpleChannelInboundHandler} import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private class RBackendAuthHandler(secret: String) extends SimpleChannelInboundHandler[Array[Byte]] with Logging { override def channelRead0(ctx: ChannelHandlerContext, msg: Array[Byte]): Unit = { // The R code adds a null terminator to serialized strings, so ignore it here. val clientSecret = new String(msg, 0, msg.length - 1, UTF_8) try { require(secret == clientSecret, "Auth secret mismatch.") ctx.pipeline().remove(this) writeReply("ok", ctx.channel()) } catch { case e: Exception => logInfo("Authentication failure.", e) writeReply("err", ctx.channel()) ctx.close() } } private def writeReply(reply: String, chan: Channel): Unit = { val out = new ByteArrayOutputStream() SerDe.writeString(new DataOutputStream(out), reply) chan.writeAndFlush(out.toByteArray()) } }
Example 178
Source File: HBaseDelegationTokenProvider.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.security import scala.reflect.runtime.universe import scala.util.control.NonFatal import org.apache.hadoop.conf.Configuration import org.apache.hadoop.security.Credentials import org.apache.hadoop.security.token.{Token, TokenIdentifier} import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[security] class HBaseDelegationTokenProvider extends HadoopDelegationTokenProvider with Logging { override def serviceName: String = "hbase" override def obtainDelegationTokens( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { try { val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader) val obtainToken = mirror.classLoader. loadClass("org.apache.hadoop.hbase.security.token.TokenUtil"). getMethod("obtainToken", classOf[Configuration]) logDebug("Attempting to fetch HBase security token.") val token = obtainToken.invoke(null, hbaseConf(hadoopConf)) .asInstanceOf[Token[_ <: TokenIdentifier]] logInfo(s"Get token from HBase: ${token.toString}") creds.addToken(token.getService, token) } catch { case NonFatal(e) => logDebug(s"Failed to get token from service $serviceName", e) } None } override def delegationTokensRequired( sparkConf: SparkConf, hadoopConf: Configuration): Boolean = { hbaseConf(hadoopConf).get("hbase.security.authentication") == "kerberos" } private def hbaseConf(conf: Configuration): Configuration = { try { val mirror = universe.runtimeMirror(Utils.getContextOrSparkClassLoader) val confCreate = mirror.classLoader. loadClass("org.apache.hadoop.hbase.HBaseConfiguration"). getMethod("create", classOf[Configuration]) confCreate.invoke(null, conf).asInstanceOf[Configuration] } catch { case NonFatal(e) => logDebug("Fail to invoke HBaseConfiguration", e) conf } } }
Example 179
Source File: SparkCuratorUtil.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import scala.collection.JavaConverters._ import org.apache.curator.framework.{CuratorFramework, CuratorFrameworkFactory} import org.apache.curator.retry.ExponentialBackoffRetry import org.apache.zookeeper.KeeperException import org.apache.spark.SparkConf import org.apache.spark.internal.Logging private[spark] object SparkCuratorUtil extends Logging { private val ZK_CONNECTION_TIMEOUT_MILLIS = 15000 private val ZK_SESSION_TIMEOUT_MILLIS = 60000 private val RETRY_WAIT_MILLIS = 5000 private val MAX_RECONNECT_ATTEMPTS = 3 def newClient( conf: SparkConf, zkUrlConf: String = "spark.deploy.zookeeper.url"): CuratorFramework = { val ZK_URL = conf.get(zkUrlConf) val zk = CuratorFrameworkFactory.newClient(ZK_URL, ZK_SESSION_TIMEOUT_MILLIS, ZK_CONNECTION_TIMEOUT_MILLIS, new ExponentialBackoffRetry(RETRY_WAIT_MILLIS, MAX_RECONNECT_ATTEMPTS)) zk.start() zk } def mkdir(zk: CuratorFramework, path: String) { if (zk.checkExists().forPath(path) == null) { try { zk.create().creatingParentsIfNeeded().forPath(path) } catch { case nodeExist: KeeperException.NodeExistsException => // do nothing, ignore node existing exception. case e: Exception => throw e } } } def deleteRecursive(zk: CuratorFramework, path: String) { if (zk.checkExists().forPath(path) != null) { for (child <- zk.getChildren.forPath(path).asScala) { zk.delete().forPath(path + "/" + child) } zk.delete().forPath(path) } } }
Example 180
Source File: FileSystemPersistenceEngine.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.Utils private[master] class FileSystemPersistenceEngine( val dir: String, val serializer: Serializer) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { val f = new File(dir + File.separator + name) if (!f.delete()) { logWarning(s"Error deleting ${f.getPath()}") } } override def read[T: ClassTag](prefix: String): Seq[T] = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) var out: SerializationStream = null Utils.tryWithSafeFinally { out = serializer.newInstance().serializeStream(fileOut) out.writeObject(value) } { fileOut.close() if (out != null) { out.close() } } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileIn = new FileInputStream(file) var in: DeserializationStream = null try { in = serializer.newInstance().deserializeStream(fileIn) in.readObject[T]() } finally { fileIn.close() if (in != null) { in.close() } } } }
Example 181
Source File: RecoveryModeFactory.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 182
Source File: MasterArguments.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import scala.annotation.tailrec import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.util.{IntParam, Utils} private def printUsageAndExit(exitCode: Int) { // scalastyle:off println System.err.println( "Usage: Master [options]\n" + "\n" + "Options:\n" + " -i HOST, --ip HOST Hostname to listen on (deprecated, please use --host or -h) \n" + " -h HOST, --host HOST Hostname to listen on\n" + " -p PORT, --port PORT Port to listen on (default: 7077)\n" + " --webui-port PORT Port for web UI (default: 8080)\n" + " --properties-file FILE Path to a custom Spark properties file.\n" + " Default is conf/spark-defaults.conf.") // scalastyle:on println System.exit(exitCode) } }
Example 183
Source File: MasterWebUI.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master.ui import org.apache.spark.deploy.DeployMessages.{MasterStateResponse, RequestMasterState} import org.apache.spark.deploy.master.Master import org.apache.spark.internal.Logging import org.apache.spark.ui.{SparkUI, WebUI} import org.apache.spark.ui.JettyUtils._ def initialize() { val masterPage = new MasterPage(this) attachPage(new ApplicationPage(this)) attachPage(masterPage) attachHandler(createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR, "/static")) attachHandler(createRedirectHandler( "/app/kill", "/", masterPage.handleAppKillRequest, httpMethods = Set("POST"))) attachHandler(createRedirectHandler( "/driver/kill", "/", masterPage.handleDriverKillRequest, httpMethods = Set("POST"))) } def addProxy(): Unit = { val handler = createProxyHandler(idToUiAddress) attachHandler(handler) } def idToUiAddress(id: String): Option[String] = { val state = masterEndpointRef.askSync[MasterStateResponse](RequestMasterState) val maybeWorkerUiAddress = state.workers.find(_.id == id).map(_.webUiAddress) val maybeAppUiAddress = state.activeApps.find(_.id == id).map(_.desc.appUiUrl) maybeWorkerUiAddress.orElse(maybeAppUiAddress) } } private[master] object MasterWebUI { private val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR }
Example 184
Source File: ZooKeeperLeaderElectionAgent.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.curator.framework.CuratorFramework import org.apache.curator.framework.recipes.leader.{LeaderLatch, LeaderLatchListener} import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.internal.Logging private[master] class ZooKeeperLeaderElectionAgent(val masterInstance: LeaderElectable, conf: SparkConf) extends LeaderLatchListener with LeaderElectionAgent with Logging { val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/leader_election" private var zk: CuratorFramework = _ private var leaderLatch: LeaderLatch = _ private var status = LeadershipStatus.NOT_LEADER start() private def start() { logInfo("Starting ZooKeeper LeaderElection agent") zk = SparkCuratorUtil.newClient(conf) leaderLatch = new LeaderLatch(zk, WORKING_DIR) leaderLatch.addListener(this) leaderLatch.start() } override def stop() { leaderLatch.close() zk.close() } override def isLeader() { synchronized { // could have lost leadership by now. if (!leaderLatch.hasLeadership) { return } logInfo("We have gained leadership") updateLeadershipStatus(true) } } override def notLeader() { synchronized { // could have gained leadership by now. if (leaderLatch.hasLeadership) { return } logInfo("We have lost leadership") updateLeadershipStatus(false) } } private def updateLeadershipStatus(isLeader: Boolean) { if (isLeader && status == LeadershipStatus.NOT_LEADER) { status = LeadershipStatus.LEADER masterInstance.electedLeader() } else if (!isLeader && status == LeadershipStatus.LEADER) { status = LeadershipStatus.NOT_LEADER masterInstance.revokedLeadership() } } private object LeadershipStatus extends Enumeration { type LeadershipStatus = Value val LEADER, NOT_LEADER = Value } }
Example 185
Source File: ZooKeeperPersistenceEngine.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer) extends PersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String): Seq[T] = { zk.getChildren.forPath(WORKING_DIR).asScala .filter(_.startsWith(prefix)).flatMap(deserializeFromFile[T]) } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes) } private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) try { Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData))) } catch { case e: Exception => logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } }
Example 186
Source File: DriverWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import java.io.File import org.apache.commons.lang3.StringUtils import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.deploy.{DependencyUtils, SparkHadoopUtil, SparkSubmit} import org.apache.spark.internal.Logging import org.apache.spark.rpc.RpcEnv import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils} case workerUrl :: userJar :: mainClass :: extraArgs => val conf = new SparkConf() val host: String = Utils.localHostName() val port: Int = sys.props.getOrElse("spark.driver.port", "0").toInt val rpcEnv = RpcEnv.create("Driver", host, port, conf, new SecurityManager(conf)) logInfo(s"Driver address: ${rpcEnv.address}") rpcEnv.setupEndpoint("workerWatcher", new WorkerWatcher(rpcEnv, workerUrl)) val currentLoader = Thread.currentThread.getContextClassLoader val userJarUrl = new File(userJar).toURI().toURL() val loader = if (sys.props.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) { new ChildFirstURLClassLoader(Array(userJarUrl), currentLoader) } else { new MutableURLClassLoader(Array(userJarUrl), currentLoader) } Thread.currentThread.setContextClassLoader(loader) setupDependencies(loader, userJar) // Delegate to supplied main class val clazz = Utils.classForName(mainClass) val mainMethod = clazz.getMethod("main", classOf[Array[String]]) mainMethod.invoke(null, extraArgs.toArray[String]) rpcEnv.shutdown() case _ => // scalastyle:off println System.err.println("Usage: DriverWrapper <workerUrl> <userJar> <driverMainClass> [options]") // scalastyle:on println System.exit(-1) } } private def setupDependencies(loader: MutableURLClassLoader, userJar: String): Unit = { val sparkConf = new SparkConf() val secMgr = new SecurityManager(sparkConf) val hadoopConf = SparkHadoopUtil.newConfiguration(sparkConf) val Seq(packagesExclusions, packages, repositories, ivyRepoPath, ivySettingsPath) = Seq( "spark.jars.excludes", "spark.jars.packages", "spark.jars.repositories", "spark.jars.ivy", "spark.jars.ivySettings" ).map(sys.props.get(_).orNull) val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(packagesExclusions, packages, repositories, ivyRepoPath, Option(ivySettingsPath)) val jars = { val jarsProp = sys.props.get("spark.jars").orNull if (!StringUtils.isBlank(resolvedMavenCoordinates)) { SparkSubmit.mergeFileLists(jarsProp, resolvedMavenCoordinates) } else { jarsProp } } val localJars = DependencyUtils.resolveAndDownloadJars(jars, userJar, sparkConf, hadoopConf, secMgr) DependencyUtils.addJarsToClassPath(localJars, loader) } }
Example 187
Source File: CommandUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import java.io.{File, FileOutputStream, InputStream, IOException} import scala.collection.JavaConverters._ import scala.collection.Map import org.apache.spark.SecurityManager import org.apache.spark.deploy.Command import org.apache.spark.internal.Logging import org.apache.spark.launcher.WorkerCommandBuilder import org.apache.spark.util.Utils def redirectStream(in: InputStream, file: File) { val out = new FileOutputStream(file, true) // TODO: It would be nice to add a shutdown hook here that explains why the output is // terminating. Otherwise if the worker dies the executor logs will silently stop. new Thread("redirect output to " + file) { override def run() { try { Utils.copyStream(in, out, true) } catch { case e: IOException => logInfo("Redirection to " + file + " closed: " + e.getMessage) } } }.start() } }
Example 188
Source File: WorkerWebUI.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker.ui import java.io.File import javax.servlet.http.HttpServletRequest import org.apache.spark.deploy.worker.Worker import org.apache.spark.internal.Logging import org.apache.spark.ui.{SparkUI, WebUI} import org.apache.spark.ui.JettyUtils._ import org.apache.spark.util.RpcUtils def initialize() { val logPage = new LogPage(this) attachPage(logPage) attachPage(new WorkerPage(this)) attachHandler(createStaticHandler(WorkerWebUI.STATIC_RESOURCE_BASE, "/static")) attachHandler(createServletHandler("/log", (request: HttpServletRequest) => logPage.renderLog(request), worker.securityMgr, worker.conf)) } } private[worker] object WorkerWebUI { val STATIC_RESOURCE_BASE = SparkUI.STATIC_RESOURCE_DIR val DEFAULT_RETAINED_DRIVERS = 1000 val DEFAULT_RETAINED_EXECUTORS = 1000 }
Example 189
Source File: WorkerWatcher.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import org.apache.spark.internal.Logging import org.apache.spark.rpc._ private[spark] class WorkerWatcher( override val rpcEnv: RpcEnv, workerUrl: String, isTesting: Boolean = false) extends RpcEndpoint with Logging { logInfo(s"Connecting to worker $workerUrl") if (!isTesting) { rpcEnv.asyncSetupEndpointRefByURI(workerUrl) } // Used to avoid shutting down JVM during tests // In the normal case, exitNonZero will call `System.exit(-1)` to shutdown the JVM. In the unit // test, the user should call `setTesting(true)` so that `exitNonZero` will set `isShutDown` to // true rather than calling `System.exit`. The user can check `isShutDown` to know if // `exitNonZero` is called. private[deploy] var isShutDown = false // Lets filter events only from the worker's rpc system private val expectedAddress = RpcAddress.fromURIString(workerUrl) private def isWorker(address: RpcAddress) = expectedAddress == address private def exitNonZero() = if (isTesting) isShutDown = true else System.exit(-1) override def receive: PartialFunction[Any, Unit] = { case e => logWarning(s"Received unexpected message: $e") } override def onConnected(remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { logInfo(s"Successfully connected to $workerUrl") } } override def onDisconnected(remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { // This log message will never be seen logError(s"Lost connection to worker rpc endpoint $workerUrl. Exiting.") exitNonZero() } } override def onNetworkError(cause: Throwable, remoteAddress: RpcAddress): Unit = { if (isWorker(remoteAddress)) { // These logs may not be seen if the worker (and associated pipe) has died logError(s"Could not initialize connection to worker $workerUrl. Exiting.") logError(s"Error was: $cause") exitNonZero() } } }
Example 190
Source File: HistoryServerArguments.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import scala.annotation.tailrec import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.util.Utils private[history] class HistoryServerArguments(conf: SparkConf, args: Array[String]) extends Logging { private var propertiesFile: String = null parse(args.toList) @tailrec private def parse(args: List[String]): Unit = { if (args.length == 1) { setLogDirectory(args.head) } else { args match { case ("--dir" | "-d") :: value :: tail => setLogDirectory(value) parse(tail) case ("--help" | "-h") :: tail => printUsageAndExit(0) case ("--properties-file") :: value :: tail => propertiesFile = value parse(tail) case Nil => case _ => printUsageAndExit(1) } } } private def setLogDirectory(value: String): Unit = { logWarning("Setting log directory through the command line is deprecated as of " + "Spark 1.1.0. Please set this through spark.history.fs.logDirectory instead.") conf.set("spark.history.fs.logDirectory", value) } // This mutates the SparkConf, so all accesses to it must be made after this line Utils.loadDefaultSparkProperties(conf, propertiesFile) private def printUsageAndExit(exitCode: Int) { // scalastyle:off println System.err.println( """ |Usage: HistoryServer [options] | |Options: | DIR Deprecated; set spark.history.fs.logDirectory directly | --dir DIR (-d DIR) Deprecated; set spark.history.fs.logDirectory directly | --properties-file FILE Path to a custom Spark properties file. | Default is conf/spark-defaults.conf. | |Configuration options can be set by setting the corresponding JVM system property. |History Server options are always available; additional options depend on the provider. | |History Server options: | | spark.history.ui.port Port where server will listen for connections | (default 18080) | spark.history.acls.enable Whether to enable view acls for all applications | (default false) | spark.history.provider Name of history provider class (defaults to | file system-based provider) | spark.history.retainedApplications Max number of application UIs to keep loaded in memory | (default 50) |FsHistoryProvider options: | | spark.history.fs.logDirectory Directory where app logs are stored | (default: file:/tmp/spark-events) | spark.history.fs.updateInterval How often to reload log data from storage | (in seconds, default: 10) |""".stripMargin) // scalastyle:on println System.exit(exitCode) } }
Example 191
Source File: LocalSparkCluster.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkConf import org.apache.spark.deploy.master.Master import org.apache.spark.deploy.worker.Worker import org.apache.spark.internal.Logging import org.apache.spark.rpc.RpcEnv import org.apache.spark.util.Utils for (workerNum <- 1 to numWorkers) { val workerEnv = Worker.startRpcEnvAndEndpoint(localHostname, 0, 0, coresPerWorker, memoryPerWorker, masters, null, Some(workerNum), _conf) workerRpcEnvs += workerEnv } masters } def stop() { logInfo("Shutting down local Spark cluster.") // Stop the workers before the master so they don't get upset that it disconnected workerRpcEnvs.foreach(_.shutdown()) masterRpcEnvs.foreach(_.shutdown()) workerRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.foreach(_.awaitTermination()) masterRpcEnvs.clear() workerRpcEnvs.clear() } }
Example 192
Source File: SparkHadoopMapRedUtil.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mapred import java.io.IOException import org.apache.hadoop.mapreduce.{TaskAttemptContext => MapReduceTaskAttemptContext} import org.apache.hadoop.mapreduce.{OutputCommitter => MapReduceOutputCommitter} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.executor.CommitDeniedException import org.apache.spark.internal.Logging object SparkHadoopMapRedUtil extends Logging { def commitTask( committer: MapReduceOutputCommitter, mrTaskContext: MapReduceTaskAttemptContext, jobId: Int, splitId: Int): Unit = { val mrTaskAttemptID = mrTaskContext.getTaskAttemptID // Called after we have decided to commit def performCommit(): Unit = { try { committer.commitTask(mrTaskContext) logInfo(s"$mrTaskAttemptID: Committed") } catch { case cause: IOException => logError(s"Error committing the output of task: $mrTaskAttemptID", cause) committer.abortTask(mrTaskContext) throw cause } } // First, check whether the task's output has already been committed by some other attempt if (committer.needsTaskCommit(mrTaskContext)) { val shouldCoordinateWithDriver: Boolean = { val sparkConf = SparkEnv.get.conf // We only need to coordinate with the driver if there are concurrent task attempts. // Note that this could happen even when speculation is not enabled (e.g. see SPARK-8029). // This (undocumented) setting is an escape-hatch in case the commit code introduces bugs. sparkConf.getBoolean("spark.hadoop.outputCommitCoordination.enabled", defaultValue = true) } if (shouldCoordinateWithDriver) { val outputCommitCoordinator = SparkEnv.get.outputCommitCoordinator val taskAttemptNumber = TaskContext.get().attemptNumber() val stageId = TaskContext.get().stageId() val canCommit = outputCommitCoordinator.canCommit(stageId, splitId, taskAttemptNumber) if (canCommit) { performCommit() } else { val message = s"$mrTaskAttemptID: Not committed because the driver did not authorize commit" logInfo(message) // We need to abort the task so that the driver can reschedule new attempts, if necessary committer.abortTask(mrTaskContext) throw new CommitDeniedException(message, stageId, splitId, taskAttemptNumber) } } else { // Speculation is disabled or a user has chosen to manually bypass the commit coordination performCommit() } } else { // Some other attempt committed the output, so we do nothing and signal success logInfo(s"No need to commit output of task because needsTaskCommit=false: $mrTaskAttemptID") } } }
Example 193
Source File: JobWaiter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.util.concurrent.atomic.AtomicInteger import scala.concurrent.{Future, Promise} import org.apache.spark.internal.Logging def cancel() { dagScheduler.cancelJob(jobId, None) } override def taskSucceeded(index: Int, result: Any): Unit = { // resultHandler call must be synchronized in case resultHandler itself is not thread safe. synchronized { resultHandler(index, result.asInstanceOf[T]) } if (finishedTasks.incrementAndGet() == totalTasks) { jobPromise.success(()) } } override def jobFailed(exception: Exception): Unit = { if (!jobPromise.tryFailure(exception)) { logWarning("Ignore failure", exception) } } }
Example 194
Source File: SparkUncaughtExceptionHandler.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.internal.Logging private[spark] class SparkUncaughtExceptionHandler(val exitOnUncaughtException: Boolean = true) extends Thread.UncaughtExceptionHandler with Logging { override def uncaughtException(thread: Thread, exception: Throwable) { try { // Make it explicit that uncaught exceptions are thrown when container is shutting down. // It will help users when they analyze the executor logs val inShutdownMsg = if (ShutdownHookManager.inShutdown()) "[Container in shutdown] " else "" val errMsg = "Uncaught exception in thread " logError(inShutdownMsg + errMsg + thread, exception) // We may have been called from a shutdown hook. If so, we must not call System.exit(). // (If we do, we will deadlock.) if (!ShutdownHookManager.inShutdown()) { if (exception.isInstanceOf[OutOfMemoryError]) { System.exit(SparkExitCode.OOM) } else if (exitOnUncaughtException) { System.exit(SparkExitCode.UNCAUGHT_EXCEPTION) } } } catch { case oom: OutOfMemoryError => Runtime.getRuntime.halt(SparkExitCode.OOM) case t: Throwable => Runtime.getRuntime.halt(SparkExitCode.UNCAUGHT_EXCEPTION_TWICE) } } def uncaughtException(exception: Throwable) { uncaughtException(Thread.currentThread(), exception) } }
Example 195
Source File: TopologyMapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.util.Utils @DeveloperApi class FileBasedTopologyMapper(conf: SparkConf) extends TopologyMapper(conf) with Logging { val topologyFile = conf.getOption("spark.storage.replication.topologyFile") require(topologyFile.isDefined, "Please specify topology file via " + "spark.storage.replication.topologyFile for FileBasedTopologyMapper.") val topologyMap = Utils.getPropertiesFromFile(topologyFile.get) override def getTopologyForHost(hostname: String): Option[String] = { val topology = topologyMap.get(hostname) if (topology.isDefined) { logDebug(s"$hostname -> ${topology.get}") } else { logWarning(s"$hostname does not have any topology information") } topology } }
Example 196
Source File: BlockManagerSlaveEndpoint.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.{MapOutputTracker, SparkEnv} import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcCallContext, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.storage.BlockManagerMessages._ import org.apache.spark.util.{ThreadUtils, Utils} private[storage] class BlockManagerSlaveEndpoint( override val rpcEnv: RpcEnv, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends ThreadSafeRpcEndpoint with Logging { private val asyncThreadPool = ThreadUtils.newDaemonCachedThreadPool("block-manager-slave-async-thread-pool") private implicit val asyncExecutionContext = ExecutionContext.fromExecutorService(asyncThreadPool) // Operations that involve removing blocks may be slow and should be done asynchronously override def receiveAndReply(context: RpcCallContext): PartialFunction[Any, Unit] = { case RemoveBlock(blockId) => doAsync[Boolean]("removing block " + blockId, context) { blockManager.removeBlock(blockId) true } case RemoveRdd(rddId) => doAsync[Int]("removing RDD " + rddId, context) { blockManager.removeRdd(rddId) } case RemoveShuffle(shuffleId) => doAsync[Boolean]("removing shuffle " + shuffleId, context) { if (mapOutputTracker != null) { mapOutputTracker.unregisterShuffle(shuffleId) } SparkEnv.get.shuffleManager.unregisterShuffle(shuffleId) } case RemoveBroadcast(broadcastId, _) => doAsync[Int]("removing broadcast " + broadcastId, context) { blockManager.removeBroadcast(broadcastId, tellMaster = true) } case GetBlockStatus(blockId, _) => context.reply(blockManager.getStatus(blockId)) case GetMatchingBlockIds(filter, _) => context.reply(blockManager.getMatchingBlockIds(filter)) case TriggerThreadDump => context.reply(Utils.getThreadDump()) case ReplicateBlock(blockId, replicas, maxReplicas) => context.reply(blockManager.replicateBlock(blockId, replicas.toSet, maxReplicas)) } private def doAsync[T](actionMessage: String, context: RpcCallContext)(body: => T) { val future = Future { logDebug(actionMessage) body } future.foreach { response => logDebug(s"Done $actionMessage, response is $response") context.reply(response) logDebug(s"Sent response: $response to ${context.senderAddress}") } future.failed.foreach { t => logError(s"Error in $actionMessage", t) context.sendFailure(t) } } override def onStop(): Unit = { asyncThreadPool.shutdownNow() } }
Example 197
Source File: OrderedRDDFunctions.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 198
Source File: SequenceFileRDDFunctions.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.hadoop.io.Writable import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.internal.Logging def saveAsSequenceFile( path: String, codec: Option[Class[_ <: CompressionCodec]] = None): Unit = self.withScope { def anyToWritable[U <% Writable](u: U): Writable = u // TODO We cannot force the return type of `anyToWritable` be same as keyWritableClass and // valueWritableClass at the compile time. To implement that, we need to add type parameters to // SequenceFileRDDFunctions. however, SequenceFileRDDFunctions is a public class so it will be a // breaking change. val convertKey = self.keyClass != _keyWritableClass val convertValue = self.valueClass != _valueWritableClass logInfo("Saving as sequence file of type " + s"(${_keyWritableClass.getSimpleName},${_valueWritableClass.getSimpleName})" ) val format = classOf[SequenceFileOutputFormat[Writable, Writable]] val jobConf = new JobConf(self.context.hadoopConfiguration) if (!convertKey && !convertValue) { self.saveAsHadoopFile(path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } else if (!convertKey && convertValue) { self.map(x => (x._1, anyToWritable(x._2))).saveAsHadoopFile( path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } else if (convertKey && !convertValue) { self.map(x => (anyToWritable(x._1), x._2)).saveAsHadoopFile( path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } else if (convertKey && convertValue) { self.map(x => (anyToWritable(x._1), anyToWritable(x._2))).saveAsHadoopFile( path, _keyWritableClass, _valueWritableClass, format, jobConf, codec) } } }
Example 199
Source File: SparkFunSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark // scalastyle:off import java.io.File import org.scalatest.{BeforeAndAfterAll, FunSuite, Outcome} import org.apache.spark.internal.Logging import org.apache.spark.util.AccumulatorContext final protected override def withFixture(test: NoArgTest): Outcome = { val testName = test.text val suiteName = this.getClass.getName val shortSuiteName = suiteName.replaceAll("org.apache.spark", "o.a.s") try { logInfo(s"\n\n===== TEST OUTPUT FOR $shortSuiteName: '$testName' =====\n") test() } finally { logInfo(s"\n\n===== FINISHED $shortSuiteName: '$testName' =====\n") } } }
Example 200
Source File: StarryClosureCleaner.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.util import org.apache.spark.internal.Logging import org.apache.spark.{SparkEnv, SparkException} import scala.collection.mutable object StarryClosureCleaner extends Logging { val serializableMap: LRUCache[String, Boolean] = new LRUCache[String, Boolean](100000) // Check whether a class represents a Scala closure private def isClosure(cls: Class[_]): Boolean = { cls.getName.contains("$anonfun$") } def clean( closure: AnyRef, checkSerializable: Boolean = true, cleanTransitively: Boolean = true): Unit = { clean(closure, checkSerializable, cleanTransitively, mutable.Map.empty) } private def clean( func: AnyRef, checkSerializable: Boolean, cleanTransitively: Boolean, accessedFields: mutable.Map[Class[_], mutable.Set[String]]): Unit = { if (!isClosure(func.getClass)) { logWarning("Expected a closure; got " + func.getClass.getName) return } if (func == null) { return } if (checkSerializable) { ensureSerializable(func) } } private def ensureSerializable(func: AnyRef) { if (!serializableMap.containsKey(func.getClass.getCanonicalName)) { try { if (SparkEnv.get != null) { SparkEnv.get.closureSerializer.newInstance().serialize(func) serializableMap.put(func.getClass.getCanonicalName, true) } } catch { case ex: Exception => throw new SparkException("Task not serializable", ex) } } } case class LRUCache[K, V](cacheSize: Int) extends util.LinkedHashMap[K, V] { override def removeEldestEntry(eldest: util.Map.Entry[K, V]): Boolean = size > cacheSize } }