org.apache.spark.annotation.DeveloperApi Scala Examples
The following examples show how to use org.apache.spark.annotation.DeveloperApi.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0 | 6 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.log4j import scala.collection.mutable object MNISTBenchmark { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt) val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2" val numPartitions = if(args.length >= 3) args(2).toInt else 10 val models = if(args.length >=4) args(3).split(',') else Array("tree","naive") val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, path) .zipWithIndex() .filter(_._2 < ns.max) .sortBy(_._2, numPartitions = numPartitions) .keys .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) .cache() dataset.count() //force persist val limiter = new Limiter() val knn = new KNNClassifier() .setTopTreeSize(numPartitions * 10) .setFeaturesCol("features") .setPredictionCol("prediction") .setK(1) val naiveKNN = new NaiveKNNClassifier() val pipeline = new Pipeline() .setStages(Array(limiter, knn)) val naivePipeline = new Pipeline() .setStages(Array(limiter, naiveKNN)) val paramGrid = new ParamGridBuilder() .addGrid(limiter.n, ns) .build() val bm = new Benchmarker() .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumTimes(3) val metrics = mutable.ArrayBuffer[String]() if(models.contains("tree")) { val bmModel = bm.setEstimator(pipeline).fit(dataset) metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}" } if(models.contains("naive")) { val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset) metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}" } logger.info(metrics.mkString("\n")) } } class Limiter(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("limiter")) val n: IntParam = new IntParam(this, "n", "number of rows to limit") def setN(value: Int): this.type = set(n, value) // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN) override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF() override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = schema }
Example 2
Source File: OapListener.scala From OAP with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.oap.listener import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} import org.apache.spark.sql.oap.OapRuntime @DeveloperApi case class SparkListenerCustomInfoUpdate( hostName: String, executorId: String, clazzName: String, customizedInfo: String) extends SparkListenerEvent { override def logEvent: Boolean = false } class OapListener extends SparkListener { override def onOtherEvent(event: SparkListenerEvent): Unit = event match { case customInfo: SparkListenerCustomInfoUpdate => if (customInfo.clazzName.contains("OapFiberCacheHeartBeatMessager")) { OapRuntime.getOrCreate.fiberSensor.updateLocations(customInfo) } else if (customInfo.clazzName.contains("FiberCacheManagerMessager")) { OapRuntime.getOrCreate.fiberSensor.updateMetrics(customInfo) } case _ => } }
Example 3
Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import breeze.linalg.{diag, eigSym, max, DenseMatrix => BDM, DenseVector => BDV, Vector => BV} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.ml.impl.Utils import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} private def calculateCovarianceConstants: (BDM[Double], Double) = { val eigSym.EigSym(d, u) = eigSym(cov.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t // For numerical stability, values are considered to be non-zero only if they exceed tol. // This prevents any inverted value from exceeding (eps * n * max(d))^-1 val tol = Utils.EPSILON * max(d) * d.length try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new BDV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) (pinvS * u.t, -0.5 * (mean.size * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") } } }
Example 4
Source File: AttributeType.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.attribute import org.apache.spark.annotation.DeveloperApi def fromName(name: String): AttributeType = { if (name == Numeric.name) { Numeric } else if (name == Nominal.name) { Nominal } else if (name == Binary.name) { Binary } else if (name == Unresolved.name) { Unresolved } else { throw new IllegalArgumentException(s"Cannot recognize type $name.") } } }
Example 5
Source File: Transformer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import scala.annotation.varargs import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ protected def validateInputType(inputType: DataType): Unit = {} override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), outputDataType, nullable = false) StructType(outputFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val transformUDF = udf(this.createTransformFunc, outputDataType) dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } override def copy(extra: ParamMap): T = defaultCopy(extra) }
Example 6
Source File: LogLoss.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[spark] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 7
Source File: Predict.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.model import org.apache.spark.annotation.{DeveloperApi, Since} @Since("1.2.0") @DeveloperApi class Predict @Since("1.2.0") ( @Since("1.2.0") val predict: Double, @Since("1.2.0") val prob: Double = 0.0) extends Serializable { override def toString: String = s"$predict (prob = $prob)" override def equals(other: Any): Boolean = { other match { case p: Predict => predict == p.predict && prob == p.prob case _ => false } } override def hashCode: Int = { com.google.common.base.Objects.hashCode(predict: java.lang.Double, prob: java.lang.Double) } }
Example 8
Source File: Entropy.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.impurity import org.apache.spark.annotation.{DeveloperApi, Since} override def prob(label: Double): Double = { val lbl = label.toInt require(lbl < stats.length, s"EntropyCalculator.prob given invalid label: $lbl (should be < ${stats.length}") require(lbl >= 0, "Entropy does not support negative labels") val cnt = count if (cnt == 0) { 0 } else { stats(lbl) / cnt } } override def toString: String = s"EntropyCalculator(stats = [${stats.mkString(", ")}])" }
Example 9
Source File: Gini.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.impurity import org.apache.spark.annotation.{DeveloperApi, Since} override def prob(label: Double): Double = { val lbl = label.toInt require(lbl < stats.length, s"GiniCalculator.prob given invalid label: $lbl (should be < ${stats.length}") require(lbl >= 0, "GiniImpurity does not support negative labels") val cnt = count if (cnt == 0) { 0 } else { stats(lbl) / cnt } } override def toString: String = s"GiniCalculator(stats = [${stats.mkString(", ")}])" }
Example 10
Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.distribution import breeze.linalg.{diag, eigSym, max, DenseMatrix => DBM, DenseVector => DBV, Vector => BV} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors} import org.apache.spark.mllib.util.MLUtils private def calculateCovarianceConstants: (DBM[Double], Double) = { val eigSym.EigSym(d, u) = eigSym(sigma.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t // For numerical stability, values are considered to be non-zero only if they exceed tol. // This prevents any inverted value from exceeding (eps * n * max(d))^-1 val tol = MLUtils.EPSILON * max(d) * d.length try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) (pinvS * u.t, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") } } }
Example 11
Source File: Updater.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization import scala.math._ import breeze.linalg.{axpy => brzAxpy, norm => brzNorm, Vector => BV} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.mllib.linalg.{Vector, Vectors} @DeveloperApi class SquaredL2Updater extends Updater { override def compute( weightsOld: Vector, gradient: Vector, stepSize: Double, iter: Int, regParam: Double): (Vector, Double) = { // add up both updates from the gradient of the loss (= step) as well as // the gradient of the regularizer (= regParam * weightsOld) // w' = w - thisIterStepSize * (gradient + regParam * w) // w' = (1 - thisIterStepSize * regParam) * w - thisIterStepSize * gradient val thisIterStepSize = stepSize / math.sqrt(iter) val brzWeights: BV[Double] = weightsOld.asBreeze.toDenseVector brzWeights :*= (1.0 - thisIterStepSize * regParam) brzAxpy(-thisIterStepSize, gradient.asBreeze, brzWeights) val norm = brzNorm(brzWeights, 2.0) (Vectors.fromBreeze(brzWeights), 0.5 * regParam * norm * norm) } }
Example 12
Source File: MFDataGenerator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.{util => ju} import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.{BLAS, DenseMatrix} import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object MFDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: MFDataGenerator " + "<master> <outputDir> [m] [n] [rank] [trainSampFact] [noise] [sigma] [test] [testSampFact]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val m: Int = if (args.length > 2) args(2).toInt else 100 val n: Int = if (args.length > 3) args(3).toInt else 100 val rank: Int = if (args.length > 4) args(4).toInt else 10 val trainSampFact: Double = if (args.length > 5) args(5).toDouble else 1.0 val noise: Boolean = if (args.length > 6) args(6).toBoolean else false val sigma: Double = if (args.length > 7) args(7).toDouble else 0.1 val test: Boolean = if (args.length > 8) args(8).toBoolean else false val testSampFact: Double = if (args.length > 9) args(9).toDouble else 0.1 val sc = new SparkContext(sparkMaster, "MFDataGenerator") val random = new ju.Random(42L) val A = DenseMatrix.randn(m, rank, random) val B = DenseMatrix.randn(rank, n, random) val z = 1 / math.sqrt(rank) val fullData = DenseMatrix.zeros(m, n) BLAS.gemm(z, A, B, 1.0, fullData) val df = rank * (m + n - rank) val sampSize = math.min(math.round(trainSampFact * df), math.round(.99 * m * n)).toInt val rand = new Random() val mn = m * n val shuffled = rand.shuffle((0 until mn).toList) val omega = shuffled.slice(0, sampSize) val ordered = omega.sortWith(_ < _).toArray val trainData: RDD[(Int, Int, Double)] = sc.parallelize(ordered) .map(x => (x % m, x / m, fullData.values(x))) // optionally add gaussian noise if (noise) { trainData.map(x => (x._1, x._2, x._3 + rand.nextGaussian * sigma)) } trainData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath) // optionally generate testing data if (test) { val testSampSize = math.min(math.round(sampSize * testSampFact).toInt, mn - sampSize) val testOmega = shuffled.slice(sampSize, sampSize + testSampSize) val testOrdered = testOmega.sortWith(_ < _).toArray val testData: RDD[(Int, Int, Double)] = sc.parallelize(testOrdered) .map(x => (x % m, x / m, fullData.values(x))) testData.map(x => x._1 + "," + x._2 + "," + x._3).saveAsTextFile(outputPath) } sc.stop() } }
Example 13
Source File: DataValidators.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 14
Source File: KMeansDataGenerator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.rdd.RDD @Since("0.8.0") def generateKMeansRDD( sc: SparkContext, numPoints: Int, k: Int, d: Int, r: Double, numPartitions: Int = 2) : RDD[Array[Double]] = { // First, generate some centers val rand = new Random(42) val centers = Array.fill(k)(Array.fill(d)(rand.nextGaussian() * r)) // Then generate points around each center sc.parallelize(0 until numPoints, numPartitions).map { idx => val center = centers(idx % k) val rand2 = new Random(42 + idx) Array.tabulate(d)(i => center(i) + rand2.nextGaussian()) } } @Since("0.8.0") def main(args: Array[String]) { if (args.length < 6) { // scalastyle:off println println("Usage: KMeansGenerator " + "<master> <output_dir> <num_points> <k> <d> <r> [<num_partitions>]") // scalastyle:on println System.exit(1) } val sparkMaster = args(0) val outputPath = args(1) val numPoints = args(2).toInt val k = args(3).toInt val d = args(4).toInt val r = args(5).toDouble val parts = if (args.length >= 7) args(6).toInt else 2 val sc = new SparkContext(sparkMaster, "KMeansDataGenerator") val data = generateKMeansRDD(sc, numPoints, k, d, r, parts) data.map(_.mkString(" ")).saveAsTextFile(outputPath) System.exit(0) } }
Example 15
Source File: LogisticRegressionDataGenerator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 16
Source File: SVMDataGenerator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 17
Source File: SparkCommandLine.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.repl import scala.tools.nsc.{Settings, CompilerCommand} import scala.Predef._ import org.apache.spark.annotation.DeveloperApi @DeveloperApi class SparkCommandLine(args: List[String], override val settings: Settings) extends CompilerCommand(args, settings) { def this(args: List[String], error: String => Unit) { this(args, new SparkRunnerSettings(error)) } def this(args: List[String]) { // scalastyle:off println this(args, str => Console.println("Error: " + str)) // scalastyle:on println } }
Example 18
Source File: SparkPlanInfo.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.metric.SQLMetricInfo import org.apache.spark.util.Utils @DeveloperApi class SparkPlanInfo( val nodeName: String, val simpleString: String, val children: Seq[SparkPlanInfo], val metadata: Map[String, String], val metrics: Seq[SQLMetricInfo]) { override def hashCode(): Int = { // hashCode of simpleString should be good enough to distinguish the plans from each other // within a plan simpleString.hashCode } override def equals(other: Any): Boolean = other match { case o: SparkPlanInfo => nodeName == o.nodeName && simpleString == o.simpleString && children == o.children case _ => false } } private[execution] object SparkPlanInfo { def fromSparkPlan(plan: SparkPlan): SparkPlanInfo = { val children = plan match { case ReusedExchangeExec(_, child) => child :: Nil case _ => plan.children ++ plan.subqueries } val metrics = plan.metrics.toSeq.map { case (key, metric) => new SQLMetricInfo(metric.name.getOrElse(key), metric.id, metric.metricType) } new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan), plan.metadata, metrics) } }
Example 19
Source File: StreamingListener.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.Queue import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Distribution @DeveloperApi class StatsReportListener(numBatchInfos: Int = 10) extends StreamingListener { // Queue containing latest completed batches val batchInfos = new Queue[BatchInfo]() override def onBatchCompleted(batchStarted: StreamingListenerBatchCompleted) { batchInfos.enqueue(batchStarted.batchInfo) if (batchInfos.size > numBatchInfos) batchInfos.dequeue() printStats() } def printStats() { showMillisDistribution("Total delay: ", _.totalDelay) showMillisDistribution("Processing time: ", _.processingDelay) } def showMillisDistribution(heading: String, getMetric: BatchInfo => Option[Long]) { org.apache.spark.scheduler.StatsReportListener.showMillisDistribution( heading, extractDistribution(getMetric)) } def extractDistribution(getMetric: BatchInfo => Option[Long]): Option[Distribution] = { Distribution(batchInfos.flatMap(getMetric(_)).map(_.toDouble)) } }
Example 20
Source File: ReceiverInfo.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.apache.spark.annotation.DeveloperApi @DeveloperApi case class ReceiverInfo( streamId: Int, name: String, active: Boolean, location: String, executorId: String, lastErrorMessage: String = "", lastError: String = "", lastErrorTime: Long = -1L ) { }
Example 21
Source File: SerializableWritable.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.ObjectWritable import org.apache.hadoop.io.Writable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils @DeveloperApi class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable { def value: T = t override def toString: String = t.toString private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() new ObjectWritable(t).write(out) } private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() val ow = new ObjectWritable() ow.setConf(new Configuration(false)) ow.readFields(in) t = ow.get().asInstanceOf[T] } }
Example 22
Source File: JavaNewHadoopRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 23
Source File: JavaHadoopRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 24
Source File: RecoveryModeFactory.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 25
Source File: EnvironmentTab.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.env import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.ui._ private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") { val listener = parent.environmentListener attachPage(new EnvironmentPage(this)) } @DeveloperApi class EnvironmentListener extends SparkListener { var jvmInformation = Seq[(String, String)]() var sparkProperties = Seq[(String, String)]() var systemProperties = Seq[(String, String)]() var classpathEntries = Seq[(String, String)]() override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) { synchronized { val environmentDetails = environmentUpdate.environmentDetails jvmInformation = environmentDetails("JVM Information") sparkProperties = environmentDetails("Spark Properties") systemProperties = environmentDetails("System Properties") classpathEntries = environmentDetails("Classpath Entries") } } }
Example 26
Source File: StorageTab.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.storage import scala.collection.mutable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.storage._ import org.apache.spark.ui._ private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)]): Unit = { val rddIdsToUpdate = updatedBlocks.flatMap { case (bid, _) => bid.asRDDId.map(_.rddId) }.toSet val rddInfosToUpdate = _rddInfoMap.values.toSeq.filter { s => rddIdsToUpdate.contains(s.id) } StorageUtils.updateRddInfo(rddInfosToUpdate, activeStorageStatusList) } override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized { val rddInfos = stageSubmitted.stageInfo.rddInfos rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info).name = info.name } } override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = synchronized { // Remove all partitions that are no longer cached in current completed stage val completedRddIds = stageCompleted.stageInfo.rddInfos.map(r => r.id).toSet _rddInfoMap.retain { case (id, info) => !completedRddIds.contains(id) || info.numCachedPartitions > 0 } } override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized { _rddInfoMap.remove(unpersistRDD.rddId) } override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = { super.onBlockUpdated(blockUpdated) val blockId = blockUpdated.blockUpdatedInfo.blockId val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel val memSize = blockUpdated.blockUpdatedInfo.memSize val diskSize = blockUpdated.blockUpdatedInfo.diskSize val blockStatus = BlockStatus(storageLevel, memSize, diskSize) updateRDDInfo(Seq((blockId, blockStatus))) } }
Example 27
Source File: JavaSerializer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io._ import java.nio.ByteBuffer import scala.reflect.ClassTag import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.{ByteBufferInputStream, ByteBufferOutputStream, Utils} private[spark] class JavaSerializationStream( out: OutputStream, counterReset: Int, extraDebugInfo: Boolean) extends SerializationStream { private val objOut = new ObjectOutputStream(out) private var counter = 0 @DeveloperApi class JavaSerializer(conf: SparkConf) extends Serializer with Externalizable { private var counterReset = conf.getInt("spark.serializer.objectStreamReset", 100) private var extraDebugInfo = conf.getBoolean("spark.serializer.extraDebugInfo", true) protected def this() = this(new SparkConf()) // For deserialization only override def newInstance(): SerializerInstance = { val classLoader = defaultClassLoader.getOrElse(Thread.currentThread.getContextClassLoader) new JavaSerializerInstance(counterReset, extraDebugInfo, classLoader) } override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { out.writeInt(counterReset) out.writeBoolean(extraDebugInfo) } override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { counterReset = in.readInt() extraDebugInfo = in.readBoolean() } }
Example 28
Source File: StageInfo.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.HashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.RDDInfo def fromStage( stage: Stage, attemptId: Int, numTasks: Option[Int] = None, taskMetrics: TaskMetrics = null, taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty ): StageInfo = { val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd) val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos new StageInfo( stage.id, attemptId, stage.name, numTasks.getOrElse(stage.numTasks), rddInfos, stage.parents.map(_.id), stage.details, taskMetrics, taskLocalityPreferences) } }
Example 29
Source File: AccumulableInfo.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.annotation.DeveloperApi object AccumulableInfo { @deprecated("do not create AccumulableInfo", "2.0.0") def apply( id: Long, name: String, update: Option[String], value: String, internal: Boolean): AccumulableInfo = { new AccumulableInfo( id, Option(name), update, Option(value), internal, countFailedValues = false) } @deprecated("do not create AccumulableInfo", "2.0.0") def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = { new AccumulableInfo( id, Option(name), update, Option(value), internal = false, countFailedValues = false) } @deprecated("do not create AccumulableInfo", "2.0.0") def apply(id: Long, name: String, value: String): AccumulableInfo = { new AccumulableInfo( id, Option(name), None, Option(value), internal = false, countFailedValues = false) } }
Example 30
Source File: SplitInfo.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import collection.mutable.ArrayBuffer import org.apache.spark.annotation.DeveloperApi // information about a specific split instance : handles both split instances. // So that we do not need to worry about the differences. @DeveloperApi class SplitInfo( val inputFormatClazz: Class[_], val hostLocation: String, val path: String, val length: Long, val underlyingSplit: Any) { override def toString(): String = { "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz + ", hostLocation : " + hostLocation + ", path : " + path + ", length : " + length + ", underlyingSplit " + underlyingSplit } override def hashCode(): Int = { var hashCode = inputFormatClazz.hashCode hashCode = hashCode * 31 + hostLocation.hashCode hashCode = hashCode * 31 + path.hashCode // ignore overflow ? It is hashcode anyway ! hashCode = hashCode * 31 + (length & 0x7fffffff).toInt hashCode } // This is practically useless since most of the Split impl's don't seem to implement equals :-( // So unless there is identity equality between underlyingSplits, it will always fail even if it // is pointing to same block. override def equals(other: Any): Boolean = other match { case that: SplitInfo => this.hostLocation == that.hostLocation && this.inputFormatClazz == that.inputFormatClazz && this.path == that.path && this.length == that.length && // other split specific checks (like start for FileSplit) this.underlyingSplit == that.underlyingSplit case _ => false } } object SplitInfo { def toSplitInfo(inputFormatClazz: Class[_], path: String, mapredSplit: org.apache.hadoop.mapred.InputSplit): Seq[SplitInfo] = { val retval = new ArrayBuffer[SplitInfo]() val length = mapredSplit.getLength for (host <- mapredSplit.getLocations) { retval += new SplitInfo(inputFormatClazz, host, path, length, mapredSplit) } retval } def toSplitInfo(inputFormatClazz: Class[_], path: String, mapreduceSplit: org.apache.hadoop.mapreduce.InputSplit): Seq[SplitInfo] = { val retval = new ArrayBuffer[SplitInfo]() val length = mapreduceSplit.getLength for (host <- mapreduceSplit.getLocations) { retval += new SplitInfo(inputFormatClazz, host, path, length, mapreduceSplit) } retval } }
Example 31
Source File: TaskInfo.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.ListBuffer import org.apache.spark.TaskState import org.apache.spark.TaskState.TaskState import org.apache.spark.annotation.DeveloperApi var finishTime: Long = 0 var failed = false var killed = false private[spark] def markGettingResult(time: Long = System.currentTimeMillis) { gettingResultTime = time } private[spark] def markFinished(state: TaskState, time: Long = System.currentTimeMillis) { finishTime = time if (state == TaskState.FAILED) { failed = true } else if (state == TaskState.KILLED) { killed = true } } def gettingResult: Boolean = gettingResultTime != 0 def finished: Boolean = finishTime != 0 def successful: Boolean = finished && !failed && !killed def running: Boolean = !finished def status: String = { if (running) { if (gettingResult) { "GET RESULT" } else { "RUNNING" } } else if (failed) { "FAILED" } else if (killed) { "KILLED" } else if (successful) { "SUCCESS" } else { "UNKNOWN" } } def id: String = s"$index.$attemptNumber" def duration: Long = { if (!finished) { throw new UnsupportedOperationException("duration() called on unfinished task") } else { finishTime - launchTime } } private[spark] def timeRunning(currentTime: Long): Long = currentTime - launchTime }
Example 32
Source File: ExecutorInfo.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.spark.annotation.DeveloperApi @DeveloperApi class ExecutorInfo( val executorHost: String, val totalCores: Int, val logUrlMap: Map[String, String]) { def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo] override def equals(other: Any): Boolean = other match { case that: ExecutorInfo => (that canEqual this) && executorHost == that.executorHost && totalCores == that.totalCores && logUrlMap == that.logUrlMap case _ => false } override def hashCode(): Int = { val state = Seq(executorHost, totalCores, logUrlMap) state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) } }
Example 33
Source File: taskListeners.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.util.EventListener import org.apache.spark.TaskContext import org.apache.spark.annotation.DeveloperApi private[spark] class TaskCompletionListenerException( errorMessages: Seq[String], val previousError: Option[Throwable] = None) extends RuntimeException { override def getMessage: String = { if (errorMessages.size == 1) { errorMessages.head } else { errorMessages.zipWithIndex.map { case (msg, i) => s"Exception $i: $msg" }.mkString("\n") } + previousError.map { e => "\n\nPrevious exception in task: " + e.getMessage + "\n" + e.getStackTrace.mkString("\t", "\n\t", "") }.getOrElse("") } }
Example 34
Source File: Dependency.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import scala.reflect.ClassTag import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.ShuffleHandle @DeveloperApi class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int) extends NarrowDependency[T](rdd) { override def getParents(partitionId: Int): List[Int] = { if (partitionId >= outStart && partitionId < outStart + length) { List(partitionId - outStart + inStart) } else { Nil } } }
Example 35
Source File: StorageStatusListener.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.collection.mutable import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ private def updateStorageStatus(unpersistedRDDId: Int) { storageStatusList.foreach { storageStatus => storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) => storageStatus.removeBlock(blockId) } } } override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized { updateStorageStatus(unpersistRDD.rddId) } override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) { synchronized { val blockManagerId = blockManagerAdded.blockManagerId val executorId = blockManagerId.executorId val maxMem = blockManagerAdded.maxMem val storageStatus = new StorageStatus(blockManagerId, maxMem) executorIdToStorageStatus(executorId) = storageStatus // Try to remove the dead storage status if same executor register the block manager twice. deadExecutorStorageStatus.zipWithIndex.find(_._1.blockManagerId.executorId == executorId) .foreach(toRemoveExecutor => deadExecutorStorageStatus.remove(toRemoveExecutor._2)) } } override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) { synchronized { val executorId = blockManagerRemoved.blockManagerId.executorId executorIdToStorageStatus.remove(executorId).foreach { status => deadExecutorStorageStatus += status } if (deadExecutorStorageStatus.size > retainedDeadExecutors) { deadExecutorStorageStatus.trimStart(1) } } } override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = { val executorId = blockUpdated.blockUpdatedInfo.blockManagerId.executorId val blockId = blockUpdated.blockUpdatedInfo.blockId val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel val memSize = blockUpdated.blockUpdatedInfo.memSize val diskSize = blockUpdated.blockUpdatedInfo.diskSize val blockStatus = BlockStatus(storageLevel, memSize, diskSize) updateStorageStatus(executorId, Seq((blockId, blockStatus))) } }
Example 36
Source File: RDDInfo.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.{RDD, RDDOperationScope} import org.apache.spark.util.Utils @DeveloperApi class RDDInfo( val id: Int, var name: String, val numPartitions: Int, var storageLevel: StorageLevel, val parentIds: Seq[Int], val callSite: String = "", val scope: Option[RDDOperationScope] = None) extends Ordered[RDDInfo] { var numCachedPartitions = 0 var memSize = 0L var diskSize = 0L var externalBlockStoreSize = 0L def isCached: Boolean = (memSize + diskSize > 0) && numCachedPartitions > 0 override def toString: String = { import Utils.bytesToString ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " + "MemorySize: %s; DiskSize: %s").format( name, id, storageLevel.toString, numCachedPartitions, numPartitions, bytesToString(memSize), bytesToString(diskSize)) } override def compare(that: RDDInfo): Int = { this.id - that.id } } private[spark] object RDDInfo { def fromRdd(rdd: RDD[_]): RDDInfo = { val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd)) val parentIds = rdd.dependencies.map(_.rdd.id) new RDDInfo(rdd.id, rddName, rdd.partitions.length, rdd.getStorageLevel, parentIds, rdd.creationSite.shortForm, rdd.scope) } }
Example 37
Source File: BlockUpdatedInfo.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.annotation.DeveloperApi import org.apache.spark.storage.BlockManagerMessages.UpdateBlockInfo @DeveloperApi case class BlockUpdatedInfo( blockManagerId: BlockManagerId, blockId: BlockId, storageLevel: StorageLevel, memSize: Long, diskSize: Long) private[spark] object BlockUpdatedInfo { private[spark] def apply(updateBlockInfo: UpdateBlockInfo): BlockUpdatedInfo = { BlockUpdatedInfo( updateBlockInfo.blockManagerId, updateBlockInfo.blockId, updateBlockInfo.storageLevel, updateBlockInfo.memSize, updateBlockInfo.diskSize) } }
Example 38
Source File: BlockId.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import java.util.UUID import org.apache.spark.annotation.DeveloperApi def apply(id: String): BlockId = id match { case RDD(rddId, splitIndex) => RDDBlockId(rddId.toInt, splitIndex.toInt) case SHUFFLE(shuffleId, mapId, reduceId) => ShuffleBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt) case SHUFFLE_DATA(shuffleId, mapId, reduceId) => ShuffleDataBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt) case SHUFFLE_INDEX(shuffleId, mapId, reduceId) => ShuffleIndexBlockId(shuffleId.toInt, mapId.toInt, reduceId.toInt) case BROADCAST(broadcastId, field) => BroadcastBlockId(broadcastId.toLong, field.stripPrefix("_")) case TASKRESULT(taskId) => TaskResultBlockId(taskId.toLong) case STREAM(streamId, uniqueId) => StreamBlockId(streamId.toInt, uniqueId.toLong) case TEST(value) => TestBlockId(value) case _ => throw new IllegalStateException("Unrecognized BlockId: " + id) } }
Example 39
Source File: BlockManagerId.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput} import java.util.concurrent.ConcurrentHashMap import org.apache.spark.SparkContext import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils def apply( execId: String, host: String, port: Int, topologyInfo: Option[String] = None): BlockManagerId = getCachedBlockManagerId(new BlockManagerId(execId, host, port, topologyInfo)) def apply(in: ObjectInput): BlockManagerId = { val obj = new BlockManagerId() obj.readExternal(in) getCachedBlockManagerId(obj) } val blockManagerIdCache = new ConcurrentHashMap[BlockManagerId, BlockManagerId]() def getCachedBlockManagerId(id: BlockManagerId): BlockManagerId = { blockManagerIdCache.putIfAbsent(id, id) blockManagerIdCache.get(id) } }
Example 40
Source File: TopologyMapper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.util.Utils @DeveloperApi class FileBasedTopologyMapper(conf: SparkConf) extends TopologyMapper(conf) with Logging { val topologyFile = conf.getOption("spark.storage.replication.topologyFile") require(topologyFile.isDefined, "Please specify topology file via " + "spark.storage.replication.topologyFile for FileBasedTopologyMapper.") val topologyMap = Utils.getPropertiesFromFile(topologyFile.get) override def getTopologyForHost(hostname: String): Option[String] = { val topology = topologyMap.get(hostname) if (topology.isDefined) { logDebug(s"$hostname -> ${topology.get}") } else { logWarning(s"$hostname does not have any topology information") } topology } }
Example 41
Source File: BlockReplicationPolicy.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.collection.mutable import scala.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging private def getSampleIds(n: Int, m: Int, r: Random): List[Int] = { val indices = (n - m + 1 to n).foldLeft(Set.empty[Int]) {case (set, i) => val t = r.nextInt(i) + 1 if (set.contains(t)) set + i else set + t } // we shuffle the result to ensure a random arrangement within the sample // to avoid any bias from set implementations r.shuffle(indices.map(_ - 1).toList) } }
Example 42
Source File: ShuffleWriteMetrics.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.LongAccumulator def writeTime: Long = _writeTime.sum private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v) private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v) private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v) private[spark] def decBytesWritten(v: Long): Unit = { _bytesWritten.setValue(bytesWritten - v) } private[spark] def decRecordsWritten(v: Long): Unit = { _recordsWritten.setValue(recordsWritten - v) } // Legacy methods for backward compatibility. // TODO: remove these once we make this class private. @deprecated("use bytesWritten instead", "2.0.0") def shuffleBytesWritten: Long = bytesWritten @deprecated("use writeTime instead", "2.0.0") def shuffleWriteTime: Long = writeTime @deprecated("use recordsWritten instead", "2.0.0") def shuffleRecordsWritten: Long = recordsWritten }
Example 43
Source File: InterruptibleIterator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi @DeveloperApi class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T]) extends Iterator[T] { def hasNext: Boolean = { // TODO(aarondav/rxin): Check Thread.interrupted instead of context.interrupted if interrupt // is allowed. The assumption is that Thread.interrupted does not have a memory fence in read // (just a volatile field in C), while context.interrupted is a volatile in the JVM, which // introduces an expensive read fence. if (context.isInterrupted) { throw new TaskKilledException } else { delegate.hasNext } } def next(): T = delegate.next() }
Example 44
Source File: ShuffledRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = index override def equals(other: Any): Boolean = super.equals(other) } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { val serializer = userSpecifiedSerializer.getOrElse { val serializerManager = SparkEnv.get.serializerManager if (mapSideCombine) { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]]) } else { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]]) } } List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override protected def getPreferredLocations(partition: Partition): Seq[String] = { val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] tracker.getPreferredLocationsForShuffle(dep, partition.index) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 45
Source File: OrderedRDDFunctions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 46
Source File: UnionRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport} import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 47
Source File: HBaseSQLTableScan.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.RangePartitioning import org.apache.spark.sql.execution.LeafNode import org.apache.spark.sql.hbase._ @DeveloperApi case class HBaseSQLTableScan( relation: HBaseRelation, output: Seq[Attribute], result: RDD[Row]) extends LeafNode { override def outputPartitioning = { var ordering = List[SortOrder]() for (key <- relation.partitionKeys) { ordering = ordering :+ SortOrder(key, Ascending) } RangePartitioning(ordering.toSeq, relation.partitions.size) } override protected def doExecute(): RDD[Row] = result }
Example 48
Source File: interfaces.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.expressions._ @DeveloperApi trait PushDownAggregateScan { var aggregateExpressions: Seq[NamedExpression] = null var groupingExpressions: Seq[Expression] = null var orders: Seq[SortOrder] = Seq.empty[SortOrder] var limit: Int = 20 def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] def setAggregateExpressions(ae: Seq[NamedExpression]): Unit = { aggregateExpressions = ae } def setGroupingExpressions(ge: Seq[Expression]): Unit = { groupingExpressions = ge } def setOrders(o: Seq[SortOrder]): Unit = { orders = o } def setLimit(l: Int): Unit = { limit = l } }
Example 49
Source File: SparkPlanInfo.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.metric.SQLMetricInfo @DeveloperApi class SparkPlanInfo( val nodeName: String, val simpleString: String, val children: Seq[SparkPlanInfo], val metadata: Map[String, String], val metrics: Seq[SQLMetricInfo]) { override def hashCode(): Int = { // hashCode of simpleString should be good enough to distinguish the plans from each other // within a plan simpleString.hashCode } override def equals(other: Any): Boolean = other match { case o: SparkPlanInfo => nodeName == o.nodeName && simpleString == o.simpleString && children == o.children case _ => false } } private[execution] object SparkPlanInfo { def fromSparkPlan(plan: SparkPlan): SparkPlanInfo = { val children = plan match { case ReusedExchangeExec(_, child) => child :: Nil case _ => plan.children ++ plan.subqueries } val metrics = plan.metrics.toSeq.map { case (key, metric) => new SQLMetricInfo(metric.name.getOrElse(key), metric.id, metric.metricType) } // dump the file scan metadata (e.g file path) to event log val metadata = plan match { case fileScan: FileSourceScanExec => fileScan.metadata case _ => Map[String, String]() } new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan), metadata, metrics) } }
Example 50
Source File: SQLListener.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.ui import com.fasterxml.jackson.databind.JavaType import com.fasterxml.jackson.databind.`type`.TypeFactory import com.fasterxml.jackson.databind.annotation.JsonDeserialize import com.fasterxml.jackson.databind.util.Converter import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.sql.execution.SparkPlanInfo import org.apache.spark.sql.execution.metric._ @DeveloperApi case class SparkListenerSQLExecutionStart( executionId: Long, description: String, details: String, physicalPlanDescription: String, sparkPlanInfo: SparkPlanInfo, time: Long) extends SparkListenerEvent @DeveloperApi case class SparkListenerSQLExecutionEnd(executionId: Long, time: Long) extends SparkListenerEvent private class LongLongTupleConverter extends Converter[(Object, Object), (Long, Long)] { override def convert(in: (Object, Object)): (Long, Long) = { def toLong(a: Object): Long = a match { case i: java.lang.Integer => i.intValue() case l: java.lang.Long => l.longValue() } (toLong(in._1), toLong(in._2)) } override def getInputType(typeFactory: TypeFactory): JavaType = { val objectType = typeFactory.uncheckedSimpleType(classOf[Object]) typeFactory.constructSimpleType(classOf[(_, _)], classOf[(_, _)], Array(objectType, objectType)) } override def getOutputType(typeFactory: TypeFactory): JavaType = { val longType = typeFactory.uncheckedSimpleType(classOf[Long]) typeFactory.constructSimpleType(classOf[(_, _)], classOf[(_, _)], Array(longType, longType)) } }
Example 51
Source File: ExtendableHiveContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import org.apache.spark.SparkContext import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.ParserDialect import org.apache.spark.sql.catalyst.analysis.{Analyzer, _} import org.apache.spark.sql.catalyst.optimizer.Optimizer import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.ui.SQLListener import org.apache.spark.sql.execution.{CacheManager, ExtractPythonUDFs} import org.apache.spark.sql.extension._ import org.apache.spark.sql.hive.client.{ClientInterface, ClientWrapper} import org.apache.spark.sql.sources.commands.hive.HiveEmulationCatalog @transient override protected[sql] lazy val analyzer: Analyzer = new Analyzer(catalog, functionRegistry, conf) { override val extendedResolutionRules = resolutionRules(this) ++ (catalog.ParquetConversions :: catalog.CreateTables :: catalog.PreInsertionCasts :: ExtractPythonUDFs :: ResolveHiveWindowFunction :: PreInsertCastAndRename :: Nil) override val extendedCheckRules = ExtendableHiveContext.this.extendedCheckRules(this) } @transient override protected[sql] lazy val optimizer: Optimizer = OptimizerFactory.produce( earlyBatches = optimizerEarlyBatches, mainBatchRules = optimizerMainBatchRules, postBatches = optimizerPostBatches ) @transient override protected[sql] val planner: SparkPlanner with HiveStrategies = new SparkPlanner with HiveStrategies with ExtendedPlanner { def baseStrategies(hiveContext: HiveContext): Seq[Strategy] = Seq( DataSourceStrategy, HiveCommandStrategy(self), HiveDDLStrategy, DDLStrategy, TakeOrderedAndProject, InMemoryScans, HiveTableScans, DataSinks, Scripts, Aggregation, LeftSemiJoin, EquiJoinSelection, BasicOperators, BroadcastNestedLoop, CartesianProduct, DefaultJoin ) override def strategies: Seq[Strategy] = self.strategies(this) ++ experimental.extraStrategies ++ baseStrategies(self) override val hiveContext = self } }
Example 52
Source File: StringMap.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{HandleInvalid, StringMapModel} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ private val className = classOf[StringMap].getName override def load(path: String): StringMap = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("labels", "handleInvalid", "defaultValue").head() val labels = data.getAs[Map[String, Double]](0) val handleInvalid = HandleInvalid.fromString(data.getAs[String](1)) val defaultValue = data.getAs[Double](2) val model = new StringMapModel(labels, handleInvalid = handleInvalid, defaultValue = defaultValue) val transformer = new StringMap(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 53
Source File: MathUnary.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType} import org.apache.spark.sql.functions.udf private val className = classOf[MathUnary].getName override def load(path: String): MathUnary = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("operation").head() val operation = data.getAs[String](0) val model = MathUnaryModel(UnaryOperation.forName(operation)) val transformer = new MathUnary(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 54
Source File: MultinomialLabeler.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.MultinomialLabelerModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasFeaturesCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{udf, col} import ml.combust.mleap.core.util.VectorConverters._ class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"), val model: MultinomialLabelerModel) extends Transformer with HasFeaturesCol with HasProbabilitiesCol with HasLabelsCol { def setFeaturesCol(value: String): this.type = set(featuresCol, value) def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value) def setLabelsCol(value: String): this.type = set(labelsCol, value) @org.apache.spark.annotation.Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val probabilitiesUdf = udf { (vector: Vector) => model.top(vector).map(_._1).toArray } val labelsUdf = udf { (vector: Vector) => model.topLabels(vector).toArray } dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))). withColumn($(labelsCol), labelsUdf(col($(featuresCol)))) } override def copy(extra: ParamMap): Transformer = copyValues(new MultinomialLabeler(uid, model), extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT], s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(probabilitiesCol)), s"Output column ${$(probabilitiesCol)} already exists.") require(!inputFields.exists(_.name == $(labelsCol)), s"Output column ${$(labelsCol)} already exists.") StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)), StructField($(labelsCol), ArrayType(StringType)))) } }
Example 55
Source File: AttributeType.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.attribute import org.apache.spark.annotation.DeveloperApi def fromName(name: String): AttributeType = { if (name == Numeric.name) { Numeric } else if (name == Nominal.name) { Nominal } else if (name == Binary.name) { Binary } else if (name == Unresolved.name) { Unresolved } else { throw new IllegalArgumentException(s"Cannot recognize type $name.") } } }
Example 56
Source File: Transformer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import scala.annotation.varargs import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ protected def validateInputType(inputType: DataType): Unit = {} override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), outputDataType, nullable = false) StructType(outputFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val transformUDF = udf(this.createTransformFunc, outputDataType) dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } override def copy(extra: ParamMap): T = defaultCopy(extra) }
Example 57
Source File: LogLoss.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[spark] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 58
Source File: Predict.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.model import org.apache.spark.annotation.{DeveloperApi, Since} @Since("1.2.0") @DeveloperApi class Predict @Since("1.2.0") ( @Since("1.2.0") val predict: Double, @Since("1.2.0") val prob: Double = 0.0) extends Serializable { override def toString: String = s"$predict (prob = $prob)" override def equals(other: Any): Boolean = { other match { case p: Predict => predict == p.predict && prob == p.prob case _ => false } } override def hashCode: Int = { com.google.common.base.Objects.hashCode(predict: java.lang.Double, prob: java.lang.Double) } }
Example 59
Source File: DataValidators.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 60
Source File: KMeansDataGenerator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.rdd.RDD @Since("0.8.0") def generateKMeansRDD( sc: SparkContext, numPoints: Int, k: Int, d: Int, r: Double, numPartitions: Int = 2) : RDD[Array[Double]] = { // First, generate some centers val rand = new Random(42) val centers = Array.fill(k)(Array.fill(d)(rand.nextGaussian() * r)) // Then generate points around each center sc.parallelize(0 until numPoints, numPartitions).map { idx => val center = centers(idx % k) val rand2 = new Random(42 + idx) Array.tabulate(d)(i => center(i) + rand2.nextGaussian()) } } @Since("0.8.0") def main(args: Array[String]) { if (args.length < 6) { // scalastyle:off println println("Usage: KMeansGenerator " + "<master> <output_dir> <num_points> <k> <d> <r> [<num_partitions>]") // scalastyle:on println System.exit(1) } val sparkMaster = args(0) val outputPath = args(1) val numPoints = args(2).toInt val k = args(3).toInt val d = args(4).toInt val r = args(5).toDouble val parts = if (args.length >= 7) args(6).toInt else 2 val sc = new SparkContext(sparkMaster, "KMeansDataGenerator") val data = generateKMeansRDD(sc, numPoints, k, d, r, parts) data.map(_.mkString(" ")).saveAsTextFile(outputPath) System.exit(0) } }
Example 61
Source File: LogisticRegressionDataGenerator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 62
Source File: SVMDataGenerator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 63
Source File: SparkCommandLine.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.repl import scala.tools.nsc.{Settings, CompilerCommand} import scala.Predef._ import org.apache.spark.annotation.DeveloperApi @DeveloperApi class SparkCommandLine(args: List[String], override val settings: Settings) extends CompilerCommand(args, settings) { def this(args: List[String], error: String => Unit) { this(args, new SparkRunnerSettings(error)) } def this(args: List[String]) { // scalastyle:off println this(args, str => Console.println("Error: " + str)) // scalastyle:on println } }
Example 64
Source File: SparkPlanInfo.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.metric.SQLMetricInfo import org.apache.spark.util.Utils @DeveloperApi class SparkPlanInfo( val nodeName: String, val simpleString: String, val children: Seq[SparkPlanInfo], val metadata: Map[String, String], val metrics: Seq[SQLMetricInfo]) { override def hashCode(): Int = { // hashCode of simpleString should be good enough to distinguish the plans from each other // within a plan simpleString.hashCode } override def equals(other: Any): Boolean = other match { case o: SparkPlanInfo => nodeName == o.nodeName && simpleString == o.simpleString && children == o.children case _ => false } } private[execution] object SparkPlanInfo { def fromSparkPlan(plan: SparkPlan): SparkPlanInfo = { val children = plan match { case ReusedExchangeExec(_, child) => child :: Nil case _ => plan.children ++ plan.subqueries } val metrics = plan.metrics.toSeq.map { case (key, metric) => new SQLMetricInfo(metric.name.getOrElse(key), metric.id, metric.metricType) } new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan), plan.metadata, metrics) } }
Example 65
Source File: ReceiverInfo.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.apache.spark.annotation.DeveloperApi @DeveloperApi case class ReceiverInfo( streamId: Int, name: String, active: Boolean, location: String, executorId: String, lastErrorMessage: String = "", lastError: String = "", lastErrorTime: Long = -1L ) { }
Example 66
Source File: SerializableWritable.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.ObjectWritable import org.apache.hadoop.io.Writable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils @DeveloperApi class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable { def value: T = t override def toString: String = t.toString private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() new ObjectWritable(t).write(out) } private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() val ow = new ObjectWritable() ow.setConf(new Configuration(false)) ow.readFields(in) t = ow.get().asInstanceOf[T] } }
Example 67
Source File: JavaNewHadoopRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 68
Source File: JavaHadoopRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 69
Source File: RecoveryModeFactory.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 70
Source File: EnvironmentTab.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.env import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.ui._ private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") { val listener = parent.environmentListener attachPage(new EnvironmentPage(this)) } @DeveloperApi class EnvironmentListener extends SparkListener { var jvmInformation = Seq[(String, String)]() var sparkProperties = Seq[(String, String)]() var systemProperties = Seq[(String, String)]() var classpathEntries = Seq[(String, String)]() override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) { synchronized { val environmentDetails = environmentUpdate.environmentDetails jvmInformation = environmentDetails("JVM Information") sparkProperties = environmentDetails("Spark Properties") systemProperties = environmentDetails("System Properties") classpathEntries = environmentDetails("Classpath Entries") } } }
Example 71
Source File: StorageTab.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.storage import scala.collection.mutable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.storage._ import org.apache.spark.ui._ private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)]): Unit = { val rddIdsToUpdate = updatedBlocks.flatMap { case (bid, _) => bid.asRDDId.map(_.rddId) }.toSet val rddInfosToUpdate = _rddInfoMap.values.toSeq.filter { s => rddIdsToUpdate.contains(s.id) } StorageUtils.updateRddInfo(rddInfosToUpdate, activeStorageStatusList) } override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized { val rddInfos = stageSubmitted.stageInfo.rddInfos rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info).name = info.name } } override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = synchronized { // Remove all partitions that are no longer cached in current completed stage val completedRddIds = stageCompleted.stageInfo.rddInfos.map(r => r.id).toSet _rddInfoMap.retain { case (id, info) => !completedRddIds.contains(id) || info.numCachedPartitions > 0 } } override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized { _rddInfoMap.remove(unpersistRDD.rddId) } override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = { super.onBlockUpdated(blockUpdated) val blockId = blockUpdated.blockUpdatedInfo.blockId val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel val memSize = blockUpdated.blockUpdatedInfo.memSize val diskSize = blockUpdated.blockUpdatedInfo.diskSize val blockStatus = BlockStatus(storageLevel, memSize, diskSize) updateRDDInfo(Seq((blockId, blockStatus))) } }
Example 72
Source File: StageInfo.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.HashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.RDDInfo def fromStage( stage: Stage, attemptId: Int, numTasks: Option[Int] = None, taskMetrics: TaskMetrics = null, taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty ): StageInfo = { val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd) val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos new StageInfo( stage.id, attemptId, stage.name, numTasks.getOrElse(stage.numTasks), rddInfos, stage.parents.map(_.id), stage.details, taskMetrics, taskLocalityPreferences) } }
Example 73
Source File: AccumulableInfo.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.annotation.DeveloperApi object AccumulableInfo { @deprecated("do not create AccumulableInfo", "2.0.0") def apply( id: Long, name: String, update: Option[String], value: String, internal: Boolean): AccumulableInfo = { new AccumulableInfo( id, Option(name), update, Option(value), internal, countFailedValues = false) } @deprecated("do not create AccumulableInfo", "2.0.0") def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = { new AccumulableInfo( id, Option(name), update, Option(value), internal = false, countFailedValues = false) } @deprecated("do not create AccumulableInfo", "2.0.0") def apply(id: Long, name: String, value: String): AccumulableInfo = { new AccumulableInfo( id, Option(name), None, Option(value), internal = false, countFailedValues = false) } }
Example 74
Source File: SplitInfo.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import collection.mutable.ArrayBuffer import org.apache.spark.annotation.DeveloperApi // information about a specific split instance : handles both split instances. // So that we do not need to worry about the differences. @DeveloperApi class SplitInfo( val inputFormatClazz: Class[_], val hostLocation: String, val path: String, val length: Long, val underlyingSplit: Any) { override def toString(): String = { "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz + ", hostLocation : " + hostLocation + ", path : " + path + ", length : " + length + ", underlyingSplit " + underlyingSplit } override def hashCode(): Int = { var hashCode = inputFormatClazz.hashCode hashCode = hashCode * 31 + hostLocation.hashCode hashCode = hashCode * 31 + path.hashCode // ignore overflow ? It is hashcode anyway ! hashCode = hashCode * 31 + (length & 0x7fffffff).toInt hashCode } // This is practically useless since most of the Split impl's don't seem to implement equals :-( // So unless there is identity equality between underlyingSplits, it will always fail even if it // is pointing to same block. override def equals(other: Any): Boolean = other match { case that: SplitInfo => this.hostLocation == that.hostLocation && this.inputFormatClazz == that.inputFormatClazz && this.path == that.path && this.length == that.length && // other split specific checks (like start for FileSplit) this.underlyingSplit == that.underlyingSplit case _ => false } } object SplitInfo { def toSplitInfo(inputFormatClazz: Class[_], path: String, mapredSplit: org.apache.hadoop.mapred.InputSplit): Seq[SplitInfo] = { val retval = new ArrayBuffer[SplitInfo]() val length = mapredSplit.getLength for (host <- mapredSplit.getLocations) { retval += new SplitInfo(inputFormatClazz, host, path, length, mapredSplit) } retval } def toSplitInfo(inputFormatClazz: Class[_], path: String, mapreduceSplit: org.apache.hadoop.mapreduce.InputSplit): Seq[SplitInfo] = { val retval = new ArrayBuffer[SplitInfo]() val length = mapreduceSplit.getLength for (host <- mapreduceSplit.getLocations) { retval += new SplitInfo(inputFormatClazz, host, path, length, mapreduceSplit) } retval } }
Example 75
Source File: TaskInfo.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.ListBuffer import org.apache.spark.TaskState import org.apache.spark.TaskState.TaskState import org.apache.spark.annotation.DeveloperApi var finishTime: Long = 0 var failed = false var killed = false private[spark] def markGettingResult(time: Long = System.currentTimeMillis) { gettingResultTime = time } private[spark] def markFinished(state: TaskState, time: Long = System.currentTimeMillis) { finishTime = time if (state == TaskState.FAILED) { failed = true } else if (state == TaskState.KILLED) { killed = true } } def gettingResult: Boolean = gettingResultTime != 0 def finished: Boolean = finishTime != 0 def successful: Boolean = finished && !failed && !killed def running: Boolean = !finished def status: String = { if (running) { if (gettingResult) { "GET RESULT" } else { "RUNNING" } } else if (failed) { "FAILED" } else if (killed) { "KILLED" } else if (successful) { "SUCCESS" } else { "UNKNOWN" } } def id: String = s"$index.$attemptNumber" def duration: Long = { if (!finished) { throw new UnsupportedOperationException("duration() called on unfinished task") } else { finishTime - launchTime } } private[spark] def timeRunning(currentTime: Long): Long = currentTime - launchTime }
Example 76
Source File: ExecutorInfo.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.spark.annotation.DeveloperApi @DeveloperApi class ExecutorInfo( val executorHost: String, val totalCores: Int, val logUrlMap: Map[String, String]) { def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo] override def equals(other: Any): Boolean = other match { case that: ExecutorInfo => (that canEqual this) && executorHost == that.executorHost && totalCores == that.totalCores && logUrlMap == that.logUrlMap case _ => false } override def hashCode(): Int = { val state = Seq(executorHost, totalCores, logUrlMap) state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) } }
Example 77
Source File: taskListeners.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.util.EventListener import org.apache.spark.TaskContext import org.apache.spark.annotation.DeveloperApi private[spark] class TaskCompletionListenerException( errorMessages: Seq[String], val previousError: Option[Throwable] = None) extends RuntimeException { override def getMessage: String = { if (errorMessages.size == 1) { errorMessages.head } else { errorMessages.zipWithIndex.map { case (msg, i) => s"Exception $i: $msg" }.mkString("\n") } + previousError.map { e => "\n\nPrevious exception in task: " + e.getMessage + "\n" + e.getStackTrace.mkString("\t", "\n\t", "") }.getOrElse("") } }
Example 78
Source File: StorageStatusListener.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.collection.mutable import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ private def updateStorageStatus(unpersistedRDDId: Int) { storageStatusList.foreach { storageStatus => storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) => storageStatus.removeBlock(blockId) } } } override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized { updateStorageStatus(unpersistRDD.rddId) } override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) { synchronized { val blockManagerId = blockManagerAdded.blockManagerId val executorId = blockManagerId.executorId val maxMem = blockManagerAdded.maxMem val storageStatus = new StorageStatus(blockManagerId, maxMem) executorIdToStorageStatus(executorId) = storageStatus // Try to remove the dead storage status if same executor register the block manager twice. deadExecutorStorageStatus.zipWithIndex.find(_._1.blockManagerId.executorId == executorId) .foreach(toRemoveExecutor => deadExecutorStorageStatus.remove(toRemoveExecutor._2)) } } override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) { synchronized { val executorId = blockManagerRemoved.blockManagerId.executorId executorIdToStorageStatus.remove(executorId).foreach { status => deadExecutorStorageStatus += status } if (deadExecutorStorageStatus.size > retainedDeadExecutors) { deadExecutorStorageStatus.trimStart(1) } } } override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = { val executorId = blockUpdated.blockUpdatedInfo.blockManagerId.executorId val blockId = blockUpdated.blockUpdatedInfo.blockId val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel val memSize = blockUpdated.blockUpdatedInfo.memSize val diskSize = blockUpdated.blockUpdatedInfo.diskSize val blockStatus = BlockStatus(storageLevel, memSize, diskSize) updateStorageStatus(executorId, Seq((blockId, blockStatus))) } }
Example 79
Source File: RDDInfo.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.{RDD, RDDOperationScope} import org.apache.spark.util.Utils @DeveloperApi class RDDInfo( val id: Int, var name: String, val numPartitions: Int, var storageLevel: StorageLevel, val parentIds: Seq[Int], val callSite: String = "", val scope: Option[RDDOperationScope] = None) extends Ordered[RDDInfo] { var numCachedPartitions = 0 var memSize = 0L var diskSize = 0L var externalBlockStoreSize = 0L def isCached: Boolean = (memSize + diskSize > 0) && numCachedPartitions > 0 override def toString: String = { import Utils.bytesToString ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " + "MemorySize: %s; DiskSize: %s").format( name, id, storageLevel.toString, numCachedPartitions, numPartitions, bytesToString(memSize), bytesToString(diskSize)) } override def compare(that: RDDInfo): Int = { this.id - that.id } } private[spark] object RDDInfo { def fromRdd(rdd: RDD[_]): RDDInfo = { val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd)) val parentIds = rdd.dependencies.map(_.rdd.id) new RDDInfo(rdd.id, rddName, rdd.partitions.length, rdd.getStorageLevel, parentIds, rdd.creationSite.shortForm, rdd.scope) } }
Example 80
Source File: BlockUpdatedInfo.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.annotation.DeveloperApi import org.apache.spark.storage.BlockManagerMessages.UpdateBlockInfo @DeveloperApi case class BlockUpdatedInfo( blockManagerId: BlockManagerId, blockId: BlockId, storageLevel: StorageLevel, memSize: Long, diskSize: Long) private[spark] object BlockUpdatedInfo { private[spark] def apply(updateBlockInfo: UpdateBlockInfo): BlockUpdatedInfo = { BlockUpdatedInfo( updateBlockInfo.blockManagerId, updateBlockInfo.blockId, updateBlockInfo.storageLevel, updateBlockInfo.memSize, updateBlockInfo.diskSize) } }
Example 81
Source File: TopologyMapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.util.Utils @DeveloperApi class FileBasedTopologyMapper(conf: SparkConf) extends TopologyMapper(conf) with Logging { val topologyFile = conf.getOption("spark.storage.replication.topologyFile") require(topologyFile.isDefined, "Please specify topology file via " + "spark.storage.replication.topologyFile for FileBasedTopologyMapper.") val topologyMap = Utils.getPropertiesFromFile(topologyFile.get) override def getTopologyForHost(hostname: String): Option[String] = { val topology = topologyMap.get(hostname) if (topology.isDefined) { logDebug(s"$hostname -> ${topology.get}") } else { logWarning(s"$hostname does not have any topology information") } topology } }
Example 82
Source File: ShuffleWriteMetrics.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.LongAccumulator def writeTime: Long = _writeTime.sum private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v) private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v) private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v) private[spark] def decBytesWritten(v: Long): Unit = { _bytesWritten.setValue(bytesWritten - v) } private[spark] def decRecordsWritten(v: Long): Unit = { _recordsWritten.setValue(recordsWritten - v) } // Legacy methods for backward compatibility. // TODO: remove these once we make this class private. @deprecated("use bytesWritten instead", "2.0.0") def shuffleBytesWritten: Long = bytesWritten @deprecated("use writeTime instead", "2.0.0") def shuffleWriteTime: Long = writeTime @deprecated("use recordsWritten instead", "2.0.0") def shuffleRecordsWritten: Long = recordsWritten }
Example 83
Source File: InterruptibleIterator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi @DeveloperApi class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T]) extends Iterator[T] { def hasNext: Boolean = { // TODO(aarondav/rxin): Check Thread.interrupted instead of context.interrupted if interrupt // is allowed. The assumption is that Thread.interrupted does not have a memory fence in read // (just a volatile field in C), while context.interrupted is a volatile in the JVM, which // introduces an expensive read fence. if (context.isInterrupted) { throw new TaskKilledException } else { delegate.hasNext } } def next(): T = delegate.next() }
Example 84
Source File: ShuffledRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = index override def equals(other: Any): Boolean = super.equals(other) } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { val serializer = userSpecifiedSerializer.getOrElse { val serializerManager = SparkEnv.get.serializerManager if (mapSideCombine) { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]]) } else { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]]) } } List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override protected def getPreferredLocations(partition: Partition): Seq[String] = { val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] tracker.getPreferredLocationsForShuffle(dep, partition.index) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 85
Source File: OrderedRDDFunctions.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 86
Source File: UnionRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport} import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 87
Source File: XGBoost.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import eleflow.uberdata.models.UberXGBOOSTModel import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType} import scala.reflect.ClassTag class XGBoost[I](override val uid: String, val models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))])( implicit kt: ClassTag[I], ord: Ordering[I] = null) extends ForecastBaseModel[XGBoostSmallModel[I]] with HasInputCol with HasOutputCol with DefaultParamsWritable with HasFeaturesCol with HasNFutures with HasGroupByCol { def this( models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))] )(implicit kt: ClassTag[I], ord: Ordering[I] ) = this(Identifiable.randomUID("xgboost"), models) override def transform(dataSet: Dataset[_]): DataFrame = { val schema = dataSet.schema val predSchema = transformSchema(schema) val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)}) val predictions = joined.map { case (id, ((bestModel, metrics), row)) => val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]]( IUberdataForecastUtil.FEATURES_COL_NAME ) val label = DataTransformer.toFloat(row.getAs($(featuresCol))) val labelPoint = features.map { vec => val array = vec.toArray.map(_.toFloat) LabeledPoint(label, null, array) } val matrix = new DMatrix(labelPoint.toIterator) val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance .predict(matrix) .flatMap(_.map(_.toDouble)) .splitAt(features.length) Row( row.toSeq :+ Vectors .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _* ) } dataSet.sqlContext.createDataFrame(predictions, predSchema) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra) }
Example 88
Source File: XGBoostBigModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberXGBoostModel import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint} import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)]) extends ForecastBaseModel[XGBoostBigModel[I]] with HasLabelCol with HasIdCol { def setLabelcol(label: String): this.type = set(labelCol, label) def setIdcol(id: String): this.type = set(idCol, id) override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) ) } .join(prediction) .map { case (id, (features, predictValue)) => Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } protected def predict(dataSet: Dataset[_]) = { val features = dataSet.rdd.map { case (row: Row) => val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) val id = row.getAs[I]($(idCol)) SparkLabeledPoint(DataTransformer.toFloat(id), features) }.cache val (_, model) = models.head UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(getPredictionSchema) protected def getPredictionSchema: Array[StructField] = { Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) } }
Example 89
Source File: VectorizeEncoder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.core.data.DataTransformer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} class VectorizeEncoder(override val uid: String) extends Transformer with HasIdCol with HasTimeCol with HasInputCols with HasLabelCol with HasGroupByCol with HasOutputCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("vectorizer")) def setIdCol(input: String) = set(idCol, input) def setLabelCol(input: String) = set(labelCol, input) def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy)) def setInputCol(input: Array[String]) = set(inputCols, input) def setTimeCol(time: String) = set(timeCol, Some(time)) def setOutputCol(output: String) = set(outputCol, output) override def transform(dataSet: Dataset[_]): DataFrame = { val context = dataSet.sqlContext.sparkContext val input = context.broadcast($(inputCols)) val allColumnNames = dataSet.schema.map(_.name) val nonInputColumnIndexes = context.broadcast( allColumnNames.zipWithIndex.filter( f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol) || f._1 == $(timeCol).getOrElse(""))) val result = dataSet.rdd.map { case (row: Row) => val rowSeq = row.toSeq val nonInputColumns = nonInputColumnIndexes.value.map { case (_, index) => rowSeq(index) } val size = input.value.length val (values, indices) = input.value .filter(col => row.getAs(col) != null) .map { column => DataTransformer.toDouble(row.getAs(column)) } .zipWithIndex .filter(f => f._1 != 0d) .unzip Row( nonInputColumns :+ org.apache.spark.ml.linalg.Vectors .sparse(size, indices.toArray, values.toArray): _* ) } val newSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(result, newSchema) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType( schema.filter( col => !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol) || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("") ) ).add(StructField($(outputCol), new VectorUDT)) }
Example 90
Source File: XGBoostBigModelTimeSeries.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import java.sql.Timestamp import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasTimeCol import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModelTimeSeries[I](override val uid: String, override val models: Seq[(ParamMap, XGBoostModel)]) extends XGBoostBigModel[I](uid, models) with HasTimeCol{ def setTimecol(time: String): this.type = set(timeCol, Some(time)) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME), row.getAs[java.sql.Timestamp]($(timeCol).get))) } .join(prediction) .map { case (id, ((features, time), predictValue)) => Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField($(timeCol).get, TimestampType), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) ) }
Example 91
Source File: SerializableWritable.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.ObjectWritable import org.apache.hadoop.io.Writable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils @DeveloperApi class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable { def value = t override def toString = t.toString private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() new ObjectWritable(t).write(out) } private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() val ow = new ObjectWritable() ow.setConf(new Configuration()) ow.readFields(in) t = ow.get().asInstanceOf[T] } }
Example 92
Source File: JavaNewHadoopRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 93
Source File: JavaHadoopRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 94
Source File: DriverInfo.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.util.Date import org.apache.spark.annotation.DeveloperApi import org.apache.spark.deploy.DriverDescription import org.apache.spark.util.Utils private[spark] class DriverInfo( val startTime: Long, val id: String, val desc: DriverDescription, val submitDate: Date) extends Serializable { @transient var state: DriverState.Value = DriverState.SUBMITTED @transient var worker: Option[WorkerInfo] = None init() private def readObject(in: java.io.ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() init() } private def init(): Unit = { state = DriverState.SUBMITTED worker = None exception = None } }
Example 95
Source File: WorkerInfo.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import scala.collection.mutable import akka.actor.ActorRef import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class WorkerInfo( val id: String, val host: String, val port: Int, val cores: Int, val memory: Int, val actor: ActorRef, val webUiPort: Int, val publicAddress: String) extends Serializable { Utils.checkHost(host, "Expected hostname") assert (port > 0) @transient var executors: mutable.HashMap[String, ExecutorDesc] = _ // executorId => info @transient var drivers: mutable.HashMap[String, DriverInfo] = _ // driverId => info @transient var state: WorkerState.Value = _ @transient var coresUsed: Int = _ @transient var memoryUsed: Int = _ @transient var lastHeartbeat: Long = _ init() def coresFree: Int = cores - coresUsed def memoryFree: Int = memory - memoryUsed private def readObject(in: java.io.ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() init() } private def init() { executors = new mutable.HashMap drivers = new mutable.HashMap state = WorkerState.ALIVE coresUsed = 0 memoryUsed = 0 lastHeartbeat = System.currentTimeMillis() } def hostPort: String = { assert (port > 0) host + ":" + port } def addExecutor(exec: ExecutorDesc) { executors(exec.fullId) = exec coresUsed += exec.cores memoryUsed += exec.memory } def removeExecutor(exec: ExecutorDesc) { if (executors.contains(exec.fullId)) { executors -= exec.fullId coresUsed -= exec.cores memoryUsed -= exec.memory } } def hasExecutor(app: ApplicationInfo): Boolean = { executors.values.exists(_.application == app) } def addDriver(driver: DriverInfo) { drivers(driver.id) = driver memoryUsed += driver.desc.mem coresUsed += driver.desc.cores } def removeDriver(driver: DriverInfo) { drivers -= driver.id memoryUsed -= driver.desc.mem coresUsed -= driver.desc.cores } def webUiAddress : String = { "http://" + this.publicAddress + ":" + this.webUiPort } def setState(state: WorkerState.Value) = { this.state = state } }
Example 96
Source File: ExecutorsTab.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.exec import scala.collection.mutable.HashMap import org.apache.spark.ExceptionFailure import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.storage.StorageStatusListener import org.apache.spark.ui.{SparkUI, SparkUITab} private[ui] class ExecutorsTab(parent: SparkUI) extends SparkUITab(parent, "executors") { val listener = parent.executorsListener val sc = parent.sc val threadDumpEnabled = sc.isDefined && parent.conf.getBoolean("spark.ui.threadDumpsEnabled", true) attachPage(new ExecutorsPage(this, threadDumpEnabled)) if (threadDumpEnabled) { attachPage(new ExecutorThreadDumpPage(this)) } } @DeveloperApi class ExecutorsListener(storageStatusListener: StorageStatusListener) extends SparkListener { val executorToTasksActive = HashMap[String, Int]() val executorToTasksComplete = HashMap[String, Int]() val executorToTasksFailed = HashMap[String, Int]() val executorToDuration = HashMap[String, Long]() val executorToInputBytes = HashMap[String, Long]() val executorToInputRecords = HashMap[String, Long]() val executorToOutputBytes = HashMap[String, Long]() val executorToOutputRecords = HashMap[String, Long]() val executorToShuffleRead = HashMap[String, Long]() val executorToShuffleWrite = HashMap[String, Long]() val executorToLogUrls = HashMap[String, Map[String, String]]() def storageStatusList = storageStatusListener.storageStatusList override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded) = synchronized { val eid = executorAdded.executorId executorToLogUrls(eid) = executorAdded.executorInfo.logUrlMap } override def onTaskStart(taskStart: SparkListenerTaskStart) = synchronized { val eid = taskStart.taskInfo.executorId executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 0) + 1 } override def onTaskEnd(taskEnd: SparkListenerTaskEnd) = synchronized { val info = taskEnd.taskInfo if (info != null) { val eid = info.executorId executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 1) - 1 executorToDuration(eid) = executorToDuration.getOrElse(eid, 0L) + info.duration taskEnd.reason match { case e: ExceptionFailure => executorToTasksFailed(eid) = executorToTasksFailed.getOrElse(eid, 0) + 1 case _ => executorToTasksComplete(eid) = executorToTasksComplete.getOrElse(eid, 0) + 1 } // Update shuffle read/write val metrics = taskEnd.taskMetrics if (metrics != null) { metrics.inputMetrics.foreach { inputMetrics => executorToInputBytes(eid) = executorToInputBytes.getOrElse(eid, 0L) + inputMetrics.bytesRead executorToInputRecords(eid) = executorToInputRecords.getOrElse(eid, 0L) + inputMetrics.recordsRead } metrics.outputMetrics.foreach { outputMetrics => executorToOutputBytes(eid) = executorToOutputBytes.getOrElse(eid, 0L) + outputMetrics.bytesWritten executorToOutputRecords(eid) = executorToOutputRecords.getOrElse(eid, 0L) + outputMetrics.recordsWritten } metrics.shuffleReadMetrics.foreach { shuffleRead => executorToShuffleRead(eid) = executorToShuffleRead.getOrElse(eid, 0L) + shuffleRead.remoteBytesRead } metrics.shuffleWriteMetrics.foreach { shuffleWrite => executorToShuffleWrite(eid) = executorToShuffleWrite.getOrElse(eid, 0L) + shuffleWrite.shuffleBytesWritten } } } } }
Example 97
Source File: EnvironmentTab.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.env import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.ui._ private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") { val listener = parent.environmentListener attachPage(new EnvironmentPage(this)) } @DeveloperApi class EnvironmentListener extends SparkListener { var jvmInformation = Seq[(String, String)]() var sparkProperties = Seq[(String, String)]() var systemProperties = Seq[(String, String)]() var classpathEntries = Seq[(String, String)]() override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) { synchronized { val environmentDetails = environmentUpdate.environmentDetails jvmInformation = environmentDetails("JVM Information") sparkProperties = environmentDetails("Spark Properties") systemProperties = environmentDetails("System Properties") classpathEntries = environmentDetails("Classpath Entries") } } }
Example 98
Source File: StorageTab.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.storage import scala.collection.mutable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ui._ import org.apache.spark.scheduler._ import org.apache.spark.storage._ override def onTaskEnd(taskEnd: SparkListenerTaskEnd) = synchronized { val metrics = taskEnd.taskMetrics if (metrics != null && metrics.updatedBlocks.isDefined) { updateRDDInfo(metrics.updatedBlocks.get) } } override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted) = synchronized { val rddInfos = stageSubmitted.stageInfo.rddInfos rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info) } } override def onStageCompleted(stageCompleted: SparkListenerStageCompleted) = synchronized { // Remove all partitions that are no longer cached in current completed stage val completedRddIds = stageCompleted.stageInfo.rddInfos.map(r => r.id).toSet _rddInfoMap.retain { case (id, info) => !completedRddIds.contains(id) || info.numCachedPartitions > 0 } } override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD) = synchronized { _rddInfoMap.remove(unpersistRDD.rddId) } }
Example 99
Source File: JavaSerializer.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io._ import java.nio.ByteBuffer import scala.reflect.ClassTag import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.ByteBufferInputStream import org.apache.spark.util.Utils private[spark] class JavaSerializationStream( out: OutputStream, counterReset: Int, extraDebugInfo: Boolean) extends SerializationStream { private val objOut = new ObjectOutputStream(out) private var counter = 0 @DeveloperApi class JavaSerializer(conf: SparkConf) extends Serializer with Externalizable { private var counterReset = conf.getInt("spark.serializer.objectStreamReset", 100) private var extraDebugInfo = conf.getBoolean("spark.serializer.extraDebugInfo", true) override def newInstance(): SerializerInstance = { val classLoader = defaultClassLoader.getOrElse(Thread.currentThread.getContextClassLoader) new JavaSerializerInstance(counterReset, extraDebugInfo, classLoader) } override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { out.writeInt(counterReset) out.writeBoolean(extraDebugInfo) } override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { counterReset = in.readInt() extraDebugInfo = in.readBoolean() } }
Example 100
Source File: Serializer.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io._ import java.nio.ByteBuffer import scala.reflect.ClassTag import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.{Utils, ByteBufferInputStream, NextIterator} def asIterator: Iterator[Any] = new NextIterator[Any] { override protected def getNext() = { try { readObject[Any]() } catch { case eof: EOFException => finished = true } } override protected def close() { DeserializationStream.this.close() } } }
Example 101
Source File: StageInfo.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.HashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.storage.RDDInfo def fromStage(stage: Stage, numTasks: Option[Int] = None): StageInfo = { val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd) val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos new StageInfo( stage.id, stage.attemptId, stage.name, numTasks.getOrElse(stage.numTasks), rddInfos, stage.details) } }
Example 102
Source File: AccumulableInfo.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.annotation.DeveloperApi @DeveloperApi class AccumulableInfo ( val id: Long, val name: String, val update: Option[String], // represents a partial update within a task val value: String) { override def equals(other: Any): Boolean = other match { case acc: AccumulableInfo => this.id == acc.id && this.name == acc.name && this.update == acc.update && this.value == acc.value case _ => false } } object AccumulableInfo { def apply(id: Long, name: String, update: Option[String], value: String) = new AccumulableInfo(id, name, update, value) def apply(id: Long, name: String, value: String) = new AccumulableInfo(id, name, None, value) }
Example 103
Source File: SplitInfo.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import collection.mutable.ArrayBuffer import org.apache.spark.annotation.DeveloperApi // information about a specific split instance : handles both split instances. // So that we do not need to worry about the differences. @DeveloperApi class SplitInfo( val inputFormatClazz: Class[_], val hostLocation: String, val path: String, val length: Long, val underlyingSplit: Any) { override def toString(): String = { "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz + ", hostLocation : " + hostLocation + ", path : " + path + ", length : " + length + ", underlyingSplit " + underlyingSplit } override def hashCode(): Int = { var hashCode = inputFormatClazz.hashCode hashCode = hashCode * 31 + hostLocation.hashCode hashCode = hashCode * 31 + path.hashCode // ignore overflow ? It is hashcode anyway ! hashCode = hashCode * 31 + (length & 0x7fffffff).toInt hashCode } // This is practically useless since most of the Split impl's dont seem to implement equals :-( // So unless there is identity equality between underlyingSplits, it will always fail even if it // is pointing to same block. override def equals(other: Any): Boolean = other match { case that: SplitInfo => { this.hostLocation == that.hostLocation && this.inputFormatClazz == that.inputFormatClazz && this.path == that.path && this.length == that.length && // other split specific checks (like start for FileSplit) this.underlyingSplit == that.underlyingSplit } case _ => false } } object SplitInfo { def toSplitInfo(inputFormatClazz: Class[_], path: String, mapredSplit: org.apache.hadoop.mapred.InputSplit): Seq[SplitInfo] = { val retval = new ArrayBuffer[SplitInfo]() val length = mapredSplit.getLength for (host <- mapredSplit.getLocations) { retval += new SplitInfo(inputFormatClazz, host, path, length, mapredSplit) } retval } def toSplitInfo(inputFormatClazz: Class[_], path: String, mapreduceSplit: org.apache.hadoop.mapreduce.InputSplit): Seq[SplitInfo] = { val retval = new ArrayBuffer[SplitInfo]() val length = mapreduceSplit.getLength for (host <- mapreduceSplit.getLocations) { retval += new SplitInfo(inputFormatClazz, host, path, length, mapreduceSplit) } retval } }
Example 104
Source File: TaskInfo.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.ListBuffer import org.apache.spark.annotation.DeveloperApi var finishTime: Long = 0 var failed = false private[spark] def markGettingResult(time: Long = System.currentTimeMillis) { gettingResultTime = time } private[spark] def markSuccessful(time: Long = System.currentTimeMillis) { finishTime = time } private[spark] def markFailed(time: Long = System.currentTimeMillis) { finishTime = time failed = true } def gettingResult: Boolean = gettingResultTime != 0 def finished: Boolean = finishTime != 0 def successful: Boolean = finished && !failed def running: Boolean = !finished def status: String = { if (running) { if (gettingResult) { "GET RESULT" } else { "RUNNING" } } else if (failed) { "FAILED" } else if (successful) { "SUCCESS" } else { "UNKNOWN" } } def id: String = s"$index.$attempt" def duration: Long = { if (!finished) { throw new UnsupportedOperationException("duration() called on unfinished task") } else { finishTime - launchTime } } private[spark] def timeRunning(currentTime: Long): Long = currentTime - launchTime }
Example 105
Source File: ExecutorInfo.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.spark.annotation.DeveloperApi @DeveloperApi class ExecutorInfo( val executorHost: String, val totalCores: Int, val logUrlMap: Map[String, String]) { def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo] override def equals(other: Any): Boolean = other match { case that: ExecutorInfo => (that canEqual this) && executorHost == that.executorHost && totalCores == that.totalCores && logUrlMap == that.logUrlMap case _ => false } override def hashCode(): Int = { val state = Seq(executorHost, totalCores, logUrlMap) state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) } }
Example 106
Source File: Aggregator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap} @DeveloperApi case class Aggregator[K, V, C] ( createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C) { // When spilling is enabled sorting will happen externally, but not necessarily with an // ExternalSorter. private val isSpillEnabled = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true) @deprecated("use combineValuesByKey with TaskContext argument", "0.9.0") def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] = combineValuesByKey(iter, null) def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]], context: TaskContext): Iterator[(K, C)] = { if (!isSpillEnabled) { val combiners = new AppendOnlyMap[K,C] var kv: Product2[K, V] = null val update = (hadValue: Boolean, oldValue: C) => { if (hadValue) mergeValue(oldValue, kv._2) else createCombiner(kv._2) } while (iter.hasNext) { kv = iter.next() combiners.changeValue(kv._1, update) } combiners.iterator } else { val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners) combiners.insertAll(iter) // Update task metrics if context is not null // TODO: Make context non optional in a future release Option(context).foreach { c => c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled) c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled) } combiners.iterator } } @deprecated("use combineCombinersByKey with TaskContext argument", "0.9.0") def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]]) : Iterator[(K, C)] = combineCombinersByKey(iter, null) def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext) : Iterator[(K, C)] = { if (!isSpillEnabled) { val combiners = new AppendOnlyMap[K,C] var kc: Product2[K, C] = null val update = (hadValue: Boolean, oldValue: C) => { if (hadValue) mergeCombiners(oldValue, kc._2) else kc._2 } while (iter.hasNext) { kc = iter.next() combiners.changeValue(kc._1, update) } combiners.iterator } else { val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners) while (iter.hasNext) { val pair = iter.next() combiners.insert(pair._1, pair._2) } // Update task metrics if context is not null // TODO: Make context non-optional in a future release Option(context).foreach { c => c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled) c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled) } combiners.iterator } } }
Example 107
Source File: Dependency.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.ShuffleHandle @DeveloperApi class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int) extends NarrowDependency[T](rdd) { override def getParents(partitionId: Int) = { if (partitionId >= outStart && partitionId < outStart + length) { List(partitionId - outStart + inStart) } else { Nil } } }
Example 108
Source File: StorageStatusListener.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.collection.mutable import org.apache.spark.SparkContext import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ private def updateStorageStatus(unpersistedRDDId: Int) { storageStatusList.foreach { storageStatus => storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) => storageStatus.removeBlock(blockId) } } } override def onTaskEnd(taskEnd: SparkListenerTaskEnd) = synchronized { val info = taskEnd.taskInfo val metrics = taskEnd.taskMetrics if (info != null && metrics != null) { val updatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]()) if (updatedBlocks.length > 0) { updateStorageStatus(info.executorId, updatedBlocks) } } } override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD) = synchronized { updateStorageStatus(unpersistRDD.rddId) } override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) { synchronized { val blockManagerId = blockManagerAdded.blockManagerId val executorId = blockManagerId.executorId val maxMem = blockManagerAdded.maxMem val storageStatus = new StorageStatus(blockManagerId, maxMem) executorIdToStorageStatus(executorId) = storageStatus } } override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) { synchronized { val executorId = blockManagerRemoved.blockManagerId.executorId executorIdToStorageStatus.remove(executorId) } } }
Example 109
Source File: RDDInfo.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.util.Utils @DeveloperApi class RDDInfo( val id: Int, val name: String, val numPartitions: Int, var storageLevel: StorageLevel) extends Ordered[RDDInfo] { var numCachedPartitions = 0 var memSize = 0L var diskSize = 0L var tachyonSize = 0L def isCached: Boolean = (memSize + diskSize + tachyonSize > 0) && numCachedPartitions > 0 override def toString = { import Utils.bytesToString ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " + "MemorySize: %s; TachyonSize: %s; DiskSize: %s").format( name, id, storageLevel.toString, numCachedPartitions, numPartitions, bytesToString(memSize), bytesToString(tachyonSize), bytesToString(diskSize)) } override def compare(that: RDDInfo) = { this.id - that.id } } private[spark] object RDDInfo { def fromRdd(rdd: RDD[_]): RDDInfo = { val rddName = Option(rdd.name).getOrElse(rdd.id.toString) new RDDInfo(rdd.id, rddName, rdd.partitions.size, rdd.getStorageLevel) } }
Example 110
Source File: BlockManagerId.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput} import java.util.concurrent.ConcurrentHashMap import org.apache.spark.SparkContext import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils def apply(execId: String, host: String, port: Int) = getCachedBlockManagerId(new BlockManagerId(execId, host, port)) def apply(in: ObjectInput) = { val obj = new BlockManagerId() obj.readExternal(in) getCachedBlockManagerId(obj) } val blockManagerIdCache = new ConcurrentHashMap[BlockManagerId, BlockManagerId]() def getCachedBlockManagerId(id: BlockManagerId): BlockManagerId = { blockManagerIdCache.putIfAbsent(id, id) blockManagerIdCache.get(id) } }
Example 111
Source File: InterruptibleIterator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi @DeveloperApi class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T]) extends Iterator[T] { def hasNext: Boolean = { // TODO(aarondav/rxin): Check Thread.interrupted instead of context.interrupted if interrupt // is allowed. The assumption is that Thread.interrupted does not have a memory fence in read // (just a volatile field in C), while context.interrupted is a volatile in the JVM, which // introduces an expensive read fence. if (context.isInterrupted) { throw new TaskKilledException } else { delegate.hasNext } } def next(): T = delegate.next() }
Example 112
Source File: ShuffledRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index = idx override def hashCode(): Int = idx } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 113
Source File: UnionRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient rdd: RDD[T], val parentRddIndex: Int, @transient parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations() = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies override def getPartitions: Array[Partition] = { val array = new Array[Partition](rdds.map(_.partitions.size).sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.size) pos += rdd.partitions.size } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 114
Source File: package.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{AttributeSet, Expression, PredicateHelper, ScalaUDF} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.carbondata.mv.plans.modular.ModularPlan import org.apache.carbondata.mv.plans.util.{CheckSPJG, LogicalPlanSignatureGenerator, Signature} def canEvaluate(exp: ScalaUDF, exprList: Seq[Expression]): Boolean = { var canBeDerived = false exprList.forall { case udf: ScalaUDF => if (udf.children.length == exp.children.length) { if (udf.children.zip(exp.children).forall(e => e._1.sql.equalsIgnoreCase(e._2.sql))) { canBeDerived = true } } canBeDerived case _ => canBeDerived } } def canEvaluate(expr: Expression, exprList: Seq[Expression]): Boolean = { expr match { case exp: ScalaUDF => canEvaluate(exp, exprList) case _ => expr.references.subsetOf(AttributeSet(exprList)) } } } def supports(supported: Boolean, message: Any) { if (!supported) { throw new UnsupportedOperationException(s"unsupported operation: $message") } } }
Example 115
Source File: ArrayParam.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package org.apache.spark.ml.param import org.apache.spark.annotation.DeveloperApi import org.json4s.{DefaultFormats, _} import org.json4s.jackson.JsonMethods.{compact, parse, render} import scala.collection.JavaConverters._ def w(value: java.util.List[_]): ParamPair[Array[_]] = w(value.asScala.toArray) override def jsonEncode(value: Array[_]): String = { import org.json4s.JsonDSL._ value match { case intArr: Array[Int] => compact(render(intArr.toSeq)) case dbArr: Array[Double] => compact(render(dbArr.toSeq)) case strArr: Array[String] => compact(render(strArr.toSeq)) case blArr: Array[Boolean] => compact(render(blArr.toSeq)) case intArr: Array[Integer] => compact(render(intArr.map(_.toLong).toSeq)) case _ => throw new IllegalArgumentException("Internal type not json serializable") } } override def jsonDecode(json: String): Array[_] = { implicit val formats: DefaultFormats.type = DefaultFormats parse(json).extract[Seq[_]].toArray } }
Example 116
Source File: Featurize.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.featurize import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.ml.{Estimator, Pipeline, PipelineModel} import org.apache.spark.sql._ import org.apache.spark.sql.types._ private[spark] object FeaturizeUtilities { // 2^18 features by default val NumFeaturesDefault = 262144 // 2^12 features for tree-based or NN-based learners val NumFeaturesTreeOrNNBased = 4096 } object Featurize extends DefaultParamsReadable[Featurize] override def fit(dataset: Dataset[_]): PipelineModel = { val pipeline = assembleFeaturesEstimators(getFeatureColumns) pipeline.fit(dataset) } private def assembleFeaturesEstimators(featureColumns: Map[String, Seq[String]]): Pipeline = { val assembleFeaturesEstimators = featureColumns.map(newColToFeatures => { new AssembleFeatures() .setColumnsToFeaturize(newColToFeatures._2.toArray) .setFeaturesCol(newColToFeatures._1) .setNumberOfFeatures(getNumberOfFeatures) .setOneHotEncodeCategoricals(getOneHotEncodeCategoricals) .setAllowImages(getAllowImages) }).toArray new Pipeline().setStages(assembleFeaturesEstimators) } override def copy(extra: ParamMap): Estimator[PipelineModel] = { new Featurize() } @DeveloperApi override def transformSchema(schema: StructType): StructType = assembleFeaturesEstimators(getFeatureColumns).transformSchema(schema) }
Example 117
Source File: ProtoParquetRDD.scala From sparksql-protobuf with Apache License 2.0 | 5 votes |
package com.github.saurfang.parquet.proto.spark import com.github.saurfang.parquet.proto.ProtoMessageParquetInputFormat import com.google.protobuf.AbstractMessage import org.apache.hadoop.conf.Configuration import org.apache.hadoop.mapred.{FileInputFormat, JobConf} import org.apache.parquet.proto.ProtoReadSupport import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.{NewHadoopRDD, RDD} import org.apache.spark.{Partition, SparkContext, TaskContext} import scala.reflect.ClassTag class ProtoParquetRDD[T <: AbstractMessage : ClassTag]( sc: SparkContext, input: String, protoClass: Class[T], @transient conf: Configuration ) extends RDD[T](sc, Nil) { def this(sc: SparkContext, input: String, protoClass: Class[T]) = { this(sc, input, protoClass, sc.hadoopConfiguration) } lazy private[this] val rdd = { val jconf = new JobConf(conf) FileInputFormat.setInputPaths(jconf, input) ProtoReadSupport.setProtobufClass(jconf, protoClass.getName) new NewHadoopRDD(sc, classOf[ProtoMessageParquetInputFormat[T]], classOf[Void], protoClass, jconf) } @DeveloperApi override def compute(split: Partition, context: TaskContext): Iterator[T] = rdd.compute(split, context).map(_._2) override protected def getPartitions: Array[Partition] = rdd.getPartitions }
Example 118
Source File: AttributeType.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.attribute import org.apache.spark.annotation.DeveloperApi def fromName(name: String): AttributeType = { if (name == Numeric.name) { Numeric } else if (name == Nominal.name) { Nominal } else if (name == Binary.name) { Binary } else if (name == Unresolved.name) { Unresolved } else { throw new IllegalArgumentException(s"Cannot recognize type $name.") } } }
Example 119
Source File: Transformer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import scala.annotation.varargs import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ protected def validateInputType(inputType: DataType): Unit = {} override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), outputDataType, nullable = false) StructType(outputFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val transformUDF = udf(this.createTransformFunc, outputDataType) dataset.withColumn($(outputCol), transformUDF(dataset($(inputCol)))) } override def copy(extra: ParamMap): T = defaultCopy(extra) }
Example 120
Source File: LogLoss.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[spark] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 121
Source File: Predict.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.model import org.apache.spark.annotation.{DeveloperApi, Since} @Since("1.2.0") @DeveloperApi class Predict @Since("1.2.0") ( @Since("1.2.0") val predict: Double, @Since("1.2.0") val prob: Double = 0.0) extends Serializable { override def toString: String = s"$predict (prob = $prob)" override def equals(other: Any): Boolean = { other match { case p: Predict => predict == p.predict && prob == p.prob case _ => false } } override def hashCode: Int = { com.google.common.base.Objects.hashCode(predict: java.lang.Double, prob: java.lang.Double) } }
Example 122
Source File: DataValidators.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.internal.Logging import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 123
Source File: KMeansDataGenerator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.rdd.RDD @Since("0.8.0") def generateKMeansRDD( sc: SparkContext, numPoints: Int, k: Int, d: Int, r: Double, numPartitions: Int = 2) : RDD[Array[Double]] = { // First, generate some centers val rand = new Random(42) val centers = Array.fill(k)(Array.fill(d)(rand.nextGaussian() * r)) // Then generate points around each center sc.parallelize(0 until numPoints, numPartitions).map { idx => val center = centers(idx % k) val rand2 = new Random(42 + idx) Array.tabulate(d)(i => center(i) + rand2.nextGaussian()) } } @Since("0.8.0") def main(args: Array[String]) { if (args.length < 6) { // scalastyle:off println println("Usage: KMeansGenerator " + "<master> <output_dir> <num_points> <k> <d> <r> [<num_partitions>]") // scalastyle:on println System.exit(1) } val sparkMaster = args(0) val outputPath = args(1) val numPoints = args(2).toInt val k = args(3).toInt val d = args(4).toInt val r = args(5).toDouble val parts = if (args.length >= 7) args(6).toInt else 2 val sc = new SparkContext(sparkMaster, "KMeansDataGenerator") val data = generateKMeansRDD(sc, numPoints, k, d, r, parts) data.map(_.mkString(" ")).saveAsTextFile(outputPath) System.exit(0) } }
Example 124
Source File: LogisticRegressionDataGenerator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("0.8.0") def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } @Since("0.8.0") def main(args: Array[String]) { if (args.length != 5) { // scalastyle:off println println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 125
Source File: SVMDataGenerator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.SparkContext import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @DeveloperApi @Since("0.8.0") object SVMDataGenerator { @Since("0.8.0") def main(args: Array[String]) { if (args.length < 2) { // scalastyle:off println println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") // scalastyle:on println System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 126
Source File: SparkCommandLine.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.repl import scala.tools.nsc.{Settings, CompilerCommand} import scala.Predef._ import org.apache.spark.annotation.DeveloperApi @DeveloperApi class SparkCommandLine(args: List[String], override val settings: Settings) extends CompilerCommand(args, settings) { def this(args: List[String], error: String => Unit) { this(args, new SparkRunnerSettings(error)) } def this(args: List[String]) { // scalastyle:off println this(args, str => Console.println("Error: " + str)) // scalastyle:on println } }
Example 127
Source File: SparkPlanInfo.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.metric.SQLMetricInfo import org.apache.spark.util.Utils @DeveloperApi class SparkPlanInfo( val nodeName: String, val simpleString: String, val children: Seq[SparkPlanInfo], val metadata: Map[String, String], val metrics: Seq[SQLMetricInfo]) { override def hashCode(): Int = { // hashCode of simpleString should be good enough to distinguish the plans from each other // within a plan simpleString.hashCode } override def equals(other: Any): Boolean = other match { case o: SparkPlanInfo => nodeName == o.nodeName && simpleString == o.simpleString && children == o.children case _ => false } } private[execution] object SparkPlanInfo { def fromSparkPlan(plan: SparkPlan): SparkPlanInfo = { val children = plan match { case ReusedExchangeExec(_, child, _) => child :: Nil case _ => plan.children ++ plan.subqueries } val metrics = plan.metrics.toSeq.map { case (key, metric) => new SQLMetricInfo(metric.name.getOrElse(key), metric.id, metric.metricType) } new SparkPlanInfo(plan.nodeName, plan.simpleString, children.map(fromSparkPlan), plan.metadata, metrics) } }
Example 128
Source File: ReceiverInfo.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.apache.spark.annotation.DeveloperApi @DeveloperApi case class ReceiverInfo( streamId: Int, name: String, active: Boolean, location: String, executorId: String, lastErrorMessage: String = "", lastError: String = "", lastErrorTime: Long = -1L ) { }
Example 129
Source File: SerializableWritable.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.ObjectWritable import org.apache.hadoop.io.Writable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils @DeveloperApi class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable { def value: T = t override def toString: String = t.toString private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() new ObjectWritable(t).write(out) } private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() val ow = new ObjectWritable() ow.setConf(new Configuration(false)) ow.readFields(in) t = ow.get().asInstanceOf[T] } }
Example 130
Source File: JavaNewHadoopRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 131
Source File: JavaHadoopRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, b.asJava).asScala, preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 132
Source File: RecoveryModeFactory.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 133
Source File: EnvironmentTab.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.env import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.ui._ private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") { val listener = parent.environmentListener attachPage(new EnvironmentPage(this)) } @DeveloperApi class EnvironmentListener extends SparkListener { var jvmInformation = Seq[(String, String)]() var sparkProperties = Seq[(String, String)]() var systemProperties = Seq[(String, String)]() var classpathEntries = Seq[(String, String)]() override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) { synchronized { val environmentDetails = environmentUpdate.environmentDetails jvmInformation = environmentDetails("JVM Information") sparkProperties = environmentDetails("Spark Properties") systemProperties = environmentDetails("System Properties") classpathEntries = environmentDetails("Classpath Entries") } } }
Example 134
Source File: StorageTab.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.storage import scala.collection.mutable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.storage._ import org.apache.spark.ui._ private def updateRDDInfo(updatedBlocks: Seq[(BlockId, BlockStatus)]): Unit = { val rddIdsToUpdate = updatedBlocks.flatMap { case (bid, _) => bid.asRDDId.map(_.rddId) }.toSet val rddInfosToUpdate = _rddInfoMap.values.toSeq.filter { s => rddIdsToUpdate.contains(s.id) } StorageUtils.updateRddInfo(rddInfosToUpdate, activeStorageStatusList) } override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized { val rddInfos = stageSubmitted.stageInfo.rddInfos rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info).name = info.name } } override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = synchronized { // Remove all partitions that are no longer cached in current completed stage val completedRddIds = stageCompleted.stageInfo.rddInfos.map(r => r.id).toSet _rddInfoMap.retain { case (id, info) => !completedRddIds.contains(id) || info.numCachedPartitions > 0 } } override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized { _rddInfoMap.remove(unpersistRDD.rddId) } override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = { super.onBlockUpdated(blockUpdated) val blockId = blockUpdated.blockUpdatedInfo.blockId val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel val memSize = blockUpdated.blockUpdatedInfo.memSize val diskSize = blockUpdated.blockUpdatedInfo.diskSize val blockStatus = BlockStatus(storageLevel, memSize, diskSize) updateRDDInfo(Seq((blockId, blockStatus))) } }
Example 135
Source File: StageInfo.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.HashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.RDDInfo def fromStage( stage: Stage, attemptId: Int, numTasks: Option[Int] = None, taskMetrics: TaskMetrics = null, taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty ): StageInfo = { val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd) val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos new StageInfo( stage.id, attemptId, stage.name, numTasks.getOrElse(stage.numTasks), rddInfos, stage.parents.map(_.id), stage.details, taskMetrics, taskLocalityPreferences) } }
Example 136
Source File: AccumulableInfo.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.annotation.DeveloperApi object AccumulableInfo { @deprecated("do not create AccumulableInfo", "2.0.0") def apply( id: Long, name: String, update: Option[String], value: String, internal: Boolean): AccumulableInfo = { new AccumulableInfo( id, Option(name), update, Option(value), internal, countFailedValues = false) } @deprecated("do not create AccumulableInfo", "2.0.0") def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = { new AccumulableInfo( id, Option(name), update, Option(value), internal = false, countFailedValues = false) } @deprecated("do not create AccumulableInfo", "2.0.0") def apply(id: Long, name: String, value: String): AccumulableInfo = { new AccumulableInfo( id, Option(name), None, Option(value), internal = false, countFailedValues = false) } }
Example 137
Source File: SplitInfo.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import collection.mutable.ArrayBuffer import org.apache.spark.annotation.DeveloperApi // information about a specific split instance : handles both split instances. // So that we do not need to worry about the differences. @DeveloperApi class SplitInfo( val inputFormatClazz: Class[_], val hostLocation: String, val path: String, val length: Long, val underlyingSplit: Any) { override def toString(): String = { "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz + ", hostLocation : " + hostLocation + ", path : " + path + ", length : " + length + ", underlyingSplit " + underlyingSplit } override def hashCode(): Int = { var hashCode = inputFormatClazz.hashCode hashCode = hashCode * 31 + hostLocation.hashCode hashCode = hashCode * 31 + path.hashCode // ignore overflow ? It is hashcode anyway ! hashCode = hashCode * 31 + (length & 0x7fffffff).toInt hashCode } // This is practically useless since most of the Split impl's don't seem to implement equals :-( // So unless there is identity equality between underlyingSplits, it will always fail even if it // is pointing to same block. override def equals(other: Any): Boolean = other match { case that: SplitInfo => this.hostLocation == that.hostLocation && this.inputFormatClazz == that.inputFormatClazz && this.path == that.path && this.length == that.length && // other split specific checks (like start for FileSplit) this.underlyingSplit == that.underlyingSplit case _ => false } } object SplitInfo { def toSplitInfo(inputFormatClazz: Class[_], path: String, mapredSplit: org.apache.hadoop.mapred.InputSplit): Seq[SplitInfo] = { val retval = new ArrayBuffer[SplitInfo]() val length = mapredSplit.getLength for (host <- mapredSplit.getLocations) { retval += new SplitInfo(inputFormatClazz, host, path, length, mapredSplit) } retval } def toSplitInfo(inputFormatClazz: Class[_], path: String, mapreduceSplit: org.apache.hadoop.mapreduce.InputSplit): Seq[SplitInfo] = { val retval = new ArrayBuffer[SplitInfo]() val length = mapreduceSplit.getLength for (host <- mapreduceSplit.getLocations) { retval += new SplitInfo(inputFormatClazz, host, path, length, mapreduceSplit) } retval } }
Example 138
Source File: TaskInfo.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.ListBuffer import org.apache.spark.TaskState import org.apache.spark.TaskState.TaskState import org.apache.spark.annotation.DeveloperApi var finishTime: Long = 0 var failed = false var killed = false private[spark] def markGettingResult(time: Long = System.currentTimeMillis) { gettingResultTime = time } private[spark] def markFinished(state: TaskState, time: Long = System.currentTimeMillis) { finishTime = time if (state == TaskState.FAILED) { failed = true } else if (state == TaskState.KILLED) { killed = true } } def gettingResult: Boolean = gettingResultTime != 0 def finished: Boolean = finishTime != 0 def successful: Boolean = finished && !failed && !killed def running: Boolean = !finished def status: String = { if (running) { if (gettingResult) { "GET RESULT" } else { "RUNNING" } } else if (failed) { "FAILED" } else if (killed) { "KILLED" } else if (successful) { "SUCCESS" } else { "UNKNOWN" } } def id: String = s"$index.$attemptNumber" def duration: Long = { if (!finished) { throw new UnsupportedOperationException("duration() called on unfinished task") } else { finishTime - launchTime } } private[spark] def timeRunning(currentTime: Long): Long = currentTime - launchTime }
Example 139
Source File: ExecutorInfo.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.spark.annotation.DeveloperApi @DeveloperApi class ExecutorInfo( val executorHost: String, val totalCores: Int, val logUrlMap: Map[String, String]) { def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo] override def equals(other: Any): Boolean = other match { case that: ExecutorInfo => (that canEqual this) && executorHost == that.executorHost && totalCores == that.totalCores && logUrlMap == that.logUrlMap case _ => false } override def hashCode(): Int = { val state = Seq(executorHost, totalCores, logUrlMap) state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) } }
Example 140
Source File: taskListeners.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.util.EventListener import org.apache.spark.TaskContext import org.apache.spark.annotation.DeveloperApi private[spark] class TaskCompletionListenerException( errorMessages: Seq[String], val previousError: Option[Throwable] = None) extends RuntimeException { override def getMessage: String = { if (errorMessages.size == 1) { errorMessages.head } else { errorMessages.zipWithIndex.map { case (msg, i) => s"Exception $i: $msg" }.mkString("\n") } + previousError.map { e => "\n\nPrevious exception in task: " + e.getMessage + "\n" + e.getStackTrace.mkString("\t", "\n\t", "") }.getOrElse("") } }
Example 141
Source File: StorageStatusListener.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.collection.mutable import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ private def updateStorageStatus(unpersistedRDDId: Int) { storageStatusList.foreach { storageStatus => storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) => storageStatus.removeBlock(blockId) } } } override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized { updateStorageStatus(unpersistRDD.rddId) } override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) { synchronized { val blockManagerId = blockManagerAdded.blockManagerId val executorId = blockManagerId.executorId val maxMem = blockManagerAdded.maxMem val storageStatus = new StorageStatus(blockManagerId, maxMem) executorIdToStorageStatus(executorId) = storageStatus // Try to remove the dead storage status if same executor register the block manager twice. deadExecutorStorageStatus.zipWithIndex.find(_._1.blockManagerId.executorId == executorId) .foreach(toRemoveExecutor => deadExecutorStorageStatus.remove(toRemoveExecutor._2)) } } override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) { synchronized { val executorId = blockManagerRemoved.blockManagerId.executorId executorIdToStorageStatus.remove(executorId).foreach { status => deadExecutorStorageStatus += status } if (deadExecutorStorageStatus.size > retainedDeadExecutors) { deadExecutorStorageStatus.trimStart(1) } } } override def onBlockUpdated(blockUpdated: SparkListenerBlockUpdated): Unit = { val executorId = blockUpdated.blockUpdatedInfo.blockManagerId.executorId val blockId = blockUpdated.blockUpdatedInfo.blockId val storageLevel = blockUpdated.blockUpdatedInfo.storageLevel val memSize = blockUpdated.blockUpdatedInfo.memSize val diskSize = blockUpdated.blockUpdatedInfo.diskSize val blockStatus = BlockStatus(storageLevel, memSize, diskSize) updateStorageStatus(executorId, Seq((blockId, blockStatus))) } }
Example 142
Source File: RDDInfo.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.{RDD, RDDOperationScope} import org.apache.spark.util.Utils @DeveloperApi class RDDInfo( val id: Int, var name: String, val numPartitions: Int, var storageLevel: StorageLevel, val parentIds: Seq[Int], val callSite: String = "", val scope: Option[RDDOperationScope] = None) extends Ordered[RDDInfo] { var numCachedPartitions = 0 var memSize = 0L var diskSize = 0L var externalBlockStoreSize = 0L def isCached: Boolean = (memSize + diskSize > 0) && numCachedPartitions > 0 override def toString: String = { import Utils.bytesToString ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " + "MemorySize: %s; DiskSize: %s").format( name, id, storageLevel.toString, numCachedPartitions, numPartitions, bytesToString(memSize), bytesToString(diskSize)) } override def compare(that: RDDInfo): Int = { this.id - that.id } } private[spark] object RDDInfo { def fromRdd(rdd: RDD[_]): RDDInfo = { val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd)) val parentIds = rdd.dependencies.map(_.rdd.id) new RDDInfo(rdd.id, rddName, rdd.partitions.length, rdd.getStorageLevel, parentIds, rdd.creationSite.shortForm, rdd.scope) } }
Example 143
Source File: BlockUpdatedInfo.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.annotation.DeveloperApi import org.apache.spark.storage.BlockManagerMessages.UpdateBlockInfo @DeveloperApi case class BlockUpdatedInfo( blockManagerId: BlockManagerId, blockId: BlockId, storageLevel: StorageLevel, memSize: Long, diskSize: Long) private[spark] object BlockUpdatedInfo { private[spark] def apply(updateBlockInfo: UpdateBlockInfo): BlockUpdatedInfo = { BlockUpdatedInfo( updateBlockInfo.blockManagerId, updateBlockInfo.blockId, updateBlockInfo.storageLevel, updateBlockInfo.memSize, updateBlockInfo.diskSize) } }
Example 144
Source File: TopologyMapper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.util.Utils @DeveloperApi class FileBasedTopologyMapper(conf: SparkConf) extends TopologyMapper(conf) with Logging { val topologyFile = conf.getOption("spark.storage.replication.topologyFile") require(topologyFile.isDefined, "Please specify topology file via " + "spark.storage.replication.topologyFile for FileBasedTopologyMapper.") val topologyMap = Utils.getPropertiesFromFile(topologyFile.get) override def getTopologyForHost(hostname: String): Option[String] = { val topology = topologyMap.get(hostname) if (topology.isDefined) { logDebug(s"$hostname -> ${topology.get}") } else { logWarning(s"$hostname does not have any topology information") } topology } }
Example 145
Source File: ShuffleWriteMetrics.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.executor import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.LongAccumulator def writeTime: Long = _writeTime.sum private[spark] def incBytesWritten(v: Long): Unit = _bytesWritten.add(v) private[spark] def incRecordsWritten(v: Long): Unit = _recordsWritten.add(v) private[spark] def incWriteTime(v: Long): Unit = _writeTime.add(v) private[spark] def decBytesWritten(v: Long): Unit = { _bytesWritten.setValue(bytesWritten - v) } private[spark] def decRecordsWritten(v: Long): Unit = { _recordsWritten.setValue(recordsWritten - v) } // Legacy methods for backward compatibility. // TODO: remove these once we make this class private. @deprecated("use bytesWritten instead", "2.0.0") def shuffleBytesWritten: Long = bytesWritten @deprecated("use writeTime instead", "2.0.0") def shuffleWriteTime: Long = writeTime @deprecated("use recordsWritten instead", "2.0.0") def shuffleRecordsWritten: Long = recordsWritten }
Example 146
Source File: InterruptibleIterator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi @DeveloperApi class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T]) extends Iterator[T] { def hasNext: Boolean = { // TODO(aarondav/rxin): Check Thread.interrupted instead of context.interrupted if interrupt // is allowed. The assumption is that Thread.interrupted does not have a memory fence in read // (just a volatile field in C), while context.interrupted is a volatile in the JVM, which // introduces an expensive read fence. if (context.isInterrupted) { throw new TaskKilledException } else { delegate.hasNext } } def next(): T = delegate.next() }
Example 147
Source File: ShuffledRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.hadoop.security.UserGroupInformation import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = index override def equals(other: Any): Boolean = super.equals(other) } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { val serializer = userSpecifiedSerializer.getOrElse { val serializerManager = SparkEnv.get(user).serializerManager if (mapSideCombine) { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]]) } else { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]]) } } List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override protected def getPreferredLocations(partition: Partition): Seq[String] = { val tracker = SparkEnv.get(user).mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] tracker.getPreferredLocationsForShuffle(dep, partition.index) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv .get(user) .shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 148
Source File: OrderedRDDFunctions.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 149
Source File: UnionRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.collection.parallel.{ForkJoinTaskSupport, ThreadPoolTaskSupport} import scala.concurrent.forkjoin.ForkJoinPool import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient private val rdd: RDD[T], val parentRddIndex: Int, @transient private val parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } object UnionRDD { private[spark] lazy val partitionEvalTaskSupport = new ForkJoinTaskSupport(new ForkJoinPool(8)) } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies // visible for testing private[spark] val isPartitionListingParallel: Boolean = rdds.length > conf.getInt("spark.rdd.parallelListingThreshold", 10) override def getPartitions: Array[Partition] = { val parRDDs = if (isPartitionListingParallel) { val parArray = rdds.par parArray.tasksupport = UnionRDD.partitionEvalTaskSupport parArray } else { rdds } val array = new Array[Partition](parRDDs.map(_.partitions.length).seq.sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 150
Source File: AttributeType.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.attribute import org.apache.spark.annotation.DeveloperApi def fromName(name: String): AttributeType = { if (name == Numeric.name) { Numeric } else if (name == Nominal.name) { Nominal } else if (name == Binary.name) { Binary } else if (name == Unresolved.name) { Unresolved } else { throw new IllegalArgumentException(s"Cannot recognize type $name.") } } }
Example 151
Source File: Transformer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import scala.annotation.varargs import org.apache.spark.Logging import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ protected def validateInputType(inputType: DataType): Unit = {} override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), outputDataType, nullable = false) StructType(outputFields) } override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) dataset.withColumn($(outputCol), callUDF(this.createTransformFunc, outputDataType, dataset($(inputCol)))) } override def copy(extra: ParamMap): T = defaultCopy(extra) }
Example 152
Source File: Predict.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.model import org.apache.spark.annotation.DeveloperApi @DeveloperApi class Predict( val predict: Double, val prob: Double = 0.0) extends Serializable { override def toString: String = s"$predict (prob = $prob)" override def equals(other: Any): Boolean = { other match { case p: Predict => predict == p.predict && prob == p.prob case _ => false } } override def hashCode: Int = { com.google.common.base.Objects.hashCode(predict: java.lang.Double, prob: java.lang.Double) } }
Example 153
Source File: DataValidators.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.annotation.DeveloperApi import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }
Example 154
Source File: KMeansDataGenerator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD def generateKMeansRDD( sc: SparkContext, numPoints: Int, k: Int, d: Int, r: Double, numPartitions: Int = 2) : RDD[Array[Double]] = { // First, generate some centers val rand = new Random(42) val centers = Array.fill(k)(Array.fill(d)(rand.nextGaussian() * r)) // Then generate points around each center sc.parallelize(0 until numPoints, numPartitions).map { idx => val center = centers(idx % k) val rand2 = new Random(42 + idx) Array.tabulate(d)(i => center(i) + rand2.nextGaussian()) } } def main(args: Array[String]) { if (args.length < 6) { println("Usage: KMeansGenerator " + "<master> <output_dir> <num_points> <k> <d> <r> [<num_partitions>]") System.exit(1) } val sparkMaster = args(0) val outputPath = args(1) val numPoints = args(2).toInt val k = args(3).toInt val d = args(4).toInt val r = args(5).toDouble val parts = if (args.length >= 7) args(6).toInt else 2 val sc = new SparkContext(sparkMaster, "KMeansDataGenerator") val data = generateKMeansRDD(sc, numPoints, k, d, r, parts) data.map(_.mkString(" ")).saveAsTextFile(outputPath) System.exit(0) } }
Example 155
Source File: LogisticRegressionDataGenerator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.linalg.Vectors def generateLogisticRDD( sc: SparkContext, nexamples: Int, nfeatures: Int, eps: Double, nparts: Int = 2, probOne: Double = 0.5): RDD[LabeledPoint] = { val data = sc.parallelize(0 until nexamples, nparts).map { idx => val rnd = new Random(42 + idx) val y = if (idx % 2 == 0) 0.0 else 1.0 val x = Array.fill[Double](nfeatures) { rnd.nextGaussian() + (y * eps) } LabeledPoint(y, Vectors.dense(x)) } data } def main(args: Array[String]) { if (args.length != 5) { println("Usage: LogisticRegressionGenerator " + "<master> <output_dir> <num_examples> <num_features> <num_partitions>") System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val eps = 3 val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator") val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts) data.saveAsTextFile(outputPath) sc.stop() } }
Example 156
Source File: SVMDataGenerator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import scala.util.Random import com.github.fommil.netlib.BLAS.{getInstance => blas} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint @DeveloperApi object SVMDataGenerator { def main(args: Array[String]) { if (args.length < 2) { println("Usage: SVMGenerator " + "<master> <output_dir> [num_examples] [num_features] [num_partitions]") System.exit(1) } val sparkMaster: String = args(0) val outputPath: String = args(1) val nexamples: Int = if (args.length > 2) args(2).toInt else 1000 val nfeatures: Int = if (args.length > 3) args(3).toInt else 2 val parts: Int = if (args.length > 4) args(4).toInt else 2 val sc = new SparkContext(sparkMaster, "SVMGenerator") val globalRnd = new Random(94720) val trueWeights = Array.fill[Double](nfeatures + 1)(globalRnd.nextGaussian()) val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx => val rnd = new Random(42 + idx) val x = Array.fill[Double](nfeatures) { rnd.nextDouble() * 2.0 - 1.0 } val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1 val y = if (yD < 0) 0.0 else 1.0 LabeledPoint(y, Vectors.dense(x)) } data.saveAsTextFile(outputPath) sc.stop() } }
Example 157
Source File: SparkCommandLine.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.repl import scala.tools.nsc.{Settings, CompilerCommand} import scala.Predef._ import org.apache.spark.annotation.DeveloperApi @DeveloperApi class SparkCommandLine(args: List[String], override val settings: Settings) extends CompilerCommand(args, settings) { def this(args: List[String], error: String => Unit) { this(args, new SparkRunnerSettings(error)) } def this(args: List[String]) { this(args, str => Console.println("Error: " + str)) } }
Example 158
Source File: DescribeHiveTableCommand.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.collection.JavaConversions._ import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{Attribute, Row} import org.apache.spark.sql.execution.{SparkPlan, RunnableCommand} import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation} import org.apache.spark.sql.hive.HiveShim import org.apache.spark.sql.SQLContext private[hive] case class DescribeHiveTableCommand( table: MetastoreRelation, override val output: Seq[Attribute], isExtended: Boolean) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { // Trying to mimic the format of Hive's output. But not exactly the same. var results: Seq[(String, String, String)] = Nil val columns: Seq[FieldSchema] = table.hiveQlTable.getCols val partitionColumns: Seq[FieldSchema] = table.hiveQlTable.getPartCols results ++= columns.map(field => (field.getName, field.getType, field.getComment)) if (partitionColumns.nonEmpty) { val partColumnInfo = partitionColumns.map(field => (field.getName, field.getType, field.getComment)) results ++= partColumnInfo ++ Seq(("# Partition Information", "", "")) ++ Seq((s"# ${output.get(0).name}", output.get(1).name, output.get(2).name)) ++ partColumnInfo } if (isExtended) { results ++= Seq(("Detailed Table Information", table.hiveQlTable.getTTable.toString, "")) } results.map { case (name, dataType, comment) => Row(name, dataType, comment) } } }
Example 159
Source File: package.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.rules import org.apache.spark.util.Utils @DeveloperApi object DumpByteCode { import scala.sys.process._ val dumpDirectory = Utils.createTempDir() dumpDirectory.mkdir() def apply(obj: Any): Unit = { val generatedClass = obj.getClass val classLoader = generatedClass .getClassLoader .asInstanceOf[scala.tools.nsc.interpreter.AbstractFileClassLoader] val generatedBytes = classLoader.classBytes(generatedClass.getName) val packageDir = new java.io.File(dumpDirectory, generatedClass.getPackage.getName) if (!packageDir.exists()) { packageDir.mkdir() } val classFile = new java.io.File(packageDir, generatedClass.getName.split("\\.").last + ".class") val outfile = new java.io.FileOutputStream(classFile) outfile.write(generatedBytes) outfile.close() println( s"javap -p -v -classpath ${dumpDirectory.getCanonicalPath} ${generatedClass.getName}".!!) } } }
Example 160
Source File: AnalysisException.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.annotation.DeveloperApi @DeveloperApi class AnalysisException protected[sql] ( val message: String, val line: Option[Int] = None, val startPosition: Option[Int] = None) extends Exception with Serializable { def withPosition(line: Option[Int], startPosition: Option[Int]): AnalysisException = { val newException = new AnalysisException(message, line, startPosition) newException.setStackTrace(getStackTrace) newException } override def getMessage: String = { val lineAnnotation = line.map(l => s" line $l").getOrElse("") val positionAnnotation = startPosition.map(p => s" pos $p").getOrElse("") s"$message;$lineAnnotation$positionAnnotation" } }
Example 161
Source File: LeftSemiJoinHash.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row} import org.apache.spark.sql.catalyst.plans.physical.ClusteredDistribution import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class LeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override val buildSide: BuildSide = BuildRight override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil override def output: Seq[Attribute] = left.output protected override def doExecute(): RDD[Row] = { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) => val hashSet = new java.util.HashSet[Row]() var currentRow: Row = null // Create a Hash set of buildKeys while (buildIter.hasNext) { currentRow = buildIter.next() val rowKey = buildSideKeyGenerator(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) if (!keyExists) { hashSet.add(rowKey) } } } val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && hashSet.contains(joinKeys.currentValue) }) } } }
Example 162
Source File: BroadcastHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.rdd.RDD import org.apache.spark.util.ThreadUtils import scala.concurrent._ import scala.concurrent.duration._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{Row, Expression} import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning, UnspecifiedDistribution} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class BroadcastHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { val timeout: Duration = { val timeoutValue = sqlContext.conf.broadcastTimeout if (timeoutValue < 0) { Duration.Inf } else { timeoutValue.seconds } } override def outputPartitioning: Partitioning = streamedPlan.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = UnspecifiedDistribution :: UnspecifiedDistribution :: Nil @transient lazy val broadcastFuture = future { // Note that we use .execute().collect() because we don't want to convert data to Scala types val input: Array[Row] = buildPlan.execute().map(_.copy()).collect() val hashed = HashedRelation(input.iterator, buildSideKeyGenerator, input.length) sparkContext.broadcast(hashed) }(BroadcastHashJoin.broadcastHashJoinExecutionContext) protected override def doExecute(): RDD[Row] = { val broadcastRelation = Await.result(broadcastFuture, timeout) streamedPlan.execute().mapPartitions { streamedIter => hashJoin(streamedIter, broadcastRelation.value) } } } object BroadcastHashJoin { private[sql] val broadcastHashJoinExecutionContext = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonCachedThreadPool("broadcast-hash-join", 128)) }
Example 163
Source File: BroadcastLeftSemiJoinHash.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, Row} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class BroadcastLeftSemiJoinHash( leftKeys: Seq[Expression], rightKeys: Seq[Expression], left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override val buildSide: BuildSide = BuildRight override def output: Seq[Attribute] = left.output protected override def doExecute(): RDD[Row] = { val buildIter = buildPlan.execute().map(_.copy()).collect().toIterator val hashSet = new java.util.HashSet[Row]() var currentRow: Row = null // Create a Hash set of buildKeys while (buildIter.hasNext) { currentRow = buildIter.next() val rowKey = buildSideKeyGenerator(currentRow) if (!rowKey.anyNull) { val keyExists = hashSet.contains(rowKey) if (!keyExists) { hashSet.add(rowKey) } } } val broadcastedRelation = sparkContext.broadcast(hashSet) streamedPlan.execute().mapPartitions { streamIter => val joinKeys = streamSideKeyGenerator() streamIter.filter(current => { !joinKeys(current).anyNull && broadcastedRelation.value.contains(joinKeys.currentValue) }) } } }
Example 164
Source File: LeftSemiJoinBNL.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} override def right: SparkPlan = broadcast @transient private lazy val boundCondition = newPredicate(condition.getOrElse(Literal(true)), left.output ++ right.output) protected override def doExecute(): RDD[Row] = { val broadcastedRelation = sparkContext.broadcast(broadcast.execute().map(_.copy()).collect().toIndexedSeq) streamed.execute().mapPartitions { streamedIter => val joinedRow = new JoinedRow streamedIter.filter(streamedRow => { var i = 0 var matched = false while (i < broadcastedRelation.value.size && !matched) { val broadcastedRow = broadcastedRelation.value(i) if (boundCondition(joinedRow(streamedRow, broadcastedRow))) { matched = true } i += 1 } matched }) } } }
Example 165
Source File: ShuffledHashJoin.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.physical.{ClusteredDistribution, Partitioning} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class ShuffledHashJoin( leftKeys: Seq[Expression], rightKeys: Seq[Expression], buildSide: BuildSide, left: SparkPlan, right: SparkPlan) extends BinaryNode with HashJoin { override def outputPartitioning: Partitioning = left.outputPartitioning override def requiredChildDistribution: Seq[ClusteredDistribution] = ClusteredDistribution(leftKeys) :: ClusteredDistribution(rightKeys) :: Nil protected override def doExecute(): RDD[Row] = { buildPlan.execute().zipPartitions(streamedPlan.execute()) { (buildIter, streamIter) => val hashed = HashedRelation(buildIter, buildSideKeyGenerator) hashJoin(streamIter, hashed) } } }
Example 166
Source File: CartesianProduct.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.joins import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{Attribute, JoinedRow} import org.apache.spark.sql.execution.{BinaryNode, SparkPlan} @DeveloperApi case class CartesianProduct(left: SparkPlan, right: SparkPlan) extends BinaryNode { override def output: Seq[Attribute] = left.output ++ right.output protected override def doExecute(): RDD[Row] = { val leftResults = left.execute().map(_.copy()) val rightResults = right.execute().map(_.copy()) leftResults.cartesian(rightResults).mapPartitions { iter => val joinedRow = new JoinedRow iter.map(r => joinedRow(r._1, r._2)) } } }
Example 167
Source File: Expand.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.errors._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical.{UnknownPartitioning, Partitioning} @DeveloperApi case class Expand( projections: Seq[GroupExpression], output: Seq[Attribute], child: SparkPlan) extends UnaryNode { // The GroupExpressions can output data with arbitrary partitioning, so set it // as UNKNOWN partitioning override def outputPartitioning: Partitioning = UnknownPartitioning(0) protected override def doExecute(): RDD[Row] = attachTree(this, "execute") { child.execute().mapPartitions { iter => // TODO Move out projection objects creation and transfer to // workers via closure. However we can't assume the Projection // is serializable because of the code gen, so we have to // create the projections within each of the partition processing. val groups = projections.map(ee => newProjection(ee.children, child.output)).toArray new Iterator[Row] { private[this] var result: Row = _ private[this] var idx = -1 // -1 means the initial state private[this] var input: Row = _ override final def hasNext: Boolean = (-1 < idx && idx < groups.length) || iter.hasNext override final def next(): Row = { if (idx <= 0) { // in the initial (-1) or beginning(0) of a new input row, fetch the next input tuple input = iter.next() idx = 0 } result = groups(idx)(input) idx += 1 if (idx == groups.length && iter.hasNext) { idx = 0 } result } } } } }
Example 168
Source File: Generate.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.expressions._ @DeveloperApi case class Generate( generator: Generator, join: Boolean, outer: Boolean, output: Seq[Attribute], child: SparkPlan) extends UnaryNode { val boundGenerator = BindReferences.bindReference(generator, child.output) protected override def doExecute(): RDD[Row] = { // boundGenerator.terminate() should be triggered after all of the rows in the partition if (join) { child.execute().mapPartitions { iter => val generatorNullRow = Row.fromSeq(Seq.fill[Any](generator.elementTypes.size)(null)) val joinedRow = new JoinedRow iter.flatMap { row => // we should always set the left (child output) joinedRow.withLeft(row) val outputRows = boundGenerator.eval(row) if (outer && outputRows.isEmpty) { joinedRow.withRight(generatorNullRow) :: Nil } else { outputRows.map(or => joinedRow.withRight(or)) } } ++ LazyIterator(() => boundGenerator.terminate()).map { row => // we leave the left side as the last element of its child output // keep it the same as Hive does joinedRow.withRight(row) } } } else { child.execute().mapPartitions { iter => iter.flatMap(row => boundGenerator.eval(row)) ++ LazyIterator(() => boundGenerator.terminate()) } } } }
Example 169
Source File: ExistingRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} private[sql] case class LogicalLocalTable(output: Seq[Attribute], rows: Seq[Row])(sqlContext: SQLContext) extends LogicalPlan with MultiInstanceRelation { override def children: Seq[LogicalPlan] = Nil override def newInstance(): this.type = LogicalLocalTable(output.map(_.newInstance()), rows)(sqlContext).asInstanceOf[this.type] override def sameResult(plan: LogicalPlan): Boolean = plan match { case LogicalRDD(_, otherRDD) => rows == rows case _ => false } @transient override lazy val statistics: Statistics = Statistics( // TODO: Improve the statistics estimation. // This is made small enough so it can be broadcasted. sizeInBytes = sqlContext.conf.autoBroadcastJoinThreshold - 1 ) }
Example 170
Source File: StreamingListener.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.Queue import org.apache.spark.util.Distribution import org.apache.spark.annotation.DeveloperApi @DeveloperApi class StatsReportListener(numBatchInfos: Int = 10) extends StreamingListener { // Queue containing latest completed batches val batchInfos = new Queue[BatchInfo]() override def onBatchCompleted(batchStarted: StreamingListenerBatchCompleted) { batchInfos.enqueue(batchStarted.batchInfo) if (batchInfos.size > numBatchInfos) batchInfos.dequeue() printStats() } def printStats() { showMillisDistribution("Total delay: ", _.totalDelay) showMillisDistribution("Processing time: ", _.processingDelay) } def showMillisDistribution(heading: String, getMetric: BatchInfo => Option[Long]) { org.apache.spark.scheduler.StatsReportListener.showMillisDistribution( heading, extractDistribution(getMetric)) } def extractDistribution(getMetric: BatchInfo => Option[Long]): Option[Distribution] = { Distribution(batchInfos.flatMap(getMetric(_)).map(_.toDouble)) } }
Example 171
Source File: ReceiverInfo.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rpc.RpcEndpointRef @DeveloperApi case class ReceiverInfo( streamId: Int, name: String, private[streaming] val endpoint: RpcEndpointRef, active: Boolean, location: String, lastErrorMessage: String = "", lastError: String = "", lastErrorTime: Long = -1L ) { }
Example 172
Source File: SerializableWritable.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.ObjectWritable import org.apache.hadoop.io.Writable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils @DeveloperApi class SerializableWritable[T <: Writable](@transient var t: T) extends Serializable { def value: T = t override def toString: String = t.toString private def writeObject(out: ObjectOutputStream): Unit = Utils.tryOrIOException { out.defaultWriteObject() new ObjectWritable(t).write(out) } private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() val ow = new ObjectWritable() ow.setConf(new Configuration()) ow.readFields(in) t = ow.get().asInstanceOf[T] } }
Example 173
Source File: JavaNewHadoopRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapreduce.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.NewHadoopRDD @DeveloperApi class JavaNewHadoopRDD[K, V](rdd: NewHadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 174
Source File: JavaHadoopRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.api.java import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.hadoop.mapred.InputSplit import org.apache.spark.annotation.DeveloperApi import org.apache.spark.api.java.JavaSparkContext._ import org.apache.spark.api.java.function.{Function2 => JFunction2} import org.apache.spark.rdd.HadoopRDD @DeveloperApi class JavaHadoopRDD[K, V](rdd: HadoopRDD[K, V]) (implicit override val kClassTag: ClassTag[K], implicit override val vClassTag: ClassTag[V]) extends JavaPairRDD[K, V](rdd) { @DeveloperApi def mapPartitionsWithInputSplit[R]( f: JFunction2[InputSplit, java.util.Iterator[(K, V)], java.util.Iterator[R]], preservesPartitioning: Boolean = false): JavaRDD[R] = { new JavaRDD(rdd.mapPartitionsWithInputSplit((a, b) => f.call(a, asJavaIterator(b)), preservesPartitioning)(fakeClassTag))(fakeClassTag) } }
Example 175
Source File: DriverInfo.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.util.Date import org.apache.spark.annotation.DeveloperApi import org.apache.spark.deploy.DriverDescription import org.apache.spark.util.Utils private[deploy] class DriverInfo( val startTime: Long, val id: String, val desc: DriverDescription, val submitDate: Date) extends Serializable { @transient var state: DriverState.Value = DriverState.SUBMITTED @transient var worker: Option[WorkerInfo] = None init() private def readObject(in: java.io.ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() init() } private def init(): Unit = { state = DriverState.SUBMITTED worker = None exception = None } }
Example 176
Source File: WorkerInfo.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import scala.collection.mutable import akka.actor.ActorRef import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class WorkerInfo( val id: String, val host: String, val port: Int, val cores: Int, val memory: Int, val actor: ActorRef, val webUiPort: Int, val publicAddress: String) extends Serializable { Utils.checkHost(host, "Expected hostname") assert (port > 0) @transient var executors: mutable.HashMap[String, ExecutorDesc] = _ // executorId => info @transient var drivers: mutable.HashMap[String, DriverInfo] = _ // driverId => info @transient var state: WorkerState.Value = _ @transient var coresUsed: Int = _ @transient var memoryUsed: Int = _ @transient var lastHeartbeat: Long = _ init() def coresFree: Int = cores - coresUsed def memoryFree: Int = memory - memoryUsed private def readObject(in: java.io.ObjectInputStream): Unit = Utils.tryOrIOException { in.defaultReadObject() init() } private def init() { executors = new mutable.HashMap drivers = new mutable.HashMap state = WorkerState.ALIVE coresUsed = 0 memoryUsed = 0 lastHeartbeat = System.currentTimeMillis() } def hostPort: String = { assert (port > 0) host + ":" + port } def addExecutor(exec: ExecutorDesc) { executors(exec.fullId) = exec coresUsed += exec.cores memoryUsed += exec.memory } def removeExecutor(exec: ExecutorDesc) { if (executors.contains(exec.fullId)) { executors -= exec.fullId coresUsed -= exec.cores memoryUsed -= exec.memory } } def hasExecutor(app: ApplicationInfo): Boolean = { executors.values.exists(_.application == app) } def addDriver(driver: DriverInfo) { drivers(driver.id) = driver memoryUsed += driver.desc.mem coresUsed += driver.desc.cores } def removeDriver(driver: DriverInfo) { drivers -= driver.id memoryUsed -= driver.desc.mem coresUsed -= driver.desc.cores } def webUiAddress : String = { "http://" + this.publicAddress + ":" + this.webUiPort } def setState(state: WorkerState.Value): Unit = { this.state = state } }
Example 177
Source File: ExecutorsTab.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.exec import scala.collection.mutable.HashMap import org.apache.spark.ExceptionFailure import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.storage.{StorageStatus, StorageStatusListener} import org.apache.spark.ui.{SparkUI, SparkUITab} import org.apache.spark.ui.jobs.UIData.ExecutorUIData private[ui] class ExecutorsTab(parent: SparkUI) extends SparkUITab(parent, "executors") { val listener = parent.executorsListener val sc = parent.sc val threadDumpEnabled = sc.isDefined && parent.conf.getBoolean("spark.ui.threadDumpsEnabled", true) attachPage(new ExecutorsPage(this, threadDumpEnabled)) if (threadDumpEnabled) { attachPage(new ExecutorThreadDumpPage(this)) } } @DeveloperApi class ExecutorsListener(storageStatusListener: StorageStatusListener) extends SparkListener { val executorToTasksActive = HashMap[String, Int]() val executorToTasksComplete = HashMap[String, Int]() val executorToTasksFailed = HashMap[String, Int]() val executorToDuration = HashMap[String, Long]() val executorToInputBytes = HashMap[String, Long]() val executorToInputRecords = HashMap[String, Long]() val executorToOutputBytes = HashMap[String, Long]() val executorToOutputRecords = HashMap[String, Long]() val executorToShuffleRead = HashMap[String, Long]() val executorToShuffleWrite = HashMap[String, Long]() val executorToLogUrls = HashMap[String, Map[String, String]]() val executorIdToData = HashMap[String, ExecutorUIData]() def storageStatusList: Seq[StorageStatus] = storageStatusListener.storageStatusList override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = synchronized { val eid = executorAdded.executorId executorToLogUrls(eid) = executorAdded.executorInfo.logUrlMap executorIdToData(eid) = ExecutorUIData(executorAdded.time) } override def onExecutorRemoved( executorRemoved: SparkListenerExecutorRemoved): Unit = synchronized { val eid = executorRemoved.executorId val uiData = executorIdToData(eid) uiData.finishTime = Some(executorRemoved.time) uiData.finishReason = Some(executorRemoved.reason) } override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = synchronized { val eid = taskStart.taskInfo.executorId executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 0) + 1 } override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized { val info = taskEnd.taskInfo if (info != null) { val eid = info.executorId executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 1) - 1 executorToDuration(eid) = executorToDuration.getOrElse(eid, 0L) + info.duration taskEnd.reason match { case e: ExceptionFailure => executorToTasksFailed(eid) = executorToTasksFailed.getOrElse(eid, 0) + 1 case _ => executorToTasksComplete(eid) = executorToTasksComplete.getOrElse(eid, 0) + 1 } // Update shuffle read/write val metrics = taskEnd.taskMetrics if (metrics != null) { metrics.inputMetrics.foreach { inputMetrics => executorToInputBytes(eid) = executorToInputBytes.getOrElse(eid, 0L) + inputMetrics.bytesRead executorToInputRecords(eid) = executorToInputRecords.getOrElse(eid, 0L) + inputMetrics.recordsRead } metrics.outputMetrics.foreach { outputMetrics => executorToOutputBytes(eid) = executorToOutputBytes.getOrElse(eid, 0L) + outputMetrics.bytesWritten executorToOutputRecords(eid) = executorToOutputRecords.getOrElse(eid, 0L) + outputMetrics.recordsWritten } metrics.shuffleReadMetrics.foreach { shuffleRead => executorToShuffleRead(eid) = executorToShuffleRead.getOrElse(eid, 0L) + shuffleRead.remoteBytesRead } metrics.shuffleWriteMetrics.foreach { shuffleWrite => executorToShuffleWrite(eid) = executorToShuffleWrite.getOrElse(eid, 0L) + shuffleWrite.shuffleBytesWritten } } } } }
Example 178
Source File: EnvironmentTab.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.env import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ import org.apache.spark.ui._ private[ui] class EnvironmentTab(parent: SparkUI) extends SparkUITab(parent, "environment") { val listener = parent.environmentListener attachPage(new EnvironmentPage(this)) } @DeveloperApi class EnvironmentListener extends SparkListener { var jvmInformation = Seq[(String, String)]() var sparkProperties = Seq[(String, String)]() var systemProperties = Seq[(String, String)]() var classpathEntries = Seq[(String, String)]() override def onEnvironmentUpdate(environmentUpdate: SparkListenerEnvironmentUpdate) { synchronized { val environmentDetails = environmentUpdate.environmentDetails jvmInformation = environmentDetails("JVM Information") sparkProperties = environmentDetails("Spark Properties") systemProperties = environmentDetails("System Properties") classpathEntries = environmentDetails("Classpath Entries") } } }
Example 179
Source File: StorageTab.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ui.storage import scala.collection.mutable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ui._ import org.apache.spark.scheduler._ import org.apache.spark.storage._ override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized { val metrics = taskEnd.taskMetrics if (metrics != null && metrics.updatedBlocks.isDefined) { updateRDDInfo(metrics.updatedBlocks.get) } } override def onStageSubmitted(stageSubmitted: SparkListenerStageSubmitted): Unit = synchronized { val rddInfos = stageSubmitted.stageInfo.rddInfos rddInfos.foreach { info => _rddInfoMap.getOrElseUpdate(info.id, info) } } override def onStageCompleted(stageCompleted: SparkListenerStageCompleted): Unit = synchronized { // Remove all partitions that are no longer cached in current completed stage val completedRddIds = stageCompleted.stageInfo.rddInfos.map(r => r.id).toSet _rddInfoMap.retain { case (id, info) => !completedRddIds.contains(id) || info.numCachedPartitions > 0 } } override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized { _rddInfoMap.remove(unpersistRDD.rddId) } }
Example 180
Source File: JavaSerializer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io._ import java.nio.ByteBuffer import scala.reflect.ClassTag import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.ByteBufferInputStream import org.apache.spark.util.Utils private[spark] class JavaSerializationStream( out: OutputStream, counterReset: Int, extraDebugInfo: Boolean) extends SerializationStream { private val objOut = new ObjectOutputStream(out) private var counter = 0 @DeveloperApi class JavaSerializer(conf: SparkConf) extends Serializer with Externalizable { private var counterReset = conf.getInt("spark.serializer.objectStreamReset", 100) private var extraDebugInfo = conf.getBoolean("spark.serializer.extraDebugInfo", true) protected def this() = this(new SparkConf()) // For deserialization only override def newInstance(): SerializerInstance = { val classLoader = defaultClassLoader.getOrElse(Thread.currentThread.getContextClassLoader) new JavaSerializerInstance(counterReset, extraDebugInfo, classLoader) } override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { out.writeInt(counterReset) out.writeBoolean(extraDebugInfo) } override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { counterReset = in.readInt() extraDebugInfo = in.readBoolean() } }
Example 181
Source File: StageInfo.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.HashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.storage.RDDInfo def fromStage(stage: Stage, numTasks: Option[Int] = None): StageInfo = { val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd) val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos new StageInfo( stage.id, stage.attemptId, stage.name, numTasks.getOrElse(stage.numTasks), rddInfos, stage.parents.map(_.id), stage.details) } }
Example 182
Source File: AccumulableInfo.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.annotation.DeveloperApi @DeveloperApi class AccumulableInfo ( val id: Long, val name: String, val update: Option[String], // represents a partial update within a task val value: String) { override def equals(other: Any): Boolean = other match { case acc: AccumulableInfo => this.id == acc.id && this.name == acc.name && this.update == acc.update && this.value == acc.value case _ => false } } object AccumulableInfo { def apply(id: Long, name: String, update: Option[String], value: String): AccumulableInfo = { new AccumulableInfo(id, name, update, value) } def apply(id: Long, name: String, value: String): AccumulableInfo = { new AccumulableInfo(id, name, None, value) } }
Example 183
Source File: SplitInfo.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import collection.mutable.ArrayBuffer import org.apache.spark.annotation.DeveloperApi // information about a specific split instance : handles both split instances. // So that we do not need to worry about the differences. @DeveloperApi class SplitInfo( val inputFormatClazz: Class[_], val hostLocation: String, val path: String, val length: Long, val underlyingSplit: Any) { override def toString(): String = { "SplitInfo " + super.toString + " .. inputFormatClazz " + inputFormatClazz + ", hostLocation : " + hostLocation + ", path : " + path + ", length : " + length + ", underlyingSplit " + underlyingSplit } override def hashCode(): Int = { var hashCode = inputFormatClazz.hashCode hashCode = hashCode * 31 + hostLocation.hashCode hashCode = hashCode * 31 + path.hashCode // ignore overflow ? It is hashcode anyway ! hashCode = hashCode * 31 + (length & 0x7fffffff).toInt hashCode } // This is practically useless since most of the Split impl's dont seem to implement equals :-( // So unless there is identity equality between underlyingSplits, it will always fail even if it // is pointing to same block. override def equals(other: Any): Boolean = other match { case that: SplitInfo => { this.hostLocation == that.hostLocation && this.inputFormatClazz == that.inputFormatClazz && this.path == that.path && this.length == that.length && // other split specific checks (like start for FileSplit) this.underlyingSplit == that.underlyingSplit } case _ => false } } object SplitInfo { def toSplitInfo(inputFormatClazz: Class[_], path: String, mapredSplit: org.apache.hadoop.mapred.InputSplit): Seq[SplitInfo] = { val retval = new ArrayBuffer[SplitInfo]() val length = mapredSplit.getLength for (host <- mapredSplit.getLocations) { retval += new SplitInfo(inputFormatClazz, host, path, length, mapredSplit) } retval } def toSplitInfo(inputFormatClazz: Class[_], path: String, mapreduceSplit: org.apache.hadoop.mapreduce.InputSplit): Seq[SplitInfo] = { val retval = new ArrayBuffer[SplitInfo]() val length = mapreduceSplit.getLength for (host <- mapreduceSplit.getLocations) { retval += new SplitInfo(inputFormatClazz, host, path, length, mapreduceSplit) } retval } }
Example 184
Source File: TaskInfo.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.ListBuffer import org.apache.spark.annotation.DeveloperApi var finishTime: Long = 0 var failed = false private[spark] def markGettingResult(time: Long = System.currentTimeMillis) { gettingResultTime = time } private[spark] def markSuccessful(time: Long = System.currentTimeMillis) { finishTime = time } private[spark] def markFailed(time: Long = System.currentTimeMillis) { finishTime = time failed = true } def gettingResult: Boolean = gettingResultTime != 0 def finished: Boolean = finishTime != 0 def successful: Boolean = finished && !failed def running: Boolean = !finished def status: String = { if (running) { if (gettingResult) { "GET RESULT" } else { "RUNNING" } } else if (failed) { "FAILED" } else if (successful) { "SUCCESS" } else { "UNKNOWN" } } @deprecated("Use attemptNumber", "1.6.0") def attempt: Int = attemptNumber def id: String = s"$index.$attemptNumber" def duration: Long = { if (!finished) { throw new UnsupportedOperationException("duration() called on unfinished task") } else { finishTime - launchTime } } private[spark] def timeRunning(currentTime: Long): Long = currentTime - launchTime }
Example 185
Source File: ExecutorInfo.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.spark.annotation.DeveloperApi @DeveloperApi class ExecutorInfo( val executorHost: String, val totalCores: Int, val logUrlMap: Map[String, String]) { def canEqual(other: Any): Boolean = other.isInstanceOf[ExecutorInfo] override def equals(other: Any): Boolean = other match { case that: ExecutorInfo => (that canEqual this) && executorHost == that.executorHost && totalCores == that.totalCores && logUrlMap == that.logUrlMap case _ => false } override def hashCode(): Int = { val state = Seq(executorHost, totalCores, logUrlMap) state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) } }
Example 186
Source File: Aggregator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.collection.{AppendOnlyMap, ExternalAppendOnlyMap} @DeveloperApi case class Aggregator[K, V, C] ( createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiners: (C, C) => C) { // When spilling is enabled sorting will happen externally, but not necessarily with an // ExternalSorter. private val isSpillEnabled = SparkEnv.get.conf.getBoolean("spark.shuffle.spill", true) @deprecated("use combineValuesByKey with TaskContext argument", "0.9.0") def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]]): Iterator[(K, C)] = combineValuesByKey(iter, null) def combineValuesByKey(iter: Iterator[_ <: Product2[K, V]], context: TaskContext): Iterator[(K, C)] = { if (!isSpillEnabled) { val combiners = new AppendOnlyMap[K, C] var kv: Product2[K, V] = null val update = (hadValue: Boolean, oldValue: C) => { if (hadValue) mergeValue(oldValue, kv._2) else createCombiner(kv._2) } while (iter.hasNext) { kv = iter.next() combiners.changeValue(kv._1, update) } combiners.iterator } else { val combiners = new ExternalAppendOnlyMap[K, V, C](createCombiner, mergeValue, mergeCombiners) combiners.insertAll(iter) // Update task metrics if context is not null // TODO: Make context non optional in a future release Option(context).foreach { c => c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled) c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled) } combiners.iterator } } @deprecated("use combineCombinersByKey with TaskContext argument", "0.9.0") def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]]) : Iterator[(K, C)] = combineCombinersByKey(iter, null) def combineCombinersByKey(iter: Iterator[_ <: Product2[K, C]], context: TaskContext) : Iterator[(K, C)] = { if (!isSpillEnabled) { val combiners = new AppendOnlyMap[K, C] var kc: Product2[K, C] = null val update = (hadValue: Boolean, oldValue: C) => { if (hadValue) mergeCombiners(oldValue, kc._2) else kc._2 } while (iter.hasNext) { kc = iter.next() combiners.changeValue(kc._1, update) } combiners.iterator } else { val combiners = new ExternalAppendOnlyMap[K, C, C](identity, mergeCombiners, mergeCombiners) combiners.insertAll(iter) // Update task metrics if context is not null // TODO: Make context non-optional in a future release Option(context).foreach { c => c.taskMetrics.incMemoryBytesSpilled(combiners.memoryBytesSpilled) c.taskMetrics.incDiskBytesSpilled(combiners.diskBytesSpilled) } combiners.iterator } } }
Example 187
Source File: Dependency.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.ShuffleHandle @DeveloperApi class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int) extends NarrowDependency[T](rdd) { override def getParents(partitionId: Int): List[Int] = { if (partitionId >= outStart && partitionId < outStart + length) { List(partitionId - outStart + inStart) } else { Nil } } }
Example 188
Source File: StorageStatusListener.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import scala.collection.mutable import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler._ private def updateStorageStatus(unpersistedRDDId: Int) { storageStatusList.foreach { storageStatus => storageStatus.rddBlocksById(unpersistedRDDId).foreach { case (blockId, _) => storageStatus.removeBlock(blockId) } } } override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized { val info = taskEnd.taskInfo val metrics = taskEnd.taskMetrics if (info != null && metrics != null) { val updatedBlocks = metrics.updatedBlocks.getOrElse(Seq[(BlockId, BlockStatus)]()) if (updatedBlocks.length > 0) { updateStorageStatus(info.executorId, updatedBlocks) } } } override def onUnpersistRDD(unpersistRDD: SparkListenerUnpersistRDD): Unit = synchronized { updateStorageStatus(unpersistRDD.rddId) } override def onBlockManagerAdded(blockManagerAdded: SparkListenerBlockManagerAdded) { synchronized { val blockManagerId = blockManagerAdded.blockManagerId val executorId = blockManagerId.executorId val maxMem = blockManagerAdded.maxMem val storageStatus = new StorageStatus(blockManagerId, maxMem) executorIdToStorageStatus(executorId) = storageStatus } } override def onBlockManagerRemoved(blockManagerRemoved: SparkListenerBlockManagerRemoved) { synchronized { val executorId = blockManagerRemoved.blockManagerId.executorId executorIdToStorageStatus.remove(executorId) } } }
Example 189
Source File: RDDInfo.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.{RDDOperationScope, RDD} import org.apache.spark.util.Utils @DeveloperApi class RDDInfo( val id: Int, val name: String, val numPartitions: Int, var storageLevel: StorageLevel, val parentIds: Seq[Int], val scope: Option[RDDOperationScope] = None) extends Ordered[RDDInfo] { var numCachedPartitions = 0 var memSize = 0L var diskSize = 0L var externalBlockStoreSize = 0L def isCached: Boolean = (memSize + diskSize + externalBlockStoreSize > 0) && numCachedPartitions > 0 override def toString: String = { import Utils.bytesToString ("RDD \"%s\" (%d) StorageLevel: %s; CachedPartitions: %d; TotalPartitions: %d; " + "MemorySize: %s; ExternalBlockStoreSize: %s; DiskSize: %s").format( name, id, storageLevel.toString, numCachedPartitions, numPartitions, bytesToString(memSize), bytesToString(externalBlockStoreSize), bytesToString(diskSize)) } override def compare(that: RDDInfo): Int = { this.id - that.id } } private[spark] object RDDInfo { def fromRdd(rdd: RDD[_]): RDDInfo = { val rddName = Option(rdd.name).getOrElse(Utils.getFormattedClassName(rdd)) val parentIds = rdd.dependencies.map(_.rdd.id) new RDDInfo(rdd.id, rddName, rdd.partitions.length, rdd.getStorageLevel, parentIds, rdd.scope) } }
Example 190
Source File: BlockManagerId.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import java.io.{Externalizable, IOException, ObjectInput, ObjectOutput} import java.util.concurrent.ConcurrentHashMap import org.apache.spark.SparkContext import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils def apply(execId: String, host: String, port: Int): BlockManagerId = getCachedBlockManagerId(new BlockManagerId(execId, host, port)) def apply(in: ObjectInput): BlockManagerId = { val obj = new BlockManagerId() obj.readExternal(in) getCachedBlockManagerId(obj) } val blockManagerIdCache = new ConcurrentHashMap[BlockManagerId, BlockManagerId]() def getCachedBlockManagerId(id: BlockManagerId): BlockManagerId = { blockManagerIdCache.putIfAbsent(id, id) blockManagerIdCache.get(id) } }
Example 191
Source File: InterruptibleIterator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi @DeveloperApi class InterruptibleIterator[+T](val context: TaskContext, val delegate: Iterator[T]) extends Iterator[T] { def hasNext: Boolean = { // TODO(aarondav/rxin): Check Thread.interrupted instead of context.interrupted if interrupt // is allowed. The assumption is that Thread.interrupted does not have a memory fence in read // (just a volatile field in C), while context.interrupted is a volatile in the JVM, which // introduces an expensive read fence. if (context.isInterrupted) { throw new TaskKilledException } else { delegate.hasNext } } def next(): T = delegate.next() }
Example 192
Source File: ShuffledRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = idx } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 193
Source File: OrderedRDDFunctions.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Logging, Partitioner, RangePartitioner} import org.apache.spark.annotation.DeveloperApi def filterByRange(lower: K, upper: K): RDD[P] = self.withScope { def inRange(k: K): Boolean = ordering.gteq(k, lower) && ordering.lteq(k, upper) val rddToFilter: RDD[P] = self.partitioner match { case Some(rp: RangePartitioner[K, V]) => { val partitionIndicies = (rp.getPartition(lower), rp.getPartition(upper)) match { case (l, u) => Math.min(l, u) to Math.max(l, u) } PartitionPruningRDD.create(self, partitionIndicies.contains) } case _ => self } rddToFilter.filter { case (k, v) => inRange(k) } } }
Example 194
Source File: UnionRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.{Dependency, Partition, RangeDependency, SparkContext, TaskContext} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.util.Utils private[spark] class UnionPartition[T: ClassTag]( idx: Int, @transient rdd: RDD[T], val parentRddIndex: Int, @transient parentRddPartitionIndex: Int) extends Partition { var parentPartition: Partition = rdd.partitions(parentRddPartitionIndex) def preferredLocations(): Seq[String] = rdd.preferredLocations(parentPartition) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { // Update the reference to parent split at the time of task serialization parentPartition = rdd.partitions(parentRddPartitionIndex) oos.defaultWriteObject() } } @DeveloperApi class UnionRDD[T: ClassTag]( sc: SparkContext, var rdds: Seq[RDD[T]]) extends RDD[T](sc, Nil) { // Nil since we implement getDependencies override def getPartitions: Array[Partition] = { val array = new Array[Partition](rdds.map(_.partitions.length).sum) var pos = 0 for ((rdd, rddIndex) <- rdds.zipWithIndex; split <- rdd.partitions) { array(pos) = new UnionPartition(pos, rdd, rddIndex, split.index) pos += 1 } array } override def getDependencies: Seq[Dependency[_]] = { val deps = new ArrayBuffer[Dependency[_]] var pos = 0 for (rdd <- rdds) { deps += new RangeDependency(rdd, 0, pos, rdd.partitions.length) pos += rdd.partitions.length } deps } override def compute(s: Partition, context: TaskContext): Iterator[T] = { val part = s.asInstanceOf[UnionPartition[T]] parent[T](part.parentRddIndex).iterator(part.parentPartition, context) } override def getPreferredLocations(s: Partition): Seq[String] = s.asInstanceOf[UnionPartition[T]].preferredLocations() override def clearDependencies() { super.clearDependencies() rdds = null } }
Example 195
Source File: EstimatorModelWrapperFixtures.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.estimators import scala.language.reflectiveCalls import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml import org.apache.spark.ml.param.{ParamMap, Param => SparkParam} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{IntegerType, StructField, StructType} import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.report.Report import ai.deepsense.deeplang.doperables.serialization.SerializableSparkModel import ai.deepsense.deeplang.doperables.{SparkEstimatorWrapper, SparkModelWrapper} import ai.deepsense.deeplang.params.wrappers.spark.SingleColumnCreatorParamWrapper import ai.deepsense.deeplang.params.{Param, Params} import ai.deepsense.sparkutils.ML object EstimatorModelWrapperFixtures { class SimpleSparkModel private[EstimatorModelWrapperFixtures]() extends ML.Model[SimpleSparkModel] { def this(x: String) = this() override val uid: String = "modelId" val predictionCol = new SparkParam[String](uid, "name", "description") def setPredictionCol(value: String): this.type = set(predictionCol, value) override def copy(extra: ParamMap): this.type = defaultCopy(extra) override def transformDF(dataset: DataFrame): DataFrame = { dataset.selectExpr("*", "1 as " + $(predictionCol)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = ??? } class SimpleSparkEstimator extends ML.Estimator[SimpleSparkModel] { def this(x: String) = this() override val uid: String = "estimatorId" val predictionCol = new SparkParam[String](uid, "name", "description") override def fitDF(dataset: DataFrame): SimpleSparkModel = new SimpleSparkModel().setPredictionCol($(predictionCol)) override def copy(extra: ParamMap): ML.Estimator[SimpleSparkModel] = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(predictionCol), IntegerType, nullable = false)) } } trait HasPredictionColumn extends Params { val predictionColumn = new SingleColumnCreatorParamWrapper[ ml.param.Params { val predictionCol: SparkParam[String] }]( "prediction column", None, _.predictionCol) setDefault(predictionColumn, "abcdefg") def getPredictionColumn(): String = $(predictionColumn) def setPredictionColumn(value: String): this.type = set(predictionColumn, value) } class SimpleSparkModelWrapper extends SparkModelWrapper[SimpleSparkModel, SimpleSparkEstimator] with HasPredictionColumn { override val params: Array[Param[_]] = Array(predictionColumn) override def report(extended: Boolean = true): Report = ??? override protected def loadModel( ctx: ExecutionContext, path: String): SerializableSparkModel[SimpleSparkModel] = ??? } class SimpleSparkEstimatorWrapper extends SparkEstimatorWrapper[SimpleSparkModel, SimpleSparkEstimator, SimpleSparkModelWrapper] with HasPredictionColumn { override val params: Array[Param[_]] = Array(predictionColumn) override def report(extended: Boolean = true): Report = ??? } }
Example 196
Source File: AttributeType.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.attribute import org.apache.spark.annotation.DeveloperApi def fromName(name: String): AttributeType = { if (name == Numeric.name) { Numeric } else if (name == Nominal.name) { Nominal } else if (name == Binary.name) { Binary } else if (name == Unresolved.name) { Unresolved } else { throw new IllegalArgumentException(s"Cannot recognize type $name.") } } }
Example 197
Source File: Transformer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import scala.annotation.varargs import org.apache.spark.Logging import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ protected def validateInputType(inputType: DataType): Unit = {} override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType validateInputType(inputType) if (schema.fieldNames.contains($(outputCol))) { throw new IllegalArgumentException(s"Output column ${$(outputCol)} already exists.") } val outputFields = schema.fields :+ StructField($(outputCol), outputDataType, nullable = false) StructType(outputFields) } override def transform(dataset: DataFrame): DataFrame = { transformSchema(dataset.schema, logging = true) dataset.withColumn($(outputCol), callUDF(this.createTransformFunc, outputDataType, dataset($(inputCol)))) } override def copy(extra: ParamMap): T = defaultCopy(extra) }
Example 198
Source File: LogLoss.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.loss import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.mllib.util.MLUtils @Since("1.2.0") override def gradient(prediction: Double, label: Double): Double = { - 4.0 * label / (1.0 + math.exp(2.0 * label * prediction)) } override private[mllib] def computeError(prediction: Double, label: Double): Double = { val margin = 2.0 * label * prediction // The following is equivalent to 2.0 * log(1 + exp(-margin)) but more numerically stable. 2.0 * MLUtils.log1pExp(-margin) } }
Example 199
Source File: Predict.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree.model import org.apache.spark.annotation.{DeveloperApi, Since} @Since("1.2.0") @DeveloperApi class Predict @Since("1.2.0") ( @Since("1.2.0") val predict: Double, @Since("1.2.0") val prob: Double = 0.0) extends Serializable { override def toString: String = s"$predict (prob = $prob)" override def equals(other: Any): Boolean = { other match { case p: Predict => predict == p.predict && prob == p.prob case _ => false } } override def hashCode: Int = { com.google.common.base.Objects.hashCode(predict: java.lang.Double, prob: java.lang.Double) } }
Example 200
Source File: DataValidators.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.Logging import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD @Since("1.3.0") def multiLabelValidator(k: Int): RDD[LabeledPoint] => Boolean = { data => val numInvalid = data.filter(x => x.label - x.label.toInt != 0.0 || x.label < 0 || x.label > k - 1).count() if (numInvalid != 0) { logError("Classification labels should be in {0 to " + (k - 1) + "}. " + "Found " + numInvalid + " invalid labels") } numInvalid == 0 } }