org.apache.spark.SparkException Scala Example

Source File: RWrappers.scala From drizzle-spark with Apache License 2.0

6 votes

package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkException
import org.apache.spark.ml.util.MLReader


private[r] object RWrappers extends MLReader[Object] {

  override def load(path: String): Object = {
    implicit val format = DefaultFormats
    val rMetadataPath = new Path(path, "rMetadata").toString
    val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
    val rMetadata = parse(rMetadataStr)
    val className = (rMetadata \ "class").extract[String]
    className match {
      case "org.apache.spark.ml.r.NaiveBayesWrapper" => NaiveBayesWrapper.load(path)
      case "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper" =>
        AFTSurvivalRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" =>
        GeneralizedLinearRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.KMeansWrapper" =>
        KMeansWrapper.load(path)
      case "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" =>
        MultilayerPerceptronClassifierWrapper.load(path)
      case "org.apache.spark.ml.r.LDAWrapper" =>
        LDAWrapper.load(path)
      case "org.apache.spark.ml.r.IsotonicRegressionWrapper" =>
        IsotonicRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.GaussianMixtureWrapper" =>
        GaussianMixtureWrapper.load(path)
      case "org.apache.spark.ml.r.ALSWrapper" =>
        ALSWrapper.load(path)
      case "org.apache.spark.ml.r.LogisticRegressionWrapper" =>
        LogisticRegressionWrapper.load(path)
      case _ =>
        throw new SparkException(s"SparkR read.ml does not support load $className")
    }
  }
}

Source File: HDFSCredentialProvider.scala From drizzle-spark with Apache License 2.0

6 votes

package org.apache.spark.deploy.yarn.security

import java.io.{ByteArrayInputStream, DataInputStream}

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
import org.apache.hadoop.mapred.Master
import org.apache.hadoop.security.Credentials

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.deploy.yarn.config._
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._

private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging {
  // Token renewal interval, this value will be set in the first call,
  // if None means no token renewer specified, so cannot get token renewal interval.
  private var tokenRenewalInterval: Option[Long] = null

  override val serviceName: String = "hdfs"

  override def obtainCredentials(
      hadoopConf: Configuration,
      sparkConf: SparkConf,
      creds: Credentials): Option[Long] = {
    // NameNode to access, used to get tokens from different FileSystems
    nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
      val dstFs = dst.getFileSystem(hadoopConf)
      logInfo("getting token for namenode: " + dst)
      dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds)
    }

    // Get the token renewal interval if it is not set. It will only be called once.
    if (tokenRenewalInterval == null) {
      tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf)
    }

    // Get the time of next renewal.
    tokenRenewalInterval.map { interval =>
      creds.getAllTokens.asScala
        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
        .map { t =>
          val identifier = new DelegationTokenIdentifier()
          identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
          identifier.getIssueDate + interval
      }.foldLeft(0L)(math.max)
    }
  }

  private def getTokenRenewalInterval(
      hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = {
    // We cannot use the tokens generated with renewer yarn. Trying to renew
    // those will fail with an access control issue. So create new tokens with the logged in
    // user as renewer.
    sparkConf.get(PRINCIPAL).map { renewer =>
      val creds = new Credentials()
      nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
        val dstFs = dst.getFileSystem(hadoopConf)
        dstFs.addDelegationTokens(renewer, creds)
      }
      val t = creds.getAllTokens.asScala
        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
        .head
      val newExpiration = t.renew(hadoopConf)
      val identifier = new DelegationTokenIdentifier()
      identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
      val interval = newExpiration - identifier.getIssueDate
      logInfo(s"Renewal Interval is $interval")
      interval
    }
  }

  private def getTokenRenewer(conf: Configuration): String = {
    val delegTokenRenewer = Master.getMasterPrincipal(conf)
    logDebug("delegation token renewer is: " + delegTokenRenewer)
    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
      logError(errorMessage)
      throw new SparkException(errorMessage)
    }

    delegTokenRenewer
  }

  private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = {
    sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet +
      sparkConf.get(STAGING_DIR).map(new Path(_))
        .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory)
  }
}

Source File: RUtils.scala From drizzle-spark with Apache License 2.0

6 votes

package org.apache.spark.api.r

import java.io.File
import java.util.Arrays

import org.apache.spark.{SparkEnv, SparkException}

private[spark] object RUtils {
  // Local path where R binary packages built from R source code contained in the spark
  // packages specified with "--packages" or "--jars" command line option reside.
  var rPackages: Option[String] = None

  
  def isRInstalled: Boolean = {
    try {
      val builder = new ProcessBuilder(Arrays.asList("R", "--version"))
      builder.start().waitFor() == 0
    } catch {
      case e: Exception => false
    }
  }
}

Source File: MesosClusterManager.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster.mesos

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}


private[spark] class MesosClusterManager extends ExternalClusterManager {
  private val MESOS_REGEX = """mesos://(.*)""".r

  override def canCreate(masterURL: String): Boolean = {
    masterURL.startsWith("mesos")
  }

  override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
    new TaskSchedulerImpl(sc)
  }

  override def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = {
    val mesosUrl = MESOS_REGEX.findFirstMatchIn(masterURL).get.group(1)
    val coarse = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true)
    if (coarse) {
      new MesosCoarseGrainedSchedulerBackend(
        scheduler.asInstanceOf[TaskSchedulerImpl],
        sc,
        mesosUrl,
        sc.env.securityManager)
    } else {
      new MesosFineGrainedSchedulerBackend(
        scheduler.asInstanceOf[TaskSchedulerImpl],
        sc,
        mesosUrl)
    }
  }

  override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
  }
}

Source File: HashingTF.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import java.lang.{Iterable => JavaIterable}

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.apache.spark.SparkException
import org.apache.spark.annotation.Since
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.unsafe.hash.Murmur3_x86_32._
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.util.Utils


  private[spark] def murmur3Hash(term: Any): Int = {
    term match {
      case null => seed
      case b: Boolean => hashInt(if (b) 1 else 0, seed)
      case b: Byte => hashInt(b, seed)
      case s: Short => hashInt(s, seed)
      case i: Int => hashInt(i, seed)
      case l: Long => hashLong(l, seed)
      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
      case s: String =>
        val utf8 = UTF8String.fromString(s)
        hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed)
      case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " +
        s"support type ${term.getClass.getCanonicalName} of input data.")
    }
  }
}

Source File: NumericParser.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.util.StringTokenizer

import scala.collection.mutable.{ArrayBuilder, ListBuffer}

import org.apache.spark.SparkException


  def parse(s: String): Any = {
    val tokenizer = new StringTokenizer(s, "()[],", true)
    if (tokenizer.hasMoreTokens()) {
      val token = tokenizer.nextToken()
      if (token == "(") {
        parseTuple(tokenizer)
      } else if (token == "[") {
        parseArray(tokenizer)
      } else {
        // expecting a number
        parseDouble(token)
      }
    } else {
      throw new SparkException(s"Cannot find any token from the input string.")
    }
  }

  private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
    val values = ArrayBuilder.make[Double]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "]") {
        parsing = false
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else {
        // expecting a number
        values += parseDouble(token)
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"An array must end with ']'.")
    }
    values.result()
  }

  private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
    val items = ListBuffer.empty[Any]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "(") {
        items.append(parseTuple(tokenizer))
        allowComma = true
      } else if (token == "[") {
        items.append(parseArray(tokenizer))
        allowComma = true
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else if (token == ")") {
        parsing = false
      } else if (token.trim.isEmpty) {
          // ignore whitespaces between delim chars, e.g. ", ["
      } else {
        // expecting a number
        items.append(parseDouble(token))
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"A tuple must end with ')'.")
    }
    items
  }

  private def parseDouble(s: String): Double = {
    try {
      java.lang.Double.parseDouble(s)
    } catch {
      case e: NumberFormatException =>
        throw new SparkException(s"Cannot parse a double from: $s", e)
    }
  }
}

Source File: LabeledPoint.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException


  @Since("1.1.0")
  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
      }
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
    }
  }

  private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = {
    LabeledPoint(point.label, Vectors.fromML(point.features))
  }
}

Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: NumericParserSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.{SparkException, SparkFunSuite}

class NumericParserSuite extends SparkFunSuite {

  test("parser") {
    val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3))
    assert(parsed(1).asInstanceOf[Double] === -4.0)
    assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8))
    assert(parsed(3).asInstanceOf[Double] === 9.0)

    val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
    malformatted.foreach { s =>
      intercept[SparkException] {
        NumericParser.parse(s)
        throw new RuntimeException(s"Didn't detect malformatted string $s.")
      }
    }
  }

  test("parser with whitespaces") {
    val s = "(0.0, [1.0, 2.0])"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Double] === 0.0)
    assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
  }
}

Source File: CachedKafkaConsumer.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.kafka010

import java.{util => ju}

import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer}
import org.apache.kafka.common.TopicPartition

import org.apache.spark.{SparkEnv, SparkException, TaskContext}
import org.apache.spark.internal.Logging



  def getOrCreate(
      topic: String,
      partition: Int,
      kafkaParams: ju.Map[String, Object]): CachedKafkaConsumer = synchronized {
    val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String]
    val topicPartition = new TopicPartition(topic, partition)
    val key = CacheKey(groupId, topicPartition)

    // If this is reattempt at running the task, then invalidate cache and start with
    // a new consumer
    if (TaskContext.get != null && TaskContext.get.attemptNumber > 1) {
      cache.remove(key)
      new CachedKafkaConsumer(topicPartition, kafkaParams)
    } else {
      if (!cache.containsKey(key)) {
        cache.put(key, new CachedKafkaConsumer(topicPartition, kafkaParams))
      }
      cache.get(key)
    }
  }
}

Source File: CommitFailureTestRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton {
  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = spark.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - default") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val divideByZero = udf((x: Int) => { x / (x - 1)})
      val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id")))

      SimpleTextRelation.callbackCalled = false
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - partitioned") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))

      SimpleTextRelation.callbackCalled = false
      SimpleTextRelation.failWriter = true
      intercept[SparkException] {
        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: ThriftServerTab.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)
  val listener = HiveThriftServer2.listener

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: UDTRegistration.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import scala.collection.mutable

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.util.Utils


  def getUDTFor(userClass: String): Option[Class[_]] = {
    udtMap.get(userClass).map { udtClassName =>
      if (Utils.classIsLoadable(udtClassName)) {
        val udtClass = Utils.classForName(udtClassName)
        if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) {
          udtClass
        } else {
          throw new SparkException(
            s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " +
              s"an UserDefinedType for ${userClass}")
        }
      } else {
        throw new SparkException(
          s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.")
      }
    }
  }
}

Source File: ScalaUDFSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types.{IntegerType, StringType}

class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("basic") {
    val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil)
    checkEvaluation(intUdf, 2)

    val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil)
    checkEvaluation(stringUdf, "ax")
  }

  test("better error message for NPE") {
    val udf = ScalaUDF(
      (s: String) => s.toLowerCase,
      StringType,
      Literal.create(null, StringType) :: Nil)

    val e1 = intercept[SparkException](udf.eval())
    assert(e1.getMessage.contains("Failed to execute user defined function"))

    val e2 = intercept[SparkException] {
      checkEvalutionWithUnsafeProjection(udf, null)
    }
    assert(e2.getMessage.contains("Failed to execute user defined function"))
  }

}

Source File: UDTRegistrationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types._

private[sql] class TestUserClass {
}

private[sql] class TestUserClass2 {
}

private[sql] class TestUserClass3 {
}

private[sql] class NonUserDefinedType {
}

private[sql] class TestUserClassUDT extends UserDefinedType[TestUserClass] {

  override def sqlType: DataType = IntegerType
  override def serialize(input: TestUserClass): Int = 1

  override def deserialize(datum: Any): TestUserClass = new TestUserClass

  override def userClass: Class[TestUserClass] = classOf[TestUserClass]

  private[spark] override def asNullable: TestUserClassUDT = this

  override def hashCode(): Int = classOf[TestUserClassUDT].getName.hashCode()

  override def equals(other: Any): Boolean = other match {
    case _: TestUserClassUDT => true
    case _ => false
  }
}

class UDTRegistrationSuite extends SparkFunSuite {

  test("register non-UserDefinedType") {
    UDTRegistration.register(classOf[TestUserClass].getName,
      "org.apache.spark.sql.NonUserDefinedType")
    intercept[SparkException] {
      UDTRegistration.getUDTFor(classOf[TestUserClass].getName)
    }
  }

  test("default UDTs") {
    val userClasses = Seq(
    "org.apache.spark.ml.linalg.Vector",
    "org.apache.spark.ml.linalg.DenseVector",
    "org.apache.spark.ml.linalg.SparseVector",
    "org.apache.spark.ml.linalg.Matrix",
    "org.apache.spark.ml.linalg.DenseMatrix",
    "org.apache.spark.ml.linalg.SparseMatrix")
    userClasses.foreach { c =>
      assert(UDTRegistration.exists(c))
    }
  }

  test("query registered user class") {
    UDTRegistration.register(classOf[TestUserClass2].getName, classOf[TestUserClassUDT].getName)
    assert(UDTRegistration.exists(classOf[TestUserClass2].getName))
    assert(
      classOf[UserDefinedType[_]].isAssignableFrom((
        UDTRegistration.getUDTFor(classOf[TestUserClass2].getName).get)))
  }

  test("query unregistered user class") {
    assert(!UDTRegistration.exists(classOf[TestUserClass3].getName))
    assert(!UDTRegistration.getUDTFor(classOf[TestUserClass3].getName).isDefined)
  }
}

Source File: YarnClusterManager.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}


private[spark] class YarnClusterManager extends ExternalClusterManager {

  override def canCreate(masterURL: String): Boolean = {
    masterURL == "yarn"
  }

  override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
    sc.deployMode match {
      case "cluster" => new YarnClusterScheduler(sc)
      case "client" => new YarnScheduler(sc)
      case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
    }
  }

  override def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = {
    sc.deployMode match {
      case "cluster" =>
        new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
      case "client" =>
        new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
      case  _ =>
        throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
    }
  }

  override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
  }
}

Source File: YarnClientSchedulerBackend.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import scala.collection.mutable.ArrayBuffer

import org.apache.hadoop.yarn.api.records.YarnApplicationState

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil}
import org.apache.spark.internal.Logging
import org.apache.spark.launcher.SparkAppHandle
import org.apache.spark.scheduler.TaskSchedulerImpl

private[spark] class YarnClientSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext)
  extends YarnSchedulerBackend(scheduler, sc)
  with Logging {

  private var client: Client = null
  private var monitorThread: MonitorThread = null

  
  override def stop() {
    assert(client != null, "Attempted to stop this scheduler before starting it!")
    if (monitorThread != null) {
      monitorThread.stopMonitor()
    }

    // Report a final state to the launcher if one is connected. This is needed since in client
    // mode this backend doesn't let the app monitor loop run to completion, so it does not report
    // the final state itself.
    //
    // Note: there's not enough information at this point to provide a better final state,
    // so assume the application was successful.
    client.reportLauncherState(SparkAppHandle.State.FINISHED)

    super.stop()
    YarnSparkHadoopUtil.get.stopCredentialUpdater()
    client.stop()
    logInfo("Stopped")
  }

}

Source File: HDFSCredentialProviderSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.scalatest.{Matchers, PrivateMethodTester}

import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}

class HDFSCredentialProviderSuite
    extends SparkFunSuite
    with PrivateMethodTester
    with Matchers {
  private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer)

  private def getTokenRenewer(
      hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = {
    hdfsCredentialProvider invokePrivate _getTokenRenewer(conf)
  }

  private var hdfsCredentialProvider: HDFSCredentialProvider = null

  override def beforeAll() {
    super.beforeAll()

    if (hdfsCredentialProvider == null) {
      hdfsCredentialProvider = new HDFSCredentialProvider()
    }
  }

  override def afterAll() {
    if (hdfsCredentialProvider != null) {
      hdfsCredentialProvider = null
    }

    super.afterAll()
  }

  test("check token renewer") {
    val hadoopConf = new Configuration()
    hadoopConf.set("yarn.resourcemanager.address", "myrm:8033")
    hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:8032@SPARKTEST.COM")
    val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf)
    renewer should be ("yarn/myrm:8032@SPARKTEST.COM")
  }

  test("check token renewer default") {
    val hadoopConf = new Configuration()
    val caught =
      intercept[SparkException] {
        getTokenRenewer(hdfsCredentialProvider, hadoopConf)
      }
    assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer")
  }
}

Source File: UnionDStream.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
  extends DStream[T](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to union is empty")
  require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.length == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[T]] = {
    val rdds = new ArrayBuffer[RDD[T]]()
    parents.map(_.getOrCompute(validTime)).foreach {
      case Some(rdd) => rdds += rdd
      case None => throw new SparkException("Could not generate RDD from a parent for unifying at" +
        s" time $validTime")
    }
    if (rdds.nonEmpty) {
      Some(ssc.sc.union(rdds))
    } else {
      None
    }
  }
}

Source File: TransformedDStream.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class TransformedDStream[U: ClassTag] (
    parents: Seq[DStream[_]],
    transformFunc: (Seq[RDD[_]], Time) => RDD[U]
  ) extends DStream[U](parents.head.ssc) {

  require(parents.nonEmpty, "List of DStreams to transform is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[U]] = {
    val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse(
      // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE
      throw new SparkException(s"Couldn't generate RDD from parent at time $validTime"))
    }
    val transformedRDD = transformFunc(parentRDDs, validTime)
    if (transformedRDD == null) {
      throw new SparkException("Transform function must not return null. " +
        "Return SparkContext.emptyRDD() instead to represent no element " +
        "as the result of transformation.")
    }
    Some(transformedRDD)
  }

  
  override protected[streaming] def createRDDWithLocalProperties[U](
      time: Time,
      displayInnerRDDOps: Boolean)(body: => U): U = {
    super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body)
  }
}

Source File: StreamingTab.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.ui

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[spark] class StreamingTab(val ssc: StreamingContext)
  extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging {

  import StreamingTab._

  private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static"

  val parent = getSparkUI(ssc)
  val listener = ssc.progressListener

  ssc.addStreamingListener(listener)
  ssc.sc.addSparkListener(listener)
  attachPage(new StreamingPage(this))
  attachPage(new BatchPage(this))

  def attach() {
    getSparkUI(ssc).attachTab(this)
    getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
  }

  def detach() {
    getSparkUI(ssc).detachTab(this)
    getSparkUI(ssc).removeStaticHandler("/static/streaming")
  }
}

private object StreamingTab {
  def getSparkUI(ssc: StreamingContext): SparkUI = {
    ssc.sc.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: RpcEndpointAddress.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rpc

import org.apache.spark.SparkException


private[spark] case class RpcEndpointAddress(val rpcAddress: RpcAddress, val name: String) {

  require(name != null, "RpcEndpoint name must be provided.")

  def this(host: String, port: Int, name: String) = {
    this(RpcAddress(host, port), name)
  }

  override val toString = if (rpcAddress != null) {
      s"spark://$name@${rpcAddress.host}:${rpcAddress.port}"
    } else {
      s"spark-client://$name"
    }
}

private[spark] object RpcEndpointAddress {

  def apply(host: String, port: Int, name: String): RpcEndpointAddress = {
    new RpcEndpointAddress(host, port, name)
  }

  def apply(sparkUrl: String): RpcEndpointAddress = {
    try {
      val uri = new java.net.URI(sparkUrl)
      val host = uri.getHost
      val port = uri.getPort
      val name = uri.getUserInfo
      if (uri.getScheme != "spark" ||
          host == null ||
          port < 0 ||
          name == null ||
          (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
          uri.getFragment != null ||
          uri.getQuery != null) {
        throw new SparkException("Invalid Spark URL: " + sparkUrl)
      }
      new RpcEndpointAddress(host, port, name)
    } catch {
      case e: java.net.URISyntaxException =>
        throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
    }
  }
}

Source File: RpcTimeout.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rpc

import java.util.concurrent.TimeoutException

import scala.concurrent.{Await, Future}
import scala.concurrent.duration._
import scala.util.control.NonFatal

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.util.Utils


  def apply(conf: SparkConf, timeoutPropList: Seq[String], defaultValue: String): RpcTimeout = {
    require(timeoutPropList.nonEmpty)

    // Find the first set property or use the default value with the first property
    val itr = timeoutPropList.iterator
    var foundProp: Option[(String, String)] = None
    while (itr.hasNext && foundProp.isEmpty) {
      val propKey = itr.next()
      conf.getOption(propKey).foreach { prop => foundProp = Some(propKey, prop) }
    }
    val finalProp = foundProp.getOrElse(timeoutPropList.head, defaultValue)
    val timeout = { Utils.timeStringAsSeconds(finalProp._2).seconds }
    new RpcTimeout(timeout, finalProp._1)
  }
}

Source File: RpcEndpointRef.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rpc

import scala.concurrent.Future
import scala.reflect.ClassTag

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.internal.Logging
import org.apache.spark.util.RpcUtils


  def askWithRetry[T: ClassTag](message: Any, timeout: RpcTimeout): T = {
    // TODO: Consider removing multiple attempts
    var attempts = 0
    var lastException: Exception = null
    while (attempts < maxRetries) {
      attempts += 1
      try {
        val future = ask[T](message, timeout)
        val result = timeout.awaitResult(future)
        if (result == null) {
          throw new SparkException("RpcEndpoint returned null")
        }
        return result
      } catch {
        case ie: InterruptedException => throw ie
        case e: Exception =>
          lastException = e
          logWarning(s"Error sending message [message = $message] in $attempts attempts", e)
      }

      if (attempts < maxRetries) {
        Thread.sleep(retryWaitMs)
      }
    }

    throw new SparkException(
      s"Error sending message [message = $message]", lastException)
  }

}

Source File: ApplicationHistoryProvider.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.history

import java.util.zip.ZipOutputStream

import scala.xml.Node

import org.apache.spark.SparkException
import org.apache.spark.ui.SparkUI

private[spark] case class ApplicationAttemptInfo(
    attemptId: Option[String],
    startTime: Long,
    endTime: Long,
    lastUpdated: Long,
    sparkUser: String,
    completed: Boolean = false)

private[spark] case class ApplicationHistoryInfo(
    id: String,
    name: String,
    attempts: List[ApplicationAttemptInfo]) {

  
  def getEmptyListingHtml(): Seq[Node] = Seq.empty
}

Source File: RpcAddressSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rpc

import org.apache.spark.{SparkException, SparkFunSuite}

class RpcAddressSuite extends SparkFunSuite {

  test("hostPort") {
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
    assert(address.hostPort == "1.2.3.4:1234")
  }

  test("fromSparkURL") {
    val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234")
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
  }

  test("fromSparkURL: a typo url") {
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("spark://1.2. 3.4:1234")
    }
    assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage)
  }

  test("fromSparkURL: invalid scheme") {
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("invalid://1.2.3.4:1234")
    }
    assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage)
  }

  test("toSparkURL") {
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.toSparkURL == "spark://1.2.3.4:1234")
  }
}

Source File: KryoSerializerResizableOutputSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.LocalSparkContext
import org.apache.spark.SparkContext
import org.apache.spark.SparkException

class KryoSerializerResizableOutputSuite extends SparkFunSuite {

  // trial and error showed this will not serialize with 1mb buffer
  val x = (1 to 400000).toArray

  test("kryo without resizable output buffer should fail on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "1m")
    val sc = new SparkContext("local", "test", conf)
    intercept[SparkException](sc.parallelize(x).collect())
    LocalSparkContext.stop(sc)
  }

  test("kryo with resizable output buffer should succeed on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "2m")
    val sc = new SparkContext("local", "test", conf)
    assert(sc.parallelize(x).collect() === x)
    LocalSparkContext.stop(sc)
  }
}

Source File: ProactiveClosureSerializationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: CoarseGrainedSchedulerBackendSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite}
import org.apache.spark.util.{RpcUtils, SerializableBuffer}

class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext {

  ignore("serialized task larger than max RPC message size") {
    val conf = new SparkConf
    conf.set("spark.rpc.message.maxSize", "1")
    conf.set("spark.default.parallelism", "1")
    sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf)
    val frameSize = RpcUtils.maxMessageSizeBytes(sc.conf)
    val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize))
    val larger = sc.parallelize(Seq(buffer))
    val thrown = intercept[SparkException] {
      larger.collect()
    }
    assert(thrown.getMessage.contains("using broadcast variables for large values"))
    val smaller = sc.parallelize(1 to 4).collect()
    assert(smaller.size === 4)
  }

}

Source File: StreamHelper.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.spark.streaming.kafka

import kafka.KafkaHelper
import kafka.common.TopicAndPartition
import kafka.consumer.PartitionTopicInfo
import kafka.message.MessageAndMetadata
import kafka.serializer.Decoder
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.{Logging, SparkException}
import scala.reflect.ClassTag

case class StreamHelper(kafkaParams: Map[String, String]) extends Logging {
  // helper for kafka zookeeper
  lazy val kafkaHelper = KafkaHelper(kafkaParams)
  lazy val kc = new KafkaCluster(kafkaParams)

  // 1. get leader's earliest and latest offset
  // 2. get consumer offset
  // 3-1. if (2) is bounded in (1) use (2) for stream
  // 3-2. else use (1) by "auto.offset.reset"
  private def getStartOffsets(topics: Set[String]): Map[TopicAndPartition, Long] = {
    lazy val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase)
    lazy val consumerOffsets = kafkaHelper.getConsumerOffsets(topics.toSeq)

    {
      for {
        topicPartitions <- kc.getPartitions(topics).right
        smallOffsets <- kc.getEarliestLeaderOffsets(topicPartitions).right
        largeOffsets <- kc.getLatestLeaderOffsets(topicPartitions).right
      } yield {
        {
          for {
            tp <- topicPartitions
          } yield {
            val co = consumerOffsets.getOrElse(tp, PartitionTopicInfo.InvalidOffset)
            val so = smallOffsets.get(tp).map(_.offset).get
            val lo = largeOffsets.get(tp).map(_.offset).get

            logWarning(s"$tp: $co $so $lo")

            if (co >= so && co <= lo) {
              (tp, co)
            } else {
              (tp, reset match {
                case Some("smallest") => so
                case _ => lo
              })
            }
          }
        }.toMap
      }
    }.fold(errs => throw new SparkException(errs.mkString("\n")), ok => ok)
  }

  def createStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, topics: Set[String]): InputDStream[(K, V)] = {
    type R = (K, V)
    val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key(), mmd.message())

    kafkaHelper.registerConsumerInZK(topics)

    new DirectKafkaInputDStream[K, V, KD, VD, R](ssc, kafkaParams, getStartOffsets(topics), messageHandler)
  }

  def commitConsumerOffsets(offsets: HasOffsetRanges): Unit = {
    val offsetsMap = {
      for {
        range <- offsets.offsetRanges if range.fromOffset < range.untilOffset
      } yield {
        logDebug(range.toString())
        TopicAndPartition(range.topic, range.partition) -> range.untilOffset
      }
    }.toMap

    kafkaHelper.commitConsumerOffsets(offsetsMap)
  }

  def commitConsumerOffset(range: OffsetRange): Unit = {
    if (range.fromOffset < range.untilOffset) {
      try {
        val tp = TopicAndPartition(range.topic, range.partition)
        logDebug("Committed offset " + range.untilOffset + " for topic " + tp)
        kafkaHelper.commitConsumerOffset(tp, range.untilOffset)
      } catch {
        case t: Throwable =>
          // log it and let it go
          logWarning("exception during commitOffsets",  t)
          throw t
      }
    }
  }

  def commitConsumerOffsets[R](stream: InputDStream[R]): Unit = {
    stream.foreachRDD { rdd =>
      commitConsumerOffsets(rdd.asInstanceOf[HasOffsetRanges])
    }
  }
}

Source File: NotAvailableFeaturesTest.scala From spark-riak-connector with Apache License 2.0

5 votes

package com.basho.riak.spark.rdd

import com.basho.riak.client.core.netty.RiakResponseException
import com.basho.riak.spark._
import org.apache.spark.SparkException
import org.apache.spark.sql.Row
import org.hamcrest.CustomTypeSafeMatcher
import org.junit.rules.ExpectedException
import org.junit.{ Rule, Test }
import org.junit.experimental.categories.Category


class NotAvailableFeaturesTest extends AbstractRiakSparkTest {
  val _expectedException: ExpectedException = ExpectedException.none()
  @Rule
  def expectedException: ExpectedException = _expectedException

  val coverageMatcher = new CustomTypeSafeMatcher[IllegalStateException]("match") {
    override def matchesSafely(t: IllegalStateException): Boolean = {
      t.getMessage.contains("Full bucket read is not supported on your version of Riak") &&
        t.getCause.isInstanceOf[RiakResponseException] &&
        t.getCause.getMessage.contains("Unknown message code: 70")
    }
  }

  val timeSeriesMatcher = new CustomTypeSafeMatcher[SparkException]("match") {
    override def matchesSafely(t: SparkException): Boolean = {
      t.getMessage.contains("Range queries are not supported in your version of Riak") &&
        t.getMessage.contains("Unknown message code: 90")
    }
  }

  @Category(Array(classOf[RiakKVTests],classOf[RiakKVNotAvailableFeaturesTest]))
  @Test
  def timeSeriesOnKV(): Unit = {
    expectedException.expect(timeSeriesMatcher)
    val rdd = sc.riakTSTable[Row]("bucket")
      .sql("select * from bucket")
      .collect()
  }

  @Category(Array(classOf[RiakKVTests],classOf[RiakKVNotAvailableFeaturesTest]))
  @Test
  def fullBucketReadOnKV(): Unit = {
    expectedException.expect(coverageMatcher)
    val rdd = sc.riakBucket[String](DEFAULT_NAMESPACE)
      .queryAll()
      .collect()
  }

  @Category(Array(classOf[RiakKVTests],classOf[RiakKVNotAvailableFeaturesTest]))
  @Test
  def queryRangeLocalOnKV(): Unit = {
    expectedException.expect(coverageMatcher)
    val rdd = sc.riakBucket[String](DEFAULT_NAMESPACE)
      .query2iRangeLocal("creationNo", 1, 1000)
      .collect()
  }
}

Source File: LinearOperatorSuite.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs

import org.scalatest.FunSuite

import org.apache.spark.SparkException
import org.apache.spark.mllib.linalg.{ DenseVector, Vectors }
import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._
import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix
import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint
import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvectordouble.{ LinopMatrix => LinopMatrixVector }
import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector.{ LinopMatrixAdjoint => LinopMatrixVectorAdjoint }
import org.apache.spark.mllib.util.MLlibTestSparkContext

class LinearOperatorSuite extends FunSuite with MLlibTestSparkContext {

  lazy val matrix = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0),
    Vectors.dense(4.0, 5.0, 6.0)), 2)

  lazy val vector = new DenseVector(Array(2.2, 3.3, 4.4))

  test("LinopMatrix multiplies properly") {

    val f = new LinopMatrix(matrix)
    val x = new DenseVector(Array(7.0, 8.0, 9.0))
    val result = f(x)
    val expectedResult = Vectors.dense(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9)
    assert(Vectors.dense(result.collectElements) == expectedResult,
      "should return the correct product")
  }

  test("LinopMatrixAdjoint multiplies properly") {

    val f = new LinopMatrixAdjoint(matrix)
    val y = sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2)
    val result = f(y)
    val expectedResult = Vectors.dense(1 * 5 + 4 * 6, 2 * 5 + 5 * 6, 3 * 5 + 6 * 6)
    assert(result == expectedResult, "should return the correct product")
  }

  test("LinopMatrixAdjoint checks for mismatched partition vectors") {

    val f = new LinopMatrixAdjoint(matrix)
    val y = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2)
    intercept[SparkException] {
      f(y)
    }
  }

  test("LinopMatrixVector multiplies properly") {

    val f = new LinopMatrixVector(matrix, vector)
    val x = new DenseVector(Array(7.0, 8.0, 9.0))
    val result = f(x)
    val expectedResult = (new DenseVector(Array(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9)),
      7.0 * 2.2 + 8.0 * 3.3 + 9.0 * 4.4)
    assert(Vectors.dense(result._1.collectElements) == expectedResult._1,
      "should return the correct product")
    assert(result._2 == expectedResult._2, "should return the correct product")
  }

  test("LinopMatrixVectorAdjoint multiplies properly") {

    var f = new LinopMatrixVectorAdjoint(matrix, vector)
    val y = (sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2),
      8.8)
    val result = f(y)
    val expectedResult =
      Vectors.dense(1 * 5 + 4 * 6 + 2.2, 2 * 5 + 5 * 6 + 3.3, 3 * 5 + 6 * 6 + 4.4)
    assert(result == expectedResult, "should return the correct product")
  }

  test("LinopMatrixVectorAdjoint checks for mismatched partition vectors") {

    val f = new LinopMatrixVectorAdjoint(matrix, vector)
    val y = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2),
      8.8)
    intercept[SparkException] {
      f(y)
    }
  }
}

Source File: LocalIndexToString.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.SparkException
import org.apache.spark.ml.feature.IndexToString

class LocalIndexToString(override val sparkTransformer: IndexToString)
  extends LocalTransformer[IndexToString] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val labels = sparkTransformer.getLabels
        val indexer = (index: Double) => {
          val idx = index.toInt
          if (0 <= idx && idx < labels.length) {
            labels(idx)
          } else {
            throw new SparkException(s"Unseen index: $index ??")
          }
        }
        val newColumn = LocalDataColumn(
          sparkTransformer.getOutputCol,
          column.data map {
            case i: Int    => indexer(i.toDouble)
            case d: Double => indexer(d)
            case d         => throw new IllegalArgumentException(s"Unknown data to index: $d")
          }
        )
        localData.withColumn(newColumn)
      case None => localData
    }
  }
}

object LocalIndexToString
  extends SimpleModelLoader[IndexToString]
  with TypedTransformerConverter[IndexToString] {
  override def build(metadata: Metadata, data: LocalData): IndexToString = {
    val ctor = classOf[IndexToString].getDeclaredConstructor(classOf[String])
    ctor.setAccessible(true)
    ctor
      .newInstance(metadata.uid)
      .setLabels(metadata.paramMap("labels").asInstanceOf[Seq[String]].toArray)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
  }

  override implicit def toLocal(transformer: IndexToString) =
    new LocalIndexToString(transformer)
}

Source File: LocalStringIndexerModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.SparkException
import org.apache.spark.ml.feature.StringIndexerModel

import scala.collection.mutable

class LocalStringIndexerModel(override val sparkTransformer: StringIndexerModel)
  extends LocalTransformer[StringIndexerModel] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val labelToIndex = {
          val n   = sparkTransformer.labels.length
          val map = new mutable.HashMap[String, Double]
          var i   = 0
          while (i < n) {
            map.update(sparkTransformer.labels(i), i)
            i += 1
          }
          map
        }
        val indexer = (label: String) => {
          if (labelToIndex.contains(label)) {
            labelToIndex(label)
          } else {
            throw new SparkException(s"Unseen label: $label.")
          }
        }
        val newColumn =
          LocalDataColumn(sparkTransformer.getOutputCol, column.data.map(_.toString) map {
            feature =>
              indexer(feature)
          })
        localData.withColumn(newColumn)
      case None => localData
    }
  }
}

object LocalStringIndexerModel
  extends SimpleModelLoader[StringIndexerModel]
  with TypedTransformerConverter[StringIndexerModel] {

  override def build(metadata: Metadata, data: LocalData): StringIndexerModel = {
    new StringIndexerModel(
      metadata.uid,
      data.column("labels").get.data.head.asInstanceOf[Seq[String]].toArray
    ).setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      .setHandleInvalid(metadata.paramMap("handleInvalid").asInstanceOf[String])
  }

  override implicit def toLocal(
    transformer: StringIndexerModel
  ) = new LocalStringIndexerModel(transformer)
}

Source File: PipelineBuilder.scala From automl with Apache License 2.0

5 votes

package com.tencent.angel.spark.automl.feature

import org.apache.spark.SparkException
import org.apache.spark.ml.PipelineStage

import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer

class IncompatibleFiledExecption(msg: String) extends SparkException(msg) {}

object PipelineBuilder {

  def build(transformers: Array[TransformerWrapper]): Array[PipelineStage] = {
    val stages: ArrayBuffer[PipelineStage] = new ArrayBuffer[PipelineStage]()
    //val allInputCols: ArrayBuffer[String] = new ArrayBuffer[String]()
    val allInputCols: mutable.HashSet[String] = new mutable.HashSet[String]()

    transformers(0).setInputCols(transformers(0).requiredInputCols)
    transformers(0).setOutputCols(transformers(0).requiredOutputCols)
    allInputCols ++= transformers(0).getInputCols
    transformers(0).setAncestorCols(allInputCols.toArray)
    stages += transformers(0).declareInAndOut().getTransformer

    (1 until transformers.length).foreach { i =>
      println(s"add $i-th transformer = ${transformers(i).getTransformer.getClass.getSimpleName}")
      // set parent
      transformers(i).setParent(transformers(i - 1))
      // add new cols
      allInputCols ++= transformers(i - 1).getOutputCols
      // set parent cols
      transformers(i).setAncestorCols(allInputCols.toArray)
      // generate input cols
      transformers(i).generateInputCols()
      // generate output cols
      transformers(i).generateOutputCols()
      // add fully configured transformer
      stages += transformers(i).declareInAndOut().getTransformer
    }

    stages.toArray
  }

}

Source File: OrcFileOperator.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import java.io.IOException

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.types.StructType

private[hive] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String,
      config: Option[Configuration] = None,
      ignoreCorruptFiles: Boolean = false)
      : Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      val reader = try {
        Some(OrcFile.createReader(fs, path))
      } catch {
        case e: IOException =>
          if (ignoreCorruptFiles) {
            logWarning(s"Skipped the footer in the corrupted file: $path", e)
            None
          } else {
            throw new SparkException(s"Could not read footer for file: $path", e)
          }
      }
      path -> reader
    }.collectFirst {
      case (path, Some(reader)) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(paths: Seq[String], conf: Option[Configuration], ignoreCorruptFiles: Boolean)
      : Option[StructType] = {
    // Take the first file where we can open a valid reader if we can find one.  Otherwise just
    // return None to indicate we can't infer the schema.
    paths.toIterator.map(getFileReader(_, conf, ignoreCorruptFiles)).collectFirst {
      case Some(reader) =>
        val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
        val schema = readerInspector.getTypeName
        logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")
        CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType]
    }
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    // TODO: Check if the paths coming in are already qualified and simplify.
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDirectory)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))
    paths
  }
}

Source File: CommitFailureTestRelationSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton {
  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = spark.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - default") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val divideByZero = udf((x: Int) => { x / (x - 1)})
      val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id")))

      SimpleTextRelation.callbackCalled = false
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - partitioned") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))

      SimpleTextRelation.callbackCalled = false
      SimpleTextRelation.failWriter = true
      intercept[SparkException] {
        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: ThriftServerTab.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)
  val listener = HiveThriftServer2.listener

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: DataSourceManagerFactory.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.xsql

import java.util.ServiceLoader

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.util.Utils

object DataSourceManagerFactory {

  def create(
      datasourceType: String,
      conf: SparkConf,
      hadoopConf: Configuration): DataSourceManager = {
    val loader = Utils.getContextOrSparkClassLoader
    val serviceLoader = ServiceLoader.load(classOf[DataSourceManager], loader)
    var cls: Class[_] = null
    // As we use ServiceLoader to support creating any user provided DataSourceManager here,
    // META-INF/services/org.apache.spark.sql.sources.DataSourceRegister must be packaged properly
    // in user's jar, and the implementation of DataSourceManager must have a public parameterless
    // constructor. For scala language, def this() = this(null...) just work.
    try {
      cls = serviceLoader.asScala
        .filter(_.shortName().equals(datasourceType))
        .toList match {
        case head :: Nil =>
          head.getClass
        case _ =>
          throw new SparkException(s"error when instantiate datasource ${datasourceType}")
      }
    } catch {
      case _: Exception =>
        throw new SparkException(
          s"""Can't find corresponding DataSourceManager for ${datasourceType} type,
             |please check
             |1. META-INF/services/org.apache.spark.sql.sources.DataSourceRegister is packaged
             |2. your implementation of DataSourceManager's shortname is ${datasourceType}
             |3. your implementation of DataSourceManager must have a public parameterless
             |   constructor. For scala language, def this() = this(null, null, ...) just work.
           """.stripMargin)
    }
    try {
      val constructor = cls.getConstructor(classOf[SparkConf], classOf[Configuration])
      val newHadoopConf = new Configuration(hadoopConf)
      constructor.newInstance(conf, newHadoopConf).asInstanceOf[DataSourceManager]
    } catch {
      case _: NoSuchMethodException =>
        try {
          cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[DataSourceManager]
        } catch {
          case _: NoSuchMethodException =>
            cls.getConstructor().newInstance().asInstanceOf[DataSourceManager]
        }
    }
  }
}

Source File: StreamingIncrementCommand.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.xsql.execution.command

import java.util.Locale

import org.apache.spark.SparkException
import org.apache.spark.sql.{Dataset, Row, SparkSession}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet}
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode}
import org.apache.spark.sql.catalyst.streaming.InternalOutputModes
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.execution.command.RunnableCommand
import org.apache.spark.sql.execution.datasources.DataSource
import org.apache.spark.sql.execution.streaming.StreamingRelationV2
import org.apache.spark.sql.sources.v2.StreamWriteSupport
import org.apache.spark.sql.streaming.{OutputMode, Trigger}
import org.apache.spark.sql.xsql.DataSourceManager._
import org.apache.spark.sql.xsql.StreamingSinkType


case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand {

  private var outputMode: OutputMode = OutputMode.Append
  // dummy
  override def output: Seq[AttributeReference] = Seq.empty
  // dummy
  override def producedAttributes: AttributeSet = plan.producedAttributes

  override def run(sparkSession: SparkSession): Seq[Row] = {
    import StreamingSinkType._
    val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan))
    val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema))
    plan.collectLeaves.head match {
      case StreamingRelationV2(_, _, extraOptions, _, _) =>
        val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK)
        val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv =>
          val key = kv._1.substring(STREAMING_SINK_PREFIX.length)
          (key, kv._2)
        }
        StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match {
          case CONSOLE =>
          case TEXT | PARQUET | ORC | JSON | CSV =>
            if (sinkOptions.get(STREAMING_SINK_PATH) == None) {
              throw new SparkException("Sink type is file, must config path")
            }
          case KAFKA =>
            if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) {
              throw new SparkException("Sink type is kafka, must config bootstrap servers")
            }
            if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) {
              throw new SparkException("Sink type is kafka, must config kafka topic")
            }
          case _ =>
            throw new SparkException(
              "Sink type is invalid, " +
                s"select from ${StreamingSinkType.values}")
        }
        val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf)
        val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",")
        val sink = ds.newInstance() match {
          case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) =>
            w
          case _ =>
            val ds = DataSource(
              sparkSession,
              className = source,
              options = sinkOptions.toMap,
              partitionColumns = Nil)
            ds.createSink(InternalOutputModes.Append)
        }
        val outputMode = InternalOutputModes(
          extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE))
        val duration =
          extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION)
        val trigger =
          extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match {
            case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration)
            case STREAMING_ONCE_TRIGGER => Trigger.Once()
            case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration)
          }
        val query = sparkSession.sessionState.streamingQueryManager.startQuery(
          extraOptions.get("queryName"),
          extraOptions.get(STREAMING_CHECKPOINT_LOCATION),
          df,
          sinkOptions.toMap,
          sink,
          outputMode,
          useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK,
          recoverFromCheckpointLocation = true,
          trigger = trigger)
        query.awaitTermination()
    }
    // dummy
    Seq.empty
  }
}

case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode {
  override def output: Seq[Attribute] = child.output
}

Source File: UDTRegistration.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import scala.collection.mutable

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.util.Utils


  def getUDTFor(userClass: String): Option[Class[_]] = {
    udtMap.get(userClass).map { udtClassName =>
      if (Utils.classIsLoadable(udtClassName)) {
        val udtClass = Utils.classForName(udtClassName)
        if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) {
          udtClass
        } else {
          throw new SparkException(
            s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " +
              s"an UserDefinedType for ${userClass}")
        }
      } else {
        throw new SparkException(
          s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.")
      }
    }
  }
}

Source File: ScalaUDFSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import java.util.Locale

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
import org.apache.spark.sql.types.{IntegerType, StringType}

class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("basic") {
    val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil, true :: Nil)
    checkEvaluation(intUdf, 2)

    val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil, true :: Nil)
    checkEvaluation(stringUdf, "ax")
  }

  test("better error message for NPE") {
    val udf = ScalaUDF(
      (s: String) => s.toLowerCase(Locale.ROOT),
      StringType,
      Literal.create(null, StringType) :: Nil,
      true :: Nil)

    val e1 = intercept[SparkException](udf.eval())
    assert(e1.getMessage.contains("Failed to execute user defined function"))

    val e2 = intercept[SparkException] {
      checkEvaluationWithUnsafeProjection(udf, null)
    }
    assert(e2.getMessage.contains("Failed to execute user defined function"))
  }

  test("SPARK-22695: ScalaUDF should not use global variables") {
    val ctx = new CodegenContext
    ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil, true :: Nil).genCode(ctx)
    assert(ctx.inlinedMutableStates.isEmpty)
  }
}

Source File: FailureSafeParser.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.SparkException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.StructType
import org.apache.spark.unsafe.types.UTF8String

class FailureSafeParser[IN](
    rawParser: IN => Seq[InternalRow],
    mode: ParseMode,
    schema: StructType,
    columnNameOfCorruptRecord: String) {

  private val corruptFieldIndex = schema.getFieldIndex(columnNameOfCorruptRecord)
  private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord))
  private val resultRow = new GenericInternalRow(schema.length)
  private val nullResult = new GenericInternalRow(schema.length)

  // This function takes 2 parameters: an optional partial result, and the bad record. If the given
  // schema doesn't contain a field for corrupted record, we just return the partial result or a
  // row with all fields null. If the given schema contains a field for corrupted record, we will
  // set the bad record to this field, and set other fields according to the partial result or null.
  private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = {
    if (corruptFieldIndex.isDefined) {
      (row, badRecord) => {
        var i = 0
        while (i < actualSchema.length) {
          val from = actualSchema(i)
          resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull
          i += 1
        }
        resultRow(corruptFieldIndex.get) = badRecord()
        resultRow
      }
    } else {
      (row, _) => row.getOrElse(nullResult)
    }
  }

  def parse(input: IN): Iterator[InternalRow] = {
    try {
      rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null))
    } catch {
      case e: BadRecordException => mode match {
        case PermissiveMode =>
          Iterator(toResultRow(e.partialResult(), e.record))
        case DropMalformedMode =>
          Iterator.empty
        case FailFastMode =>
          throw new SparkException("Malformed records are detected in record parsing. " +
            s"Parse Mode: ${FailFastMode.name}.", e.cause)
      }
    }
  }
}

Source File: InsertIntoDataSourceDirCommand.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import org.apache.spark.SparkException
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.catalog._
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.datasources._


case class InsertIntoDataSourceDirCommand(
    storage: CatalogStorageFormat,
    provider: String,
    query: LogicalPlan,
    overwrite: Boolean) extends RunnableCommand {

  override protected def innerChildren: Seq[LogicalPlan] = query :: Nil

  override def run(sparkSession: SparkSession): Seq[Row] = {
    assert(storage.locationUri.nonEmpty, "Directory path is required")
    assert(provider.nonEmpty, "Data source is required")

    // Create the relation based on the input logical plan: `query`.
    val pathOption = storage.locationUri.map("path" -> CatalogUtils.URIToString(_))

    val dataSource = DataSource(
      sparkSession,
      className = provider,
      options = storage.properties ++ pathOption,
      catalogTable = None)

    val isFileFormat = classOf[FileFormat].isAssignableFrom(dataSource.providingClass)
    if (!isFileFormat) {
      throw new SparkException(
        "Only Data Sources providing FileFormat are supported: " + dataSource.providingClass)
    }

    val saveMode = if (overwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists
    try {
      sparkSession.sessionState.executePlan(dataSource.planForWriting(saveMode, query)).toRdd
    } catch {
      case ex: AnalysisException =>
        logError(s"Failed to write to directory " + storage.locationUri.toString, ex)
        throw ex
    }

    Seq.empty[Row]
  }
}

Source File: WriteToContinuousDataSourceExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming.continuous

import scala.util.control.NonFatal

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.streaming.StreamExecution
import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter


case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan)
    extends SparkPlan with Logging {
  override def children: Seq[SparkPlan] = Seq(query)
  override def output: Seq[Attribute] = Nil

  override protected def doExecute(): RDD[InternalRow] = {
    val writerFactory = writer.createWriterFactory()
    val rdd = new ContinuousWriteRDD(query.execute(), writerFactory)

    logInfo(s"Start processing data source writer: $writer. " +
      s"The input RDD has ${rdd.partitions.length} partitions.")
    EpochCoordinatorRef.get(
      sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
      sparkContext.env)
      .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions))

    try {
      // Force the RDD to run so continuous processing starts; no data is actually being collected
      // to the driver, as ContinuousWriteRDD outputs nothing.
      rdd.collect()
    } catch {
      case _: InterruptedException =>
        // Interruption is how continuous queries are ended, so accept and ignore the exception.
      case cause: Throwable =>
        cause match {
          // Do not wrap interruption exceptions that will be handled by streaming specially.
          case _ if StreamExecution.isInterruptionException(cause) => throw cause
          // Only wrap non fatal exceptions.
          case NonFatal(e) => throw new SparkException("Writing job aborted.", e)
          case _ => throw cause
        }
    }

    sparkContext.emptyRDD
  }
}

Source File: UDTRegistrationSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types._

private[sql] class TestUserClass {
}

private[sql] class TestUserClass2 {
}

private[sql] class TestUserClass3 {
}

private[sql] class NonUserDefinedType {
}

private[sql] class TestUserClassUDT extends UserDefinedType[TestUserClass] {

  override def sqlType: DataType = IntegerType
  override def serialize(input: TestUserClass): Int = 1

  override def deserialize(datum: Any): TestUserClass = new TestUserClass

  override def userClass: Class[TestUserClass] = classOf[TestUserClass]

  private[spark] override def asNullable: TestUserClassUDT = this

  override def hashCode(): Int = classOf[TestUserClassUDT].getName.hashCode()

  override def equals(other: Any): Boolean = other match {
    case _: TestUserClassUDT => true
    case _ => false
  }
}

class UDTRegistrationSuite extends SparkFunSuite {

  test("register non-UserDefinedType") {
    UDTRegistration.register(classOf[TestUserClass].getName,
      "org.apache.spark.sql.NonUserDefinedType")
    intercept[SparkException] {
      UDTRegistration.getUDTFor(classOf[TestUserClass].getName)
    }
  }

  test("default UDTs") {
    val userClasses = Seq(
    "org.apache.spark.ml.linalg.Vector",
    "org.apache.spark.ml.linalg.DenseVector",
    "org.apache.spark.ml.linalg.SparseVector",
    "org.apache.spark.ml.linalg.Matrix",
    "org.apache.spark.ml.linalg.DenseMatrix",
    "org.apache.spark.ml.linalg.SparseMatrix")
    userClasses.foreach { c =>
      assert(UDTRegistration.exists(c))
    }
  }

  test("query registered user class") {
    UDTRegistration.register(classOf[TestUserClass2].getName, classOf[TestUserClassUDT].getName)
    assert(UDTRegistration.exists(classOf[TestUserClass2].getName))
    assert(
      classOf[UserDefinedType[_]].isAssignableFrom((
        UDTRegistration.getUDTFor(classOf[TestUserClass2].getName).get)))
  }

  test("query unregistered user class") {
    assert(!UDTRegistration.exists(classOf[TestUserClass3].getName))
    assert(!UDTRegistration.getUDTFor(classOf[TestUserClass3].getName).isDefined)
  }
}

Source File: ParquetFileFormatSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.SparkException
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SharedSQLContext

class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext {

  test("read parquet footers in parallel") {
    def testReadFooters(ignoreCorruptFiles: Boolean): Unit = {
      withTempDir { dir =>
        val fs = FileSystem.get(spark.sessionState.newHadoopConf())
        val basePath = dir.getCanonicalPath

        val path1 = new Path(basePath, "first")
        val path2 = new Path(basePath, "second")
        val path3 = new Path(basePath, "third")

        spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString)
        spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString)
        spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString)

        val fileStatuses =
          Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten

        val footers = ParquetFileFormat.readParquetFootersInParallel(
          spark.sessionState.newHadoopConf(), fileStatuses, ignoreCorruptFiles)

        assert(footers.size == 2)
      }
    }

    testReadFooters(true)
    val exception = intercept[SparkException] {
      testReadFooters(false)
    }.getCause
    assert(exception.getMessage().contains("Could not read footer for file"))
  }
}

Source File: DruidRule.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.druid

import org.apache.spark.SparkException
import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
import org.apache.spark.sql.catalyst.expressions.{
  Attribute,
  Expression => SExpression,
  Literal,
  NamedExpression,
  SortOrder
}
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules.Rule


object DruidRule extends Rule[LogicalPlan] {

  override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {
    case Aggregate(ges, aes, p @ Project(_, _)) =>
      ProjectAndAggregate(ges, aes, p)

    case s @ Sort(orders, _, child) =>
      if (child.isInstanceOf[ProjectAndAggregate]) {
        child.asInstanceOf[ProjectAndAggregate].copy(orders = orders)
      } else {
        s
      }

    case l @ LocalLimit(Literal(v, t), child) =>
      val value: Any = convertToScala(v, t)
      val limit = value.asInstanceOf[Int]
      if (limit < 0) {
        throw new SparkException(s"Aggregate limit must great than zero!")
      }
      if (child.isInstanceOf[ProjectAndAggregate]) {
        child.asInstanceOf[ProjectAndAggregate].copy(limit = limit)
      } else {
        l
      }

    case g @ GlobalLimit(_, child) =>
      if (child.isInstanceOf[ProjectAndAggregate]) {
        child
      } else {
        g
      }
  }
}
case class ProjectAndAggregate(
    groupingExpressions: Seq[SExpression],
    aggregateExpressions: Seq[NamedExpression],
    child: LogicalPlan,
    orders: Seq[SortOrder] = null,
    limit: Int = 20)
  extends UnaryNode {
  override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute)
}

Source File: AlarmFactory.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.alarm

import java.util.ServiceLoader

import scala.collection.JavaConverters._

import org.apache.spark.SparkException
import org.apache.spark.util.Utils

object AlarmFactory {
  def create(alarmName: String, options: Map[String, String]): Alarm = {
    val loader = Utils.getContextOrSparkClassLoader
    val serviceLoader = ServiceLoader.load(classOf[Alarm], loader)
    val AlarmClass =
      serviceLoader.asScala.filter(_.name.equalsIgnoreCase(alarmName)).toList match {
        case head :: Nil =>
          head.getClass
        case _ =>
          throw new SparkException("error when instantiate spark.xsql.alarm.items")
      }
    AlarmClass.newInstance().bind(options)
  }

}

Source File: MonitorFactory.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.monitor

import java.util.ServiceLoader

import scala.collection.JavaConverters._

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.alarm.Alarm
import org.apache.spark.util.Utils
import org.apache.spark.util.kvstore.KVStore

object MonitorFactory {

  def create(
      monitorName: String,
      alarms: Seq[Alarm],
      appStore: KVStore,
      conf: SparkConf): Monitor = {
    val loader = Utils.getContextOrSparkClassLoader
    val serviceLoader = ServiceLoader.load(classOf[Monitor], loader)
    val MonitorClass = serviceLoader.asScala
      .filter(_.item.equals(MonitorItem.withName(monitorName)))
      .toList match {
      case head :: Nil =>
        head.getClass
      case _ =>
        throw new SparkException("error when instantiate spark.xsql.monitor.items")
    }
    MonitorClass.newInstance().bind(alarms).bind(appStore).bind(conf)
  }
}

Source File: MetricsSystems.scala From spark-monitoring with MIT License

5 votes

package org.apache.spark.metrics

import com.codahale.metrics._
import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import scala.collection.JavaConverters.mapAsScalaMapConverter

// These will only be created on executors
private[metrics] class RpcMetricsSystem(
                                         private val metricsSource: MetricsSource
                                       ) extends UserMetricsSystem with Logging {

  require(metricsSource != null, "metricsSource cannot be null")

  private val namespace = metricsSource.sourceName
  private val metricProxies = metricsSource.metricRegistry.getMetrics.asScala

  def counter(metricName: String): Counter = {
    getMetric[CounterProxy](metricName)
  }

  def histogram(metricName: String): Histogram = {
    getMetric[HistogramProxy](metricName)
  }

  def meter(metricName: String): Meter = {
    getMetric[MeterProxy](metricName)
  }

  def timer(metricName: String): Timer = {
    getMetric[TimerProxy](metricName)
  }

  def gauge[T](metricName: String): SettableGauge[T] = {
    getMetric[SettableGaugeProxy[T]](metricName)
  }

  private def getMetric[T <: MetricProxy](metricName: String): T = {
    metricProxies.get(metricName) match {
      case Some(metric) => {
        metric.asInstanceOf[T]
      }
      case None => throw new SparkException(s"Metric '${metricName}' in namespace ${namespace} was not found")
    }
  }
}

// These can be created on the driver and the executors.
class LocalMetricsSystem(
                          metricsSource: MetricsSource
                        ) extends UserMetricsSystem {

  require(metricsSource != null, "metricsSource cannot be null")

  private val namespace = metricsSource.sourceName
  private lazy val metrics = metricsSource.metricRegistry.getMetrics.asScala

  def counter(metricName: String): Counter = {
    getMetric[Counter](metricName)
  }

  def histogram(metricName: String): Histogram = {
    getMetric[Histogram](metricName)
  }

  def meter(metricName: String): Meter = {
    getMetric[Meter](metricName)
  }

  def timer(metricName: String): Timer = {
    getMetric[Timer](metricName)
  }

  def gauge[T](metricName: String): SettableGauge[T] = {
    val metric = getMetric[Gauge[T]](metricName)
    // If we have one, but it's not a settable gauge, it will run autonomously and provide metrics.
    // However, this is an exception here, as the developer wants to set it.
    if (!(metric.isInstanceOf[SettableGauge[T]])) {
      throw new SparkException(s"Gauge ${metricName} does not extend SettableGauge[T]")
    }

    metric.asInstanceOf[SettableGauge[T]]
  }

  private def getMetric[T <: Metric](metricName: String): T = {
    metrics.get(metricName) match {
      case Some(metric) => {
        metric.asInstanceOf[T]
      }
      case None => throw new SparkException(s"Metric '${metricName}' in namespace ${namespace} was not found")
    }
  }
}

Source File: UnifiedSparkListener.scala From spark-monitoring with MIT License

5 votes

package org.apache.spark.listeners

import java.time.Instant

import org.apache.spark.{SparkConf, SparkException, SparkInformation}
import org.apache.spark.internal.Logging
import org.apache.spark.listeners.sink.SparkListenerSink
import org.apache.spark.scheduler._
import org.apache.spark.sql.streaming.StreamingQueryListener
import org.apache.spark.util.JsonProtocol
import org.json4s.JsonAST.JValue
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods.{compact, render}

import scala.util.control.NonFatal


class UnifiedSparkListener(override val conf: SparkConf)
  extends UnifiedSparkListenerHandler
    with Logging
    with SparkListenerHandlers
    with StreamingListenerHandlers
    with StreamingQueryListenerHandlers {

  private val listenerSink = this.createSink(this.conf)

  override def onOtherEvent(event: SparkListenerEvent): Unit = {
    // All events in Spark that are not specific to SparkListener go through
    // this method.  The typed ListenerBus implementations intercept and forward to
    // their "local" listeners.
    // We will just handle everything here so we only have to have one listener.
    // The advantage is that this can be registered in extraListeners, so no
    // code change is required to add listener support.
    event match {
      // We will use the ClassTag for the private wrapper class to match
      case this.streamingListenerEventClassTag(e) =>
        this.onStreamingListenerEvent(e)
      case streamingQueryListenerEvent: StreamingQueryListener.Event =>
        this.onStreamingQueryListenerEvent(streamingQueryListenerEvent)
      case sparkListenerEvent: SparkListenerEvent => if (sparkListenerEvent.logEvent) {
        logSparkListenerEvent(sparkListenerEvent)
      }
    }
  }

  private def createSink(conf: SparkConf): SparkListenerSink = {
    val sink = conf.getOption("spark.unifiedListener.sink") match {
      case Some(listenerSinkClassName) => listenerSinkClassName
      case None => throw new SparkException("spark.unifiedListener.sink setting is required")
    }
    logInfo(s"Creating listener sink: ${sink}")
    org.apache.spark.util.Utils.loadExtensions(
      classOf[SparkListenerSink],
      Seq(sink),
      conf).head
  }

  protected def logSparkListenerEvent(
                                       event: SparkListenerEvent,
                                       getTimestamp: () => Instant =
                                       () => Instant.now()): Unit = {
    val json = try {
      // Add a well-known time field.
      Some(
        JsonProtocol.sparkEventToJson(event)
          .merge(render(
            SparkInformation.get() + ("SparkEventTime" -> getTimestamp().toString)
          ))
      )
    } catch {
      case NonFatal(e) =>
        logError(s"Error serializing SparkListenerEvent to JSON: $event", e)
        None
    }

    sendToSink(json)
  }

  private[spark] def sendToSink(json: Option[JValue]): Unit = {
    try {
      json match {
        case Some(j) => {
          logDebug(s"Sending event to listener sink: ${compact(j)}")
          this.listenerSink.logEvent(json)
        }
        case None => {
          logWarning("json value was None")
        }
      }
    } catch {
      case NonFatal(e) =>
        logError(s"Error sending to listener sink: $e")
    }
  }
}

Source File: LogAnalyticsMetricsSink.scala From spark-monitoring with MIT License

5 votes

package org.apache.spark.metrics.sink.loganalytics

import java.util.Properties
import java.util.concurrent.TimeUnit

import com.codahale.metrics.MetricRegistry
import org.apache.spark.internal.Logging
import org.apache.spark.metrics.sink.Sink
import org.apache.spark.{SecurityManager, SparkException}

private class LogAnalyticsMetricsSink(
                                val property: Properties,
                                val registry: MetricRegistry,
                                securityMgr: SecurityManager)
  extends Sink with Logging {

  private val config = new LogAnalyticsSinkConfiguration(property)

  org.apache.spark.metrics.MetricsSystem.checkMinimalPollingPeriod(config.pollUnit, config.pollPeriod)

  var reporter = LogAnalyticsReporter.forRegistry(registry)
    .withWorkspaceId(config.workspaceId)
    .withWorkspaceKey(config.secret)
    .withLogType(config.logType)
    .build()

  override def start(): Unit = {
    reporter.start(config.pollPeriod, config.pollUnit)
    logInfo(s"LogAnalyticsMetricsSink started")
  }

  override def stop(): Unit = {
    reporter.stop()
    logInfo("LogAnalyticsMetricsSink stopped.")
  }

  override def report(): Unit = {
    reporter.report()
  }
}

Source File: MesosClusterManager.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster.mesos

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.internal.config._
import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}


private[spark] class MesosClusterManager extends ExternalClusterManager {
  private val MESOS_REGEX = """mesos://(.*)""".r

  override def canCreate(masterURL: String): Boolean = {
    masterURL.startsWith("mesos")
  }

  override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
    new TaskSchedulerImpl(sc)
  }

  override def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = {
    require(!sc.conf.get(IO_ENCRYPTION_ENABLED),
      "I/O encryption is currently not supported in Mesos.")

    val mesosUrl = MESOS_REGEX.findFirstMatchIn(masterURL).get.group(1)
    val coarse = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true)
    if (coarse) {
      new MesosCoarseGrainedSchedulerBackend(
        scheduler.asInstanceOf[TaskSchedulerImpl],
        sc,
        mesosUrl,
        sc.env.securityManager)
    } else {
      new MesosFineGrainedSchedulerBackend(
        scheduler.asInstanceOf[TaskSchedulerImpl],
        sc,
        mesosUrl)
    }
  }

  override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
  }
}

Source File: RWrappers.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkException
import org.apache.spark.ml.util.MLReader


private[r] object RWrappers extends MLReader[Object] {

  override def load(path: String): Object = {
    implicit val format = DefaultFormats
    val rMetadataPath = new Path(path, "rMetadata").toString
    val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
    val rMetadata = parse(rMetadataStr)
    val className = (rMetadata \ "class").extract[String]
    className match {
      case "org.apache.spark.ml.r.NaiveBayesWrapper" => NaiveBayesWrapper.load(path)
      case "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper" =>
        AFTSurvivalRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" =>
        GeneralizedLinearRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.KMeansWrapper" =>
        KMeansWrapper.load(path)
      case "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" =>
        MultilayerPerceptronClassifierWrapper.load(path)
      case "org.apache.spark.ml.r.LDAWrapper" =>
        LDAWrapper.load(path)
      case "org.apache.spark.ml.r.IsotonicRegressionWrapper" =>
        IsotonicRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.GaussianMixtureWrapper" =>
        GaussianMixtureWrapper.load(path)
      case "org.apache.spark.ml.r.ALSWrapper" =>
        ALSWrapper.load(path)
      case "org.apache.spark.ml.r.LogisticRegressionWrapper" =>
        LogisticRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.RandomForestRegressorWrapper" =>
        RandomForestRegressorWrapper.load(path)
      case "org.apache.spark.ml.r.RandomForestClassifierWrapper" =>
        RandomForestClassifierWrapper.load(path)
      case "org.apache.spark.ml.r.GBTRegressorWrapper" =>
        GBTRegressorWrapper.load(path)
      case "org.apache.spark.ml.r.GBTClassifierWrapper" =>
        GBTClassifierWrapper.load(path)
      case _ =>
        throw new SparkException(s"SparkR read.ml does not support load $className")
    }
  }
}

Source File: NumericParser.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.util.StringTokenizer

import scala.collection.mutable.{ArrayBuilder, ListBuffer}

import org.apache.spark.SparkException


  def parse(s: String): Any = {
    val tokenizer = new StringTokenizer(s, "()[],", true)
    if (tokenizer.hasMoreTokens()) {
      val token = tokenizer.nextToken()
      if (token == "(") {
        parseTuple(tokenizer)
      } else if (token == "[") {
        parseArray(tokenizer)
      } else {
        // expecting a number
        parseDouble(token)
      }
    } else {
      throw new SparkException(s"Cannot find any token from the input string.")
    }
  }

  private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
    val values = ArrayBuilder.make[Double]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "]") {
        parsing = false
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else {
        // expecting a number
        values += parseDouble(token)
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"An array must end with ']'.")
    }
    values.result()
  }

  private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
    val items = ListBuffer.empty[Any]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "(") {
        items.append(parseTuple(tokenizer))
        allowComma = true
      } else if (token == "[") {
        items.append(parseArray(tokenizer))
        allowComma = true
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else if (token == ")") {
        parsing = false
      } else if (token.trim.isEmpty) {
          // ignore whitespaces between delim chars, e.g. ", ["
      } else {
        // expecting a number
        items.append(parseDouble(token))
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"A tuple must end with ')'.")
    }
    items
  }

  private def parseDouble(s: String): Double = {
    try {
      java.lang.Double.parseDouble(s)
    } catch {
      case e: NumberFormatException =>
        throw new SparkException(s"Cannot parse a double from: $s", e)
    }
  }
}

Source File: LabeledPoint.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException


  @Since("1.1.0")
  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
      }
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
    }
  }

  private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = {
    LabeledPoint(point.label, Vectors.fromML(point.features))
  }
}

Source File: LibSVMRelationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: NumericParserSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.{SparkException, SparkFunSuite}

class NumericParserSuite extends SparkFunSuite {

  test("parser") {
    val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3))
    assert(parsed(1).asInstanceOf[Double] === -4.0)
    assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8))
    assert(parsed(3).asInstanceOf[Double] === 9.0)

    val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
    malformatted.foreach { s =>
      intercept[SparkException] {
        NumericParser.parse(s)
        throw new RuntimeException(s"Didn't detect malformatted string $s.")
      }
    }
  }

  test("parser with whitespaces") {
    val s = "(0.0, [1.0, 2.0])"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Double] === 0.0)
    assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
  }
}

Source File: CommitFailureTestRelationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton {
  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = spark.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - default") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val divideByZero = udf((x: Int) => { x / (x - 1)})
      val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id")))

      SimpleTextRelation.callbackCalled = false
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - partitioned") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))

      SimpleTextRelation.callbackCalled = false
      SimpleTextRelation.failWriter = true
      intercept[SparkException] {
        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: ThriftServerTab.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)
  val listener = HiveThriftServer2.listener

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: UDTRegistration.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import scala.collection.mutable

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.util.Utils


  def getUDTFor(userClass: String): Option[Class[_]] = {
    udtMap.get(userClass).map { udtClassName =>
      if (Utils.classIsLoadable(udtClassName)) {
        val udtClass = Utils.classForName(udtClassName)
        if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) {
          udtClass
        } else {
          throw new SparkException(
            s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " +
              s"an UserDefinedType for ${userClass}")
        }
      } else {
        throw new SparkException(
          s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.")
      }
    }
  }
}

Source File: ScalaUDFSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types.{IntegerType, StringType}

class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("basic") {
    val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil)
    checkEvaluation(intUdf, 2)

    val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil)
    checkEvaluation(stringUdf, "ax")
  }

  test("better error message for NPE") {
    val udf = ScalaUDF(
      (s: String) => s.toLowerCase,
      StringType,
      Literal.create(null, StringType) :: Nil)

    val e1 = intercept[SparkException](udf.eval())
    assert(e1.getMessage.contains("Failed to execute user defined function"))

    val e2 = intercept[SparkException] {
      checkEvalutionWithUnsafeProjection(udf, null)
    }
    assert(e2.getMessage.contains("Failed to execute user defined function"))
  }

}

Source File: UDTRegistrationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types._

private[sql] class TestUserClass {
}

private[sql] class TestUserClass2 {
}

private[sql] class TestUserClass3 {
}

private[sql] class NonUserDefinedType {
}

private[sql] class TestUserClassUDT extends UserDefinedType[TestUserClass] {

  override def sqlType: DataType = IntegerType
  override def serialize(input: TestUserClass): Int = 1

  override def deserialize(datum: Any): TestUserClass = new TestUserClass

  override def userClass: Class[TestUserClass] = classOf[TestUserClass]

  private[spark] override def asNullable: TestUserClassUDT = this

  override def hashCode(): Int = classOf[TestUserClassUDT].getName.hashCode()

  override def equals(other: Any): Boolean = other match {
    case _: TestUserClassUDT => true
    case _ => false
  }
}

class UDTRegistrationSuite extends SparkFunSuite {

  test("register non-UserDefinedType") {
    UDTRegistration.register(classOf[TestUserClass].getName,
      "org.apache.spark.sql.NonUserDefinedType")
    intercept[SparkException] {
      UDTRegistration.getUDTFor(classOf[TestUserClass].getName)
    }
  }

  test("default UDTs") {
    val userClasses = Seq(
    "org.apache.spark.ml.linalg.Vector",
    "org.apache.spark.ml.linalg.DenseVector",
    "org.apache.spark.ml.linalg.SparseVector",
    "org.apache.spark.ml.linalg.Matrix",
    "org.apache.spark.ml.linalg.DenseMatrix",
    "org.apache.spark.ml.linalg.SparseMatrix")
    userClasses.foreach { c =>
      assert(UDTRegistration.exists(c))
    }
  }

  test("query registered user class") {
    UDTRegistration.register(classOf[TestUserClass2].getName, classOf[TestUserClassUDT].getName)
    assert(UDTRegistration.exists(classOf[TestUserClass2].getName))
    assert(
      classOf[UserDefinedType[_]].isAssignableFrom((
        UDTRegistration.getUDTFor(classOf[TestUserClass2].getName).get)))
  }

  test("query unregistered user class") {
    assert(!UDTRegistration.exists(classOf[TestUserClass3].getName))
    assert(!UDTRegistration.getUDTFor(classOf[TestUserClass3].getName).isDefined)
  }
}

Source File: HDFSCredentialProvider.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import java.io.{ByteArrayInputStream, DataInputStream}

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier
import org.apache.hadoop.mapred.Master
import org.apache.hadoop.security.Credentials

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.deploy.yarn.config._
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._

private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging {
  // Token renewal interval, this value will be set in the first call,
  // if None means no token renewer specified, so cannot get token renewal interval.
  private var tokenRenewalInterval: Option[Long] = null

  override val serviceName: String = "hdfs"

  override def obtainCredentials(
      hadoopConf: Configuration,
      sparkConf: SparkConf,
      creds: Credentials): Option[Long] = {
    // NameNode to access, used to get tokens from different FileSystems
    nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
      val dstFs = dst.getFileSystem(hadoopConf)
      logInfo("getting token for namenode: " + dst)
      dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds)
    }

    // Get the token renewal interval if it is not set. It will only be called once.
    if (tokenRenewalInterval == null) {
      tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf)
    }

    // Get the time of next renewal.
    tokenRenewalInterval.map { interval =>
      creds.getAllTokens.asScala
        .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
        .map { t =>
          val identifier = new DelegationTokenIdentifier()
          identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
          identifier.getIssueDate + interval
      }.foldLeft(0L)(math.max)
    }
  }

  private def getTokenRenewalInterval(
      hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = {
    // We cannot use the tokens generated with renewer yarn. Trying to renew
    // those will fail with an access control issue. So create new tokens with the logged in
    // user as renewer.
    sparkConf.get(PRINCIPAL).flatMap { renewer =>
      val creds = new Credentials()
      nnsToAccess(hadoopConf, sparkConf).foreach { dst =>
        val dstFs = dst.getFileSystem(hadoopConf)
        dstFs.addDelegationTokens(renewer, creds)
      }
      val hdfsToken = creds.getAllTokens.asScala
        .find(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND)
      hdfsToken.map { t =>
        val newExpiration = t.renew(hadoopConf)
        val identifier = new DelegationTokenIdentifier()
        identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier)))
        val interval = newExpiration - identifier.getIssueDate
        logInfo(s"Renewal Interval is $interval")
        interval
      }
    }
  }

  private def getTokenRenewer(conf: Configuration): String = {
    val delegTokenRenewer = Master.getMasterPrincipal(conf)
    logDebug("delegation token renewer is: " + delegTokenRenewer)
    if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) {
      val errorMessage = "Can't get Master Kerberos principal for use as renewer"
      logError(errorMessage)
      throw new SparkException(errorMessage)
    }

    delegTokenRenewer
  }

  private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = {
    sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet +
      sparkConf.get(STAGING_DIR).map(new Path(_))
        .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory)
  }
}

Source File: YarnClusterManager.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}


private[spark] class YarnClusterManager extends ExternalClusterManager {

  override def canCreate(masterURL: String): Boolean = {
    masterURL == "yarn"
  }

  override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
    sc.deployMode match {
      case "cluster" => new YarnClusterScheduler(sc)
      case "client" => new YarnScheduler(sc)
      case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
    }
  }

  override def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = {
    sc.deployMode match {
      case "cluster" =>
        new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
      case "client" =>
        new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
      case  _ =>
        throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
    }
  }

  override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
  }
}

Source File: HDFSCredentialProviderSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.scalatest.{Matchers, PrivateMethodTester}

import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}

class HDFSCredentialProviderSuite
    extends SparkFunSuite
    with PrivateMethodTester
    with Matchers {
  private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer)

  private def getTokenRenewer(
      hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = {
    hdfsCredentialProvider invokePrivate _getTokenRenewer(conf)
  }

  private var hdfsCredentialProvider: HDFSCredentialProvider = null

  override def beforeAll() {
    super.beforeAll()

    if (hdfsCredentialProvider == null) {
      hdfsCredentialProvider = new HDFSCredentialProvider()
    }
  }

  override def afterAll() {
    if (hdfsCredentialProvider != null) {
      hdfsCredentialProvider = null
    }

    super.afterAll()
  }

  test("check token renewer") {
    val hadoopConf = new Configuration()
    hadoopConf.set("yarn.resourcemanager.address", "myrm:8033")
    hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:8032@SPARKTEST.COM")
    val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf)
    renewer should be ("yarn/myrm:8032@SPARKTEST.COM")
  }

  test("check token renewer default") {
    val hadoopConf = new Configuration()
    val caught =
      intercept[SparkException] {
        getTokenRenewer(hdfsCredentialProvider, hadoopConf)
      }
    assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer")
  }
}

Source File: UnionDStream.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
  extends DStream[T](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to union is empty")
  require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.length == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[T]] = {
    val rdds = new ArrayBuffer[RDD[T]]()
    parents.map(_.getOrCompute(validTime)).foreach {
      case Some(rdd) => rdds += rdd
      case None => throw new SparkException("Could not generate RDD from a parent for unifying at" +
        s" time $validTime")
    }
    if (rdds.nonEmpty) {
      Some(ssc.sc.union(rdds))
    } else {
      None
    }
  }
}

Source File: TransformedDStream.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class TransformedDStream[U: ClassTag] (
    parents: Seq[DStream[_]],
    transformFunc: (Seq[RDD[_]], Time) => RDD[U]
  ) extends DStream[U](parents.head.ssc) {

  require(parents.nonEmpty, "List of DStreams to transform is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[U]] = {
    val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse(
      // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE
      throw new SparkException(s"Couldn't generate RDD from parent at time $validTime"))
    }
    val transformedRDD = transformFunc(parentRDDs, validTime)
    if (transformedRDD == null) {
      throw new SparkException("Transform function must not return null. " +
        "Return SparkContext.emptyRDD() instead to represent no element " +
        "as the result of transformation.")
    }
    Some(transformedRDD)
  }

  
  override protected[streaming] def createRDDWithLocalProperties[U](
      time: Time,
      displayInnerRDDOps: Boolean)(body: => U): U = {
    super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body)
  }
}

Source File: StreamingTab.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.ui

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[spark] class StreamingTab(val ssc: StreamingContext)
  extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging {

  import StreamingTab._

  private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static"

  val parent = getSparkUI(ssc)
  val listener = ssc.progressListener

  ssc.addStreamingListener(listener)
  ssc.sc.addSparkListener(listener)
  attachPage(new StreamingPage(this))
  attachPage(new BatchPage(this))

  def attach() {
    getSparkUI(ssc).attachTab(this)
    getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
  }

  def detach() {
    getSparkUI(ssc).detachTab(this)
    getSparkUI(ssc).removeStaticHandler("/static/streaming")
  }
}

private object StreamingTab {
  def getSparkUI(ssc: StreamingContext): SparkUI = {
    ssc.sc.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: RpcEndpointAddress.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rpc

import org.apache.spark.SparkException


private[spark] case class RpcEndpointAddress(val rpcAddress: RpcAddress, val name: String) {

  require(name != null, "RpcEndpoint name must be provided.")

  def this(host: String, port: Int, name: String) = {
    this(RpcAddress(host, port), name)
  }

  override val toString = if (rpcAddress != null) {
      s"spark://$name@${rpcAddress.host}:${rpcAddress.port}"
    } else {
      s"spark-client://$name"
    }
}

private[spark] object RpcEndpointAddress {

  def apply(host: String, port: Int, name: String): RpcEndpointAddress = {
    new RpcEndpointAddress(host, port, name)
  }

  def apply(sparkUrl: String): RpcEndpointAddress = {
    try {
      val uri = new java.net.URI(sparkUrl)
      val host = uri.getHost
      val port = uri.getPort
      val name = uri.getUserInfo
      if (uri.getScheme != "spark" ||
          host == null ||
          port < 0 ||
          name == null ||
          (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
          uri.getFragment != null ||
          uri.getQuery != null) {
        throw new SparkException("Invalid Spark URL: " + sparkUrl)
      }
      new RpcEndpointAddress(host, port, name)
    } catch {
      case e: java.net.URISyntaxException =>
        throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
    }
  }
}

Source File: RUtils.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.io.File
import java.util.Arrays

import org.apache.spark.{SparkEnv, SparkException}

private[spark] object RUtils {
  // Local path where R binary packages built from R source code contained in the spark
  // packages specified with "--packages" or "--jars" command line option reside.
  var rPackages: Option[String] = None

  
  def isRInstalled: Boolean = {
    try {
      val builder = new ProcessBuilder(Arrays.asList("R", "--version"))
      builder.start().waitFor() == 0
    } catch {
      case e: Exception => false
    }
  }
}

Source File: RpcAddressSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rpc

import org.apache.spark.{SparkException, SparkFunSuite}

class RpcAddressSuite extends SparkFunSuite {

  test("hostPort") {
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
    assert(address.hostPort == "1.2.3.4:1234")
  }

  test("fromSparkURL") {
    val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234")
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
  }

  test("fromSparkURL: a typo url") {
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("spark://1.2. 3.4:1234")
    }
    assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage)
  }

  test("fromSparkURL: invalid scheme") {
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("invalid://1.2.3.4:1234")
    }
    assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage)
  }

  test("toSparkURL") {
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.toSparkURL == "spark://1.2.3.4:1234")
  }
}

Source File: KryoSerializerResizableOutputSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.LocalSparkContext
import org.apache.spark.SparkContext
import org.apache.spark.SparkException

class KryoSerializerResizableOutputSuite extends SparkFunSuite {

  // trial and error showed this will not serialize with 1mb buffer
  val x = (1 to 400000).toArray

  test("kryo without resizable output buffer should fail on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "1m")
    val sc = new SparkContext("local", "test", conf)
    intercept[SparkException](sc.parallelize(x).collect())
    LocalSparkContext.stop(sc)
  }

  test("kryo with resizable output buffer should succeed on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "2m")
    val sc = new SparkContext("local", "test", conf)
    assert(sc.parallelize(x).collect() === x)
    LocalSparkContext.stop(sc)
  }
}

Source File: ProactiveClosureSerializationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: CoarseGrainedSchedulerBackendSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite}
import org.apache.spark.util.{RpcUtils, SerializableBuffer}

class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext {

  test("serialized task larger than max RPC message size") {
    val conf = new SparkConf
    conf.set("spark.rpc.message.maxSize", "1")
    conf.set("spark.default.parallelism", "1")
    sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf)
    val frameSize = RpcUtils.maxMessageSizeBytes(sc.conf)
    val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize))
    val larger = sc.parallelize(Seq(buffer))
    val thrown = intercept[SparkException] {
      larger.collect()
    }
    assert(thrown.getMessage.contains("using broadcast variables for large values"))
    val smaller = sc.parallelize(1 to 4).collect()
    assert(smaller.size === 4)
  }

}

Source File: FuncTestSparkNotebookContext.scala From uberdata with Apache License 2.0

5 votes

package eleflow.uberdata.core

import eleflow.uberdata.core.data.{DataTransformer, Dataset}
import eleflow.uberdata.core.enums.DataSetType
import org.apache.spark.rpc.netty.{BeforeAndAfterWithContext, TestSparkConf}
import org.apache.spark.SparkException
import org.scalatest._


class FuncTestSparkNotebookContext extends FlatSpec with BeforeAndAfterWithContext { this: Suite =>

  val uberContext = context

  "Functional SparkNotebookContext" should
    "correctly load rdd" in {

    import Dataset._

    val dataset = Dataset(uberContext, s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv")

    val testDataSet =
      Dataset(uberContext, s"${defaultFilePath}FuncTestSparkNotebookContextFile2.csv")

    val (train, test, _) =
      DataTransformer.createLabeledPointFromRDD(dataset, testDataSet, "int", "id")
    val all = train.take(3)
    val (_, first) = all.head
    val (_, second) = all.tail.head
    assert(first.label == 5.0)
    assert(first.features.toArray.deep == Array[Double](0.0, 1.0, 10.5).deep)
    assert(second.label == 1.0)
    assert(second.features.toArray.deep == Array[Double](1.0, 0.0, 0.1).deep)

    val allTest = test.take(3)
    val (_, firstTest) = allTest.head
    val (_, secondTest) = allTest.tail.head
    assert(firstTest.label == 1.0)
    assert(firstTest.features.toArray.deep == Array[Double](0.0, 1.0, 10.5).deep)
    assert(secondTest.label == 2.0)
    assert(secondTest.features.toArray.deep == Array[Double](1.0, 0.0, 0.1).deep)
  }

  it should "Throw an exception when process an empty numeric column" in {

    @transient lazy val context = uberContext

    context.sparkContext
    try {
      import Dataset._
      val dataset = Dataset(context, s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv")
      dataset.take(3)
    } catch {
      case e: SparkException =>
        assert(e.getMessage.contains("UnexpectedFileFormatException"))
    }
  }

  it should "Correct handle empty string values" in {
    @transient lazy val context = uberContext
    context.sparkContext
    val schemaRdd =
      Dataset(context, s"${defaultFilePath}FuncTestSparkNotebookContextEmpty.csv").toDataFrame
    val result = DataTransformer
      .createLabeledPointFromRDD(schemaRdd, Seq("int"), Seq("id"), DataSetType.Train)
    assert(result.count() == 3)
  }

  it should "Throw an exception when input have different number of columns" in {
    uberContext.sparkContext
    try {

      context
        .load(s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv", TestSparkConf.separator)
    } catch {
      case e: SparkException =>
        assert(e.getMessage.contains("UnexpectedFileFormatException"))
    }
  }

}

Source File: MasterSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.deploy.master

import akka.actor.Address
import org.scalatest.FunSuite

import org.apache.spark.{SSLOptions, SparkConf, SparkException}

class MasterSuite extends FunSuite {

  test("toAkkaUrl") {
    val conf = new SparkConf(loadDefaults = false)
    val akkaUrl = Master.toAkkaUrl("spark://1.2.3.4:1234", "akka.tcp")
    assert("akka.tcp://sparkMaster@1.2.3.4:1234/user/Master" === akkaUrl)
  }

  test("toAkkaUrl with SSL") {
    val conf = new SparkConf(loadDefaults = false)
    val akkaUrl = Master.toAkkaUrl("spark://1.2.3.4:1234", "akka.ssl.tcp")
    assert("akka.ssl.tcp://sparkMaster@1.2.3.4:1234/user/Master" === akkaUrl)
  }

  test("toAkkaUrl: a typo url") {
    val conf = new SparkConf(loadDefaults = false)
    val e = intercept[SparkException] {
      Master.toAkkaUrl("spark://1.2. 3.4:1234", "akka.tcp")
    }
    assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage)
  }

  test("toAkkaAddress") {
    val conf = new SparkConf(loadDefaults = false)
    val address = Master.toAkkaAddress("spark://1.2.3.4:1234", "akka.tcp")
    assert(Address("akka.tcp", "sparkMaster", "1.2.3.4", 1234) === address)
  }

  test("toAkkaAddress with SSL") {
    val conf = new SparkConf(loadDefaults = false)
    val address = Master.toAkkaAddress("spark://1.2.3.4:1234", "akka.ssl.tcp")
    assert(Address("akka.ssl.tcp", "sparkMaster", "1.2.3.4", 1234) === address)
  }

  test("toAkkaAddress: a typo url") {
    val conf = new SparkConf(loadDefaults = false)
    val e = intercept[SparkException] {
      Master.toAkkaAddress("spark://1.2. 3.4:1234", "akka.tcp")
    }
    assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage)
  }
}

Source File: KryoSerializerResizableOutputSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.scalatest.FunSuite

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.LocalSparkContext
import org.apache.spark.SparkException


class KryoSerializerResizableOutputSuite extends FunSuite {

  // trial and error showed this will not serialize with 1mb buffer
  val x = (1 to 400000).toArray

  test("kryo without resizable output buffer should fail on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer.mb", "1")
    conf.set("spark.kryoserializer.buffer.max.mb", "1")
    val sc = new SparkContext("local", "test", conf)
    intercept[SparkException](sc.parallelize(x).collect())
    LocalSparkContext.stop(sc)
  }

  test("kryo with resizable output buffer should succeed on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer.mb", "1")
    conf.set("spark.kryoserializer.buffer.max.mb", "2")
    val sc = new SparkContext("local", "test", conf)
    assert(sc.parallelize(x).collect() === x)
    LocalSparkContext.stop(sc)
  }
}

Source File: ProactiveClosureSerializationSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.scalatest.FunSuite

import org.apache.spark.{SharedSparkContext, SparkException}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T) = x.toString
  
  def pred[T](x: T) = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends FunSuite with SharedSparkContext {

  def fixture = (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <- 
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"), 
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = 
    x.map(y=>uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = 
    x.flatMap(y=>Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = 
    x.filter(y=>uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = 
    x.mapPartitions(_.map(y=>uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = 
    x.mapPartitionsWithIndex((_, it) => it.map(y=>uc.op(y)))
  
}

Source File: CoarseGrainedSchedulerBackendSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkException, SparkContext}
import org.apache.spark.util.{SerializableBuffer, AkkaUtils}

import org.scalatest.FunSuite

class CoarseGrainedSchedulerBackendSuite extends FunSuite with LocalSparkContext {

  test("serialized task larger than akka frame size") {
    val conf = new SparkConf
    conf.set("spark.akka.frameSize","1")
    conf.set("spark.default.parallelism","1")
    sc = new SparkContext("local-cluster[2 , 1 , 512]", "test", conf)
    val frameSize = AkkaUtils.maxFrameSizeBytes(sc.conf)
    val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize))
    val larger = sc.parallelize(Seq(buffer))
    val thrown = intercept[SparkException] {
      larger.collect()
    }
    assert(thrown.getMessage.contains("using broadcast variables for large values"))
    val smaller = sc.parallelize(1 to 4).collect()
    assert(smaller.size === 4)
  }

}

Source File: MutableURLClassLoaderSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.util

import java.net.URLClassLoader

import org.scalatest.FunSuite

import org.apache.spark.{LocalSparkContext, SparkContext, SparkException, TestUtils}
import org.apache.spark.util.Utils

class MutableURLClassLoaderSuite extends FunSuite {

  val urls2 = List(TestUtils.createJarWithClasses(
      classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"),
      toStringValue = "2")).toArray
  val urls = List(TestUtils.createJarWithClasses(
      classNames = Seq("FakeClass1"),
      classNamesWithBase = Seq(("FakeClass2", "FakeClass3")), // FakeClass3 is in parent
      toStringValue = "1",
      classpathUrls = urls2)).toArray

  test("child first") {
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
    val fakeClass = classLoader.loadClass("FakeClass2").newInstance()
    val fakeClassVersion = fakeClass.toString
    assert(fakeClassVersion === "1")
    val fakeClass2 = classLoader.loadClass("FakeClass2").newInstance()
    assert(fakeClass.getClass === fakeClass2.getClass)
  }

  test("parent first") {
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new MutableURLClassLoader(urls, parentLoader)
    val fakeClass = classLoader.loadClass("FakeClass1").newInstance()
    val fakeClassVersion = fakeClass.toString
    assert(fakeClassVersion === "2")
    val fakeClass2 = classLoader.loadClass("FakeClass1").newInstance()
    assert(fakeClass.getClass === fakeClass2.getClass)
  }

  test("child first can fall back") {
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
    val fakeClass = classLoader.loadClass("FakeClass3").newInstance()
    val fakeClassVersion = fakeClass.toString
    assert(fakeClassVersion === "2")
  }

  test("child first can fail") {
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
    intercept[java.lang.ClassNotFoundException] {
      classLoader.loadClass("FakeClassDoesNotExist").newInstance()
    }
  }

  test("driver sets context class loader in local mode") {
    // Test the case where the driver program sets a context classloader and then runs a job
    // in local mode. This is what happens when ./spark-submit is called with "local" as the
    // master.
    val original = Thread.currentThread().getContextClassLoader

    val className = "ClassForDriverTest"
    val jar = TestUtils.createJarWithClasses(Seq(className))
    val contextLoader = new URLClassLoader(Array(jar), Utils.getContextOrSparkClassLoader)
    Thread.currentThread().setContextClassLoader(contextLoader)

    val sc = new SparkContext("local", "driverLoaderTest")

    try {
      sc.makeRDD(1 to 5, 2).mapPartitions { x =>
        val loader = Thread.currentThread().getContextClassLoader
        Class.forName(className, true, loader).newInstance()
        Seq().iterator
      }.count()
    }
    catch {
      case e: SparkException if e.getMessage.contains("ClassNotFoundException") =>
        fail("Local executor could not find class", e)
      case t: Throwable => fail("Unexpected exception ", t)
    }

    sc.stop()
    Thread.currentThread().setContextClassLoader(original)
  }
}

Source File: Application.scala From ZparkIO with MIT License

5 votes

package com.leobenkel.zparkioProfileExampleMoreComplex

import com.leobenkel.zparkio.Services._
import com.leobenkel.zparkio.ZparkioApp
import com.leobenkel.zparkioProfileExampleMoreComplex.Application.APP_ENV
import com.leobenkel.zparkioProfileExampleMoreComplex.Services.Database.Database
import com.leobenkel.zparkioProfileExampleMoreComplex.Services.FileIO.FileIO
import com.leobenkel.zparkioProfileExampleMoreComplex.Services._
import com.leobenkel.zparkioProfileExampleMoreComplex.Transformations.UserTransformations
import izumi.reflect.Tag
import org.apache.spark.SparkException
import zio.{ZIO, ZLayer}

trait Application extends ZparkioApp[Arguments, APP_ENV, Unit] {
  implicit lazy final override val tagC:   Tag[Arguments] = Tag.tagFromTagMacro
  implicit lazy final override val tagEnv: Tag[APP_ENV] = Tag.tagFromTagMacro

  override protected def env: ZLayer[ZPARKIO_ENV, Throwable, APP_ENV] =
    FileIO.Live ++ Database.Live

  override protected def sparkFactory:  FACTORY_SPARK = SparkBuilder
  override protected def loggerFactory: FACTORY_LOG = Logger.Factory(Log)
  override protected def makeCli(args: List[String]): Arguments = Arguments(args)

  override def runApp(): ZIO[COMPLETE_ENV, Throwable, Unit] = {
    for {
      _       <- Logger.info(s"--Start--")
      authors <- UserTransformations.getAuthors
      _       <- Logger.info(s"There are ${authors.count()} authors")
    } yield ()
  }

  override def processErrors(f: Throwable): Option[Int] = {
    println(f)
    f.printStackTrace(System.out)

    f match {
      case _: SparkException       => Some(10)
      case _: InterruptedException => Some(0)
      case _ => Some(1)
    }
  }
}

object Application {
  type APP_ENV = FileIO with Database
}

Source File: StringToShortIndexer.scala From spark-ext with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkException
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.util.collection.OpenHashMap


class StringToShortIndexer(override val uid: String) extends Estimator[StringToShortIndexerModel]
with StringIndexerBase {

  def this() = this(Identifiable.randomUID("strShortIdx"))

  def setInputCol(value: String): this.type = set(inputCol, value)

  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def fit(dataset: DataFrame): StringToShortIndexerModel = {
    val counts = dataset.select(col($(inputCol)).cast(StringType))
      .map(_.getString(0))
      .countByValue()
    val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray
    require(labels.length <= Short.MaxValue,
      s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})")
    copyValues(new StringToShortIndexerModel(uid, labels).setParent(this))
  }

  override def transformSchema(schema: StructType): StructType = {
    validateAndTransformSchema(schema)
  }

  override def copy(extra: ParamMap): StringToShortIndexer = defaultCopy(extra)
}

class StringToShortIndexerModel (
  override val uid: String,
  val labels: Array[String]) extends Model[StringToShortIndexerModel] with StringIndexerBase {

  def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels)

  require(labels.length <= Short.MaxValue,
    s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})")

  private val labelToIndex: OpenHashMap[String, Short] = {
    val n = labels.length.toShort
    val map = new OpenHashMap[String, Short](n)
    var i: Short = 0
    while (i < n) {
      map.update(labels(i), i)
      i = (i + 1).toShort
    }
    map
  }

  def setInputCol(value: String): this.type = set(inputCol, value)

  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataset: DataFrame): DataFrame = {
    if (!dataset.schema.fieldNames.contains($(inputCol))) {
      logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " +
        "Skip StringToShortIndexerModel.")
      return dataset
    }

    val indexer = udf { label: String =>
      if (labelToIndex.contains(label)) {
        labelToIndex(label)
      } else {
        // TODO: handle unseen labels
        throw new SparkException(s"Unseen label: $label.")
      }
    }
    val outputColName = $(outputCol)
    val metadata = NominalAttribute.defaultAttr
      .withName(outputColName).withValues(labels).toMetadata()
    dataset.select(col("*"),
      indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    if (schema.fieldNames.contains($(inputCol))) {
      validateAndTransformSchema(schema)
    } else {
      // If the input column does not exist during transformation, we skip StringToShortIndexerModel.
      schema
    }
  }

  override def copy(extra: ParamMap): StringToShortIndexerModel = {
    val copied = new StringToShortIndexerModel(uid, labels)
    copyValues(copied, extra).setParent(parent)
  }
}

Source File: DruidQueriesTab.scala From spark-druid-olap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.sparklinedata.ui

import org.apache.spark.sql.hive.thriftserver.sparklinedata.ui.DruidQueriesTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}
import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.sql.SPLLogging

private[thriftserver] class DruidQueriesTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "druid") with SPLLogging {

  override val name = "Druid Query Details"
  val parent = getSparkUI(sparkContext)
  attachPage(new DruidQueriesPage(this))
  parent.attachTab(this)
  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[spark] object DruidQueriesTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: PowerBiSuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.io.split1

import java.io.File

import com.microsoft.ml.spark.Secrets
import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.io.powerbi.PowerBIWriter
import org.apache.spark.SparkException
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.catalyst.encoders.RowEncoder
import org.apache.spark.sql.functions.{current_timestamp, lit}

import scala.collection.JavaConverters._

class PowerBiSuite extends TestBase with FileReaderUtils {

  lazy val url: String = sys.env.getOrElse("MML_POWERBI_URL", Secrets.PowerbiURL)
  lazy val df: DataFrame = session
    .createDataFrame(Seq(
      (Some(0), "a"),
      (Some(1), "b"),
      (Some(2), "c"),
      (Some(3), ""),
      (None, "bad_row")))
    .toDF("bar", "foo")
    .withColumn("baz", current_timestamp())
  lazy val bigdf: DataFrame = (1 to 5).foldRight(df) { case (_, ldf) => ldf.union(df) }.repartition(2)
  lazy val delayDF: DataFrame = {
    val rows = Array.fill(100){df.collect()}.flatten.toList.asJava
    val df2 = session
      .createDataFrame(rows, df.schema)
      .coalesce(1).cache()
    df2.count()
    df2.map({x => Thread.sleep(10); x})(RowEncoder(df2.schema))
  }

  test("write to powerBi", TestBase.BuildServer) {
    PowerBIWriter.write(df, url)
  }

  test("write to powerBi with delays"){
    PowerBIWriter.write(delayDF, url)
  }

  test("using dynamic minibatching"){
    PowerBIWriter.write(delayDF, url, Map("minibatcher"->"dynamic", "maxBatchSize"->"50"))
  }

  test("using timed minibatching"){
    PowerBIWriter.write(delayDF, url, Map("minibatcher"->"timed"))
  }

  test("using consolidated timed minibatching"){
    PowerBIWriter.write(delayDF, url, Map(
      "minibatcher"->"timed",
      "consolidate"->"true"))
  }

  test("using buffered batching"){
    PowerBIWriter.write(delayDF, url, Map("buffered"->"true"))
  }

  ignore("throw useful error message when given an improper dataset") {
    //TODO figure out why this does not throw errors on the build machine
    assertThrows[SparkException] {
      PowerBIWriter.write(df.withColumn("bad", lit("foo")), url)
    }
  }

  test("stream to powerBi", TestBase.BuildServer) {
    bigdf.write.parquet(tmpDir + File.separator + "powerBI.parquet")
    val sdf = session.readStream.schema(df.schema).parquet(tmpDir + File.separator + "powerBI.parquet")
    val q1 = PowerBIWriter.stream(sdf, url).start()
    q1.processAllAvailable()
  }

}

Source File: MesosClusterManager.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster.mesos

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.internal.config._
import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}


private[spark] class MesosClusterManager extends ExternalClusterManager {
  private val MESOS_REGEX = """mesos://(.*)""".r

  override def canCreate(masterURL: String): Boolean = {
    masterURL.startsWith("mesos")
  }

  override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
    new TaskSchedulerImpl(sc)
  }

  override def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = {
    require(!sc.conf.get(IO_ENCRYPTION_ENABLED),
      "I/O encryption is currently not supported in Mesos.")

    val mesosUrl = MESOS_REGEX.findFirstMatchIn(masterURL).get.group(1)
    val coarse = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true)
    if (coarse) {
      new MesosCoarseGrainedSchedulerBackend(
        scheduler.asInstanceOf[TaskSchedulerImpl],
        sc,
        mesosUrl,
        sc.env.securityManager)
    } else {
      new MesosFineGrainedSchedulerBackend(
        scheduler.asInstanceOf[TaskSchedulerImpl],
        sc,
        mesosUrl)
    }
  }

  override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
  }
}

Source File: RWrappers.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkException
import org.apache.spark.ml.util.MLReader


private[r] object RWrappers extends MLReader[Object] {

  override def load(path: String): Object = {
    implicit val format = DefaultFormats
    val rMetadataPath = new Path(path, "rMetadata").toString
    val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
    val rMetadata = parse(rMetadataStr)
    val className = (rMetadata \ "class").extract[String]
    className match {
      case "org.apache.spark.ml.r.NaiveBayesWrapper" => NaiveBayesWrapper.load(path)
      case "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper" =>
        AFTSurvivalRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" =>
        GeneralizedLinearRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.KMeansWrapper" =>
        KMeansWrapper.load(path)
      case "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" =>
        MultilayerPerceptronClassifierWrapper.load(path)
      case "org.apache.spark.ml.r.LDAWrapper" =>
        LDAWrapper.load(path)
      case "org.apache.spark.ml.r.IsotonicRegressionWrapper" =>
        IsotonicRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.GaussianMixtureWrapper" =>
        GaussianMixtureWrapper.load(path)
      case "org.apache.spark.ml.r.ALSWrapper" =>
        ALSWrapper.load(path)
      case "org.apache.spark.ml.r.LogisticRegressionWrapper" =>
        LogisticRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.RandomForestRegressorWrapper" =>
        RandomForestRegressorWrapper.load(path)
      case "org.apache.spark.ml.r.RandomForestClassifierWrapper" =>
        RandomForestClassifierWrapper.load(path)
      case "org.apache.spark.ml.r.GBTRegressorWrapper" =>
        GBTRegressorWrapper.load(path)
      case "org.apache.spark.ml.r.GBTClassifierWrapper" =>
        GBTClassifierWrapper.load(path)
      case _ =>
        throw new SparkException(s"SparkR read.ml does not support load $className")
    }
  }
}

Source File: NumericParser.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.util.StringTokenizer

import scala.collection.mutable.{ArrayBuilder, ListBuffer}

import org.apache.spark.SparkException


  def parse(s: String): Any = {
    val tokenizer = new StringTokenizer(s, "()[],", true)
    if (tokenizer.hasMoreTokens()) {
      val token = tokenizer.nextToken()
      if (token == "(") {
        parseTuple(tokenizer)
      } else if (token == "[") {
        parseArray(tokenizer)
      } else {
        // expecting a number
        parseDouble(token)
      }
    } else {
      throw new SparkException(s"Cannot find any token from the input string.")
    }
  }

  private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
    val values = ArrayBuilder.make[Double]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "]") {
        parsing = false
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else {
        // expecting a number
        values += parseDouble(token)
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"An array must end with ']'.")
    }
    values.result()
  }

  private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
    val items = ListBuffer.empty[Any]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "(") {
        items.append(parseTuple(tokenizer))
        allowComma = true
      } else if (token == "[") {
        items.append(parseArray(tokenizer))
        allowComma = true
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else if (token == ")") {
        parsing = false
      } else if (token.trim.isEmpty) {
          // ignore whitespaces between delim chars, e.g. ", ["
      } else {
        // expecting a number
        items.append(parseDouble(token))
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"A tuple must end with ')'.")
    }
    items
  }

  private def parseDouble(s: String): Double = {
    try {
      java.lang.Double.parseDouble(s)
    } catch {
      case e: NumberFormatException =>
        throw new SparkException(s"Cannot parse a double from: $s", e)
    }
  }
}

Source File: LabeledPoint.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException


  @Since("1.1.0")
  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
      }
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
    }
  }

  private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = {
    LabeledPoint(point.label, Vectors.fromML(point.features))
  }
}

Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: NumericParserSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.{SparkException, SparkFunSuite}

class NumericParserSuite extends SparkFunSuite {

  test("parser") {
    val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3))
    assert(parsed(1).asInstanceOf[Double] === -4.0)
    assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8))
    assert(parsed(3).asInstanceOf[Double] === 9.0)

    val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
    malformatted.foreach { s =>
      intercept[SparkException] {
        NumericParser.parse(s)
        throw new RuntimeException(s"Didn't detect malformatted string $s.")
      }
    }
  }

  test("parser with whitespaces") {
    val s = "(0.0, [1.0, 2.0])"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Double] === 0.0)
    assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
  }
}

Source File: CommitFailureTestRelationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton {
  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = spark.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - default") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val divideByZero = udf((x: Int) => { x / (x - 1)})
      val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id")))

      SimpleTextRelation.callbackCalled = false
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - partitioned") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))

      SimpleTextRelation.callbackCalled = false
      SimpleTextRelation.failWriter = true
      intercept[SparkException] {
        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: ThriftServerTab.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.hive.thriftserver.monitor.ThriftServerMonitor
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[thriftserver] class ThriftServerTab(userName: String, sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)

  // ThriftServerTab renders by different listener's content, identified by user.
  val listener = ThriftServerMonitor.getListener(userName)

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: ThriftServerMonitor.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.monitor

import scala.collection.mutable.HashMap

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab

object ThriftServerMonitor extends Logging {

  private[this] val uiTabs = new HashMap[String, ThriftServerTab]()

  private[this] val listeners = new HashMap[String, ThriftServerListener]()

  def setListener(user: String, sparkListener: ThriftServerListener): Unit = {
    listeners.put(user, sparkListener)
  }

  def getListener(user: String): ThriftServerListener = {
    listeners.getOrElse(user, throw new SparkException(s"Listener does not init for user[$user]"))
  }

  def addUITab(user: String, ui: ThriftServerTab): Unit = {
    uiTabs.put(user, ui)
  }

  def detachUITab(user: String): Unit = {
    listeners.remove(user)
    uiTabs.get(user).foreach(_.detach())
  }

  def detachAllUITabs(): Unit = {
    uiTabs.values.foreach(_.detach())
  }
}

Source File: UDTRegistration.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import scala.collection.mutable

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.util.Utils


  def getUDTFor(userClass: String): Option[Class[_]] = {
    udtMap.get(userClass).map { udtClassName =>
      if (Utils.classIsLoadable(udtClassName)) {
        val udtClass = Utils.classForName(udtClassName)
        if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) {
          udtClass
        } else {
          throw new SparkException(
            s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " +
              s"an UserDefinedType for ${userClass}")
        }
      } else {
        throw new SparkException(
          s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.")
      }
    }
  }
}

Source File: ScalaUDFSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types.{IntegerType, StringType}

class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("basic") {
    val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil)
    checkEvaluation(intUdf, 2)

    val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil)
    checkEvaluation(stringUdf, "ax")
  }

  test("better error message for NPE") {
    val udf = ScalaUDF(
      (s: String) => s.toLowerCase,
      StringType,
      Literal.create(null, StringType) :: Nil)

    val e1 = intercept[SparkException](udf.eval())
    assert(e1.getMessage.contains("Failed to execute user defined function"))

    val e2 = intercept[SparkException] {
      checkEvalutionWithUnsafeProjection(udf, null)
    }
    assert(e2.getMessage.contains("Failed to execute user defined function"))
  }

}

Source File: UDTRegistrationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types._

private[sql] class TestUserClass {
}

private[sql] class TestUserClass2 {
}

private[sql] class TestUserClass3 {
}

private[sql] class NonUserDefinedType {
}

private[sql] class TestUserClassUDT extends UserDefinedType[TestUserClass] {

  override def sqlType: DataType = IntegerType
  override def serialize(input: TestUserClass): Int = 1

  override def deserialize(datum: Any): TestUserClass = new TestUserClass

  override def userClass: Class[TestUserClass] = classOf[TestUserClass]

  private[spark] override def asNullable: TestUserClassUDT = this

  override def hashCode(): Int = classOf[TestUserClassUDT].getName.hashCode()

  override def equals(other: Any): Boolean = other match {
    case _: TestUserClassUDT => true
    case _ => false
  }
}

class UDTRegistrationSuite extends SparkFunSuite {

  test("register non-UserDefinedType") {
    UDTRegistration.register(classOf[TestUserClass].getName,
      "org.apache.spark.sql.NonUserDefinedType")
    intercept[SparkException] {
      UDTRegistration.getUDTFor(classOf[TestUserClass].getName)
    }
  }

  test("default UDTs") {
    val userClasses = Seq(
    "org.apache.spark.ml.linalg.Vector",
    "org.apache.spark.ml.linalg.DenseVector",
    "org.apache.spark.ml.linalg.SparseVector",
    "org.apache.spark.ml.linalg.Matrix",
    "org.apache.spark.ml.linalg.DenseMatrix",
    "org.apache.spark.ml.linalg.SparseMatrix")
    userClasses.foreach { c =>
      assert(UDTRegistration.exists(c))
    }
  }

  test("query registered user class") {
    UDTRegistration.register(classOf[TestUserClass2].getName, classOf[TestUserClassUDT].getName)
    assert(UDTRegistration.exists(classOf[TestUserClass2].getName))
    assert(
      classOf[UserDefinedType[_]].isAssignableFrom((
        UDTRegistration.getUDTFor(classOf[TestUserClass2].getName).get)))
  }

  test("query unregistered user class") {
    assert(!UDTRegistration.exists(classOf[TestUserClass3].getName))
    assert(!UDTRegistration.getUDTFor(classOf[TestUserClass3].getName).isDefined)
  }
}

Source File: YarnClusterManager.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}


private[spark] class YarnClusterManager extends ExternalClusterManager {

  override def canCreate(masterURL: String): Boolean = {
    masterURL == "yarn"
  }

  override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
    sc.deployMode match {
      case "cluster" => new YarnClusterScheduler(sc)
      case "client" => new YarnScheduler(sc)
      case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
    }
  }

  override def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = {
    sc.deployMode match {
      case "cluster" =>
        new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
      case "client" =>
        new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
      case  _ =>
        throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
    }
  }

  override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
  }
}

Source File: HDFSCredentialProviderSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn.security

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.scalatest.{Matchers, PrivateMethodTester}

import org.apache.spark.{SparkConf, SparkException, SparkFunSuite}

class HDFSCredentialProviderSuite
    extends SparkFunSuite
    with PrivateMethodTester
    with Matchers {
  private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer)

  private def getTokenRenewer(
      hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = {
    hdfsCredentialProvider invokePrivate _getTokenRenewer(conf)
  }

  private var hdfsCredentialProvider: HDFSCredentialProvider = null

  override def beforeAll() {
    super.beforeAll()

    if (hdfsCredentialProvider == null) {
      hdfsCredentialProvider = new HDFSCredentialProvider()
    }
  }

  override def afterAll() {
    if (hdfsCredentialProvider != null) {
      hdfsCredentialProvider = null
    }

    super.afterAll()
  }

  test("check token renewer") {
    val hadoopConf = new Configuration()
    hadoopConf.set("yarn.resourcemanager.address", "myrm:8033")
    hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:8032@SPARKTEST.COM")
    val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf)
    renewer should be ("yarn/myrm:8032@SPARKTEST.COM")
  }

  test("check token renewer default") {
    val hadoopConf = new Configuration()
    val caught =
      intercept[SparkException] {
        getTokenRenewer(hdfsCredentialProvider, hadoopConf)
      }
    assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer")
  }
}

Source File: UnionDStream.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
  extends DStream[T](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to union is empty")
  require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.length == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[T]] = {
    val rdds = new ArrayBuffer[RDD[T]]()
    parents.map(_.getOrCompute(validTime)).foreach {
      case Some(rdd) => rdds += rdd
      case None => throw new SparkException("Could not generate RDD from a parent for unifying at" +
        s" time $validTime")
    }
    if (rdds.nonEmpty) {
      Some(ssc.sc.union(rdds))
    } else {
      None
    }
  }
}

Source File: TransformedDStream.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class TransformedDStream[U: ClassTag] (
    parents: Seq[DStream[_]],
    transformFunc: (Seq[RDD[_]], Time) => RDD[U]
  ) extends DStream[U](parents.head.ssc) {

  require(parents.nonEmpty, "List of DStreams to transform is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[U]] = {
    val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse(
      // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE
      throw new SparkException(s"Couldn't generate RDD from parent at time $validTime"))
    }
    val transformedRDD = transformFunc(parentRDDs, validTime)
    if (transformedRDD == null) {
      throw new SparkException("Transform function must not return null. " +
        "Return SparkContext.emptyRDD() instead to represent no element " +
        "as the result of transformation.")
    }
    Some(transformedRDD)
  }

  
  override protected[streaming] def createRDDWithLocalProperties[U](
      time: Time,
      displayInnerRDDOps: Boolean)(body: => U): U = {
    super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body)
  }
}

Source File: StreamingTab.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.ui

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[spark] class StreamingTab(val ssc: StreamingContext)
  extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging {

  import StreamingTab._

  private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static"

  val parent = getSparkUI(ssc)
  val listener = ssc.progressListener

  ssc.addStreamingListener(listener)
  ssc.sc.addSparkListener(listener)
  attachPage(new StreamingPage(this))
  attachPage(new BatchPage(this))

  def attach() {
    getSparkUI(ssc).attachTab(this)
    getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
  }

  def detach() {
    getSparkUI(ssc).detachTab(this)
    getSparkUI(ssc).removeStaticHandler("/static/streaming")
  }
}

private object StreamingTab {
  def getSparkUI(ssc: StreamingContext): SparkUI = {
    ssc.sc.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: RpcEndpointAddress.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rpc

import org.apache.spark.SparkException


private[spark] case class RpcEndpointAddress(val rpcAddress: RpcAddress, val name: String) {

  require(name != null, "RpcEndpoint name must be provided.")

  def this(host: String, port: Int, name: String) = {
    this(RpcAddress(host, port), name)
  }

  override val toString = if (rpcAddress != null) {
      s"spark://$name@${rpcAddress.host}:${rpcAddress.port}"
    } else {
      s"spark-client://$name"
    }
}

private[spark] object RpcEndpointAddress {

  def apply(host: String, port: Int, name: String): RpcEndpointAddress = {
    new RpcEndpointAddress(host, port, name)
  }

  def apply(sparkUrl: String): RpcEndpointAddress = {
    try {
      val uri = new java.net.URI(sparkUrl)
      val host = uri.getHost
      val port = uri.getPort
      val name = uri.getUserInfo
      if (uri.getScheme != "spark" ||
          host == null ||
          port < 0 ||
          name == null ||
          (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
          uri.getFragment != null ||
          uri.getQuery != null) {
        throw new SparkException("Invalid Spark URL: " + sparkUrl)
      }
      new RpcEndpointAddress(host, port, name)
    } catch {
      case e: java.net.URISyntaxException =>
        throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
    }
  }
}

Source File: RUtils.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.io.File
import java.util.Arrays

import org.apache.hadoop.security.UserGroupInformation

import org.apache.spark.{SparkEnv, SparkException}

private[spark] object RUtils {
  // Local path where R binary packages built from R source code contained in the spark
  // packages specified with "--packages" or "--jars" command line option reside.
  var rPackages: Option[String] = None

  
  def isRInstalled: Boolean = {
    try {
      val builder = new ProcessBuilder(Arrays.asList("R", "--version"))
      builder.start().waitFor() == 0
    } catch {
      case e: Exception => false
    }
  }
}

Source File: RpcAddressSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rpc

import org.apache.spark.{SparkException, SparkFunSuite}

class RpcAddressSuite extends SparkFunSuite {

  test("hostPort") {
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
    assert(address.hostPort == "1.2.3.4:1234")
  }

  test("fromSparkURL") {
    val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234")
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
  }

  test("fromSparkURL: a typo url") {
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("spark://1.2. 3.4:1234")
    }
    assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage)
  }

  test("fromSparkURL: invalid scheme") {
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("invalid://1.2.3.4:1234")
    }
    assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage)
  }

  test("toSparkURL") {
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.toSparkURL == "spark://1.2.3.4:1234")
  }
}

Source File: KryoSerializerResizableOutputSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.LocalSparkContext
import org.apache.spark.SparkContext
import org.apache.spark.SparkException

class KryoSerializerResizableOutputSuite extends SparkFunSuite {

  // trial and error showed this will not serialize with 1mb buffer
  val x = (1 to 400000).toArray

  test("kryo without resizable output buffer should fail on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "1m")
    val sc = new SparkContext("local", "test", conf)
    intercept[SparkException](sc.parallelize(x).collect())
    LocalSparkContext.stop(sc)
  }

  test("kryo with resizable output buffer should succeed on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "2m")
    val sc = new SparkContext("local", "test", conf)
    assert(sc.parallelize(x).collect() === x)
    LocalSparkContext.stop(sc)
  }
}

Source File: ProactiveClosureSerializationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: CoarseGrainedSchedulerBackendSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite}
import org.apache.spark.util.{RpcUtils, SerializableBuffer}

class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext {

  test("serialized task larger than max RPC message size") {
    val conf = new SparkConf
    conf.set("spark.rpc.message.maxSize", "1")
    conf.set("spark.default.parallelism", "1")
    sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf)
    val frameSize = RpcUtils.maxMessageSizeBytes(sc.conf)
    val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize))
    val larger = sc.parallelize(Seq(buffer))
    val thrown = intercept[SparkException] {
      larger.collect()
    }
    assert(thrown.getMessage.contains("using broadcast variables for large values"))
    val smaller = sc.parallelize(1 to 4).collect()
    assert(smaller.size === 4)
  }

}

Source File: NumericParser.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.util.StringTokenizer

import scala.collection.mutable.{ArrayBuilder, ListBuffer}

import org.apache.spark.SparkException


  def parse(s: String): Any = {
    val tokenizer = new StringTokenizer(s, "()[],", true)
    if (tokenizer.hasMoreTokens()) {
      val token = tokenizer.nextToken()
      if (token == "(") {
        parseTuple(tokenizer)
      } else if (token == "[") {
        parseArray(tokenizer)
      } else {
        // expecting a number
        parseDouble(token)
      }
    } else {
      throw new SparkException(s"Cannot find any token from the input string.")
    }
  }

  private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
    val values = ArrayBuilder.make[Double]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "]") {
        parsing = false
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else {
        // expecting a number
        values += parseDouble(token)
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"An array must end with ']'.")
    }
    values.result()
  }

  private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
    val items = ListBuffer.empty[Any]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "(") {
        items.append(parseTuple(tokenizer))
        allowComma = true
      } else if (token == "[") {
        items.append(parseArray(tokenizer))
        allowComma = true
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else if (token == ")") {
        parsing = false
      } else if (token.trim.isEmpty){
          // ignore whitespaces between delim chars, e.g. ", ["
      } else {
        // expecting a number
        items.append(parseDouble(token))
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"A tuple must end with ')'.")
    }
    items
  }

  private def parseDouble(s: String): Double = {
    try {
      java.lang.Double.parseDouble(s)
    } catch {
      case e: NumberFormatException =>
        throw new SparkException(s"Cannot parse a double from: $s", e)
    }
  }
}

Source File: LabeledPoint.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException


  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
      }
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
    }
  }
}

Source File: VectorAssemblerSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.functions.col

class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new VectorAssembler)
  }

  test("assemble") {
    import org.apache.spark.ml.feature.VectorAssembler.assemble
    assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty))
    assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0)))
    val dv = Vectors.dense(2.0, 0.0)
    assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0)))
    val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0))
    assert(assemble(0.0, dv, 1.0, sv) ===
      Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0)))
    for (v <- Seq(1, "a", null)) {
      intercept[SparkException](assemble(v))
      intercept[SparkException](assemble(1.0, v))
    }
  }

  test("assemble should compress vectors") {
    import org.apache.spark.ml.feature.VectorAssembler.assemble
    val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0))
    assert(v1.isInstanceOf[SparseVector])
    val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0)))
    assert(v2.isInstanceOf[DenseVector])
  }

  test("VectorAssembler") {
    val df = sqlContext.createDataFrame(Seq(
      (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L)
    )).toDF("id", "x", "y", "name", "z", "n")
    val assembler = new VectorAssembler()
      .setInputCols(Array("x", "y", "z", "n"))
      .setOutputCol("features")
    assembler.transform(df).select("features").collect().foreach {
      case Row(v: Vector) =>
        assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0)))
    }
  }

  test("ML attributes") {
    val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari")
    val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0)
    val user = new AttributeGroup("user", Array(
      NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"),
      NumericAttribute.defaultAttr.withName("salary")))
    val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0)))
    val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad")
      .select(
        col("browser").as("browser", browser.toMetadata()),
        col("hour").as("hour", hour.toMetadata()),
        col("count"), // "count" is an integer column without ML attribute
        col("user").as("user", user.toMetadata()),
        col("ad")) // "ad" is a vector column without ML attribute
    val assembler = new VectorAssembler()
      .setInputCols(Array("browser", "hour", "count", "user", "ad"))
      .setOutputCol("features")
    val output = assembler.transform(df)
    val schema = output.schema
    val features = AttributeGroup.fromStructField(schema("features"))
    assert(features.size === 7)
    val browserOut = features.getAttr(0)
    assert(browserOut === browser.withIndex(0).withName("browser"))
    val hourOut = features.getAttr(1)
    assert(hourOut === hour.withIndex(1).withName("hour"))
    val countOut = features.getAttr(2)
    assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2))
    val userGenderOut = features.getAttr(3)
    assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3))
    val userSalaryOut = features.getAttr(4)
    assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4))
    assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5))
    assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6))
  }
}

Source File: NumericParserSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.{SparkException, SparkFunSuite}

class NumericParserSuite extends SparkFunSuite {

  test("parser") {
    val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3))
    assert(parsed(1).asInstanceOf[Double] === -4.0)
    assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8))
    assert(parsed(3).asInstanceOf[Double] === 9.0)

    val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
    malformatted.foreach { s =>
      intercept[SparkException] {
        NumericParser.parse(s)
        println(s"Didn't detect malformatted string $s.")
      }
    }
  }

  test("parser with whitespaces") {
    val s = "(0.0, [1.0, 2.0])"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Double] === 0.0)
    assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
  }
}

Source File: ThriftServerTab.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SparkSQLEnv}
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}
import org.apache.spark.{SparkContext, Logging, SparkException}


private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sql") with Logging {

  override val name = "SQL"

  val parent = getSparkUI(sparkContext)
  val listener = HiveThriftServer2.listener

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: StreamingTab.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.ui

import org.eclipse.jetty.servlet.ServletContextHandler

import org.apache.spark.{Logging, SparkException}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.ui.{JettyUtils, SparkUI, SparkUITab}

import StreamingTab._


private[spark] class StreamingTab(val ssc: StreamingContext)
  extends SparkUITab(getSparkUI(ssc), "streaming") with Logging {

  private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static"

  val parent = getSparkUI(ssc)
  val listener = ssc.progressListener

  ssc.addStreamingListener(listener)
  ssc.sc.addSparkListener(listener)
  attachPage(new StreamingPage(this))
  attachPage(new BatchPage(this))

  var staticHandler: ServletContextHandler = null

  def attach() {
    getSparkUI(ssc).attachTab(this)
    staticHandler = JettyUtils.createStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
    getSparkUI(ssc).attachHandler(staticHandler)
  }

  def detach() {
    getSparkUI(ssc).detachTab(this)
    getSparkUI(ssc).detachHandler(staticHandler)
    staticHandler = null
  }
}

private object StreamingTab {
  def getSparkUI(ssc: StreamingContext): SparkUI = {
    ssc.sc.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: KryoSerializerResizableOutputSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.SparkContext
import org.apache.spark.LocalSparkContext
import org.apache.spark.SparkException


class KryoSerializerResizableOutputSuite extends SparkFunSuite {

  // trial and error showed this will not serialize with 1mb buffer
  val x = (1 to 400000).toArray

  test("kryo without resizable output buffer should fail on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "1m")
    val sc = new SparkContext("local", "test", conf)
    intercept[SparkException](sc.parallelize(x).collect())
    LocalSparkContext.stop(sc)
  }

  test("kryo with resizable output buffer should succeed on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "2m")
    val sc = new SparkContext("local", "test", conf)
    assert(sc.parallelize(x).collect() === x)
    LocalSparkContext.stop(sc)
  }
}

Source File: ProactiveClosureSerializationSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: CoarseGrainedSchedulerBackendSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite}
import org.apache.spark.util.{SerializableBuffer, AkkaUtils}

class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext {

  test("serialized task larger than akka frame size") {
    val conf = new SparkConf
    conf.set("spark.akka.frameSize", "1")
    conf.set("spark.default.parallelism", "1")
    sc = new SparkContext("local-cluster[2 , 1 , 512]", "test", conf)
    val frameSize = AkkaUtils.maxFrameSizeBytes(sc.conf)
    val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize))
    val larger = sc.parallelize(Seq(buffer))
    val thrown = intercept[SparkException] {
      larger.collect()
    }
    assert(thrown.getMessage.contains("using broadcast variables for large values"))
    val smaller = sc.parallelize(1 to 4).collect()
    assert(smaller.size === 4)
  }

}

Source File: MutableURLClassLoaderSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.util

import java.net.URLClassLoader

import org.apache.spark.{SparkContext, SparkException, SparkFunSuite, TestUtils}

class MutableURLClassLoaderSuite extends SparkFunSuite {

  val urls2 = List(TestUtils.createJarWithClasses(
      classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"),
      toStringValue = "2")).toArray
  val urls = List(TestUtils.createJarWithClasses(
      classNames = Seq("FakeClass1"),
      classNamesWithBase = Seq(("FakeClass2", "FakeClass3")), // FakeClass3 is in parent
      toStringValue = "1",
      classpathUrls = urls2)).toArray

  test("child first") {
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
    val fakeClass = classLoader.loadClass("FakeClass2").newInstance()
    val fakeClassVersion = fakeClass.toString
    assert(fakeClassVersion === "1")
    val fakeClass2 = classLoader.loadClass("FakeClass2").newInstance()
    assert(fakeClass.getClass === fakeClass2.getClass)
  }

  test("parent first") {
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new MutableURLClassLoader(urls, parentLoader)
    val fakeClass = classLoader.loadClass("FakeClass1").newInstance()
    val fakeClassVersion = fakeClass.toString
    assert(fakeClassVersion === "2")
    val fakeClass2 = classLoader.loadClass("FakeClass1").newInstance()
    assert(fakeClass.getClass === fakeClass2.getClass)
  }

  test("child first can fall back") {
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
    val fakeClass = classLoader.loadClass("FakeClass3").newInstance()
    val fakeClassVersion = fakeClass.toString
    assert(fakeClassVersion === "2")
  }

  test("child first can fail") {
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
    intercept[java.lang.ClassNotFoundException] {
      classLoader.loadClass("FakeClassDoesNotExist").newInstance()
    }
  }

  test("driver sets context class loader in local mode") {
    // Test the case where the driver program sets a context classloader and then runs a job
    // in local mode. This is what happens when ./spark-submit is called with "local" as the
    // master.
    val original = Thread.currentThread().getContextClassLoader

    val className = "ClassForDriverTest"
    val jar = TestUtils.createJarWithClasses(Seq(className))
    val contextLoader = new URLClassLoader(Array(jar), Utils.getContextOrSparkClassLoader)
    Thread.currentThread().setContextClassLoader(contextLoader)

    val sc = new SparkContext("local", "driverLoaderTest")

    try {
      sc.makeRDD(1 to 5, 2).mapPartitions { x =>
        val loader = Thread.currentThread().getContextClassLoader
        Class.forName(className, true, loader).newInstance()
        Seq().iterator
      }.count()
    }
    catch {
      case e: SparkException if e.getMessage.contains("ClassNotFoundException") =>
        fail("Local executor could not find class", e)
      case t: Throwable => fail("Unexpected exception ", t)
    }

    sc.stop()
    Thread.currentThread().setContextClassLoader(original)
  }
}

Source File: DataFrameToFileWriter.scala From seahorse with Apache License 2.0

5 votes

package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage

import org.apache.spark.SparkException
import ai.deepsense.commons.utils.LoggerForCallerClass
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.doperations.exceptions.WriteFileException
import ai.deepsense.deeplang.doperations.inout.OutputFileFormatChoice.Csv
import ai.deepsense.deeplang.doperations.inout.OutputStorageTypeChoice
import ai.deepsense.deeplang.doperations.readwritedataframe.{FilePath, FilePathFromLibraryPath, FileScheme}
import ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv.CsvSchemaStringifierBeforeCsvWriting
import ai.deepsense.deeplang.exceptions.DeepLangException
import ai.deepsense.deeplang.{ExecutionContext, FileSystemClient}
import org.apache.spark.sql.SaveMode

object DataFrameToFileWriter {

  val logger = LoggerForCallerClass()

  def writeToFile(
      fileChoice: OutputStorageTypeChoice.File,
      context: ExecutionContext,
      dataFrame: DataFrame): Unit = {
    implicit val ctx = context

    val path = FileSystemClient.replaceLeadingTildeWithHomeDirectory(fileChoice.getOutputFile())
    val filePath = FilePath(path)
    val saveMode = if (fileChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists

    try {
      val preprocessed = fileChoice.getFileFormat() match {
        case csv: Csv => CsvSchemaStringifierBeforeCsvWriting.preprocess(dataFrame)
        case other => dataFrame
      }
      writeUsingProvidedFileScheme(fileChoice, preprocessed, filePath, saveMode)
    } catch {
      case e: SparkException =>
        logger.error(s"WriteDataFrame error: Spark problem. Unable to write file to $path", e)
        throw WriteFileException(path, e)
    }
  }

  private def writeUsingProvidedFileScheme(
      fileChoice: OutputStorageTypeChoice.File, dataFrame: DataFrame, path: FilePath, saveMode: SaveMode
    )(implicit context: ExecutionContext): Unit = {
    import FileScheme._
    path.fileScheme match {
      case Library =>
        val filePath = FilePathFromLibraryPath(path)
        val FilePath(_, libraryPath) = filePath
        new java.io.File(libraryPath).getParentFile.mkdirs()
        writeUsingProvidedFileScheme(fileChoice, dataFrame, filePath, saveMode)
      case FileScheme.File => DriverFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode)
      case HDFS => ClusterFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode)
      case HTTP | HTTPS | FTP => throw NotSupportedScheme(path.fileScheme)
    }
  }

  case class NotSupportedScheme(fileScheme: FileScheme)
    extends DeepLangException(s"Not supported file scheme ${fileScheme.pathPrefix}")

}

Source File: CSVToAvroTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.utils.io.csv

import com.salesforce.op.test.{Passenger, TestSparkContext}
import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class CSVToAvroTest extends FlatSpec with TestSparkContext {
  val avroSchema: String = loadFile(s"$resourceDir/PassengerSchemaModifiedDataTypes.avsc")
  val csvReader: CSVInOut = new CSVInOut(CSVOptions(header = true))
  lazy val csvRDD: RDD[Seq[String]] = csvReader.readRDD(s"$resourceDir/PassengerDataModifiedDataTypes.csv")
  lazy val csvFileRecordCount: Long = csvRDD.count

  Spec(CSVToAvro.getClass) should "convert RDD[Seq[String]] to RDD[GenericRecord]" in {
    val res = CSVToAvro.toAvro(csvRDD, avroSchema)
    res shouldBe a[RDD[_]]
    res.count shouldBe csvFileRecordCount
  }

  it should "convert RDD[Seq[String]] to RDD[T]" in {
    val res = CSVToAvro.toAvroTyped[Passenger](csvRDD, avroSchema)
    res shouldBe a[RDD[_]]
    res.count shouldBe csvFileRecordCount
  }

  it should "throw an error for nested schema" in {
    val invalidAvroSchema = loadFile(s"$resourceDir/PassengerSchemaNestedTypeCSV.avsc")
    val exceptionMsg = "CSV should be a flat file and not have nested records (unsupported column(Sex schemaType=ENUM)"
    val error = intercept[SparkException](CSVToAvro.toAvro(csvRDD, invalidAvroSchema).count())
    error.getCause.getMessage shouldBe exceptionMsg
  }

  it should "throw an error for mis-matching schema fields" in {
    val invalidAvroSchema = loadFile(s"$resourceDir/PassengerSchemaInvalidField.avsc")
    val error = intercept[SparkException](CSVToAvro.toAvro(csvRDD, invalidAvroSchema).count())
    error.getCause.getMessage shouldBe "Mismatch number of fields in csv record and avro schema"
  }

  it should "throw an error for bad data" in {
    val invalidDataRDD = csvReader.readRDD(s"$resourceDir/PassengerDataContentTypeMisMatch.csv")
    val error = intercept[SparkException](CSVToAvro.toAvro(invalidDataRDD, avroSchema).count())
    error.getCause.getMessage shouldBe "Boolean column not actually a boolean. Invalid value: 'fail'"
  }
}

Source File: PredictionDescalerTransformerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages.impl.feature

import com.salesforce.op.OpWorkflow
import com.salesforce.op.features.types._
import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.SparkException
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner

import scala.util.{Failure, Success}

@RunWith(classOf[JUnitRunner])
class PredictionDescalerTransformerTest extends OpTransformerSpec[Real, PredictionDescaler[Real, Real]] {
  val predictionData = Seq(-1.0, 0.0, 1.0, 2.0).map(Prediction(_))
  val featureData = Seq(0.0, 1.0, 2.0, 3.0).map(_.toReal)
  val (testData, p, f1) = TestFeatureBuilder[Prediction, Real](predictionData zip featureData)

  val scalerMetadata = ScalerMetadata(ScalingType.Linear, LinearScalerArgs(slope = 4.0, intercept = 1.0)).toMetadata()
  val colWithMetadata = testData.col(f1.name).as(f1.name, scalerMetadata)
  val inputData = testData.withColumn(f1.name, colWithMetadata)

  val transformer = new PredictionDescaler[Real, Real]().setInput(p, f1)

  val expectedResult: Seq[Real] = Seq(-0.5, -0.25, 0.0, 0.25).map(_.toReal)

  it should "error on missing scaler metadata" in {
    val (df, p, f1) = TestFeatureBuilder(Seq(4.0, 1.0, 0.0).map(Prediction(_)) zip Seq(0.0, 0.0, 0.0).map(Real(_)))
    val error = intercept[SparkException](
      new PredictionDescaler[Real, Real]().setInput(p, f1).transform(df).collect()
    )
    error.getCause should not be null
    error.getCause shouldBe a[RuntimeException]
    error.getCause.getMessage shouldBe s"Failed to extract scaler metadata for input feature '${f1.name}'"
  }

  it should "descale and serialize log-scaling workflow" in {
    val logScaler = new ScalerTransformer[Real, Real](
      scalingType = ScalingType.Logarithmic,
      scalingArgs = EmptyScalerArgs()
    ).setInput(f1)
    val scaledResponse = logScaler.getOutput()
    val metadata = logScaler.transform(inputData).schema(scaledResponse.name).metadata
    ScalerMetadata(metadata) match {
      case Failure(err) => fail(err)
      case Success(meta) =>
        meta shouldBe ScalerMetadata(ScalingType.Logarithmic, EmptyScalerArgs())
    }
    val shifted = scaledResponse.map[Prediction](v => Prediction(v.value.getOrElse(Double.NaN) + 1),
      operationName = "shift")
    val descaledPrediction = new PredictionDescaler[Real, Real]().setInput(shifted, scaledResponse).getOutput()
    val workflow = new OpWorkflow().setResultFeatures(descaledPrediction)
    val wfModel = workflow.setInputDataset(inputData).train()
    val transformed = wfModel.score()
    val actual = transformed.collect().map(_.getAs[Double](1))
    val expected = Array(0.0, 1.0, 2.0, 3.0).map(_ * math.E)
    all(actual.zip(expected).map(x => math.abs(x._2 - x._1))) should be < 0.0001
  }

  it should "descale and serialize linear-scaling workflow" in {
    val scalingArgs = LinearScalerArgs(slope = 2.0, intercept = 0.0)
    val linearScaler = new ScalerTransformer[Real, Real](
      scalingType = ScalingType.Linear,
      scalingArgs = scalingArgs
    ).setInput(f1)
    val scaledResponse = linearScaler.getOutput()
    val metadata = linearScaler.transform(inputData).schema(scaledResponse.name).metadata
    ScalerMetadata(metadata) match {
      case Failure(err) => fail(err)
      case Success(meta) =>
        meta shouldBe ScalerMetadata(ScalingType.Linear, scalingArgs)
    }
    val shifted = scaledResponse.map[Prediction](v => Prediction(v.value.getOrElse(Double.NaN) + 1),
      operationName = "shift")
    val descaledPrediction = new PredictionDescaler[Real, Real]().setInput(shifted, scaledResponse).getOutput()
    val workflow = new OpWorkflow().setResultFeatures(descaledPrediction)
    val wfModel = workflow.setInputDataset(inputData).train()
    val transformed = wfModel.score()
    val actual = transformed.collect().map(_.getAs[Double](1))
    val expected = Array(0.5, 1.5, 2.5, 3.5)
    actual shouldBe expected
  }

  it should "work with its shortcut" in {
    val descaled = p.descale[Real, Real](f1)
    val transformed = descaled.originStage.asInstanceOf[PredictionDescaler[Real, Real]].transform(inputData)
    val actual = transformed.collect(descaled)
    actual shouldEqual expectedResult.toArray
  }

}

Source File: DescalerTransformerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages.impl.feature

import com.salesforce.op.OpWorkflow
import com.salesforce.op.features.types._
import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder}
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.SparkException
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner

import scala.util.{Failure, Success}

@RunWith(classOf[JUnitRunner])
class DescalerTransformerTest extends OpTransformerSpec[Real, DescalerTransformer[Real, Real, Real]] {
  val (testData, f1) = TestFeatureBuilder(Seq(4.0, 1.0, 0.0).map(_.toReal))
  val scalerMetadata = ScalerMetadata(ScalingType.Linear, LinearScalerArgs(slope = 2.0, intercept = 3.0)).toMetadata()
  val colWithMetadata = testData.col(f1.name).as(f1.name, scalerMetadata)
  val inputData = testData.withColumn(f1.name, colWithMetadata)

  val transformer = new DescalerTransformer[Real, Real, Real]().setInput(f1, f1)
  val expectedResult: Seq[Real] = Seq(0.5, -1.0, -1.5).map(_.toReal)

  it should "error on missing scaler metadata" in {
    val (df, f) = TestFeatureBuilder(Seq(4.0, 1.0, 0.0).map(_.toReal))
    val error = intercept[SparkException](
      new DescalerTransformer[Real, Real, Real]().setInput(f, f).transform(df).collect()
    )
    error.getCause should not be null
    error.getCause shouldBe a[RuntimeException]
    error.getCause.getMessage shouldBe s"Failed to extract scaler metadata for input feature '${f1.name}'"
  }

  it should "descale and work in log-scaling workflow" in {
    val logScaler = new ScalerTransformer[Real, Real](
      scalingType = ScalingType.Logarithmic,
      scalingArgs = EmptyScalerArgs()
    ).setInput(f1)
    val scaledResponse = logScaler.getOutput()
    val metadata = logScaler.transform(inputData).schema(scaledResponse.name).metadata
    ScalerMetadata(metadata) match {
      case Failure(err) => fail(err)
      case Success(meta) =>
        meta shouldBe ScalerMetadata(ScalingType.Logarithmic, EmptyScalerArgs())
    }

    val shifted = scaledResponse.map[Real](v => v.value.map(_ + 1).toReal, operationName = "shift")
    val descaledResponse = new DescalerTransformer[Real, Real, Real]().setInput(shifted, scaledResponse).getOutput()
    val workflow = new OpWorkflow().setResultFeatures(descaledResponse)
    val wfModel = workflow.setInputDataset(inputData).train()
    val transformed = wfModel.score()

    val actual = transformed.collect().map(_.getAs[Double](1))
    val expected = Array(4.0, 1.0, 0.0).map(_ * math.E)
    all(actual.zip(expected).map(x => math.abs(x._2 - x._1))) should be < 0.0001
  }

  it should "work with its shortcut" in {
    val descaled = f1.descale[Real, Real](f1)
    val transformed = descaled.originStage.asInstanceOf[DescalerTransformer[Real, Real, Real]].transform(inputData)
    val actual = transformed.collect(descaled)
    actual shouldEqual expectedResult.toArray
  }
}

Source File: KyuubiSessionTab.scala From kyuubi with Apache License 2.0

5 votes

package org.apache.spark.ui

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.ui.KyuubiSessionTab._

import yaooqinn.kyuubi.ui.{KyuubiServerListener, KyuubiServerMonitor}


class KyuubiSessionTab(userName: String, sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), userName) {

  override val name = s"Kyuubi Tab 4 $userName"

  val parent = getSparkUI(sparkContext)

  // KyuubiSessionTab renders by different listener's content, identified by user.
  val listener = KyuubiServerMonitor.getListener(userName).getOrElse {
    val lr = new KyuubiServerListener(sparkContext.conf)
    KyuubiServerMonitor.setListener(userName, lr)
    lr
  }

  attachPage(new KyuubiSessionPage(this))
  attachPage(new KyuubiSessionSubPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

object KyuubiSessionTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: SparkContextReflectionSuite.scala From kyuubi with Apache License 2.0

5 votes

package yaooqinn.kyuubi

import org.apache.spark.{SparkConf, SparkContext, SparkException, SparkFunSuite}

import yaooqinn.kyuubi.utils.ReflectUtils

class SparkContextReflectionSuite extends SparkFunSuite {

  test("SparkContext initialization with default constructor") {
    val conf = new SparkConf(loadDefaults = true).setMaster("local").setAppName("sc_init")
    val sc = ReflectUtils
      .newInstance(classOf[SparkContext].getName, Seq(classOf[SparkConf]), Seq(conf))
      .asInstanceOf[SparkContext]
    assert(sc.isInstanceOf[SparkContext])
    sc.stop()
  }

  test("SparkContext initialization with this()") {
    intercept[SparkException](ReflectUtils
      .instantiateClassByName(classOf[SparkContext].getName)
      .asInstanceOf[SparkContext])
  }

  test("SparkContext initialization with app name & master & conf") {
    val conf = new SparkConf(loadDefaults = true)
    val sc = ReflectUtils
      .newInstance(
        classOf[SparkContext].getName,
        Seq(classOf[String], classOf[String], classOf[SparkConf]),
        Seq("local", "sc_init", conf))
      .asInstanceOf[SparkContext]
    assert(sc.isInstanceOf[SparkContext])
    sc.stop()
  }

  test("Initializing 2 SparkContext with Reflection") {
    val conf1 = new SparkConf(loadDefaults = true)
      .setMaster("local").setAppName("sc1").set("spark.driver.allowMultipleContexts", "true")
    val sc1 = ReflectUtils
      .newInstance(classOf[SparkContext].getName, Seq(classOf[SparkConf]), Seq(conf1))
      .asInstanceOf[SparkContext]
    val conf2 = new SparkConf(loadDefaults = true)
      .setMaster("local").setAppName("sc2").set("spark.driver.allowMultipleContexts", "true")
    val sc2 = ReflectUtils
      .newInstance(classOf[SparkContext].getName, Seq(classOf[SparkConf]), Seq(conf2))
      .asInstanceOf[SparkContext]

    assert(sc1 !== sc2)
    sc1.stop()
    sc2.stop()
  }
}

Source File: NumericParser.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.util.StringTokenizer

import scala.collection.mutable.{ArrayBuilder, ListBuffer}

import org.apache.spark.SparkException


  def parse(s: String): Any = {
    val tokenizer = new StringTokenizer(s, "()[],", true)
    if (tokenizer.hasMoreTokens()) {
      val token = tokenizer.nextToken()
      if (token == "(") {
        parseTuple(tokenizer)
      } else if (token == "[") {
        parseArray(tokenizer)
      } else {
        // expecting a number
        parseDouble(token)
      }
    } else {
      throw new SparkException(s"Cannot find any token from the input string.")
    }
  }

  private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
    val values = ArrayBuilder.make[Double]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "]") {
        parsing = false
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else {
        // expecting a number
        values += parseDouble(token)
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"An array must end with ']'.")
    }
    values.result()
  }

  private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
    val items = ListBuffer.empty[Any]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "(") {
        items.append(parseTuple(tokenizer))
        allowComma = true
      } else if (token == "[") {
        items.append(parseArray(tokenizer))
        allowComma = true
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else if (token == ")") {
        parsing = false
      } else if (token.trim.isEmpty){
          // ignore whitespaces between delim chars, e.g. ", ["
      } else {
        // expecting a number
        items.append(parseDouble(token))
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"A tuple must end with ')'.")
    }
    items
  }

  private def parseDouble(s: String): Double = {
    try {
      java.lang.Double.parseDouble(s)
    } catch {
      case e: NumberFormatException =>
        throw new SparkException(s"Cannot parse a double from: $s", e)
    }
  }
}

Source File: LabeledPoint.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException


  @Since("1.1.0")
  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
      }
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
    }
  }
}

Source File: NumericParserSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.{SparkException, SparkFunSuite}

class NumericParserSuite extends SparkFunSuite {

  test("parser") {//解析
    val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3))
    assert(parsed(1).asInstanceOf[Double] === -4.0)
    assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8))
    assert(parsed(3).asInstanceOf[Double] === 9.0)

    val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
    malformatted.foreach { s =>
      intercept[SparkException] {
        NumericParser.parse(s)
        throw new RuntimeException(s"Didn't detect malformatted string $s.")
      }
    }
  }

  test("parser with whitespaces") {//空格的解析
    val s = "(0.0, [1.0, 2.0])"
    //数字解析
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Double] === 0.0)
    assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
  }
}

Source File: CommitFailureTestRelationSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path
import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.test.SQLTestUtils


class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils {
  override def _sqlContext: SQLContext = TestHive
  private val sqlContext = _sqlContext

  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  //提交任务时，“CommitFailureTestSource”会为测试目的引发异常
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName
  //commitTask（）失败应该回退到abortTask（）
  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      //这里我们将分区号合并为1，以确保只发出一个任务, 这个防止当FileOutputCommitter尝试删除`_temporary`时发生竞争条件
      //目录提交/中止作业, 有关详细信息，请参阅SPARK-8513
      val df = sqlContext.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: ThriftServerTab.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SparkSQLEnv}
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}
import org.apache.spark.{SparkContext, Logging, SparkException}


private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)
  val listener = HiveThriftServer2.listener

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: StreamingTab.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming.ui

import org.apache.spark.{Logging, SparkException}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.ui.{SparkUI, SparkUITab}

import StreamingTab._


private[spark] class StreamingTab(val ssc: StreamingContext)
  extends SparkUITab(getSparkUI(ssc), "streaming") with Logging {

  private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static"

  val parent = getSparkUI(ssc)
  val listener = ssc.progressListener

  ssc.addStreamingListener(listener)
  ssc.sc.addSparkListener(listener)
  attachPage(new StreamingPage(this))
  attachPage(new BatchPage(this))

  def attach() {
    getSparkUI(ssc).attachTab(this)
    getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
  }

  def detach() {
    getSparkUI(ssc).detachTab(this)
    getSparkUI(ssc).removeStaticHandler("/static/streaming")
  }
}

private object StreamingTab {
  def getSparkUI(ssc: StreamingContext): SparkUI = {
    ssc.sc.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: RUtils.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.io.File

import scala.collection.JavaConversions._

import org.apache.spark.{SparkEnv, SparkException}

private[spark] object RUtils {
  // Local path where R binary packages built from R source code contained in the spark
  // packages specified with "--packages" or "--jars" command line option reside.
  var rPackages: Option[String] = None

  
  def isRInstalled: Boolean = {
    try {
      val builder = new ProcessBuilder(Seq("R", "--version"))
      builder.start().waitFor() == 0
    } catch {
      case e: Exception => false
    }
  }
}

Source File: LocalRDDCheckpointData.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import scala.reflect.ClassTag

import org.apache.spark.{Logging, SparkEnv, SparkException, TaskContext}
import org.apache.spark.storage.{RDDBlockId, StorageLevel}
import org.apache.spark.util.Utils


  def transformStorageLevel(level: StorageLevel): StorageLevel = {
    // If this RDD is to be cached off-heap, fail fast since we cannot provide any
    // correctness guarantees about subsequent computations after the first one
    //如果这个RDD要被堆栈缓存,那么快速失败,因为我们不能在第一个之后提供关于后续计算的任何正确性保证
    if (level.useOffHeap) {
      throw new SparkException("Local checkpointing is not compatible with off-heap caching.")
    }

    StorageLevel(useDisk = true, level.useMemory, level.deserialized, level.replication)
  }
}

Source File: ProactiveClosureSerializationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: StarryClosureCleaner.scala From starry with Apache License 2.0

5 votes

package org.apache.spark.util

import java.util

import org.apache.spark.internal.Logging
import org.apache.spark.{SparkEnv, SparkException}

import scala.collection.mutable


object StarryClosureCleaner extends Logging {

  val serializableMap: LRUCache[String, Boolean] = new LRUCache[String, Boolean](100000)

  // Check whether a class represents a Scala closure
  private def isClosure(cls: Class[_]): Boolean = {
    cls.getName.contains("$anonfun$")
  }

  def clean(
             closure: AnyRef,
             checkSerializable: Boolean = true,
             cleanTransitively: Boolean = true): Unit = {
    clean(closure, checkSerializable, cleanTransitively, mutable.Map.empty)
  }

  private def clean(
                     func: AnyRef,
                     checkSerializable: Boolean,
                     cleanTransitively: Boolean,
                     accessedFields: mutable.Map[Class[_], mutable.Set[String]]): Unit = {

    if (!isClosure(func.getClass)) {
      logWarning("Expected a closure; got " + func.getClass.getName)
      return
    }
    if (func == null) {
      return
    }
    if (checkSerializable) {
      ensureSerializable(func)
    }
  }

  private def ensureSerializable(func: AnyRef) {
    if (!serializableMap.containsKey(func.getClass.getCanonicalName)) {
      try {
        if (SparkEnv.get != null) {
          SparkEnv.get.closureSerializer.newInstance().serialize(func)
          serializableMap.put(func.getClass.getCanonicalName, true)
        }
      } catch {
        case ex: Exception => throw new SparkException("Task not serializable", ex)
      }
    }
  }

  case class LRUCache[K, V](cacheSize: Int) extends util.LinkedHashMap[K, V] {

    override def removeEldestEntry(eldest: util.Map.Entry[K, V]): Boolean = size > cacheSize

  }

}

Source File: QuerySuite.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark

import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException}
import org.scalatest.FunSuite

import scala.collection.mutable.ArrayBuffer

abstract class QuerySuite extends FunSuite with Logging {

  case class TestCase(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String], answersSize: Int) {
    def this(program: String, query: String, data: Map[String, Seq[String]], answersSize: Int) = this(program, query, data, null, answersSize)

    def this(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String]) = this(program, query, data, answers, answers.size)
  }

  def runTest(testCase: TestCase): Unit = runTests(Seq(testCase))

  def runTests(testCases: Seq[TestCase]): Unit = {
    val sparkCtx = new SparkContext("local[*]", "QuerySuite", new SparkConf()
      .set("spark.eventLog.enabled", "true")
      //.set("spark.eventLog.dir", "../logs")
      .set("spark.ui.enabled", "false")
      .set("spark.sql.shuffle.partitions", "5")
      .setAll(Map.empty[String, String])
    )

    val bigDatalogCtx = new BigDatalogContext(sparkCtx)

    var count: Int = 1
    for (testCase <- testCases) {
      bigDatalogCtx.loadProgram(testCase.program)

      for ((relationName, data) <- testCase.data) {
        val relationInfo = bigDatalogCtx.relationCatalog.getRelationInfo(relationName)
        if (relationInfo == null)
          throw new SparkException("You are attempting to load an unknown relation.")

        bigDatalogCtx.registerAndLoadTable(relationName, data, bigDatalogCtx.conf.numShufflePartitions)
      }

      val query = testCase.query
      val answers = testCase.answers
      logInfo("========== START BigDatalog Query " + count + " START ==========")
      val program = bigDatalogCtx.query(query)

      val results = program.execute().collect()

      // for some test cases we will only know the size of the answer set, not the actual answers
      if (answers == null) {
        assert(results.size == testCase.answersSize)
      } else {
        if (results.size != answers.size) {
          displayDifferences(results.map(_.toString), answers)
          // yes this will fail
          assert(results.size == answers.size)
        } else {
          for (result <- results)
            assert(answers.contains(result.toString()))
        }

        val resultStrings = results.map(_.toString).toSet

        for (answer <- answers)
          assert(resultStrings.contains(answer.toString()))
      }
      logInfo("========== END BigDatalog Query " + count + " END ==========\n")
      count += 1
      bigDatalogCtx.reset()
    }

    sparkCtx.stop()
  }

  private def displayDifferences(results: Seq[String], answers: Seq[String]): Unit = {
    val missingAnswers = new ArrayBuffer[String]
    val missingResults = new ArrayBuffer[String]

    for (result <- results)
      if (!answers.contains(result))
        missingAnswers += result

    for (answer <- answers)
      if (!results.contains(answer))
        missingResults += answer

    if (missingAnswers.nonEmpty)
      logInfo("Results not in Answers: " + missingAnswers.mkString(", "))

    if (missingResults.nonEmpty)
      logInfo("Answers not in Results: " + missingResults.mkString(", "))
  }
}

Source File: NumericParser.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.util.StringTokenizer

import scala.collection.mutable.{ArrayBuilder, ListBuffer}

import org.apache.spark.SparkException


  def parse(s: String): Any = {
    val tokenizer = new StringTokenizer(s, "()[],", true)
    if (tokenizer.hasMoreTokens()) {
      val token = tokenizer.nextToken()
      if (token == "(") {
        parseTuple(tokenizer)
      } else if (token == "[") {
        parseArray(tokenizer)
      } else {
        // expecting a number
        parseDouble(token)
      }
    } else {
      throw new SparkException(s"Cannot find any token from the input string.")
    }
  }

  private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
    val values = ArrayBuilder.make[Double]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "]") {
        parsing = false
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else {
        // expecting a number
        values += parseDouble(token)
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"An array must end with ']'.")
    }
    values.result()
  }

  private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
    val items = ListBuffer.empty[Any]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "(") {
        items.append(parseTuple(tokenizer))
        allowComma = true
      } else if (token == "[") {
        items.append(parseArray(tokenizer))
        allowComma = true
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else if (token == ")") {
        parsing = false
      } else if (token.trim.isEmpty){
          // ignore whitespaces between delim chars, e.g. ", ["
      } else {
        // expecting a number
        items.append(parseDouble(token))
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"A tuple must end with ')'.")
    }
    items
  }

  private def parseDouble(s: String): Double = {
    try {
      java.lang.Double.parseDouble(s)
    } catch {
      case e: NumberFormatException =>
        throw new SparkException(s"Cannot parse a double from: $s", e)
    }
  }
}

Source File: LabeledPoint.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException


  @Since("1.1.0")
  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
      }
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
    }
  }
}

Source File: NumericParserSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.{SparkException, SparkFunSuite}

class NumericParserSuite extends SparkFunSuite {

  test("parser") {
    val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3))
    assert(parsed(1).asInstanceOf[Double] === -4.0)
    assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8))
    assert(parsed(3).asInstanceOf[Double] === 9.0)

    val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
    malformatted.foreach { s =>
      intercept[SparkException] {
        NumericParser.parse(s)
        throw new RuntimeException(s"Didn't detect malformatted string $s.")
      }
    }
  }

  test("parser with whitespaces") {
    val s = "(0.0, [1.0, 2.0])"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Double] === 0.0)
    assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
  }
}

Source File: CommitFailureTestRelationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path
import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils


class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton  {

  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = sqlContext.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: ThriftServerTab.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SparkSQLEnv}
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}
import org.apache.spark.{SparkContext, Logging, SparkException}


private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)
  val listener = HiveThriftServer2.listener

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: UnionDStream.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.streaming.{Duration, Time}
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.UnionRDD

private[streaming]
class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
  extends DStream[T](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to union is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[T]] = {
    val rdds = new ArrayBuffer[RDD[T]]()
    parents.map(_.getOrCompute(validTime)).foreach {
      case Some(rdd) => rdds += rdd
      case None => throw new SparkException("Could not generate RDD from a parent for unifying at" +
        s" time $validTime")
    }
    if (rdds.size > 0) {
      Some(new UnionRDD(ssc.sc, rdds))
    } else {
      None
    }
  }
}

Source File: TransformedDStream.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class TransformedDStream[U: ClassTag] (
    parents: Seq[DStream[_]],
    transformFunc: (Seq[RDD[_]], Time) => RDD[U]
  ) extends DStream[U](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to transform is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[U]] = {
    val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse(
      // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE
      throw new SparkException(s"Couldn't generate RDD from parent at time $validTime"))
    }
    val transformedRDD = transformFunc(parentRDDs, validTime)
    if (transformedRDD == null) {
      throw new SparkException("Transform function must not return null. " +
        "Return SparkContext.emptyRDD() instead to represent no element " +
        "as the result of transformation.")
    }
    Some(transformedRDD)
  }

  
  override protected[streaming] def createRDDWithLocalProperties[U](
      time: Time,
      displayInnerRDDOps: Boolean)(body: => U): U = {
    super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body)
  }
}

Source File: StreamingTab.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.ui

import org.apache.spark.{Logging, SparkException}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.ui.{SparkUI, SparkUITab}

import StreamingTab._


private[spark] class StreamingTab(val ssc: StreamingContext)
  extends SparkUITab(getSparkUI(ssc), "streaming") with Logging {

  private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static"

  val parent = getSparkUI(ssc)
  val listener = ssc.progressListener

  ssc.addStreamingListener(listener)
  ssc.sc.addSparkListener(listener)
  attachPage(new StreamingPage(this))
  attachPage(new BatchPage(this))

  def attach() {
    getSparkUI(ssc).attachTab(this)
    getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
  }

  def detach() {
    getSparkUI(ssc).detachTab(this)
    getSparkUI(ssc).removeStaticHandler("/static/streaming")
  }
}

private object StreamingTab {
  def getSparkUI(ssc: StreamingContext): SparkUI = {
    ssc.sc.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: RpcEndpointAddress.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.rpc.netty

import org.apache.spark.SparkException
import org.apache.spark.rpc.RpcAddress


private[netty] case class RpcEndpointAddress(val rpcAddress: RpcAddress, val name: String) {

  require(name != null, "RpcEndpoint name must be provided.")

  def this(host: String, port: Int, name: String) = {
    this(RpcAddress(host, port), name)
  }

  override val toString = if (rpcAddress != null) {
      s"spark://$name@${rpcAddress.host}:${rpcAddress.port}"
    } else {
      s"spark-client://$name"
    }
}

private[netty] object RpcEndpointAddress {

  def apply(sparkUrl: String): RpcEndpointAddress = {
    try {
      val uri = new java.net.URI(sparkUrl)
      val host = uri.getHost
      val port = uri.getPort
      val name = uri.getUserInfo
      if (uri.getScheme != "spark" ||
          host == null ||
          port < 0 ||
          name == null ||
          (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
          uri.getFragment != null ||
          uri.getQuery != null) {
        throw new SparkException("Invalid Spark URL: " + sparkUrl)
      }
      new RpcEndpointAddress(host, port, name)
    } catch {
      case e: java.net.URISyntaxException =>
        throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
    }
  }
}

Source File: RUtils.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.io.File
import java.util.Arrays

import org.apache.spark.{SparkEnv, SparkException}

private[spark] object RUtils {
  // Local path where R binary packages built from R source code contained in the spark
  // packages specified with "--packages" or "--jars" command line option reside.
  var rPackages: Option[String] = None

  
  def isRInstalled: Boolean = {
    try {
      val builder = new ProcessBuilder(Arrays.asList("R", "--version"))
      builder.start().waitFor() == 0
    } catch {
      case e: Exception => false
    }
  }
}

Source File: MemoryCheckpointRDD.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.storage.RDDBlockId
import org.apache.spark.{Partition, SparkContext, SparkException, TaskContext}

import scala.reflect.ClassTag

// We use a different class than LocalCheckpointRDD, but the same functionality,
// so that we easily identify (e..g, pattern match) this class in DAGScheduler.
class MemoryCheckpointRDD[T: ClassTag](sc: SparkContext, rddId: Int, numPartitions: Int)
  extends LocalCheckpointRDD[T](sc, rddId, numPartitions) {

  def this(rdd: RDD[T]) {
    this(rdd.context, rdd.id, rdd.partitions.size)
  }

  
  override def compute(partition: Partition, context: TaskContext): Iterator[T] = {
    throw new SparkException(
      s"Checkpoint block ${RDDBlockId(rddId, partition.index)} not found! Either the executor " +
        s"that originally checkpointed this partition is no longer alive, or the original RDD is " +
        s"unpersisted. If this problem persists, you may consider using `rdd.checkpoint()` " +
        s"or `rdd.localcheckpoint()` instead, which are slower than memory checkpointing but more fault-tolerant.")
  }
}

Source File: RpcAddressSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.rpc

import org.apache.spark.{SparkException, SparkFunSuite}

class RpcAddressSuite extends SparkFunSuite {

  test("hostPort") {
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
    assert(address.hostPort == "1.2.3.4:1234")
  }

  test("fromSparkURL") {
    val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234")
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
  }

  test("fromSparkURL: a typo url") {
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("spark://1.2. 3.4:1234")
    }
    assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage)
  }

  test("fromSparkURL: invalid scheme") {
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("invalid://1.2.3.4:1234")
    }
    assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage)
  }

  test("toSparkURL") {
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.toSparkURL == "spark://1.2.3.4:1234")
  }
}

Source File: KryoSerializerResizableOutputSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.SparkContext
import org.apache.spark.LocalSparkContext
import org.apache.spark.SparkException


class KryoSerializerResizableOutputSuite extends SparkFunSuite {

  // trial and error showed this will not serialize with 1mb buffer
  val x = (1 to 400000).toArray

  test("kryo without resizable output buffer should fail on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "1m")
    val sc = new SparkContext("local", "test", conf)
    intercept[SparkException](sc.parallelize(x).collect())
    LocalSparkContext.stop(sc)
  }

  test("kryo with resizable output buffer should succeed on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "2m")
    val sc = new SparkContext("local", "test", conf)
    assert(sc.parallelize(x).collect() === x)
    LocalSparkContext.stop(sc)
  }
}

Source File: CoarseGrainedSchedulerBackendSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite}
import org.apache.spark.util.{RpcUtils, SerializableBuffer}

class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext {

  test("serialized task larger than max RPC message size") {
    val conf = new SparkConf
    conf.set("spark.rpc.message.maxSize", "1")
    conf.set("spark.default.parallelism", "1")
    sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf)
    val frameSize = RpcUtils.maxMessageSizeBytes(sc.conf)
    val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize))
    val larger = sc.parallelize(Seq(buffer))
    val thrown = intercept[SparkException] {
      larger.collect()
    }
    assert(thrown.getMessage.contains("using broadcast variables for large values"))
    val smaller = sc.parallelize(1 to 4).collect()
    assert(smaller.size === 4)
  }

}

Source File: CoarseGrainedSchedulerBackendSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite}
import org.apache.spark.util.{SerializableBuffer, AkkaUtils}

class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext {

  test("serialized task larger than akka frame size") {
    val conf = new SparkConf
    conf.set("spark.akka.frameSize", "1")
    conf.set("spark.default.parallelism", "1")
    sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf)
    val frameSize = AkkaUtils.maxFrameSizeBytes(sc.conf)
    val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize))
    val larger = sc.parallelize(Seq(buffer))
    val thrown = intercept[SparkException] {
      larger.collect()
    }
    assert(thrown.getMessage.contains("using broadcast variables for large values"))
    val smaller = sc.parallelize(1 to 4).collect()
    assert(smaller.size === 4)
  }

}

Source File: ThriftServerTabSeq.scala From bdg-sequila with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.ui



import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2Seq.HiveThriftServer2ListenerSeq
import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SequilaThriftServer}
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[thriftserver] class ThriftServerTabSeq(sparkContext: SparkContext, list: HiveThriftServer2ListenerSeq)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "SeQuiLa JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)
  val listener = list

  attachPage(new ThriftServerPageSeq(this))
  attachPage(new ThriftServerSessionPageSeq(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: SimhashIndexing.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.story

import java.net.URL

import com.datastax.spark.connector._
import io.gzet.story.model.Article
import io.gzet.story.util.SimhashUtils._
import io.gzet.story.util.{HtmlFetcher, Tokenizer}
import io.gzet.utils.spark.gdelt.GKGParser
import org.apache.lucene.analysis.en.EnglishAnalyzer
import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException}

import scala.util.Try

object SimhashIndexing extends SimpleConfig with Logging {

  def main(args: Array[String]) = {

    val sc = new SparkContext(new SparkConf().setAppName("GDELT Indexing"))

    if (args.isEmpty)
      throw new SparkException("usage: <gdeltInputDir>")

    val gdeltInputDir = args.head
    val gkgRDD = sc.textFile(gdeltInputDir)
      .map(GKGParser.toJsonGKGV2)
      .map(GKGParser.toCaseClass2)

    val urlRDD = gkgRDD.map(g => g.documentId.getOrElse("NA"))
      .filter(url => Try(new URL(url)).isSuccess)
      .distinct()
      .repartition(partitions)

    val contentRDD = urlRDD.mapPartitions({ it =>
      val html = new HtmlFetcher(gooseConnectionTimeout, gooseSocketTimeout)
      it map html.fetch
    })

    val corpusRDD = contentRDD.mapPartitions({ it =>
      val analyzer = new EnglishAnalyzer()
      it.map(content => (content, Tokenizer.lucene(content.body, analyzer)))
    }).filter({ case (content, corpus) =>
      corpus.length > minWords
    })

    //CREATE TABLE gzet.articles ( hash int PRIMARY KEY, url text, title text, body text );
    corpusRDD.mapValues(_.mkString(" ").simhash).map({ case (content, simhash) =>
      Article(simhash, content.body, content.title, content.url)
    }).saveToCassandra(cassandraKeyspace, cassandraTable)

  }

}

Source File: SimpleVectorAssembler.scala From albedo with MIT License

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkException
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row}

import scala.collection.mutable.ArrayBuilder



  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)

    val schema = dataset.schema
    val assembleFunc = udf { r: Row =>
      SimpleVectorAssembler.assemble(r.toSeq: _*)
    }
    val args = $(inputCols).map { c =>
      schema(c).dataType match {
        case DoubleType => dataset(c)
        case _: VectorUDT => dataset(c)
        case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid")
      }
    }

    dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol)))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputColNames = $(inputCols)
    val outputColName = $(outputCol)
    val inputDataTypes = inputColNames.map(name => schema(name).dataType)
    inputDataTypes.foreach {
      case _: NumericType | BooleanType =>
      case t if t.isInstanceOf[VectorUDT] =>
      case other =>
        throw new IllegalArgumentException(s"Data type $other is not supported.")
    }
    if (schema.fieldNames.contains(outputColName)) {
      throw new IllegalArgumentException(s"Output column $outputColName already exists.")
    }
    StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true))
  }

  override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra)
}

object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] {
  override def load(path: String): SimpleVectorAssembler = super.load(path)

  def assemble(vv: Any*): Vector = {
    val indices = ArrayBuilder.make[Int]
    val values = ArrayBuilder.make[Double]
    var cur = 0
    vv.foreach {
      case v: Double =>
        if (v != 0.0) {
          indices += cur
          values += v
        }
        cur += 1
      case vec: Vector =>
        vec.foreachActive { case (i, v) =>
          if (v != 0.0) {
            indices += cur + i
            values += v
          }
        }
        cur += vec.size
      case null =>
        // TODO: output Double.NaN?
        throw new SparkException("Values to assemble cannot be null.")
      case o =>
        throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.")
    }
    Vectors.sparse(cur, indices.result(), values.result()).compressed
  }
}

Source File: N1qlSpec.scala From couchbase-spark-connector with Apache License 2.0

5 votes

package com.couchbase.spark.n1ql

import com.couchbase.client.core.CouchbaseException
import com.couchbase.client.java.error.QueryExecutionException
import com.couchbase.client.java.query.N1qlQuery
import org.apache.spark.{SparkConf, SparkContext, SparkException}
import org.apache.spark.sql.sources.EqualTo
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
import org.scalatest._
import com.couchbase.spark._
import com.couchbase.spark.connection.CouchbaseConnection
import com.couchbase.spark.sql.N1QLRelation
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import scala.util.control.NonFatal

class N1qlSpec extends FunSuite with Matchers with BeforeAndAfterAll {

  private val master = "local[2]"
  private val appName = "cb-int-specs1"

  private var spark: SparkSession = _


  override def beforeAll(): Unit = {
    spark = SparkSession
      .builder()
      .master(master)
      .appName(appName)
      .config("spark.couchbase.username", "Administrator")
      .config("spark.couchbase.password", "password")
      // Open 2 buckets as tests below rely on it
      .config("com.couchbase.bucket.default", "")
      .config("com.couchbase.bucket.travel-sample", "")
      .getOrCreate()
  }

  override def afterAll(): Unit = {
    CouchbaseConnection().stop()
    spark.stop()
  }

  test("Creating N1QLRelation with default bucket, when two buckets exist, should fail") {
    assertThrows[IllegalStateException] {
      spark.read
        .format("com.couchbase.spark.sql.DefaultSource")
        .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline")))
        .option("schemaFilter", "`type` = 'airline'")
        .schema(StructType(StructField("name", StringType) :: Nil))
        .load()
    }
  }

  test("Creating N1QLRelation with non-default bucket, when two buckets exist, should succeed") {
    spark.read
      .format("com.couchbase.spark.sql.DefaultSource")
      .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline")))
      .option("schemaFilter", "`type` = 'airline'")
      .option("bucket", "travel-sample")
      .schema(StructType(StructField("name", StringType) :: Nil))
      .load()
  }

  test("N1QL failures should fail the Observable") {
    try {
      spark.sparkContext
        .couchbaseQuery(N1qlQuery.simple("BAD QUERY"), bucketName = "default")
        .collect()
        .foreach(println)
      fail()
    }
    catch {
      case e: SparkException =>
        assert (e.getCause.isInstanceOf[QueryExecutionException])
        val err = e.getCause.asInstanceOf[QueryExecutionException]
        assert (err.getMessage == "syntax error - at QUERY")
      case NonFatal(e) =>
        println(e)
        fail()
    }
  }
}

Source File: StructTypeToMleap.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.converter

import com.truecar.mleap.runtime.types
import org.apache.spark.SparkException
import org.apache.spark.mllib.linalg.VectorUDT
import org.apache.spark.sql.types._


case class StructTypeToMleap(schema: StructType) {
  def toMleap: types.StructType = {
    val leapFields = schema.fields.map {
      field =>
        val sparkType = field.dataType
        val sparkTypeName = sparkType.typeName
        val dataType = sparkType match {
          case _: NumericType | BooleanType => types.DoubleType
          case _: StringType => types.StringType
          case _: VectorUDT => types.VectorType
          case dataType: ArrayType if dataType.elementType == StringType => types.StringArrayType
          case _ => throw new SparkException(s"unsupported MLeap datatype: $sparkTypeName")
        }

        types.StructField(field.name, dataType)
    }
    types.StructType(leapFields)
  }
}

Source File: StringIndexerModelSuite.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase}
import com.opendatagroup.hadrian.errors.PFAUserException

import org.apache.spark.SparkException
import org.apache.spark.ml.feature.StringIndexer

class StringIndexerModelSuite extends SparkFeaturePFASuiteBase[StringIndexerResult] {
  import spark.implicits._

  val df = Seq(
    (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")
  ).toDF("id", "category")

  val testHandleInvalidDF = Seq(
    (0, "a"), (1, "b"), (2, "c"), (3, "d"), (4, "e"), (5, "c")
  ).toDF("id", "category")

  val indexer = new StringIndexer()
    .setInputCol("category")
    .setOutputCol("categoryIndex")

  override val sparkTransformer = indexer.fit(df)

  val result = sparkTransformer.transform(df)
  val sparkOutput = result.select(indexer.getOutputCol).toDF()

  override val input = result.select(indexer.getInputCol).toJSON.collect()
  override val expectedOutput = sparkOutput.toJSON.collect()

  // Additional test for handleInvalid
  test("StringIndexer with handleInvalid=keep") {
    val sparkTransformer = indexer.setHandleInvalid("keep").fit(df)

    val result = sparkTransformer.transform(testHandleInvalidDF)
    val input = testHandleInvalidDF.select(indexer.getInputCol).toJSON.collect()
    val expectedOutput = result.select(indexer.getOutputCol).toJSON.collect()

    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("StringIndexer with handleInvalid=error") {
    val sparkTransformer = indexer.setHandleInvalid("error").fit(df)

    intercept[SparkException] {
      val result = sparkTransformer.transform(testHandleInvalidDF)
      result.foreach(_ => Unit)
    }

    intercept[PFAUserException] {
      val input = testHandleInvalidDF.select(indexer.getInputCol).toJSON.collect()
      // we transform on df here to avoid Spark throwing the error and to ensure we match
      // the sizes of expected input / output. The error should be thrown before the comparison
      // would fail
      val expectedOutput = sparkTransformer.transform(df).select(indexer.getOutputCol).toJSON.collect()
      parityTest(sparkTransformer, input, expectedOutput)
    }
  }
}

case class StringIndexerResult(categoryIndex: Double) extends Result

Source File: LabeledPoint.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.feature

import org.apache.spark.SparkException
import org.apache.spark.linalg.{NumericParser, Vector, Vectors}

import scala.beans.BeanInfo

/**
  *
  * Class that represents the features and label of a data point.
  *
  * @param label    Label for this data point.
  * @param features List of features for this data point.
  */

@BeanInfo
case class LabeledPoint(label: Double, features: Vector) extends Serializable {
  override def toString: String = {
    s"($label,$features)"
  }
}


object LabeledPoint {
  /**
    * Parses a string resulted from `LabeledPoint#toString` into
    * an [[LabeledPoint]].
    *
    */

  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
      }
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
    }
  }
}

Source File: PSVectorPool.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.context

import com.tencent.angel.ml.math2.utils.RowType
import com.tencent.angel.sona.models.PSVector
import com.tencent.angel.sona.models.impl.PSVectorImpl
import org.apache.spark.SparkException
import sun.misc.Cleaner

/**
  * PSVectorPool delegate a memory space on PS servers,
  * which hold `capacity` number vectors with `numDimensions` dimension.
  * The dimension of PSVectors in one PSVectorPool is the same.
  *
  * A PSVectorPool is like a Angel Matrix.
  *
  * @param id        PSVectorPool unique id
  * @param dimension Dimension of vectors
  * @param capacity  Capacity of pool
  */

class PSVectorPool(
                    val id: Int,
                    val dimension: Long,
                    val capacity: Int,
                    val rowType: RowType) {

  val cleaners = new java.util.WeakHashMap[PSVector, Cleaner]
  val bitSet = new java.util.BitSet(capacity)
  var destroyed = false
  var size = 0

  def allocate(): PSVector = {
    if (destroyed) {
      throw new SparkException("This vector pool has been destroyed!")
    }

    if (size > math.max(capacity * 0.9, 4)) {
      System.gc()
    }

    tryOnce match {
      case Some(toReturn) => return toReturn
      case None =>
    }

    System.gc()
    Thread.sleep(100L)

    // Try again
    tryOnce match {
      case Some(toReturn) => toReturn
      case None => throw new SparkException("This vector pool is full!")
    }

  }

  private def tryOnce: Option[PSVector] = {
    bitSet.synchronized {
      if (size < capacity) {
        val index = bitSet.nextClearBit(0)
        bitSet.set(index)
        size += 1
        return Some(doCreateOne(index))
      }
    }
    None
  }

  private def doCreateOne(index: Int): PSVector = {
    val vector = new PSVectorImpl(id, index, dimension, rowType)
    val task = new CleanTask(id, index)
    cleaners.put(vector, Cleaner.create(vector, task))
    vector
  }

  private class CleanTask(poolId: Int, index: Int) extends Runnable {
    def run(): Unit = {
      bitSet.synchronized {
        bitSet.clear(index)
        size -= 1
      }
    }
  }

  def delete(key: PSVector): Unit = {
    cleaners.remove(key).clean()
  }

  def destroy(): Unit = {
    destroyed = true
  }
}

object PSVectorPool {
  val DEFAULT_POOL_CAPACITY = 10
}

Source File: PSVector.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.models

import java.util.concurrent.Future
import scala.collection.Map

import org.apache.spark.SparkException

import com.tencent.angel.ml.math2.vector.Vector
import com.tencent.angel.ml.math2.utils.RowType
import com.tencent.angel.ml.matrix.psf.get.base.{GetFunc, GetResult}
import com.tencent.angel.ml.matrix.psf.update.base.{UpdateFunc, VoidResult}
import com.tencent.angel.sona.context.PSContext


  def longKeySparse(dim: Long,
                    maxRange: Long,
                    capacity: Int = 20,
                    rowType: RowType = RowType.T_DOUBLE_SPARSE_LONGKEY,
                    additionalConfiguration: Map[String, String] = Map()): PSVector = {
    sparse(dim, capacity, maxRange, rowType, additionalConfiguration)
  }

  def sparse(dimension: Long, capacity: Int, range: Long, rowType: RowType,
             additionalConfiguration: Map[String, String]): PSVector = {
    PSContext.instance().createVector(dimension, rowType, capacity, range, additionalConfiguration)
  }

  def sparse(dimension: Long, capacity: Int = 20, rowType: RowType = RowType.T_DOUBLE_SPARSE_LONGKEY,
             additionalConfiguration: Map[String, String] = Map()): PSVector = {
    sparse(dimension, capacity, dimension, rowType, additionalConfiguration)
  }
}

Source File: TextPiperSuite.scala From glow with Apache License 2.0

5 votes

package io.projectglow.transformers.pipe

import scala.collection.JavaConverters._

import org.apache.spark.SparkException
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{StringType, StructField, StructType}

import io.projectglow.Glow
import io.projectglow.sql.GlowBaseTest

class TextPiperSuite extends GlowBaseTest {
  override def afterEach(): Unit = {
    Glow.transform("pipe_cleanup", spark.emptyDataFrame)
    super.afterEach()
  }

  def pipeText(df: DataFrame): DataFrame = {
    val options =
      Map("inputFormatter" -> "text", "outputFormatter" -> "text", "cmd" -> """["cat", "-"]""")
    new PipeTransformer().transform(df, options)
  }

  test("text input and output") {
    val sess = spark
    import sess.implicits._

    val output = pipeText(Seq("hello", "world").toDF())
    assert(output.count() == 2)
    assert(output.schema == StructType(Seq(StructField("text", StringType))))
    assert(output.orderBy("text").as[String].collect.toSeq == Seq("hello", "world"))
  }

  test("text input requires one column") {
    val sess = spark
    import sess.implicits._

    val df = Seq(Seq("hello", "world"), Seq("foo", "bar")).toDF()
    assertThrows[IllegalArgumentException](pipeText(df))
  }

  test("text input requires string column") {
    val sess = spark
    import sess.implicits._

    val df = Seq(Seq(5), Seq(6)).toDF()
    assertThrows[IllegalArgumentException](pipeText(df))
  }

  test("does not break on null row") {
    val sess = spark
    import sess.implicits._

    val df = Seq("hello", null, "hello").toDF()
    val output = pipeText(df)
    assert(output.count() == 2)
    assert(output.filter("text = 'hello'").count == 2)
  }

  test("command fails") {
    val sess = spark
    import sess.implicits._

    val df = Seq("hello", "world").toDF()
    val options =
      Map(
        "inputFormatter" -> "text",
        "outputFormatter" -> "text",
        "cmd" -> """["bash", "-c", "exit 1"]""")

    val ex = intercept[SparkException] {
      new PipeTransformer().transform(df, options)
    }
    assert(ex.getMessage.contains("Subprocess exited with status 1"))

    // threads should still be cleaned up
    eventually {
      assert(
        !Thread
          .getAllStackTraces
          .asScala
          .keySet
          .exists(_.getName.startsWith(ProcessHelper.STDIN_WRITER_THREAD_PREFIX)))
      assert(
        !Thread
          .getAllStackTraces
          .asScala
          .keySet
          .exists(_.getName.startsWith(ProcessHelper.STDERR_READER_THREAD_PREFIX)))
    }
  }
}

Source File: CrailBroadcast.scala From crail-spark-io with Apache License 2.0

5 votes

package org.apache.spark.broadcast

import java.io._

import org.apache.spark.storage._
import org.apache.spark.{SparkEnv, SparkException}

import scala.collection.mutable
import scala.reflect.ClassTag
import scala.util.control.NonFatal


private[spark] class CrailBroadcast[T: ClassTag](obj: T, id: Long)
  extends Broadcast[T](id) with Serializable {
  
  @transient private lazy val _value: T = readBroadcastBlock()
  
  private val broadcastId = BroadcastBlockId(id)
  
  writeBlocks(obj)
  
  override protected def getValue() = {
    _value
  }
  
  override protected def doUnpersist(blocking: Boolean): Unit = {
    logWarning(" called doUnpersist on broadcastId: " + id + " (NYI)")
  }  
  
  override protected def doDestroy(blocking: Boolean): Unit = {
    
              val obj = x.asInstanceOf[T]
              if(CrailBroadcast.useLocalCache) {
                CrailBroadcast.broadcastCache(id) = Some(x)
              } else {
                SparkEnv.get.blockManager.putSingle(broadcastId, obj, StorageLevel.MEMORY_ONLY, tellMaster = false)
              }
              obj
            case None =>
              throw new SparkException(s"Failed to get broadcast " + broadcastId)
          }
      }
    }
  }
}

private object CrailBroadcast {

  //FIXME: (atr) I am not completely sure about if this gives us the best performance.
  val broadcastCache:mutable.HashMap[Long, Option[Any]] = new mutable.HashMap[Long, Option[Any]]
  private val useLocalCache = false

  def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean): Unit = {
    this.synchronized {
      if(useLocalCache) {
        broadcastCache.remove(id)
      } else {
        SparkEnv.get.blockManager.master.removeBroadcast(id, removeFromDriver, blocking)
        SparkEnv.get.blockManager.removeBroadcast(id, false)
      }
    }
  }

  def cleanCache(): Unit = {
    this.synchronized {
      broadcastCache.clear()
    }
  }
}

object Utils {
  def tryOrIOException[T](block: => T): T = {
    try {
      block
    } catch {
      case e: IOException =>
        throw e
      case NonFatal(e) =>
        throw new IOException(e)
    }
  }
}

Source File: IOCommon.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.common

import java.io.{File, FileInputStream, IOException, InputStreamReader}
import java.util.Properties

import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, SparkException}

import scala.collection.JavaConversions._
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag

class IOCommon(val sc:SparkContext) {
   def load[T:ClassTag:TypeTag](filename:String, force_format:Option[String]=None) = {
     val input_format = force_format.getOrElse(
       IOCommon.getProperty("sparkbench.inputformat").getOrElse("Text"))

     input_format match {
       case "Text" =>
         sc.textFile(filename)

       case "Sequence" =>
         sc.sequenceFile[NullWritable, Text](filename).map(_._2.toString)

       case _ => throw new UnsupportedOperationException(s"Unknown inpout format: $input_format")
     }
   }

   def save(filename:String, data:RDD[_], prefix:String) = {
     val output_format = IOCommon.getProperty(prefix).getOrElse("Text")
     val output_format_codec =
       loadClassByName[CompressionCodec](IOCommon.getProperty(prefix + ".codec"))

     output_format match {
       case "Text" =>
         if (output_format_codec.isEmpty)  data.saveAsTextFile(filename)
         else data.saveAsTextFile(filename, output_format_codec.get)

       case "Sequence" =>
         val sequence_data = data.map(x => (NullWritable.get(), new Text(x.toString)))
         if (output_format_codec.isEmpty) {
           sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename)
         } else {
           sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename,
             output_format_codec.get)
         }

       case _ => throw new UnsupportedOperationException(s"Unknown output format: $output_format")
     }
   }

   def save(filename:String, data:RDD[_]):Unit = save(filename, data, "sparkbench.outputformat")

   private def loadClassByName[T](name:Option[String]) = {
     if (!name.isEmpty) Some(Class.forName(name.get)
       .newInstance.asInstanceOf[T].getClass) else None
   }

   private def callMethod[T, R](obj:T, method_name:String) =
     obj.getClass.getMethod(method_name).invoke(obj).asInstanceOf[R]
 }

object IOCommon {
   private val sparkbench_conf: HashMap[String, String] =
     getPropertiesFromFile(System.getenv("SPARKBENCH_PROPERTIES_FILES"))

   def getPropertiesFromFile(filenames: String): HashMap[String, String] = {
     val result = new HashMap[String, String]
     filenames.split(',').filter(_.stripMargin.length > 0).foreach { filename =>
       val file = new File(filename)
       require(file.exists, s"Properties file $file does not exist")
       require(file.isFile, s"Properties file $file is not a normal file")

       val inReader = new InputStreamReader(new FileInputStream(file), "UTF-8")
       try {
         val properties = new Properties()
         properties.load(inReader)
         result ++= properties.stringPropertyNames()
           .map(k => (k, properties(k).trim)).toMap
       } catch {
         case e: IOException =>
           val message = s"Failed when loading Sparkbench properties file $file"
           throw new SparkException(message, e)
       } finally {
         inReader.close()
       }
     }
     result.filter{case (key, value) => value.toLowerCase != "none"}
   }

   def getProperty(key:String):Option[String] = sparkbench_conf.get(key)

   def dumpProperties(): Unit = sparkbench_conf
       .foreach{case (key, value)=> println(s"$key\t\t$value")}
 }

Source File: SQLServerTab.scala From spark-sql-server with Apache License 2.0

5 votes

package org.apache.spark.sql.server.ui

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.server.SQLServerListener
import org.apache.spark.sql.server.ui.SQLServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}


case class SQLServerTab(
    sparkContext: SparkContext,
    listener: SQLServerListener)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  private val parent = getSparkUI(sparkContext)

  attachPage(new SQLServerPage(this))
  attachPage(new SQLServerSessionPage(this))

  parent.attachTab(this)

  def detach() {
    parent.detachTab(this)
  }
}

object SQLServerTab {

  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: DataFrameToFileWriter.scala From seahorse-workflow-executor with Apache License 2.0

5 votes

package io.deepsense.deeplang.doperations.readwritedataframe.filestorage

import org.apache.spark.SparkException
import io.deepsense.commons.utils.LoggerForCallerClass
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.doperations.exceptions.WriteFileException
import io.deepsense.deeplang.doperations.inout.OutputFileFormatChoice.Csv
import io.deepsense.deeplang.doperations.inout.OutputStorageTypeChoice
import io.deepsense.deeplang.doperations.readwritedataframe.{FilePath, FilePathFromLibraryPath, FileScheme}
import io.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv.CsvSchemaStringifierBeforeCsvWriting
import io.deepsense.deeplang.exceptions.DeepLangException
import io.deepsense.deeplang.{ExecutionContext, FileSystemClient}
import org.apache.spark.sql.SaveMode

object DataFrameToFileWriter {

  val logger = LoggerForCallerClass()

  def writeToFile(
      fileChoice: OutputStorageTypeChoice.File,
      context: ExecutionContext,
      dataFrame: DataFrame): Unit = {
    implicit val ctx = context

    val path = FileSystemClient.replaceLeadingTildeWithHomeDirectory(fileChoice.getOutputFile())
    val filePath = FilePath(path)
    val saveMode = if (fileChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists

    try {
      val preprocessed = fileChoice.getFileFormat() match {
        case csv: Csv => CsvSchemaStringifierBeforeCsvWriting.preprocess(dataFrame)
        case other => dataFrame
      }
      writeUsingProvidedFileScheme(fileChoice, preprocessed, filePath, saveMode)
    } catch {
      case e: SparkException =>
        logger.error(s"WriteDataFrame error: Spark problem. Unable to write file to $path", e)
        throw WriteFileException(path, e)
    }
  }

  private def writeUsingProvidedFileScheme(
      fileChoice: OutputStorageTypeChoice.File, dataFrame: DataFrame, path: FilePath, saveMode: SaveMode
    )(implicit context: ExecutionContext): Unit = {
    import FileScheme._
    path.fileScheme match {
      case Library =>
        val filePath = FilePathFromLibraryPath(path)
        val FilePath(_, libraryPath) = filePath
        new java.io.File(libraryPath).getParentFile.mkdirs()
        writeUsingProvidedFileScheme(fileChoice, dataFrame, filePath, saveMode)
      case FileScheme.File => DriverFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode)
      case HDFS => ClusterFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode)
      case HTTP | HTTPS | FTP => throw NotSupportedScheme(path.fileScheme)
    }
  }

  case class NotSupportedScheme(fileScheme: FileScheme)
    extends DeepLangException(s"Not supported file scheme ${fileScheme.pathPrefix}")

}

Source File: ThriftServerTab.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.ui

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[thriftserver] class ThriftServerTab(sparkContext: SparkContext)
  extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging {

  override val name = "JDBC/ODBC Server"

  val parent = getSparkUI(sparkContext)
  val listener = HiveThriftServer2.listener

  attachPage(new ThriftServerPage(this))
  attachPage(new ThriftServerSessionPage(this))
  parent.attachTab(this)

  def detach() {
    getSparkUI(sparkContext).detachTab(this)
  }
}

private[thriftserver] object ThriftServerTab {
  def getSparkUI(sparkContext: SparkContext): SparkUI = {
    sparkContext.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: KryoSerializerResizableOutputSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.SparkContext
import org.apache.spark.LocalSparkContext
import org.apache.spark.SparkException


class KryoSerializerResizableOutputSuite extends SparkFunSuite {

  // trial and error showed this will not serialize with 1mb buffer
  //试验和错误不会序列化使用1MB的缓冲
  val x = (1 to 400000).toArray
  //kryo不可调整大小的输出缓冲区,应该在大数组失败
  test("kryo without resizable output buffer should fail on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "1m")
    val sc = new SparkContext("local", "test", conf)
    intercept[SparkException](sc.parallelize(x).collect())
    LocalSparkContext.stop(sc)
  }
 //kryo不可调整大小的输出缓冲区,应该在大数组成功
  test("kryo with resizable output buffer should succeed on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "2m")
    val sc = new SparkContext("local", "test", conf)
    assert(sc.parallelize(x).collect() === x)
    LocalSparkContext.stop(sc)
  }
}

Source File: ProactiveClosureSerializationSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }
//在一个活动的序列化异常,抛出预期的序列化异常
  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each
  //有可能是一个更清洁的方式来消除样板,
  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: CoarseGrainedSchedulerBackendSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite}
import org.apache.spark.util.{SerializableBuffer, AkkaUtils}

class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext {
  //序列化任务大于Akka框架大小

  ignore("serialized task larger than akka frame size") {
    val conf = new SparkConf
    //以MB为单位的driver和executor之间通信信息的大小,设置值越大,driver可以接受越大的计算结果
    conf.set("spark.akka.frameSize", "1")
    //设置并发数
    conf.set("spark.default.parallelism", "1")
    //sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf)
    sc = new SparkContext("local[*]", "test", conf)
    //获得Akka传递值大小 1048576默认10M
    val frameSize = AkkaUtils.maxFrameSizeBytes(sc.conf)
   //创建一个序列化缓存

   //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区
    //allocate 分配20M
   val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize))

   val larger = sc.parallelize(Seq(buffer))
  val thrown = intercept[SparkException] {
     larger.collect()
   }
   //抛出异常:使用大的值广播变量
   assert(thrown.getMessage.contains("using broadcast variables for large values"))
   val smaller = sc.parallelize(1 to 4).collect()
   assert(smaller.size === 4)/**/
  }

}

Source File: MutableURLClassLoaderSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.util

import java.net.URLClassLoader

import org.apache.spark.{SparkContext, SparkException, SparkFunSuite, TestUtils}

class MutableURLClassLoaderSuite extends SparkFunSuite {

  val urls2 = List(TestUtils.createJarWithClasses(
      classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"),
      toStringValue = "2")).toArray
  val urls = List(TestUtils.createJarWithClasses(
      classNames = Seq("FakeClass1"),
      classNamesWithBase = Seq(("FakeClass2", "FakeClass3")), // FakeClass3 is in parent
      toStringValue = "1",
      classpathUrls = urls2)).toArray

  test("child first") {//第一个子类
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
    val fakeClass = classLoader.loadClass("FakeClass2").newInstance()
    val fakeClassVersion = fakeClass.toString
    assert(fakeClassVersion === "1")
    val fakeClass2 = classLoader.loadClass("FakeClass2").newInstance()
    assert(fakeClass.getClass === fakeClass2.getClass)
  }

  test("parent first") {//第一个父类
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new MutableURLClassLoader(urls, parentLoader)
    val fakeClass = classLoader.loadClass("FakeClass1").newInstance()
    val fakeClassVersion = fakeClass.toString
    assert(fakeClassVersion === "2")
    val fakeClass2 = classLoader.loadClass("FakeClass1").newInstance()
    assert(fakeClass.getClass === fakeClass2.getClass)
  }

  test("child first can fall back") {//子第一次可以倒退
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
    val fakeClass = classLoader.loadClass("FakeClass3").newInstance()
    val fakeClassVersion = fakeClass.toString
    assert(fakeClassVersion === "2")
  }

  test("child first can fail") {//子第一次可以失败
    val parentLoader = new URLClassLoader(urls2, null)
    val classLoader = new ChildFirstURLClassLoader(urls, parentLoader)
    intercept[java.lang.ClassNotFoundException] {
      classLoader.loadClass("FakeClassDoesNotExist").newInstance()
    }
  }
  //驱动程序在本地模式下设置上下文类加载程序
  test("driver sets context class loader in local mode") {
    // Test the case where the driver program sets a context classloader and then runs a job
    // in local mode. This is what happens when ./spark-submit is called with "local" as the
    // master.
    //测试驱动程序设置上下文类加载器然后运行作业的情况
    //在本地模式,当./spark-submit以“local”作为调用时，会发生什么master
    val original = Thread.currentThread().getContextClassLoader

    val className = "ClassForDriverTest"
    val jar = TestUtils.createJarWithClasses(Seq(className))
    val contextLoader = new URLClassLoader(Array(jar), Utils.getContextOrSparkClassLoader)
    Thread.currentThread().setContextClassLoader(contextLoader)

    val sc = new SparkContext("local", "driverLoaderTest")

    try {
      sc.makeRDD(1 to 5, 2).mapPartitions { x =>
        val loader = Thread.currentThread().getContextClassLoader
        // scalastyle:off classforname
        Class.forName(className, true, loader).newInstance()
        // scalastyle:on classforname
        Seq().iterator
      }.count()
    }
    catch {
      case e: SparkException if e.getMessage.contains("ClassNotFoundException") =>
        fail("Local executor could not find class", e)
      case t: Throwable => fail("Unexpected exception ", t)
    }

    sc.stop()
    Thread.currentThread().setContextClassLoader(original)
  }
}

Source File: RWrappers.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods._

import org.apache.spark.SparkException
import org.apache.spark.ml.util.MLReader


private[r] object RWrappers extends MLReader[Object] {

  override def load(path: String): Object = {
    implicit val format = DefaultFormats
    val rMetadataPath = new Path(path, "rMetadata").toString
    val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
    val rMetadata = parse(rMetadataStr)
    val className = (rMetadata \ "class").extract[String]
    className match {
      case "org.apache.spark.ml.r.NaiveBayesWrapper" => NaiveBayesWrapper.load(path)
      case "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper" =>
        AFTSurvivalRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" =>
        GeneralizedLinearRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.KMeansWrapper" =>
        KMeansWrapper.load(path)
      case "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" =>
        MultilayerPerceptronClassifierWrapper.load(path)
      case "org.apache.spark.ml.r.LDAWrapper" =>
        LDAWrapper.load(path)
      case "org.apache.spark.ml.r.IsotonicRegressionWrapper" =>
        IsotonicRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.GaussianMixtureWrapper" =>
        GaussianMixtureWrapper.load(path)
      case "org.apache.spark.ml.r.ALSWrapper" =>
        ALSWrapper.load(path)
      case "org.apache.spark.ml.r.LogisticRegressionWrapper" =>
        LogisticRegressionWrapper.load(path)
      case "org.apache.spark.ml.r.RandomForestRegressorWrapper" =>
        RandomForestRegressorWrapper.load(path)
      case "org.apache.spark.ml.r.RandomForestClassifierWrapper" =>
        RandomForestClassifierWrapper.load(path)
      case "org.apache.spark.ml.r.DecisionTreeRegressorWrapper" =>
        DecisionTreeRegressorWrapper.load(path)
      case "org.apache.spark.ml.r.DecisionTreeClassifierWrapper" =>
        DecisionTreeClassifierWrapper.load(path)
      case "org.apache.spark.ml.r.GBTRegressorWrapper" =>
        GBTRegressorWrapper.load(path)
      case "org.apache.spark.ml.r.GBTClassifierWrapper" =>
        GBTClassifierWrapper.load(path)
      case "org.apache.spark.ml.r.BisectingKMeansWrapper" =>
        BisectingKMeansWrapper.load(path)
      case "org.apache.spark.ml.r.LinearSVCWrapper" =>
        LinearSVCWrapper.load(path)
      case "org.apache.spark.ml.r.FPGrowthWrapper" =>
        FPGrowthWrapper.load(path)
      case _ =>
        throw new SparkException(s"SparkR read.ml does not support load $className")
    }
  }
}

Source File: NumericParser.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.util.StringTokenizer

import scala.collection.mutable.{ArrayBuilder, ListBuffer}

import org.apache.spark.SparkException


  def parse(s: String): Any = {
    val tokenizer = new StringTokenizer(s, "()[],", true)
    if (tokenizer.hasMoreTokens()) {
      val token = tokenizer.nextToken()
      if (token == "(") {
        parseTuple(tokenizer)
      } else if (token == "[") {
        parseArray(tokenizer)
      } else {
        // expecting a number
        parseDouble(token)
      }
    } else {
      throw new SparkException(s"Cannot find any token from the input string.")
    }
  }

  private def parseArray(tokenizer: StringTokenizer): Array[Double] = {
    val values = ArrayBuilder.make[Double]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "]") {
        parsing = false
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else {
        // expecting a number
        values += parseDouble(token)
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"An array must end with ']'.")
    }
    values.result()
  }

  private def parseTuple(tokenizer: StringTokenizer): Seq[_] = {
    val items = ListBuffer.empty[Any]
    var parsing = true
    var allowComma = false
    var token: String = null
    while (parsing && tokenizer.hasMoreTokens()) {
      token = tokenizer.nextToken()
      if (token == "(") {
        items.append(parseTuple(tokenizer))
        allowComma = true
      } else if (token == "[") {
        items.append(parseArray(tokenizer))
        allowComma = true
      } else if (token == ",") {
        if (allowComma) {
          allowComma = false
        } else {
          throw new SparkException("Found a ',' at a wrong position.")
        }
      } else if (token == ")") {
        parsing = false
      } else if (token.trim.isEmpty) {
          // ignore whitespaces between delim chars, e.g. ", ["
      } else {
        // expecting a number
        items.append(parseDouble(token))
        allowComma = true
      }
    }
    if (parsing) {
      throw new SparkException(s"A tuple must end with ')'.")
    }
    items
  }

  private def parseDouble(s: String): Double = {
    try {
      java.lang.Double.parseDouble(s)
    } catch {
      case e: NumberFormatException =>
        throw new SparkException(s"Cannot parse a double from: $s", e)
    }
  }
}

Source File: LabeledPoint.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.SparkException
import org.apache.spark.annotation.Since
import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.NumericParser


  @Since("1.1.0")
  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
      }
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
    }
  }

  private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = {
    LabeledPoint(point.label, Vectors.fromML(point.features))
  }
}

Source File: ChiSquareTestSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.stat

import java.util.Random

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.stat.test.ChiSqTest
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ChiSquareTestSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("test DataFrame of labeled points") {
    // labels: 1.0 (2 / 6), 0.0 (4 / 6)
    // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6)
    // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6)
    val data = Seq(
      LabeledPoint(0.0, Vectors.dense(0.5, 10.0)),
      LabeledPoint(0.0, Vectors.dense(1.5, 20.0)),
      LabeledPoint(1.0, Vectors.dense(1.5, 30.0)),
      LabeledPoint(0.0, Vectors.dense(3.5, 30.0)),
      LabeledPoint(0.0, Vectors.dense(3.5, 40.0)),
      LabeledPoint(1.0, Vectors.dense(3.5, 40.0)))
    for (numParts <- List(2, 4, 6, 8)) {
      val df = spark.createDataFrame(sc.parallelize(data, numParts))
      val chi = ChiSquareTest.test(df, "features", "label")
      val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) =
        chi.select("pValues", "degreesOfFreedom", "statistics")
          .as[(Vector, Array[Int], Vector)].head()
      assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4)
      assert(degreesOfFreedom === Array(2, 3))
      assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4)
    }
  }

  test("large number of features (SPARK-3087)") {
    // Test that the right number of results is returned
    val numCols = 1001
    val sparseData = Array(
      LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))),
      LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0)))))
    val df = spark.createDataFrame(sparseData)
    val chi = ChiSquareTest.test(df, "features", "label")
    val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) =
      chi.select("pValues", "degreesOfFreedom", "statistics")
        .as[(Vector, Array[Int], Vector)].head()
    assert(pValues.size === numCols)
    assert(degreesOfFreedom.length === numCols)
    assert(statistics.size === numCols)
    assert(pValues(1000) !== null)  // SPARK-3087
  }

  test("fail on continuous features or labels") {
    val tooManyCategories: Int = 100000
    assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " +
      "tooManyCategories be large enough to cause ChiSqTest to throw an exception.")

    val random = new Random(11L)
    val continuousLabel = Seq.fill(tooManyCategories)(
      LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2))))
    withClue("ChiSquare should throw an exception when given a continuous-valued label") {
      intercept[SparkException] {
        val df = spark.createDataFrame(continuousLabel)
        ChiSquareTest.test(df, "features", "label")
      }
    }
    val continuousFeature = Seq.fill(tooManyCategories)(
      LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble())))
    withClue("ChiSquare should throw an exception when given continuous-valued features") {
      intercept[SparkException] {
        val df = spark.createDataFrame(continuousFeature)
        ChiSquareTest.test(df, "features", "label")
      }
    }
  }
}

Source File: NumericParserSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.{SparkException, SparkFunSuite}

class NumericParserSuite extends SparkFunSuite {

  test("parser") {
    val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3))
    assert(parsed(1).asInstanceOf[Double] === -4.0)
    assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8))
    assert(parsed(3).asInstanceOf[Double] === 9.0)

    val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4")
    malformatted.foreach { s =>
      intercept[SparkException] {
        NumericParser.parse(s)
        throw new RuntimeException(s"Didn't detect malformatted string $s.")
      }
    }
  }


  test("parser with whitespaces") {
    val s = "(0.0, [1.0, 2.0])"
    val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]]
    assert(parsed(0).asInstanceOf[Double] === 0.0)
    assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0))
  }
}

Source File: InitContainerConfigOrchestrator.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.k8s.submit.steps.initcontainer

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.deploy.k8s.{InitContainerBootstrap, KubernetesUtils, MountSecretsBootstrap}
import org.apache.spark.deploy.k8s.Config._
import org.apache.spark.deploy.k8s.Constants._


private[spark] class InitContainerConfigOrchestrator(
    sparkJars: Seq[String],
    sparkFiles: Seq[String],
    jarsDownloadPath: String,
    filesDownloadPath: String,
    imagePullPolicy: String,
    configMapName: String,
    configMapKey: String,
    sparkConf: SparkConf) {

  private val initContainerImage = sparkConf
    .get(INIT_CONTAINER_IMAGE)
    .getOrElse(throw new SparkException(
      "Must specify the init-container image when there are remote dependencies"))

  def getAllConfigurationSteps: Seq[InitContainerConfigurationStep] = {
    val initContainerBootstrap = new InitContainerBootstrap(
      initContainerImage,
      imagePullPolicy,
      jarsDownloadPath,
      filesDownloadPath,
      configMapName,
      configMapKey,
      SPARK_POD_DRIVER_ROLE,
      sparkConf)
    val baseStep = new BasicInitContainerConfigurationStep(
      sparkJars,
      sparkFiles,
      jarsDownloadPath,
      filesDownloadPath,
      initContainerBootstrap)

    val secretNamesToMountPaths = KubernetesUtils.parsePrefixedKeyValuePairs(
      sparkConf,
      KUBERNETES_DRIVER_SECRETS_PREFIX)
    // Mount user-specified driver secrets also into the driver's init-container. The
    // init-container may need credentials in the secrets to be able to download remote
    // dependencies. The driver's main container and its init-container share the secrets
    // because the init-container is sort of an implementation details and this sharing
    // avoids introducing a dedicated configuration property just for the init-container.
    val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) {
      Seq(new InitContainerMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths)))
    } else {
      Nil
    }

    Seq(baseStep) ++ mountSecretsStep
  }
}

Source File: InitContainerBootstrap.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.k8s

import scala.collection.JavaConverters._

import io.fabric8.kubernetes.api.model.{ContainerBuilder, EmptyDirVolumeSource, EnvVarBuilder, PodBuilder, VolumeMount, VolumeMountBuilder}

import org.apache.spark.{SparkConf, SparkException}
import org.apache.spark.deploy.k8s.Config._
import org.apache.spark.deploy.k8s.Constants._


  def bootstrapInitContainer(
      original: PodWithDetachedInitContainer): PodWithDetachedInitContainer = {
    val sharedVolumeMounts = Seq[VolumeMount](
      new VolumeMountBuilder()
        .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME)
        .withMountPath(jarsDownloadPath)
        .build(),
      new VolumeMountBuilder()
        .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
        .withMountPath(filesDownloadPath)
        .build())

    val customEnvVarKeyPrefix = sparkRole match {
      case SPARK_POD_DRIVER_ROLE => KUBERNETES_DRIVER_ENV_KEY
      case SPARK_POD_EXECUTOR_ROLE => "spark.executorEnv."
      case _ => throw new SparkException(s"$sparkRole is not a valid Spark pod role")
    }
    val customEnvVars = sparkConf.getAllWithPrefix(customEnvVarKeyPrefix).toSeq.map {
      case (key, value) =>
        new EnvVarBuilder()
          .withName(key)
          .withValue(value)
          .build()
    }

    val initContainer = new ContainerBuilder(original.initContainer)
      .withName("spark-init")
      .withImage(initContainerImage)
      .withImagePullPolicy(imagePullPolicy)
      .addAllToEnv(customEnvVars.asJava)
      .addNewVolumeMount()
        .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
        .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_DIR)
        .endVolumeMount()
      .addToVolumeMounts(sharedVolumeMounts: _*)
      .addToArgs("init")
      .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH)
      .build()

    val podWithBasicVolumes = new PodBuilder(original.pod)
      .editSpec()
      .addNewVolume()
        .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME)
        .withNewConfigMap()
          .withName(configMapName)
          .addNewItem()
            .withKey(configMapKey)
            .withPath(INIT_CONTAINER_PROPERTIES_FILE_NAME)
            .endItem()
          .endConfigMap()
        .endVolume()
      .addNewVolume()
        .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME)
        .withEmptyDir(new EmptyDirVolumeSource())
        .endVolume()
      .addNewVolume()
        .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME)
        .withEmptyDir(new EmptyDirVolumeSource())
        .endVolume()
      .endSpec()
      .build()

    val mainContainer = new ContainerBuilder(original.mainContainer)
      .addToVolumeMounts(sharedVolumeMounts: _*)
      .addNewEnv()
        .withName(ENV_MOUNTED_FILES_DIR)
        .withValue(filesDownloadPath)
        .endEnv()
      .build()

    PodWithDetachedInitContainer(
      podWithBasicVolumes,
      initContainer,
      mainContainer)
  }
}

Source File: MesosProtoUtils.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster.mesos

import scala.collection.JavaConverters._

import org.apache.mesos.Protos

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging

object MesosProtoUtils extends Logging {

  
  def mesosLabels(labelsStr: String): Protos.Labels.Builder = {
    val labels: Seq[Protos.Label] = if (labelsStr == "") {
      Seq()
    } else {
      labelsStr.split("""(?<!\\),""").toSeq.map { labelStr =>
        val parts = labelStr.split("""(?<!\\):""")
        if (parts.length != 2) {
          throw new SparkException(s"Malformed label: ${labelStr}")
        }

        val cleanedParts = parts
          .map(part => part.replaceAll("""\\,""", ","))
          .map(part => part.replaceAll("""\\:""", ":"))

        Protos.Label.newBuilder()
          .setKey(cleanedParts(0))
          .setValue(cleanedParts(1))
          .build()
      }
    }

    Protos.Labels.newBuilder().addAllLabels(labels.asJava)
  }
}

Source File: YarnClusterManager.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.spark.{SparkContext, SparkException}
import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}


private[spark] class YarnClusterManager extends ExternalClusterManager {

  override def canCreate(masterURL: String): Boolean = {
    masterURL == "yarn"
  }

  override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
    sc.deployMode match {
      case "cluster" => new YarnClusterScheduler(sc)
      case "client" => new YarnScheduler(sc)
      case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
    }
  }

  override def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = {
    sc.deployMode match {
      case "cluster" =>
        new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
      case "client" =>
        new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc)
      case  _ =>
        throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn")
    }
  }

  override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
  }
}

Source File: InsertIntoHiveDirCommand.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import scala.language.existentials

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.common.FileUtils
import org.apache.hadoop.hive.ql.plan.TableDesc
import org.apache.hadoop.hive.serde.serdeConstants
import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
import org.apache.hadoop.mapred._

import org.apache.spark.SparkException
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable}
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.hive.client.HiveClientImpl


case class InsertIntoHiveDirCommand(
    isLocal: Boolean,
    storage: CatalogStorageFormat,
    query: LogicalPlan,
    overwrite: Boolean,
    outputColumns: Seq[Attribute]) extends SaveAsHiveFile {

  override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = {
    assert(storage.locationUri.nonEmpty)

    val hiveTable = HiveClientImpl.toHiveTable(CatalogTable(
      identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")),
      tableType = org.apache.spark.sql.catalyst.catalog.CatalogTableType.VIEW,
      storage = storage,
      schema = query.schema
    ))
    hiveTable.getMetadata.put(serdeConstants.SERIALIZATION_LIB,
      storage.serde.getOrElse(classOf[LazySimpleSerDe].getName))

    val tableDesc = new TableDesc(
      hiveTable.getInputFormatClass,
      hiveTable.getOutputFormatClass,
      hiveTable.getMetadata
    )

    val hadoopConf = sparkSession.sessionState.newHadoopConf()
    val jobConf = new JobConf(hadoopConf)

    val targetPath = new Path(storage.locationUri.get)
    val writeToPath =
      if (isLocal) {
        val localFileSystem = FileSystem.getLocal(jobConf)
        localFileSystem.makeQualified(targetPath)
      } else {
        val qualifiedPath = FileUtils.makeQualified(targetPath, hadoopConf)
        val dfs = qualifiedPath.getFileSystem(jobConf)
        if (!dfs.exists(qualifiedPath)) {
          dfs.mkdirs(qualifiedPath.getParent)
        }
        qualifiedPath
      }

    val tmpPath = getExternalTmpPath(sparkSession, hadoopConf, writeToPath)
    val fileSinkConf = new org.apache.spark.sql.hive.HiveShim.ShimFileSinkDesc(
      tmpPath.toString, tableDesc, false)

    try {
      saveAsHiveFile(
        sparkSession = sparkSession,
        plan = child,
        hadoopConf = hadoopConf,
        fileSinkConf = fileSinkConf,
        outputLocation = tmpPath.toString,
        allColumns = outputColumns)

      val fs = writeToPath.getFileSystem(hadoopConf)
      if (overwrite && fs.exists(writeToPath)) {
        fs.listStatus(writeToPath).foreach { existFile =>
          if (Option(existFile.getPath) != createdTempDir) fs.delete(existFile.getPath, true)
        }
      }

      fs.listStatus(tmpPath).foreach {
        tmpFile => fs.rename(tmpFile.getPath, writeToPath)
      }
    } catch {
      case e: Throwable =>
        throw new SparkException(
          "Failed inserting overwrite directory " + storage.locationUri.get, e)
    } finally {
      deleteExternalTmpPath(hadoopConf)
    }

    Seq.empty[Row]
  }
}

Source File: CommitFailureTestRelationSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton {
  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = spark.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - default") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val divideByZero = udf((x: Int) => { x / (x - 1)})
      val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id")))

      SimpleTextRelation.callbackCalled = false
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - partitioned") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))

      SimpleTextRelation.callbackCalled = false
      SimpleTextRelation.failWriter = true
      intercept[SparkException] {
        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: RpcAddressSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.rpc

import org.apache.spark.{SparkException, SparkFunSuite}

class RpcAddressSuite extends SparkFunSuite {

  test("hostPort") {//主机端口
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
    assert(address.hostPort == "1.2.3.4:1234")
  }

  test("fromSparkURL") {//来自Spark URL
    val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234")
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
  }

  test("fromSparkURL: a typo url") {//来自一个错误Spark URL
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("spark://1.2. 3.4:1234")//中间有空格
    }
    assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage)
  }

  test("fromSparkURL: invalid scheme") {//来自一个Spark URL无效模式
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("invalid://1.2.3.4:1234")
    }
    assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage)
  }

  test("toSparkURL") {//转换SparkURL格式
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.toSparkURL == "spark://1.2.3.4:1234")
  }
}

Source File: UDTRegistration.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.types

import scala.collection.mutable

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.util.Utils


  def getUDTFor(userClass: String): Option[Class[_]] = {
    udtMap.get(userClass).map { udtClassName =>
      if (Utils.classIsLoadable(udtClassName)) {
        val udtClass = Utils.classForName(udtClassName)
        if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) {
          udtClass
        } else {
          throw new SparkException(
            s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " +
              s"an UserDefinedType for ${userClass}")
        }
      } else {
        throw new SparkException(
          s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.")
      }
    }
  }
}

Source File: ScalaUDFSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import java.util.Locale

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
import org.apache.spark.sql.types.{IntegerType, StringType}

class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("basic") {
    val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil)
    checkEvaluation(intUdf, 2)

    val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil)
    checkEvaluation(stringUdf, "ax")
  }

  test("better error message for NPE") {
    val udf = ScalaUDF(
      (s: String) => s.toLowerCase(Locale.ROOT),
      StringType,
      Literal.create(null, StringType) :: Nil)

    val e1 = intercept[SparkException](udf.eval())
    assert(e1.getMessage.contains("Failed to execute user defined function"))

    val e2 = intercept[SparkException] {
      checkEvalutionWithUnsafeProjection(udf, null)
    }
    assert(e2.getMessage.contains("Failed to execute user defined function"))
  }

  test("SPARK-22695: ScalaUDF should not use global variables") {
    val ctx = new CodegenContext
    ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil).genCode(ctx)
    assert(ctx.inlinedMutableStates.isEmpty)
  }
}

Source File: FailureSafeParser.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.SparkException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
import org.apache.spark.sql.catalyst.util._
import org.apache.spark.sql.types.StructType
import org.apache.spark.unsafe.types.UTF8String

class FailureSafeParser[IN](
    rawParser: IN => Seq[InternalRow],
    mode: ParseMode,
    schema: StructType,
    columnNameOfCorruptRecord: String) {

  private val corruptFieldIndex = schema.getFieldIndex(columnNameOfCorruptRecord)
  private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord))
  private val resultRow = new GenericInternalRow(schema.length)
  private val nullResult = new GenericInternalRow(schema.length)

  // This function takes 2 parameters: an optional partial result, and the bad record. If the given
  // schema doesn't contain a field for corrupted record, we just return the partial result or a
  // row with all fields null. If the given schema contains a field for corrupted record, we will
  // set the bad record to this field, and set other fields according to the partial result or null.
  private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = {
    if (corruptFieldIndex.isDefined) {
      (row, badRecord) => {
        var i = 0
        while (i < actualSchema.length) {
          val from = actualSchema(i)
          resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull
          i += 1
        }
        resultRow(corruptFieldIndex.get) = badRecord()
        resultRow
      }
    } else {
      (row, _) => row.getOrElse(nullResult)
    }
  }

  def parse(input: IN): Iterator[InternalRow] = {
    try {
      rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null))
    } catch {
      case e: BadRecordException => mode match {
        case PermissiveMode =>
          Iterator(toResultRow(e.partialResult(), e.record))
        case DropMalformedMode =>
          Iterator.empty
        case FailFastMode =>
          throw new SparkException("Malformed records are detected in record parsing. " +
            s"Parse Mode: ${FailFastMode.name}.", e.cause)
      }
    }
  }
}

Source File: InsertIntoDataSourceDirCommand.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.command

import org.apache.spark.SparkException
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.catalog._
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.datasources._


case class InsertIntoDataSourceDirCommand(
    storage: CatalogStorageFormat,
    provider: String,
    query: LogicalPlan,
    overwrite: Boolean) extends RunnableCommand {

  override protected def innerChildren: Seq[LogicalPlan] = query :: Nil

  override def run(sparkSession: SparkSession): Seq[Row] = {
    assert(storage.locationUri.nonEmpty, "Directory path is required")
    assert(provider.nonEmpty, "Data source is required")

    // Create the relation based on the input logical plan: `query`.
    val pathOption = storage.locationUri.map("path" -> CatalogUtils.URIToString(_))

    val dataSource = DataSource(
      sparkSession,
      className = provider,
      options = storage.properties ++ pathOption,
      catalogTable = None)

    val isFileFormat = classOf[FileFormat].isAssignableFrom(dataSource.providingClass)
    if (!isFileFormat) {
      throw new SparkException(
        "Only Data Sources providing FileFormat are supported: " + dataSource.providingClass)
    }

    val saveMode = if (overwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists
    try {
      sparkSession.sessionState.executePlan(dataSource.planForWriting(saveMode, query)).toRdd
    } catch {
      case ex: AnalysisException =>
        logError(s"Failed to write to directory " + storage.locationUri.toString, ex)
        throw ex
    }

    Seq.empty[Row]
  }
}

Source File: UDTRegistrationSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.sql.types._

private[sql] class TestUserClass {
}

private[sql] class TestUserClass2 {
}

private[sql] class TestUserClass3 {
}

private[sql] class NonUserDefinedType {
}

private[sql] class TestUserClassUDT extends UserDefinedType[TestUserClass] {

  override def sqlType: DataType = IntegerType
  override def serialize(input: TestUserClass): Int = 1

  override def deserialize(datum: Any): TestUserClass = new TestUserClass

  override def userClass: Class[TestUserClass] = classOf[TestUserClass]

  private[spark] override def asNullable: TestUserClassUDT = this

  override def hashCode(): Int = classOf[TestUserClassUDT].getName.hashCode()

  override def equals(other: Any): Boolean = other match {
    case _: TestUserClassUDT => true
    case _ => false
  }
}

class UDTRegistrationSuite extends SparkFunSuite {

  test("register non-UserDefinedType") {
    UDTRegistration.register(classOf[TestUserClass].getName,
      "org.apache.spark.sql.NonUserDefinedType")
    intercept[SparkException] {
      UDTRegistration.getUDTFor(classOf[TestUserClass].getName)
    }
  }

  test("default UDTs") {
    val userClasses = Seq(
    "org.apache.spark.ml.linalg.Vector",
    "org.apache.spark.ml.linalg.DenseVector",
    "org.apache.spark.ml.linalg.SparseVector",
    "org.apache.spark.ml.linalg.Matrix",
    "org.apache.spark.ml.linalg.DenseMatrix",
    "org.apache.spark.ml.linalg.SparseMatrix")
    userClasses.foreach { c =>
      assert(UDTRegistration.exists(c))
    }
  }

  test("query registered user class") {
    UDTRegistration.register(classOf[TestUserClass2].getName, classOf[TestUserClassUDT].getName)
    assert(UDTRegistration.exists(classOf[TestUserClass2].getName))
    assert(
      classOf[UserDefinedType[_]].isAssignableFrom((
        UDTRegistration.getUDTFor(classOf[TestUserClass2].getName).get)))
  }

  test("query unregistered user class") {
    assert(!UDTRegistration.exists(classOf[TestUserClass3].getName))
    assert(!UDTRegistration.getUDTFor(classOf[TestUserClass3].getName).isDefined)
  }
}

Source File: ParquetFileFormatSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.parquet

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.SparkException
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.test.SharedSQLContext

class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext {

  test("read parquet footers in parallel") {
    def testReadFooters(ignoreCorruptFiles: Boolean): Unit = {
      withTempDir { dir =>
        val fs = FileSystem.get(sparkContext.hadoopConfiguration)
        val basePath = dir.getCanonicalPath

        val path1 = new Path(basePath, "first")
        val path2 = new Path(basePath, "second")
        val path3 = new Path(basePath, "third")

        spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString)
        spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString)
        spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString)

        val fileStatuses =
          Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten

        val footers = ParquetFileFormat.readParquetFootersInParallel(
          sparkContext.hadoopConfiguration, fileStatuses, ignoreCorruptFiles)

        assert(footers.size == 2)
      }
    }

    testReadFooters(true)
    val exception = intercept[java.io.IOException] {
      testReadFooters(false)
    }
    assert(exception.getMessage().contains("Could not read footer for file"))
  }
}

Source File: UnionDStream.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class UnionDStream[T: ClassTag](parents: Array[DStream[T]])
  extends DStream[T](parents.head.ssc) {

  require(parents.length > 0, "List of DStreams to union is empty")
  require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.length == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[T]] = {
    val rdds = new ArrayBuffer[RDD[T]]()
    parents.map(_.getOrCompute(validTime)).foreach {
      case Some(rdd) => rdds += rdd
      case None => throw new SparkException("Could not generate RDD from a parent for unifying at" +
        s" time $validTime")
    }
    if (rdds.nonEmpty) {
      Some(ssc.sc.union(rdds))
    } else {
      None
    }
  }
}

Source File: TransformedDStream.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import scala.reflect.ClassTag

import org.apache.spark.SparkException
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration, Time}

private[streaming]
class TransformedDStream[U: ClassTag] (
    parents: Seq[DStream[_]],
    transformFunc: (Seq[RDD[_]], Time) => RDD[U]
  ) extends DStream[U](parents.head.ssc) {

  require(parents.nonEmpty, "List of DStreams to transform is empty")
  require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts")
  require(parents.map(_.slideDuration).distinct.size == 1,
    "Some of the DStreams have different slide durations")

  override def dependencies: List[DStream[_]] = parents.toList

  override def slideDuration: Duration = parents.head.slideDuration

  override def compute(validTime: Time): Option[RDD[U]] = {
//    针对每一个流，获取其当前时间的RDD。
    val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse(
      // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE
      throw new SparkException(s"Couldn't generate RDD from parent at time $validTime"))
    }
    val transformedRDD = transformFunc(parentRDDs, validTime)
    if (transformedRDD == null) {
      throw new SparkException("Transform function must not return null. " +
        "Return SparkContext.emptyRDD() instead to represent no element " +
        "as the result of transformation.")
    }
    Some(transformedRDD)
  }

  
  override protected[streaming] def createRDDWithLocalProperties[U](
      time: Time,
      displayInnerRDDOps: Boolean)(body: => U): U = {
    super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body)
  }
}

Source File: StreamingTab.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.streaming.ui

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.ui.{SparkUI, SparkUITab}


private[spark] class StreamingTab(val ssc: StreamingContext)
  extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging {

  import StreamingTab._

  private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static"

  val parent = getSparkUI(ssc)
  val listener = ssc.progressListener

  ssc.addStreamingListener(listener)
  ssc.sc.addSparkListener(listener)
  parent.setStreamingJobProgressListener(listener)
  attachPage(new StreamingPage(this))
  attachPage(new BatchPage(this))

  def attach() {
    getSparkUI(ssc).attachTab(this)
    getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming")
  }

  def detach() {
    getSparkUI(ssc).detachTab(this)
    getSparkUI(ssc).removeStaticHandler("/static/streaming")
  }
}

private object StreamingTab {
  def getSparkUI(ssc: StreamingContext): SparkUI = {
    ssc.sc.ui.getOrElse {
      throw new SparkException("Parent SparkUI to attach this tab to not found!")
    }
  }
}

Source File: StagesResource.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.status.api.v1

import java.util.{List => JList}
import javax.ws.rs._
import javax.ws.rs.core.MediaType

import org.apache.spark.SparkException
import org.apache.spark.scheduler.StageInfo
import org.apache.spark.status.api.v1.StageStatus._
import org.apache.spark.status.api.v1.TaskSorting._
import org.apache.spark.ui.SparkUI

@Produces(Array(MediaType.APPLICATION_JSON))
private[v1] class StagesResource extends BaseAppResource {

  @GET
  def stageList(@QueryParam("status") statuses: JList[StageStatus]): Seq[StageData] = {
    withUI(_.store.stageList(statuses))
  }

  @GET
  @Path("{stageId: \\d+}")
  def stageData(
      @PathParam("stageId") stageId: Int,
      @QueryParam("details") @DefaultValue("true") details: Boolean): Seq[StageData] = {
    withUI { ui =>
      val ret = ui.store.stageData(stageId, details = details)
      if (ret.nonEmpty) {
        ret
      } else {
        throw new NotFoundException(s"unknown stage: $stageId")
      }
    }
  }

  @GET
  @Path("{stageId: \\d+}/{stageAttemptId: \\d+}")
  def oneAttemptData(
      @PathParam("stageId") stageId: Int,
      @PathParam("stageAttemptId") stageAttemptId: Int,
      @QueryParam("details") @DefaultValue("true") details: Boolean): StageData = withUI { ui =>
    try {
      ui.store.stageAttempt(stageId, stageAttemptId, details = details)
    } catch {
      case _: NoSuchElementException =>
        // Change the message depending on whether there are any attempts for the requested stage.
        val all = ui.store.stageData(stageId)
        val msg = if (all.nonEmpty) {
          val ids = all.map(_.attemptId)
          s"unknown attempt for stage $stageId.  Found attempts: [${ids.mkString(",")}]"
        } else {
          s"unknown stage: $stageId"
        }
        throw new NotFoundException(msg)
    }
  }

  @GET
  @Path("{stageId: \\d+}/{stageAttemptId: \\d+}/taskSummary")
  def taskSummary(
      @PathParam("stageId") stageId: Int,
      @PathParam("stageAttemptId") stageAttemptId: Int,
      @DefaultValue("0.05,0.25,0.5,0.75,0.95") @QueryParam("quantiles") quantileString: String)
  : TaskMetricDistributions = withUI { ui =>
    val quantiles = quantileString.split(",").map { s =>
      try {
        s.toDouble
      } catch {
        case nfe: NumberFormatException =>
          throw new BadParameterException("quantiles", "double", s)
      }
    }

    ui.store.taskSummary(stageId, stageAttemptId, quantiles).getOrElse(
      throw new NotFoundException(s"No tasks reported metrics for $stageId / $stageAttemptId yet."))
  }

  @GET
  @Path("{stageId: \\d+}/{stageAttemptId: \\d+}/taskList")
  def taskList(
      @PathParam("stageId") stageId: Int,
      @PathParam("stageAttemptId") stageAttemptId: Int,
      @DefaultValue("0") @QueryParam("offset") offset: Int,
      @DefaultValue("20") @QueryParam("length") length: Int,
      @DefaultValue("ID") @QueryParam("sortBy") sortBy: TaskSorting): Seq[TaskData] = {
    withUI(_.store.taskList(stageId, stageAttemptId, offset, length, sortBy))
  }

}

Source File: RpcEndpointAddress.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.rpc

import org.apache.spark.SparkException


private[spark] case class RpcEndpointAddress(rpcAddress: RpcAddress, name: String) {

  require(name != null, "RpcEndpoint name must be provided.")

  def this(host: String, port: Int, name: String) = {
    this(RpcAddress(host, port), name)
  }

  override val toString = if (rpcAddress != null) {
      s"spark://$name@${rpcAddress.host}:${rpcAddress.port}"
    } else {
      s"spark-client://$name"
    }
}

private[spark] object RpcEndpointAddress {

  def apply(host: String, port: Int, name: String): RpcEndpointAddress = {
    new RpcEndpointAddress(host, port, name)
  }

  def apply(sparkUrl: String): RpcEndpointAddress = {
    try {
      val uri = new java.net.URI(sparkUrl)
      val host = uri.getHost
      val port = uri.getPort
      val name = uri.getUserInfo
      if (uri.getScheme != "spark" ||
          host == null ||
          port < 0 ||
          name == null ||
          (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null
          uri.getFragment != null ||
          uri.getQuery != null) {
        throw new SparkException("Invalid Spark URL: " + sparkUrl)
      }
      new RpcEndpointAddress(host, port, name)
    } catch {
      case e: java.net.URISyntaxException =>
        throw new SparkException("Invalid Spark URL: " + sparkUrl, e)
    }
  }
}

Source File: RUtils.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.api.r

import java.io.File
import java.util.Arrays

import org.apache.spark.{SparkEnv, SparkException}

private[spark] object RUtils {
  // Local path where R binary packages built from R source code contained in the spark
  // packages specified with "--packages" or "--jars" command line option reside.
  var rPackages: Option[String] = None

  
  def isRInstalled: Boolean = {
    try {
      val builder = new ProcessBuilder(Arrays.asList("R", "--version"))
      builder.start().waitFor() == 0
    } catch {
      case e: Exception => false
    }
  }
}

Source File: RpcAddressSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.rpc

import org.apache.spark.{SparkException, SparkFunSuite}

class RpcAddressSuite extends SparkFunSuite {

  test("hostPort") {
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
    assert(address.hostPort == "1.2.3.4:1234")
  }

  test("fromSparkURL") {
    val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234")
    assert(address.host == "1.2.3.4")
    assert(address.port == 1234)
  }

  test("fromSparkURL: a typo url") {
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("spark://1.2. 3.4:1234")
    }
    assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage)
  }

  test("fromSparkURL: invalid scheme") {
    val e = intercept[SparkException] {
      RpcAddress.fromSparkURL("invalid://1.2.3.4:1234")
    }
    assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage)
  }

  test("toSparkURL") {
    val address = RpcAddress("1.2.3.4", 1234)
    assert(address.toSparkURL == "spark://1.2.3.4:1234")
  }
}

Source File: KryoSerializerResizableOutputSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.LocalSparkContext._
import org.apache.spark.SparkContext
import org.apache.spark.SparkException

class KryoSerializerResizableOutputSuite extends SparkFunSuite {

  // trial and error showed this will not serialize with 1mb buffer
  val x = (1 to 400000).toArray

  test("kryo without resizable output buffer should fail on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "1m")
    withSpark(new SparkContext("local", "test", conf)) { sc =>
      intercept[SparkException](sc.parallelize(x).collect())
    }
  }

  test("kryo with resizable output buffer should succeed on large array") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.set("spark.kryoserializer.buffer", "1m")
    conf.set("spark.kryoserializer.buffer.max", "2m")
    withSpark(new SparkContext("local", "test", conf)) { sc =>
      assert(sc.parallelize(x).collect() === x)
    }
  }
}

Source File: ProactiveClosureSerializationSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

org.apache.spark.SparkException Scala Examples