org.apache.spark.SparkException Scala Examples
The following examples show how to use org.apache.spark.SparkException.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RWrappers.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkException import org.apache.spark.ml.util.MLReader private[r] object RWrappers extends MLReader[Object] { override def load(path: String): Object = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val className = (rMetadata \ "class").extract[String] className match { case "org.apache.spark.ml.r.NaiveBayesWrapper" => NaiveBayesWrapper.load(path) case "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper" => AFTSurvivalRegressionWrapper.load(path) case "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" => GeneralizedLinearRegressionWrapper.load(path) case "org.apache.spark.ml.r.KMeansWrapper" => KMeansWrapper.load(path) case "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" => MultilayerPerceptronClassifierWrapper.load(path) case "org.apache.spark.ml.r.LDAWrapper" => LDAWrapper.load(path) case "org.apache.spark.ml.r.IsotonicRegressionWrapper" => IsotonicRegressionWrapper.load(path) case "org.apache.spark.ml.r.GaussianMixtureWrapper" => GaussianMixtureWrapper.load(path) case "org.apache.spark.ml.r.ALSWrapper" => ALSWrapper.load(path) case "org.apache.spark.ml.r.LogisticRegressionWrapper" => LogisticRegressionWrapper.load(path) case _ => throw new SparkException(s"SparkR read.ml does not support load $className") } } }
Example 2
Source File: HDFSCredentialProvider.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.deploy.yarn.security import java.io.{ByteArrayInputStream, DataInputStream} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier import org.apache.hadoop.mapred.Master import org.apache.hadoop.security.Credentials import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging { // Token renewal interval, this value will be set in the first call, // if None means no token renewer specified, so cannot get token renewal interval. private var tokenRenewalInterval: Option[Long] = null override val serviceName: String = "hdfs" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { // NameNode to access, used to get tokens from different FileSystems nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) logInfo("getting token for namenode: " + dst) dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds) } // Get the token renewal interval if it is not set. It will only be called once. if (tokenRenewalInterval == null) { tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf) } // Get the time of next renewal. tokenRenewalInterval.map { interval => creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .map { t => val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) identifier.getIssueDate + interval }.foldLeft(0L)(math.max) } } private def getTokenRenewalInterval( hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = { // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. sparkConf.get(PRINCIPAL).map { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } val t = creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .head val newExpiration = t.renew(hadoopConf) val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) val interval = newExpiration - identifier.getIssueDate logInfo(s"Renewal Interval is $interval") interval } } private def getTokenRenewer(conf: Configuration): String = { val delegTokenRenewer = Master.getMasterPrincipal(conf) logDebug("delegation token renewer is: " + delegTokenRenewer) if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) { val errorMessage = "Can't get Master Kerberos principal for use as renewer" logError(errorMessage) throw new SparkException(errorMessage) } delegTokenRenewer } private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = { sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet + sparkConf.get(STAGING_DIR).map(new Path(_)) .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory) } }
Example 3
Source File: RUtils.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.api.r import java.io.File import java.util.Arrays import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Arrays.asList("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 4
Source File: MesosClusterManager.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl} private[spark] class MesosClusterManager extends ExternalClusterManager { private val MESOS_REGEX = """mesos://(.*)""".r override def canCreate(masterURL: String): Boolean = { masterURL.startsWith("mesos") } override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = { new TaskSchedulerImpl(sc) } override def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = { val mesosUrl = MESOS_REGEX.findFirstMatchIn(masterURL).get.group(1) val coarse = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true) if (coarse) { new MesosCoarseGrainedSchedulerBackend( scheduler.asInstanceOf[TaskSchedulerImpl], sc, mesosUrl, sc.env.securityManager) } else { new MesosFineGrainedSchedulerBackend( scheduler.asInstanceOf[TaskSchedulerImpl], sc, mesosUrl) } } override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend) } }
Example 5
Source File: HashingTF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import java.lang.{Iterable => JavaIterable} import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.annotation.Since import org.apache.spark.api.java.JavaRDD import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD import org.apache.spark.unsafe.hash.Murmur3_x86_32._ import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.util.Utils private[spark] def murmur3Hash(term: Any): Int = { term match { case null => seed case b: Boolean => hashInt(if (b) 1 else 0, seed) case b: Byte => hashInt(b, seed) case s: Short => hashInt(s, seed) case i: Int => hashInt(i, seed) case l: Long => hashLong(l, seed) case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed) case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed) case s: String => val utf8 = UTF8String.fromString(s) hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed) case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " + s"support type ${term.getClass.getCanonicalName} of input data.") } } }
Example 6
Source File: NumericParser.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty) { // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 7
Source File: LabeledPoint.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 8
Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 9
Source File: NumericParserSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.{SparkException, SparkFunSuite} class NumericParserSuite extends SparkFunSuite { test("parser") { val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3)) assert(parsed(1).asInstanceOf[Double] === -4.0) assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8)) assert(parsed(3).asInstanceOf[Double] === 9.0) val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4") malformatted.foreach { s => intercept[SparkException] { NumericParser.parse(s) throw new RuntimeException(s"Didn't detect malformatted string $s.") } } } test("parser with whitespaces") { val s = "(0.0, [1.0, 2.0])" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Double] === 0.0) assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) } }
Example 10
Source File: CachedKafkaConsumer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.{util => ju} import org.apache.kafka.clients.consumer.{ConsumerConfig, ConsumerRecord, KafkaConsumer} import org.apache.kafka.common.TopicPartition import org.apache.spark.{SparkEnv, SparkException, TaskContext} import org.apache.spark.internal.Logging def getOrCreate( topic: String, partition: Int, kafkaParams: ju.Map[String, Object]): CachedKafkaConsumer = synchronized { val groupId = kafkaParams.get(ConsumerConfig.GROUP_ID_CONFIG).asInstanceOf[String] val topicPartition = new TopicPartition(topic, partition) val key = CacheKey(groupId, topicPartition) // If this is reattempt at running the task, then invalidate cache and start with // a new consumer if (TaskContext.get != null && TaskContext.get.attemptNumber > 1) { cache.remove(key) new CachedKafkaConsumer(topicPartition, kafkaParams) } else { if (!cache.containsKey(key)) { cache.put(key, new CachedKafkaConsumer(topicPartition, kafkaParams)) } cache.get(key) } } }
Example 11
Source File: CommitFailureTestRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 12
Source File: ThriftServerTab.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 13
Source File: UDTRegistration.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.util.Utils def getUDTFor(userClass: String): Option[Class[_]] = { udtMap.get(userClass).map { udtClassName => if (Utils.classIsLoadable(udtClassName)) { val udtClass = Utils.classForName(udtClassName) if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) { udtClass } else { throw new SparkException( s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " + s"an UserDefinedType for ${userClass}") } } else { throw new SparkException( s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.") } } } }
Example 14
Source File: ScalaUDFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.types.{IntegerType, StringType} class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper { test("basic") { val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil) checkEvaluation(intUdf, 2) val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil) checkEvaluation(stringUdf, "ax") } test("better error message for NPE") { val udf = ScalaUDF( (s: String) => s.toLowerCase, StringType, Literal.create(null, StringType) :: Nil) val e1 = intercept[SparkException](udf.eval()) assert(e1.getMessage.contains("Failed to execute user defined function")) val e2 = intercept[SparkException] { checkEvalutionWithUnsafeProjection(udf, null) } assert(e2.getMessage.contains("Failed to execute user defined function")) } }
Example 15
Source File: UDTRegistrationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.types._ private[sql] class TestUserClass { } private[sql] class TestUserClass2 { } private[sql] class TestUserClass3 { } private[sql] class NonUserDefinedType { } private[sql] class TestUserClassUDT extends UserDefinedType[TestUserClass] { override def sqlType: DataType = IntegerType override def serialize(input: TestUserClass): Int = 1 override def deserialize(datum: Any): TestUserClass = new TestUserClass override def userClass: Class[TestUserClass] = classOf[TestUserClass] private[spark] override def asNullable: TestUserClassUDT = this override def hashCode(): Int = classOf[TestUserClassUDT].getName.hashCode() override def equals(other: Any): Boolean = other match { case _: TestUserClassUDT => true case _ => false } } class UDTRegistrationSuite extends SparkFunSuite { test("register non-UserDefinedType") { UDTRegistration.register(classOf[TestUserClass].getName, "org.apache.spark.sql.NonUserDefinedType") intercept[SparkException] { UDTRegistration.getUDTFor(classOf[TestUserClass].getName) } } test("default UDTs") { val userClasses = Seq( "org.apache.spark.ml.linalg.Vector", "org.apache.spark.ml.linalg.DenseVector", "org.apache.spark.ml.linalg.SparseVector", "org.apache.spark.ml.linalg.Matrix", "org.apache.spark.ml.linalg.DenseMatrix", "org.apache.spark.ml.linalg.SparseMatrix") userClasses.foreach { c => assert(UDTRegistration.exists(c)) } } test("query registered user class") { UDTRegistration.register(classOf[TestUserClass2].getName, classOf[TestUserClassUDT].getName) assert(UDTRegistration.exists(classOf[TestUserClass2].getName)) assert( classOf[UserDefinedType[_]].isAssignableFrom(( UDTRegistration.getUDTFor(classOf[TestUserClass2].getName).get))) } test("query unregistered user class") { assert(!UDTRegistration.exists(classOf[TestUserClass3].getName)) assert(!UDTRegistration.getUDTFor(classOf[TestUserClass3].getName).isDefined) } }
Example 16
Source File: YarnClusterManager.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl} private[spark] class YarnClusterManager extends ExternalClusterManager { override def canCreate(masterURL: String): Boolean = { masterURL == "yarn" } override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = { sc.deployMode match { case "cluster" => new YarnClusterScheduler(sc) case "client" => new YarnScheduler(sc) case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn") } } override def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = { sc.deployMode match { case "cluster" => new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc) case "client" => new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc) case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn") } } override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend) } }
Example 17
Source File: YarnClientSchedulerBackend.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import scala.collection.mutable.ArrayBuffer import org.apache.hadoop.yarn.api.records.YarnApplicationState import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.deploy.yarn.{Client, ClientArguments, YarnSparkHadoopUtil} import org.apache.spark.internal.Logging import org.apache.spark.launcher.SparkAppHandle import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class YarnClientSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext) extends YarnSchedulerBackend(scheduler, sc) with Logging { private var client: Client = null private var monitorThread: MonitorThread = null override def stop() { assert(client != null, "Attempted to stop this scheduler before starting it!") if (monitorThread != null) { monitorThread.stopMonitor() } // Report a final state to the launcher if one is connected. This is needed since in client // mode this backend doesn't let the app monitor loop run to completion, so it does not report // the final state itself. // // Note: there's not enough information at this point to provide a better final state, // so assume the application was successful. client.reportLauncherState(SparkAppHandle.State.FINISHED) super.stop() YarnSparkHadoopUtil.get.stopCredentialUpdater() client.stop() logInfo("Stopped") } }
Example 18
Source File: HDFSCredentialProviderSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.scalatest.{Matchers, PrivateMethodTester} import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} class HDFSCredentialProviderSuite extends SparkFunSuite with PrivateMethodTester with Matchers { private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer) private def getTokenRenewer( hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = { hdfsCredentialProvider invokePrivate _getTokenRenewer(conf) } private var hdfsCredentialProvider: HDFSCredentialProvider = null override def beforeAll() { super.beforeAll() if (hdfsCredentialProvider == null) { hdfsCredentialProvider = new HDFSCredentialProvider() } } override def afterAll() { if (hdfsCredentialProvider != null) { hdfsCredentialProvider = null } super.afterAll() } test("check token renewer") { val hadoopConf = new Configuration() hadoopConf.set("yarn.resourcemanager.address", "myrm:8033") hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:[email protected]") val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf) renewer should be ("yarn/myrm:[email protected]") } test("check token renewer default") { val hadoopConf = new Configuration() val caught = intercept[SparkException] { getTokenRenewer(hdfsCredentialProvider, hadoopConf) } assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer") } }
Example 19
Source File: UnionDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.length == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.nonEmpty) { Some(ssc.sc.union(rdds)) } else { None } } }
Example 20
Source File: TransformedDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.nonEmpty, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse( // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE throw new SparkException(s"Couldn't generate RDD from parent at time $validTime")) } val transformedRDD = transformFunc(parentRDDs, validTime) if (transformedRDD == null) { throw new SparkException("Transform function must not return null. " + "Return SparkContext.emptyRDD() instead to represent no element " + "as the result of transformation.") } Some(transformedRDD) } override protected[streaming] def createRDDWithLocalProperties[U]( time: Time, displayInnerRDDOps: Boolean)(body: => U): U = { super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body) } }
Example 21
Source File: StreamingTab.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging { import StreamingTab._ private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 22
Source File: RpcEndpointAddress.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import org.apache.spark.SparkException private[spark] case class RpcEndpointAddress(val rpcAddress: RpcAddress, val name: String) { require(name != null, "RpcEndpoint name must be provided.") def this(host: String, port: Int, name: String) = { this(RpcAddress(host, port), name) } override val toString = if (rpcAddress != null) { s"spark://$name@${rpcAddress.host}:${rpcAddress.port}" } else { s"spark-client://$name" } } private[spark] object RpcEndpointAddress { def apply(host: String, port: Int, name: String): RpcEndpointAddress = { new RpcEndpointAddress(host, port, name) } def apply(sparkUrl: String): RpcEndpointAddress = { try { val uri = new java.net.URI(sparkUrl) val host = uri.getHost val port = uri.getPort val name = uri.getUserInfo if (uri.getScheme != "spark" || host == null || port < 0 || name == null || (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null uri.getFragment != null || uri.getQuery != null) { throw new SparkException("Invalid Spark URL: " + sparkUrl) } new RpcEndpointAddress(host, port, name) } catch { case e: java.net.URISyntaxException => throw new SparkException("Invalid Spark URL: " + sparkUrl, e) } } }
Example 23
Source File: RpcTimeout.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import java.util.concurrent.TimeoutException import scala.concurrent.{Await, Future} import scala.concurrent.duration._ import scala.util.control.NonFatal import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.util.Utils def apply(conf: SparkConf, timeoutPropList: Seq[String], defaultValue: String): RpcTimeout = { require(timeoutPropList.nonEmpty) // Find the first set property or use the default value with the first property val itr = timeoutPropList.iterator var foundProp: Option[(String, String)] = None while (itr.hasNext && foundProp.isEmpty) { val propKey = itr.next() conf.getOption(propKey).foreach { prop => foundProp = Some(propKey, prop) } } val finalProp = foundProp.getOrElse(timeoutPropList.head, defaultValue) val timeout = { Utils.timeStringAsSeconds(finalProp._2).seconds } new RpcTimeout(timeout, finalProp._1) } }
Example 24
Source File: RpcEndpointRef.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import scala.concurrent.Future import scala.reflect.ClassTag import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.util.RpcUtils def askWithRetry[T: ClassTag](message: Any, timeout: RpcTimeout): T = { // TODO: Consider removing multiple attempts var attempts = 0 var lastException: Exception = null while (attempts < maxRetries) { attempts += 1 try { val future = ask[T](message, timeout) val result = timeout.awaitResult(future) if (result == null) { throw new SparkException("RpcEndpoint returned null") } return result } catch { case ie: InterruptedException => throw ie case e: Exception => lastException = e logWarning(s"Error sending message [message = $message] in $attempts attempts", e) } if (attempts < maxRetries) { Thread.sleep(retryWaitMs) } } throw new SparkException( s"Error sending message [message = $message]", lastException) } }
Example 25
Source File: ApplicationHistoryProvider.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.history import java.util.zip.ZipOutputStream import scala.xml.Node import org.apache.spark.SparkException import org.apache.spark.ui.SparkUI private[spark] case class ApplicationAttemptInfo( attemptId: Option[String], startTime: Long, endTime: Long, lastUpdated: Long, sparkUser: String, completed: Boolean = false) private[spark] case class ApplicationHistoryInfo( id: String, name: String, attempts: List[ApplicationAttemptInfo]) { def getEmptyListingHtml(): Seq[Node] = Seq.empty }
Example 26
Source File: RpcAddressSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import org.apache.spark.{SparkException, SparkFunSuite} class RpcAddressSuite extends SparkFunSuite { test("hostPort") { val address = RpcAddress("1.2.3.4", 1234) assert(address.host == "1.2.3.4") assert(address.port == 1234) assert(address.hostPort == "1.2.3.4:1234") } test("fromSparkURL") { val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234") assert(address.host == "1.2.3.4") assert(address.port == 1234) } test("fromSparkURL: a typo url") { val e = intercept[SparkException] { RpcAddress.fromSparkURL("spark://1.2. 3.4:1234") } assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage) } test("fromSparkURL: invalid scheme") { val e = intercept[SparkException] { RpcAddress.fromSparkURL("invalid://1.2.3.4:1234") } assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage) } test("toSparkURL") { val address = RpcAddress("1.2.3.4", 1234) assert(address.toSparkURL == "spark://1.2.3.4:1234") } }
Example 27
Source File: KryoSerializerResizableOutputSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.LocalSparkContext import org.apache.spark.SparkContext import org.apache.spark.SparkException class KryoSerializerResizableOutputSuite extends SparkFunSuite { // trial and error showed this will not serialize with 1mb buffer val x = (1 to 400000).toArray test("kryo without resizable output buffer should fail on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "1m") val sc = new SparkContext("local", "test", conf) intercept[SparkException](sc.parallelize(x).collect()) LocalSparkContext.stop(sc) } test("kryo with resizable output buffer should succeed on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "2m") val sc = new SparkContext("local", "test", conf) assert(sc.parallelize(x).collect() === x) LocalSparkContext.stop(sc) } }
Example 28
Source File: ProactiveClosureSerializationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 29
Source File: CoarseGrainedSchedulerBackendSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite} import org.apache.spark.util.{RpcUtils, SerializableBuffer} class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext { ignore("serialized task larger than max RPC message size") { val conf = new SparkConf conf.set("spark.rpc.message.maxSize", "1") conf.set("spark.default.parallelism", "1") sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf) val frameSize = RpcUtils.maxMessageSizeBytes(sc.conf) val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize)) val larger = sc.parallelize(Seq(buffer)) val thrown = intercept[SparkException] { larger.collect() } assert(thrown.getMessage.contains("using broadcast variables for large values")) val smaller = sc.parallelize(1 to 4).collect() assert(smaller.size === 4) } }
Example 30
Source File: StreamHelper.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka import kafka.KafkaHelper import kafka.common.TopicAndPartition import kafka.consumer.PartitionTopicInfo import kafka.message.MessageAndMetadata import kafka.serializer.Decoder import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.{Logging, SparkException} import scala.reflect.ClassTag case class StreamHelper(kafkaParams: Map[String, String]) extends Logging { // helper for kafka zookeeper lazy val kafkaHelper = KafkaHelper(kafkaParams) lazy val kc = new KafkaCluster(kafkaParams) // 1. get leader's earliest and latest offset // 2. get consumer offset // 3-1. if (2) is bounded in (1) use (2) for stream // 3-2. else use (1) by "auto.offset.reset" private def getStartOffsets(topics: Set[String]): Map[TopicAndPartition, Long] = { lazy val reset = kafkaParams.get("auto.offset.reset").map(_.toLowerCase) lazy val consumerOffsets = kafkaHelper.getConsumerOffsets(topics.toSeq) { for { topicPartitions <- kc.getPartitions(topics).right smallOffsets <- kc.getEarliestLeaderOffsets(topicPartitions).right largeOffsets <- kc.getLatestLeaderOffsets(topicPartitions).right } yield { { for { tp <- topicPartitions } yield { val co = consumerOffsets.getOrElse(tp, PartitionTopicInfo.InvalidOffset) val so = smallOffsets.get(tp).map(_.offset).get val lo = largeOffsets.get(tp).map(_.offset).get logWarning(s"$tp: $co $so $lo") if (co >= so && co <= lo) { (tp, co) } else { (tp, reset match { case Some("smallest") => so case _ => lo }) } } }.toMap } }.fold(errs => throw new SparkException(errs.mkString("\n")), ok => ok) } def createStream[K: ClassTag, V: ClassTag, KD <: Decoder[K]: ClassTag, VD <: Decoder[V]: ClassTag](ssc: StreamingContext, topics: Set[String]): InputDStream[(K, V)] = { type R = (K, V) val messageHandler = (mmd: MessageAndMetadata[K, V]) => (mmd.key(), mmd.message()) kafkaHelper.registerConsumerInZK(topics) new DirectKafkaInputDStream[K, V, KD, VD, R](ssc, kafkaParams, getStartOffsets(topics), messageHandler) } def commitConsumerOffsets(offsets: HasOffsetRanges): Unit = { val offsetsMap = { for { range <- offsets.offsetRanges if range.fromOffset < range.untilOffset } yield { logDebug(range.toString()) TopicAndPartition(range.topic, range.partition) -> range.untilOffset } }.toMap kafkaHelper.commitConsumerOffsets(offsetsMap) } def commitConsumerOffset(range: OffsetRange): Unit = { if (range.fromOffset < range.untilOffset) { try { val tp = TopicAndPartition(range.topic, range.partition) logDebug("Committed offset " + range.untilOffset + " for topic " + tp) kafkaHelper.commitConsumerOffset(tp, range.untilOffset) } catch { case t: Throwable => // log it and let it go logWarning("exception during commitOffsets", t) throw t } } } def commitConsumerOffsets[R](stream: InputDStream[R]): Unit = { stream.foreachRDD { rdd => commitConsumerOffsets(rdd.asInstanceOf[HasOffsetRanges]) } } }
Example 31
Source File: NotAvailableFeaturesTest.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.rdd import com.basho.riak.client.core.netty.RiakResponseException import com.basho.riak.spark._ import org.apache.spark.SparkException import org.apache.spark.sql.Row import org.hamcrest.CustomTypeSafeMatcher import org.junit.rules.ExpectedException import org.junit.{ Rule, Test } import org.junit.experimental.categories.Category class NotAvailableFeaturesTest extends AbstractRiakSparkTest { val _expectedException: ExpectedException = ExpectedException.none() @Rule def expectedException: ExpectedException = _expectedException val coverageMatcher = new CustomTypeSafeMatcher[IllegalStateException]("match") { override def matchesSafely(t: IllegalStateException): Boolean = { t.getMessage.contains("Full bucket read is not supported on your version of Riak") && t.getCause.isInstanceOf[RiakResponseException] && t.getCause.getMessage.contains("Unknown message code: 70") } } val timeSeriesMatcher = new CustomTypeSafeMatcher[SparkException]("match") { override def matchesSafely(t: SparkException): Boolean = { t.getMessage.contains("Range queries are not supported in your version of Riak") && t.getMessage.contains("Unknown message code: 90") } } @Category(Array(classOf[RiakKVTests],classOf[RiakKVNotAvailableFeaturesTest])) @Test def timeSeriesOnKV(): Unit = { expectedException.expect(timeSeriesMatcher) val rdd = sc.riakTSTable[Row]("bucket") .sql("select * from bucket") .collect() } @Category(Array(classOf[RiakKVTests],classOf[RiakKVNotAvailableFeaturesTest])) @Test def fullBucketReadOnKV(): Unit = { expectedException.expect(coverageMatcher) val rdd = sc.riakBucket[String](DEFAULT_NAMESPACE) .queryAll() .collect() } @Category(Array(classOf[RiakKVTests],classOf[RiakKVNotAvailableFeaturesTest])) @Test def queryRangeLocalOnKV(): Unit = { expectedException.expect(coverageMatcher) val rdd = sc.riakBucket[String](DEFAULT_NAMESPACE) .query2iRangeLocal("creationNo", 1, 1000) .collect() } }
Example 32
Source File: LinearOperatorSuite.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs import org.scalatest.FunSuite import org.apache.spark.SparkException import org.apache.spark.mllib.linalg.{ DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvectordouble.{ LinopMatrix => LinopMatrixVector } import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector.{ LinopMatrixAdjoint => LinopMatrixVectorAdjoint } import org.apache.spark.mllib.util.MLlibTestSparkContext class LinearOperatorSuite extends FunSuite with MLlibTestSparkContext { lazy val matrix = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)), 2) lazy val vector = new DenseVector(Array(2.2, 3.3, 4.4)) test("LinopMatrix multiplies properly") { val f = new LinopMatrix(matrix) val x = new DenseVector(Array(7.0, 8.0, 9.0)) val result = f(x) val expectedResult = Vectors.dense(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9) assert(Vectors.dense(result.collectElements) == expectedResult, "should return the correct product") } test("LinopMatrixAdjoint multiplies properly") { val f = new LinopMatrixAdjoint(matrix) val y = sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2) val result = f(y) val expectedResult = Vectors.dense(1 * 5 + 4 * 6, 2 * 5 + 5 * 6, 3 * 5 + 6 * 6) assert(result == expectedResult, "should return the correct product") } test("LinopMatrixAdjoint checks for mismatched partition vectors") { val f = new LinopMatrixAdjoint(matrix) val y = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2) intercept[SparkException] { f(y) } } test("LinopMatrixVector multiplies properly") { val f = new LinopMatrixVector(matrix, vector) val x = new DenseVector(Array(7.0, 8.0, 9.0)) val result = f(x) val expectedResult = (new DenseVector(Array(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9)), 7.0 * 2.2 + 8.0 * 3.3 + 9.0 * 4.4) assert(Vectors.dense(result._1.collectElements) == expectedResult._1, "should return the correct product") assert(result._2 == expectedResult._2, "should return the correct product") } test("LinopMatrixVectorAdjoint multiplies properly") { var f = new LinopMatrixVectorAdjoint(matrix, vector) val y = (sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2), 8.8) val result = f(y) val expectedResult = Vectors.dense(1 * 5 + 4 * 6 + 2.2, 2 * 5 + 5 * 6 + 3.3, 3 * 5 + 6 * 6 + 4.4) assert(result == expectedResult, "should return the correct product") } test("LinopMatrixVectorAdjoint checks for mismatched partition vectors") { val f = new LinopMatrixVectorAdjoint(matrix, vector) val y = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2), 8.8) intercept[SparkException] { f(y) } } }
Example 33
Source File: LocalIndexToString.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.SparkException import org.apache.spark.ml.feature.IndexToString class LocalIndexToString(override val sparkTransformer: IndexToString) extends LocalTransformer[IndexToString] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val labels = sparkTransformer.getLabels val indexer = (index: Double) => { val idx = index.toInt if (0 <= idx && idx < labels.length) { labels(idx) } else { throw new SparkException(s"Unseen index: $index ??") } } val newColumn = LocalDataColumn( sparkTransformer.getOutputCol, column.data map { case i: Int => indexer(i.toDouble) case d: Double => indexer(d) case d => throw new IllegalArgumentException(s"Unknown data to index: $d") } ) localData.withColumn(newColumn) case None => localData } } } object LocalIndexToString extends SimpleModelLoader[IndexToString] with TypedTransformerConverter[IndexToString] { override def build(metadata: Metadata, data: LocalData): IndexToString = { val ctor = classOf[IndexToString].getDeclaredConstructor(classOf[String]) ctor.setAccessible(true) ctor .newInstance(metadata.uid) .setLabels(metadata.paramMap("labels").asInstanceOf[Seq[String]].toArray) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) } override implicit def toLocal(transformer: IndexToString) = new LocalIndexToString(transformer) }
Example 34
Source File: LocalStringIndexerModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.SparkException import org.apache.spark.ml.feature.StringIndexerModel import scala.collection.mutable class LocalStringIndexerModel(override val sparkTransformer: StringIndexerModel) extends LocalTransformer[StringIndexerModel] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val labelToIndex = { val n = sparkTransformer.labels.length val map = new mutable.HashMap[String, Double] var i = 0 while (i < n) { map.update(sparkTransformer.labels(i), i) i += 1 } map } val indexer = (label: String) => { if (labelToIndex.contains(label)) { labelToIndex(label) } else { throw new SparkException(s"Unseen label: $label.") } } val newColumn = LocalDataColumn(sparkTransformer.getOutputCol, column.data.map(_.toString) map { feature => indexer(feature) }) localData.withColumn(newColumn) case None => localData } } } object LocalStringIndexerModel extends SimpleModelLoader[StringIndexerModel] with TypedTransformerConverter[StringIndexerModel] { override def build(metadata: Metadata, data: LocalData): StringIndexerModel = { new StringIndexerModel( metadata.uid, data.column("labels").get.data.head.asInstanceOf[Seq[String]].toArray ).setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) .setHandleInvalid(metadata.paramMap("handleInvalid").asInstanceOf[String]) } override implicit def toLocal( transformer: StringIndexerModel ) = new LocalStringIndexerModel(transformer) }
Example 35
Source File: PipelineBuilder.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import org.apache.spark.SparkException import org.apache.spark.ml.PipelineStage import scala.collection.mutable import scala.collection.mutable.ArrayBuffer class IncompatibleFiledExecption(msg: String) extends SparkException(msg) {} object PipelineBuilder { def build(transformers: Array[TransformerWrapper]): Array[PipelineStage] = { val stages: ArrayBuffer[PipelineStage] = new ArrayBuffer[PipelineStage]() //val allInputCols: ArrayBuffer[String] = new ArrayBuffer[String]() val allInputCols: mutable.HashSet[String] = new mutable.HashSet[String]() transformers(0).setInputCols(transformers(0).requiredInputCols) transformers(0).setOutputCols(transformers(0).requiredOutputCols) allInputCols ++= transformers(0).getInputCols transformers(0).setAncestorCols(allInputCols.toArray) stages += transformers(0).declareInAndOut().getTransformer (1 until transformers.length).foreach { i => println(s"add $i-th transformer = ${transformers(i).getTransformer.getClass.getSimpleName}") // set parent transformers(i).setParent(transformers(i - 1)) // add new cols allInputCols ++= transformers(i - 1).getOutputCols // set parent cols transformers(i).setAncestorCols(allInputCols.toArray) // generate input cols transformers(i).generateInputCols() // generate output cols transformers(i).generateOutputCols() // add fully configured transformer stages += transformers(i).declareInAndOut().getTransformer } stages.toArray } }
Example 36
Source File: OrcFileOperator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import java.io.IOException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[hive] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None, ignoreCorruptFiles: Boolean = false) : Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => val reader = try { Some(OrcFile.createReader(fs, path)) } catch { case e: IOException => if (ignoreCorruptFiles) { logWarning(s"Skipped the footer in the corrupted file: $path", e) None } else { throw new SparkException(s"Could not read footer for file: $path", e) } } path -> reader }.collectFirst { case (path, Some(reader)) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration], ignoreCorruptFiles: Boolean) : Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.toIterator.map(getFileReader(_, conf, ignoreCorruptFiles)).collectFirst { case Some(reader) => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 37
Source File: CommitFailureTestRelationSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 38
Source File: ThriftServerTab.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 39
Source File: DataSourceManagerFactory.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql import java.util.ServiceLoader import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.util.Utils object DataSourceManagerFactory { def create( datasourceType: String, conf: SparkConf, hadoopConf: Configuration): DataSourceManager = { val loader = Utils.getContextOrSparkClassLoader val serviceLoader = ServiceLoader.load(classOf[DataSourceManager], loader) var cls: Class[_] = null // As we use ServiceLoader to support creating any user provided DataSourceManager here, // META-INF/services/org.apache.spark.sql.sources.DataSourceRegister must be packaged properly // in user's jar, and the implementation of DataSourceManager must have a public parameterless // constructor. For scala language, def this() = this(null...) just work. try { cls = serviceLoader.asScala .filter(_.shortName().equals(datasourceType)) .toList match { case head :: Nil => head.getClass case _ => throw new SparkException(s"error when instantiate datasource ${datasourceType}") } } catch { case _: Exception => throw new SparkException( s"""Can't find corresponding DataSourceManager for ${datasourceType} type, |please check |1. META-INF/services/org.apache.spark.sql.sources.DataSourceRegister is packaged |2. your implementation of DataSourceManager's shortname is ${datasourceType} |3. your implementation of DataSourceManager must have a public parameterless | constructor. For scala language, def this() = this(null, null, ...) just work. """.stripMargin) } try { val constructor = cls.getConstructor(classOf[SparkConf], classOf[Configuration]) val newHadoopConf = new Configuration(hadoopConf) constructor.newInstance(conf, newHadoopConf).asInstanceOf[DataSourceManager] } catch { case _: NoSuchMethodException => try { cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[DataSourceManager] } catch { case _: NoSuchMethodException => cls.getConstructor().newInstance().asInstanceOf[DataSourceManager] } } } }
Example 40
Source File: StreamingIncrementCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import java.util.Locale import org.apache.spark.SparkException import org.apache.spark.sql.{Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, AttributeSet} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, UnaryNode} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.execution.streaming.StreamingRelationV2 import org.apache.spark.sql.sources.v2.StreamWriteSupport import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.xsql.DataSourceManager._ import org.apache.spark.sql.xsql.StreamingSinkType case class StreamingIncrementCommand(plan: LogicalPlan) extends RunnableCommand { private var outputMode: OutputMode = OutputMode.Append // dummy override def output: Seq[AttributeReference] = Seq.empty // dummy override def producedAttributes: AttributeSet = plan.producedAttributes override def run(sparkSession: SparkSession): Seq[Row] = { import StreamingSinkType._ val qe = new QueryExecution(sparkSession, new ConstructedStreaming(plan)) val df = new Dataset(sparkSession, qe, RowEncoder(qe.analyzed.schema)) plan.collectLeaves.head match { case StreamingRelationV2(_, _, extraOptions, _, _) => val source = extraOptions.getOrElse(STREAMING_SINK_TYPE, DEFAULT_STREAMING_SINK) val sinkOptions = extraOptions.filter(_._1.startsWith(STREAMING_SINK_PREFIX)).map { kv => val key = kv._1.substring(STREAMING_SINK_PREFIX.length) (key, kv._2) } StreamingSinkType.withName(source.toUpperCase(Locale.ROOT)) match { case CONSOLE => case TEXT | PARQUET | ORC | JSON | CSV => if (sinkOptions.get(STREAMING_SINK_PATH) == None) { throw new SparkException("Sink type is file, must config path") } case KAFKA => if (sinkOptions.get(STREAMING_SINK_BOOTSTRAP_SERVERS) == None) { throw new SparkException("Sink type is kafka, must config bootstrap servers") } if (sinkOptions.get(STREAMING_SINK_TOPIC) == None) { throw new SparkException("Sink type is kafka, must config kafka topic") } case _ => throw new SparkException( "Sink type is invalid, " + s"select from ${StreamingSinkType.values}") } val ds = DataSource.lookupDataSource(source, sparkSession.sessionState.conf) val disabledSources = sparkSession.sqlContext.conf.disabledV2StreamingWriters.split(",") val sink = ds.newInstance() match { case w: StreamWriteSupport if !disabledSources.contains(w.getClass.getCanonicalName) => w case _ => val ds = DataSource( sparkSession, className = source, options = sinkOptions.toMap, partitionColumns = Nil) ds.createSink(InternalOutputModes.Append) } val outputMode = InternalOutputModes( extraOptions.getOrElse(STREAMING_OUTPUT_MODE, DEFAULT_STREAMING_OUTPUT_MODE)) val duration = extraOptions.getOrElse(STREAMING_TRIGGER_DURATION, DEFAULT_STREAMING_TRIGGER_DURATION) val trigger = extraOptions.getOrElse(STREAMING_TRIGGER_TYPE, DEFAULT_STREAMING_TRIGGER_TYPE) match { case STREAMING_MICRO_BATCH_TRIGGER => Trigger.ProcessingTime(duration) case STREAMING_ONCE_TRIGGER => Trigger.Once() case STREAMING_CONTINUOUS_TRIGGER => Trigger.Continuous(duration) } val query = sparkSession.sessionState.streamingQueryManager.startQuery( extraOptions.get("queryName"), extraOptions.get(STREAMING_CHECKPOINT_LOCATION), df, sinkOptions.toMap, sink, outputMode, useTempCheckpointLocation = source == DEFAULT_STREAMING_SINK, recoverFromCheckpointLocation = true, trigger = trigger) query.awaitTermination() } // dummy Seq.empty } } case class ConstructedStreaming(child: LogicalPlan) extends UnaryNode { override def output: Seq[Attribute] = child.output }
Example 41
Source File: UDTRegistration.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.util.Utils def getUDTFor(userClass: String): Option[Class[_]] = { udtMap.get(userClass).map { udtClassName => if (Utils.classIsLoadable(udtClassName)) { val udtClass = Utils.classForName(udtClassName) if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) { udtClass } else { throw new SparkException( s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " + s"an UserDefinedType for ${userClass}") } } else { throw new SparkException( s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.") } } } }
Example 42
Source File: ScalaUDFSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import java.util.Locale import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext import org.apache.spark.sql.types.{IntegerType, StringType} class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper { test("basic") { val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil, true :: Nil) checkEvaluation(intUdf, 2) val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil, true :: Nil) checkEvaluation(stringUdf, "ax") } test("better error message for NPE") { val udf = ScalaUDF( (s: String) => s.toLowerCase(Locale.ROOT), StringType, Literal.create(null, StringType) :: Nil, true :: Nil) val e1 = intercept[SparkException](udf.eval()) assert(e1.getMessage.contains("Failed to execute user defined function")) val e2 = intercept[SparkException] { checkEvaluationWithUnsafeProjection(udf, null) } assert(e2.getMessage.contains("Failed to execute user defined function")) } test("SPARK-22695: ScalaUDF should not use global variables") { val ctx = new CodegenContext ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil, true :: Nil).genCode(ctx) assert(ctx.inlinedMutableStates.isEmpty) } }
Example 43
Source File: FailureSafeParser.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.unsafe.types.UTF8String class FailureSafeParser[IN]( rawParser: IN => Seq[InternalRow], mode: ParseMode, schema: StructType, columnNameOfCorruptRecord: String) { private val corruptFieldIndex = schema.getFieldIndex(columnNameOfCorruptRecord) private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord)) private val resultRow = new GenericInternalRow(schema.length) private val nullResult = new GenericInternalRow(schema.length) // This function takes 2 parameters: an optional partial result, and the bad record. If the given // schema doesn't contain a field for corrupted record, we just return the partial result or a // row with all fields null. If the given schema contains a field for corrupted record, we will // set the bad record to this field, and set other fields according to the partial result or null. private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = { if (corruptFieldIndex.isDefined) { (row, badRecord) => { var i = 0 while (i < actualSchema.length) { val from = actualSchema(i) resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull i += 1 } resultRow(corruptFieldIndex.get) = badRecord() resultRow } } else { (row, _) => row.getOrElse(nullResult) } } def parse(input: IN): Iterator[InternalRow] = { try { rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null)) } catch { case e: BadRecordException => mode match { case PermissiveMode => Iterator(toResultRow(e.partialResult(), e.record)) case DropMalformedMode => Iterator.empty case FailFastMode => throw new SparkException("Malformed records are detected in record parsing. " + s"Parse Mode: ${FailFastMode.name}.", e.cause) } } } }
Example 44
Source File: InsertIntoDataSourceDirCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.SparkException import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources._ case class InsertIntoDataSourceDirCommand( storage: CatalogStorageFormat, provider: String, query: LogicalPlan, overwrite: Boolean) extends RunnableCommand { override protected def innerChildren: Seq[LogicalPlan] = query :: Nil override def run(sparkSession: SparkSession): Seq[Row] = { assert(storage.locationUri.nonEmpty, "Directory path is required") assert(provider.nonEmpty, "Data source is required") // Create the relation based on the input logical plan: `query`. val pathOption = storage.locationUri.map("path" -> CatalogUtils.URIToString(_)) val dataSource = DataSource( sparkSession, className = provider, options = storage.properties ++ pathOption, catalogTable = None) val isFileFormat = classOf[FileFormat].isAssignableFrom(dataSource.providingClass) if (!isFileFormat) { throw new SparkException( "Only Data Sources providing FileFormat are supported: " + dataSource.providingClass) } val saveMode = if (overwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists try { sparkSession.sessionState.executePlan(dataSource.planForWriting(saveMode, query)).toRdd } catch { case ex: AnalysisException => logError(s"Failed to write to directory " + storage.locationUri.toString, ex) throw ex } Seq.empty[Row] } }
Example 45
Source File: WriteToContinuousDataSourceExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.continuous import scala.util.control.NonFatal import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter case class WriteToContinuousDataSourceExec(writer: StreamWriter, query: SparkPlan) extends SparkPlan with Logging { override def children: Seq[SparkPlan] = Seq(query) override def output: Seq[Attribute] = Nil override protected def doExecute(): RDD[InternalRow] = { val writerFactory = writer.createWriterFactory() val rdd = new ContinuousWriteRDD(query.execute(), writerFactory) logInfo(s"Start processing data source writer: $writer. " + s"The input RDD has ${rdd.partitions.length} partitions.") EpochCoordinatorRef.get( sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), sparkContext.env) .askSync[Unit](SetWriterPartitions(rdd.getNumPartitions)) try { // Force the RDD to run so continuous processing starts; no data is actually being collected // to the driver, as ContinuousWriteRDD outputs nothing. rdd.collect() } catch { case _: InterruptedException => // Interruption is how continuous queries are ended, so accept and ignore the exception. case cause: Throwable => cause match { // Do not wrap interruption exceptions that will be handled by streaming specially. case _ if StreamExecution.isInterruptionException(cause) => throw cause // Only wrap non fatal exceptions. case NonFatal(e) => throw new SparkException("Writing job aborted.", e) case _ => throw cause } } sparkContext.emptyRDD } }
Example 46
Source File: UDTRegistrationSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.types._ private[sql] class TestUserClass { } private[sql] class TestUserClass2 { } private[sql] class TestUserClass3 { } private[sql] class NonUserDefinedType { } private[sql] class TestUserClassUDT extends UserDefinedType[TestUserClass] { override def sqlType: DataType = IntegerType override def serialize(input: TestUserClass): Int = 1 override def deserialize(datum: Any): TestUserClass = new TestUserClass override def userClass: Class[TestUserClass] = classOf[TestUserClass] private[spark] override def asNullable: TestUserClassUDT = this override def hashCode(): Int = classOf[TestUserClassUDT].getName.hashCode() override def equals(other: Any): Boolean = other match { case _: TestUserClassUDT => true case _ => false } } class UDTRegistrationSuite extends SparkFunSuite { test("register non-UserDefinedType") { UDTRegistration.register(classOf[TestUserClass].getName, "org.apache.spark.sql.NonUserDefinedType") intercept[SparkException] { UDTRegistration.getUDTFor(classOf[TestUserClass].getName) } } test("default UDTs") { val userClasses = Seq( "org.apache.spark.ml.linalg.Vector", "org.apache.spark.ml.linalg.DenseVector", "org.apache.spark.ml.linalg.SparseVector", "org.apache.spark.ml.linalg.Matrix", "org.apache.spark.ml.linalg.DenseMatrix", "org.apache.spark.ml.linalg.SparseMatrix") userClasses.foreach { c => assert(UDTRegistration.exists(c)) } } test("query registered user class") { UDTRegistration.register(classOf[TestUserClass2].getName, classOf[TestUserClassUDT].getName) assert(UDTRegistration.exists(classOf[TestUserClass2].getName)) assert( classOf[UserDefinedType[_]].isAssignableFrom(( UDTRegistration.getUDTFor(classOf[TestUserClass2].getName).get))) } test("query unregistered user class") { assert(!UDTRegistration.exists(classOf[TestUserClass3].getName)) assert(!UDTRegistration.getUDTFor(classOf[TestUserClass3].getName).isDefined) } }
Example 47
Source File: ParquetFileFormatSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkException import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext { test("read parquet footers in parallel") { def testReadFooters(ignoreCorruptFiles: Boolean): Unit = { withTempDir { dir => val fs = FileSystem.get(spark.sessionState.newHadoopConf()) val basePath = dir.getCanonicalPath val path1 = new Path(basePath, "first") val path2 = new Path(basePath, "second") val path3 = new Path(basePath, "third") spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString) spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString) spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString) val fileStatuses = Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten val footers = ParquetFileFormat.readParquetFootersInParallel( spark.sessionState.newHadoopConf(), fileStatuses, ignoreCorruptFiles) assert(footers.size == 2) } } testReadFooters(true) val exception = intercept[SparkException] { testReadFooters(false) }.getCause assert(exception.getMessage().contains("Could not read footer for file")) } }
Example 48
Source File: DruidRule.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.druid import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala import org.apache.spark.sql.catalyst.expressions.{ Attribute, Expression => SExpression, Literal, NamedExpression, SortOrder } import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.Rule object DruidRule extends Rule[LogicalPlan] { override def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { case Aggregate(ges, aes, p @ Project(_, _)) => ProjectAndAggregate(ges, aes, p) case s @ Sort(orders, _, child) => if (child.isInstanceOf[ProjectAndAggregate]) { child.asInstanceOf[ProjectAndAggregate].copy(orders = orders) } else { s } case l @ LocalLimit(Literal(v, t), child) => val value: Any = convertToScala(v, t) val limit = value.asInstanceOf[Int] if (limit < 0) { throw new SparkException(s"Aggregate limit must great than zero!") } if (child.isInstanceOf[ProjectAndAggregate]) { child.asInstanceOf[ProjectAndAggregate].copy(limit = limit) } else { l } case g @ GlobalLimit(_, child) => if (child.isInstanceOf[ProjectAndAggregate]) { child } else { g } } } case class ProjectAndAggregate( groupingExpressions: Seq[SExpression], aggregateExpressions: Seq[NamedExpression], child: LogicalPlan, orders: Seq[SortOrder] = null, limit: Int = 20) extends UnaryNode { override def output: Seq[Attribute] = aggregateExpressions.map(_.toAttribute) }
Example 49
Source File: AlarmFactory.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.alarm import java.util.ServiceLoader import scala.collection.JavaConverters._ import org.apache.spark.SparkException import org.apache.spark.util.Utils object AlarmFactory { def create(alarmName: String, options: Map[String, String]): Alarm = { val loader = Utils.getContextOrSparkClassLoader val serviceLoader = ServiceLoader.load(classOf[Alarm], loader) val AlarmClass = serviceLoader.asScala.filter(_.name.equalsIgnoreCase(alarmName)).toList match { case head :: Nil => head.getClass case _ => throw new SparkException("error when instantiate spark.xsql.alarm.items") } AlarmClass.newInstance().bind(options) } }
Example 50
Source File: MonitorFactory.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.monitor import java.util.ServiceLoader import scala.collection.JavaConverters._ import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.alarm.Alarm import org.apache.spark.util.Utils import org.apache.spark.util.kvstore.KVStore object MonitorFactory { def create( monitorName: String, alarms: Seq[Alarm], appStore: KVStore, conf: SparkConf): Monitor = { val loader = Utils.getContextOrSparkClassLoader val serviceLoader = ServiceLoader.load(classOf[Monitor], loader) val MonitorClass = serviceLoader.asScala .filter(_.item.equals(MonitorItem.withName(monitorName))) .toList match { case head :: Nil => head.getClass case _ => throw new SparkException("error when instantiate spark.xsql.monitor.items") } MonitorClass.newInstance().bind(alarms).bind(appStore).bind(conf) } }
Example 51
Source File: MetricsSystems.scala From spark-monitoring with MIT License | 5 votes |
package org.apache.spark.metrics import com.codahale.metrics._ import org.apache.spark.SparkException import org.apache.spark.internal.Logging import scala.collection.JavaConverters.mapAsScalaMapConverter // These will only be created on executors private[metrics] class RpcMetricsSystem( private val metricsSource: MetricsSource ) extends UserMetricsSystem with Logging { require(metricsSource != null, "metricsSource cannot be null") private val namespace = metricsSource.sourceName private val metricProxies = metricsSource.metricRegistry.getMetrics.asScala def counter(metricName: String): Counter = { getMetric[CounterProxy](metricName) } def histogram(metricName: String): Histogram = { getMetric[HistogramProxy](metricName) } def meter(metricName: String): Meter = { getMetric[MeterProxy](metricName) } def timer(metricName: String): Timer = { getMetric[TimerProxy](metricName) } def gauge[T](metricName: String): SettableGauge[T] = { getMetric[SettableGaugeProxy[T]](metricName) } private def getMetric[T <: MetricProxy](metricName: String): T = { metricProxies.get(metricName) match { case Some(metric) => { metric.asInstanceOf[T] } case None => throw new SparkException(s"Metric '${metricName}' in namespace ${namespace} was not found") } } } // These can be created on the driver and the executors. class LocalMetricsSystem( metricsSource: MetricsSource ) extends UserMetricsSystem { require(metricsSource != null, "metricsSource cannot be null") private val namespace = metricsSource.sourceName private lazy val metrics = metricsSource.metricRegistry.getMetrics.asScala def counter(metricName: String): Counter = { getMetric[Counter](metricName) } def histogram(metricName: String): Histogram = { getMetric[Histogram](metricName) } def meter(metricName: String): Meter = { getMetric[Meter](metricName) } def timer(metricName: String): Timer = { getMetric[Timer](metricName) } def gauge[T](metricName: String): SettableGauge[T] = { val metric = getMetric[Gauge[T]](metricName) // If we have one, but it's not a settable gauge, it will run autonomously and provide metrics. // However, this is an exception here, as the developer wants to set it. if (!(metric.isInstanceOf[SettableGauge[T]])) { throw new SparkException(s"Gauge ${metricName} does not extend SettableGauge[T]") } metric.asInstanceOf[SettableGauge[T]] } private def getMetric[T <: Metric](metricName: String): T = { metrics.get(metricName) match { case Some(metric) => { metric.asInstanceOf[T] } case None => throw new SparkException(s"Metric '${metricName}' in namespace ${namespace} was not found") } } }
Example 52
Source File: UnifiedSparkListener.scala From spark-monitoring with MIT License | 5 votes |
package org.apache.spark.listeners import java.time.Instant import org.apache.spark.{SparkConf, SparkException, SparkInformation} import org.apache.spark.internal.Logging import org.apache.spark.listeners.sink.SparkListenerSink import org.apache.spark.scheduler._ import org.apache.spark.sql.streaming.StreamingQueryListener import org.apache.spark.util.JsonProtocol import org.json4s.JsonAST.JValue import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods.{compact, render} import scala.util.control.NonFatal class UnifiedSparkListener(override val conf: SparkConf) extends UnifiedSparkListenerHandler with Logging with SparkListenerHandlers with StreamingListenerHandlers with StreamingQueryListenerHandlers { private val listenerSink = this.createSink(this.conf) override def onOtherEvent(event: SparkListenerEvent): Unit = { // All events in Spark that are not specific to SparkListener go through // this method. The typed ListenerBus implementations intercept and forward to // their "local" listeners. // We will just handle everything here so we only have to have one listener. // The advantage is that this can be registered in extraListeners, so no // code change is required to add listener support. event match { // We will use the ClassTag for the private wrapper class to match case this.streamingListenerEventClassTag(e) => this.onStreamingListenerEvent(e) case streamingQueryListenerEvent: StreamingQueryListener.Event => this.onStreamingQueryListenerEvent(streamingQueryListenerEvent) case sparkListenerEvent: SparkListenerEvent => if (sparkListenerEvent.logEvent) { logSparkListenerEvent(sparkListenerEvent) } } } private def createSink(conf: SparkConf): SparkListenerSink = { val sink = conf.getOption("spark.unifiedListener.sink") match { case Some(listenerSinkClassName) => listenerSinkClassName case None => throw new SparkException("spark.unifiedListener.sink setting is required") } logInfo(s"Creating listener sink: ${sink}") org.apache.spark.util.Utils.loadExtensions( classOf[SparkListenerSink], Seq(sink), conf).head } protected def logSparkListenerEvent( event: SparkListenerEvent, getTimestamp: () => Instant = () => Instant.now()): Unit = { val json = try { // Add a well-known time field. Some( JsonProtocol.sparkEventToJson(event) .merge(render( SparkInformation.get() + ("SparkEventTime" -> getTimestamp().toString) )) ) } catch { case NonFatal(e) => logError(s"Error serializing SparkListenerEvent to JSON: $event", e) None } sendToSink(json) } private[spark] def sendToSink(json: Option[JValue]): Unit = { try { json match { case Some(j) => { logDebug(s"Sending event to listener sink: ${compact(j)}") this.listenerSink.logEvent(json) } case None => { logWarning("json value was None") } } } catch { case NonFatal(e) => logError(s"Error sending to listener sink: $e") } } }
Example 53
Source File: LogAnalyticsMetricsSink.scala From spark-monitoring with MIT License | 5 votes |
package org.apache.spark.metrics.sink.loganalytics import java.util.Properties import java.util.concurrent.TimeUnit import com.codahale.metrics.MetricRegistry import org.apache.spark.internal.Logging import org.apache.spark.metrics.sink.Sink import org.apache.spark.{SecurityManager, SparkException} private class LogAnalyticsMetricsSink( val property: Properties, val registry: MetricRegistry, securityMgr: SecurityManager) extends Sink with Logging { private val config = new LogAnalyticsSinkConfiguration(property) org.apache.spark.metrics.MetricsSystem.checkMinimalPollingPeriod(config.pollUnit, config.pollPeriod) var reporter = LogAnalyticsReporter.forRegistry(registry) .withWorkspaceId(config.workspaceId) .withWorkspaceKey(config.secret) .withLogType(config.logType) .build() override def start(): Unit = { reporter.start(config.pollPeriod, config.pollUnit) logInfo(s"LogAnalyticsMetricsSink started") } override def stop(): Unit = { reporter.stop() logInfo("LogAnalyticsMetricsSink stopped.") } override def report(): Unit = { reporter.report() } }
Example 54
Source File: MesosClusterManager.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.config._ import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl} private[spark] class MesosClusterManager extends ExternalClusterManager { private val MESOS_REGEX = """mesos://(.*)""".r override def canCreate(masterURL: String): Boolean = { masterURL.startsWith("mesos") } override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = { new TaskSchedulerImpl(sc) } override def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = { require(!sc.conf.get(IO_ENCRYPTION_ENABLED), "I/O encryption is currently not supported in Mesos.") val mesosUrl = MESOS_REGEX.findFirstMatchIn(masterURL).get.group(1) val coarse = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true) if (coarse) { new MesosCoarseGrainedSchedulerBackend( scheduler.asInstanceOf[TaskSchedulerImpl], sc, mesosUrl, sc.env.securityManager) } else { new MesosFineGrainedSchedulerBackend( scheduler.asInstanceOf[TaskSchedulerImpl], sc, mesosUrl) } } override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend) } }
Example 55
Source File: RWrappers.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkException import org.apache.spark.ml.util.MLReader private[r] object RWrappers extends MLReader[Object] { override def load(path: String): Object = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val className = (rMetadata \ "class").extract[String] className match { case "org.apache.spark.ml.r.NaiveBayesWrapper" => NaiveBayesWrapper.load(path) case "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper" => AFTSurvivalRegressionWrapper.load(path) case "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" => GeneralizedLinearRegressionWrapper.load(path) case "org.apache.spark.ml.r.KMeansWrapper" => KMeansWrapper.load(path) case "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" => MultilayerPerceptronClassifierWrapper.load(path) case "org.apache.spark.ml.r.LDAWrapper" => LDAWrapper.load(path) case "org.apache.spark.ml.r.IsotonicRegressionWrapper" => IsotonicRegressionWrapper.load(path) case "org.apache.spark.ml.r.GaussianMixtureWrapper" => GaussianMixtureWrapper.load(path) case "org.apache.spark.ml.r.ALSWrapper" => ALSWrapper.load(path) case "org.apache.spark.ml.r.LogisticRegressionWrapper" => LogisticRegressionWrapper.load(path) case "org.apache.spark.ml.r.RandomForestRegressorWrapper" => RandomForestRegressorWrapper.load(path) case "org.apache.spark.ml.r.RandomForestClassifierWrapper" => RandomForestClassifierWrapper.load(path) case "org.apache.spark.ml.r.GBTRegressorWrapper" => GBTRegressorWrapper.load(path) case "org.apache.spark.ml.r.GBTClassifierWrapper" => GBTClassifierWrapper.load(path) case _ => throw new SparkException(s"SparkR read.ml does not support load $className") } } }
Example 56
Source File: NumericParser.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty) { // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 57
Source File: LabeledPoint.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 58
Source File: LibSVMRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 59
Source File: NumericParserSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.{SparkException, SparkFunSuite} class NumericParserSuite extends SparkFunSuite { test("parser") { val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3)) assert(parsed(1).asInstanceOf[Double] === -4.0) assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8)) assert(parsed(3).asInstanceOf[Double] === 9.0) val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4") malformatted.foreach { s => intercept[SparkException] { NumericParser.parse(s) throw new RuntimeException(s"Didn't detect malformatted string $s.") } } } test("parser with whitespaces") { val s = "(0.0, [1.0, 2.0])" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Double] === 0.0) assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) } }
Example 60
Source File: CommitFailureTestRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 61
Source File: ThriftServerTab.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 62
Source File: UDTRegistration.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.util.Utils def getUDTFor(userClass: String): Option[Class[_]] = { udtMap.get(userClass).map { udtClassName => if (Utils.classIsLoadable(udtClassName)) { val udtClass = Utils.classForName(udtClassName) if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) { udtClass } else { throw new SparkException( s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " + s"an UserDefinedType for ${userClass}") } } else { throw new SparkException( s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.") } } } }
Example 63
Source File: ScalaUDFSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.types.{IntegerType, StringType} class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper { test("basic") { val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil) checkEvaluation(intUdf, 2) val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil) checkEvaluation(stringUdf, "ax") } test("better error message for NPE") { val udf = ScalaUDF( (s: String) => s.toLowerCase, StringType, Literal.create(null, StringType) :: Nil) val e1 = intercept[SparkException](udf.eval()) assert(e1.getMessage.contains("Failed to execute user defined function")) val e2 = intercept[SparkException] { checkEvalutionWithUnsafeProjection(udf, null) } assert(e2.getMessage.contains("Failed to execute user defined function")) } }
Example 64
Source File: UDTRegistrationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.types._ private[sql] class TestUserClass { } private[sql] class TestUserClass2 { } private[sql] class TestUserClass3 { } private[sql] class NonUserDefinedType { } private[sql] class TestUserClassUDT extends UserDefinedType[TestUserClass] { override def sqlType: DataType = IntegerType override def serialize(input: TestUserClass): Int = 1 override def deserialize(datum: Any): TestUserClass = new TestUserClass override def userClass: Class[TestUserClass] = classOf[TestUserClass] private[spark] override def asNullable: TestUserClassUDT = this override def hashCode(): Int = classOf[TestUserClassUDT].getName.hashCode() override def equals(other: Any): Boolean = other match { case _: TestUserClassUDT => true case _ => false } } class UDTRegistrationSuite extends SparkFunSuite { test("register non-UserDefinedType") { UDTRegistration.register(classOf[TestUserClass].getName, "org.apache.spark.sql.NonUserDefinedType") intercept[SparkException] { UDTRegistration.getUDTFor(classOf[TestUserClass].getName) } } test("default UDTs") { val userClasses = Seq( "org.apache.spark.ml.linalg.Vector", "org.apache.spark.ml.linalg.DenseVector", "org.apache.spark.ml.linalg.SparseVector", "org.apache.spark.ml.linalg.Matrix", "org.apache.spark.ml.linalg.DenseMatrix", "org.apache.spark.ml.linalg.SparseMatrix") userClasses.foreach { c => assert(UDTRegistration.exists(c)) } } test("query registered user class") { UDTRegistration.register(classOf[TestUserClass2].getName, classOf[TestUserClassUDT].getName) assert(UDTRegistration.exists(classOf[TestUserClass2].getName)) assert( classOf[UserDefinedType[_]].isAssignableFrom(( UDTRegistration.getUDTFor(classOf[TestUserClass2].getName).get))) } test("query unregistered user class") { assert(!UDTRegistration.exists(classOf[TestUserClass3].getName)) assert(!UDTRegistration.getUDTFor(classOf[TestUserClass3].getName).isDefined) } }
Example 65
Source File: HDFSCredentialProvider.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import java.io.{ByteArrayInputStream, DataInputStream} import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hdfs.security.token.delegation.DelegationTokenIdentifier import org.apache.hadoop.mapred.Master import org.apache.hadoop.security.Credentials import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.yarn.config._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[security] class HDFSCredentialProvider extends ServiceCredentialProvider with Logging { // Token renewal interval, this value will be set in the first call, // if None means no token renewer specified, so cannot get token renewal interval. private var tokenRenewalInterval: Option[Long] = null override val serviceName: String = "hdfs" override def obtainCredentials( hadoopConf: Configuration, sparkConf: SparkConf, creds: Credentials): Option[Long] = { // NameNode to access, used to get tokens from different FileSystems nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) logInfo("getting token for namenode: " + dst) dstFs.addDelegationTokens(getTokenRenewer(hadoopConf), creds) } // Get the token renewal interval if it is not set. It will only be called once. if (tokenRenewalInterval == null) { tokenRenewalInterval = getTokenRenewalInterval(hadoopConf, sparkConf) } // Get the time of next renewal. tokenRenewalInterval.map { interval => creds.getAllTokens.asScala .filter(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) .map { t => val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) identifier.getIssueDate + interval }.foldLeft(0L)(math.max) } } private def getTokenRenewalInterval( hadoopConf: Configuration, sparkConf: SparkConf): Option[Long] = { // We cannot use the tokens generated with renewer yarn. Trying to renew // those will fail with an access control issue. So create new tokens with the logged in // user as renewer. sparkConf.get(PRINCIPAL).flatMap { renewer => val creds = new Credentials() nnsToAccess(hadoopConf, sparkConf).foreach { dst => val dstFs = dst.getFileSystem(hadoopConf) dstFs.addDelegationTokens(renewer, creds) } val hdfsToken = creds.getAllTokens.asScala .find(_.getKind == DelegationTokenIdentifier.HDFS_DELEGATION_KIND) hdfsToken.map { t => val newExpiration = t.renew(hadoopConf) val identifier = new DelegationTokenIdentifier() identifier.readFields(new DataInputStream(new ByteArrayInputStream(t.getIdentifier))) val interval = newExpiration - identifier.getIssueDate logInfo(s"Renewal Interval is $interval") interval } } } private def getTokenRenewer(conf: Configuration): String = { val delegTokenRenewer = Master.getMasterPrincipal(conf) logDebug("delegation token renewer is: " + delegTokenRenewer) if (delegTokenRenewer == null || delegTokenRenewer.length() == 0) { val errorMessage = "Can't get Master Kerberos principal for use as renewer" logError(errorMessage) throw new SparkException(errorMessage) } delegTokenRenewer } private def nnsToAccess(hadoopConf: Configuration, sparkConf: SparkConf): Set[Path] = { sparkConf.get(NAMENODES_TO_ACCESS).map(new Path(_)).toSet + sparkConf.get(STAGING_DIR).map(new Path(_)) .getOrElse(FileSystem.get(hadoopConf).getHomeDirectory) } }
Example 66
Source File: YarnClusterManager.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl} private[spark] class YarnClusterManager extends ExternalClusterManager { override def canCreate(masterURL: String): Boolean = { masterURL == "yarn" } override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = { sc.deployMode match { case "cluster" => new YarnClusterScheduler(sc) case "client" => new YarnScheduler(sc) case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn") } } override def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = { sc.deployMode match { case "cluster" => new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc) case "client" => new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc) case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn") } } override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend) } }
Example 67
Source File: HDFSCredentialProviderSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.scalatest.{Matchers, PrivateMethodTester} import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} class HDFSCredentialProviderSuite extends SparkFunSuite with PrivateMethodTester with Matchers { private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer) private def getTokenRenewer( hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = { hdfsCredentialProvider invokePrivate _getTokenRenewer(conf) } private var hdfsCredentialProvider: HDFSCredentialProvider = null override def beforeAll() { super.beforeAll() if (hdfsCredentialProvider == null) { hdfsCredentialProvider = new HDFSCredentialProvider() } } override def afterAll() { if (hdfsCredentialProvider != null) { hdfsCredentialProvider = null } super.afterAll() } test("check token renewer") { val hadoopConf = new Configuration() hadoopConf.set("yarn.resourcemanager.address", "myrm:8033") hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:[email protected]") val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf) renewer should be ("yarn/myrm:[email protected]") } test("check token renewer default") { val hadoopConf = new Configuration() val caught = intercept[SparkException] { getTokenRenewer(hdfsCredentialProvider, hadoopConf) } assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer") } }
Example 68
Source File: UnionDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.length == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.nonEmpty) { Some(ssc.sc.union(rdds)) } else { None } } }
Example 69
Source File: TransformedDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.nonEmpty, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse( // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE throw new SparkException(s"Couldn't generate RDD from parent at time $validTime")) } val transformedRDD = transformFunc(parentRDDs, validTime) if (transformedRDD == null) { throw new SparkException("Transform function must not return null. " + "Return SparkContext.emptyRDD() instead to represent no element " + "as the result of transformation.") } Some(transformedRDD) } override protected[streaming] def createRDDWithLocalProperties[U]( time: Time, displayInnerRDDOps: Boolean)(body: => U): U = { super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body) } }
Example 70
Source File: StreamingTab.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging { import StreamingTab._ private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 71
Source File: RpcEndpointAddress.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import org.apache.spark.SparkException private[spark] case class RpcEndpointAddress(val rpcAddress: RpcAddress, val name: String) { require(name != null, "RpcEndpoint name must be provided.") def this(host: String, port: Int, name: String) = { this(RpcAddress(host, port), name) } override val toString = if (rpcAddress != null) { s"spark://$name@${rpcAddress.host}:${rpcAddress.port}" } else { s"spark-client://$name" } } private[spark] object RpcEndpointAddress { def apply(host: String, port: Int, name: String): RpcEndpointAddress = { new RpcEndpointAddress(host, port, name) } def apply(sparkUrl: String): RpcEndpointAddress = { try { val uri = new java.net.URI(sparkUrl) val host = uri.getHost val port = uri.getPort val name = uri.getUserInfo if (uri.getScheme != "spark" || host == null || port < 0 || name == null || (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null uri.getFragment != null || uri.getQuery != null) { throw new SparkException("Invalid Spark URL: " + sparkUrl) } new RpcEndpointAddress(host, port, name) } catch { case e: java.net.URISyntaxException => throw new SparkException("Invalid Spark URL: " + sparkUrl, e) } } }
Example 72
Source File: RUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.File import java.util.Arrays import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Arrays.asList("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 73
Source File: RpcAddressSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import org.apache.spark.{SparkException, SparkFunSuite} class RpcAddressSuite extends SparkFunSuite { test("hostPort") { val address = RpcAddress("1.2.3.4", 1234) assert(address.host == "1.2.3.4") assert(address.port == 1234) assert(address.hostPort == "1.2.3.4:1234") } test("fromSparkURL") { val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234") assert(address.host == "1.2.3.4") assert(address.port == 1234) } test("fromSparkURL: a typo url") { val e = intercept[SparkException] { RpcAddress.fromSparkURL("spark://1.2. 3.4:1234") } assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage) } test("fromSparkURL: invalid scheme") { val e = intercept[SparkException] { RpcAddress.fromSparkURL("invalid://1.2.3.4:1234") } assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage) } test("toSparkURL") { val address = RpcAddress("1.2.3.4", 1234) assert(address.toSparkURL == "spark://1.2.3.4:1234") } }
Example 74
Source File: KryoSerializerResizableOutputSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.LocalSparkContext import org.apache.spark.SparkContext import org.apache.spark.SparkException class KryoSerializerResizableOutputSuite extends SparkFunSuite { // trial and error showed this will not serialize with 1mb buffer val x = (1 to 400000).toArray test("kryo without resizable output buffer should fail on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "1m") val sc = new SparkContext("local", "test", conf) intercept[SparkException](sc.parallelize(x).collect()) LocalSparkContext.stop(sc) } test("kryo with resizable output buffer should succeed on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "2m") val sc = new SparkContext("local", "test", conf) assert(sc.parallelize(x).collect() === x) LocalSparkContext.stop(sc) } }
Example 75
Source File: ProactiveClosureSerializationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 76
Source File: CoarseGrainedSchedulerBackendSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite} import org.apache.spark.util.{RpcUtils, SerializableBuffer} class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext { test("serialized task larger than max RPC message size") { val conf = new SparkConf conf.set("spark.rpc.message.maxSize", "1") conf.set("spark.default.parallelism", "1") sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf) val frameSize = RpcUtils.maxMessageSizeBytes(sc.conf) val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize)) val larger = sc.parallelize(Seq(buffer)) val thrown = intercept[SparkException] { larger.collect() } assert(thrown.getMessage.contains("using broadcast variables for large values")) val smaller = sc.parallelize(1 to 4).collect() assert(smaller.size === 4) } }
Example 77
Source File: FuncTestSparkNotebookContext.scala From uberdata with Apache License 2.0 | 5 votes |
package eleflow.uberdata.core import eleflow.uberdata.core.data.{DataTransformer, Dataset} import eleflow.uberdata.core.enums.DataSetType import org.apache.spark.rpc.netty.{BeforeAndAfterWithContext, TestSparkConf} import org.apache.spark.SparkException import org.scalatest._ class FuncTestSparkNotebookContext extends FlatSpec with BeforeAndAfterWithContext { this: Suite => val uberContext = context "Functional SparkNotebookContext" should "correctly load rdd" in { import Dataset._ val dataset = Dataset(uberContext, s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv") val testDataSet = Dataset(uberContext, s"${defaultFilePath}FuncTestSparkNotebookContextFile2.csv") val (train, test, _) = DataTransformer.createLabeledPointFromRDD(dataset, testDataSet, "int", "id") val all = train.take(3) val (_, first) = all.head val (_, second) = all.tail.head assert(first.label == 5.0) assert(first.features.toArray.deep == Array[Double](0.0, 1.0, 10.5).deep) assert(second.label == 1.0) assert(second.features.toArray.deep == Array[Double](1.0, 0.0, 0.1).deep) val allTest = test.take(3) val (_, firstTest) = allTest.head val (_, secondTest) = allTest.tail.head assert(firstTest.label == 1.0) assert(firstTest.features.toArray.deep == Array[Double](0.0, 1.0, 10.5).deep) assert(secondTest.label == 2.0) assert(secondTest.features.toArray.deep == Array[Double](1.0, 0.0, 0.1).deep) } it should "Throw an exception when process an empty numeric column" in { @transient lazy val context = uberContext context.sparkContext try { import Dataset._ val dataset = Dataset(context, s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv") dataset.take(3) } catch { case e: SparkException => assert(e.getMessage.contains("UnexpectedFileFormatException")) } } it should "Correct handle empty string values" in { @transient lazy val context = uberContext context.sparkContext val schemaRdd = Dataset(context, s"${defaultFilePath}FuncTestSparkNotebookContextEmpty.csv").toDataFrame val result = DataTransformer .createLabeledPointFromRDD(schemaRdd, Seq("int"), Seq("id"), DataSetType.Train) assert(result.count() == 3) } it should "Throw an exception when input have different number of columns" in { uberContext.sparkContext try { context .load(s"${defaultFilePath}FuncTestSparkNotebookContextFile1.csv", TestSparkConf.separator) } catch { case e: SparkException => assert(e.getMessage.contains("UnexpectedFileFormatException")) } } }
Example 78
Source File: MasterSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import akka.actor.Address import org.scalatest.FunSuite import org.apache.spark.{SSLOptions, SparkConf, SparkException} class MasterSuite extends FunSuite { test("toAkkaUrl") { val conf = new SparkConf(loadDefaults = false) val akkaUrl = Master.toAkkaUrl("spark://1.2.3.4:1234", "akka.tcp") assert("akka.tcp://[email protected]:1234/user/Master" === akkaUrl) } test("toAkkaUrl with SSL") { val conf = new SparkConf(loadDefaults = false) val akkaUrl = Master.toAkkaUrl("spark://1.2.3.4:1234", "akka.ssl.tcp") assert("akka.ssl.tcp://[email protected]:1234/user/Master" === akkaUrl) } test("toAkkaUrl: a typo url") { val conf = new SparkConf(loadDefaults = false) val e = intercept[SparkException] { Master.toAkkaUrl("spark://1.2. 3.4:1234", "akka.tcp") } assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage) } test("toAkkaAddress") { val conf = new SparkConf(loadDefaults = false) val address = Master.toAkkaAddress("spark://1.2.3.4:1234", "akka.tcp") assert(Address("akka.tcp", "sparkMaster", "1.2.3.4", 1234) === address) } test("toAkkaAddress with SSL") { val conf = new SparkConf(loadDefaults = false) val address = Master.toAkkaAddress("spark://1.2.3.4:1234", "akka.ssl.tcp") assert(Address("akka.ssl.tcp", "sparkMaster", "1.2.3.4", 1234) === address) } test("toAkkaAddress: a typo url") { val conf = new SparkConf(loadDefaults = false) val e = intercept[SparkException] { Master.toAkkaAddress("spark://1.2. 3.4:1234", "akka.tcp") } assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage) } }
Example 79
Source File: KryoSerializerResizableOutputSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.scalatest.FunSuite import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.LocalSparkContext import org.apache.spark.SparkException class KryoSerializerResizableOutputSuite extends FunSuite { // trial and error showed this will not serialize with 1mb buffer val x = (1 to 400000).toArray test("kryo without resizable output buffer should fail on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer.mb", "1") conf.set("spark.kryoserializer.buffer.max.mb", "1") val sc = new SparkContext("local", "test", conf) intercept[SparkException](sc.parallelize(x).collect()) LocalSparkContext.stop(sc) } test("kryo with resizable output buffer should succeed on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer.mb", "1") conf.set("spark.kryoserializer.buffer.max.mb", "2") val sc = new SparkContext("local", "test", conf) assert(sc.parallelize(x).collect() === x) LocalSparkContext.stop(sc) } }
Example 80
Source File: ProactiveClosureSerializationSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.scalatest.FunSuite import org.apache.spark.{SharedSparkContext, SparkException} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T) = x.toString def pred[T](x: T) = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends FunSuite with SharedSparkContext { def fixture = (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y=>uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y=>Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y=>uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y=>uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y=>uc.op(y))) }
Example 81
Source File: CoarseGrainedSchedulerBackendSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkException, SparkContext} import org.apache.spark.util.{SerializableBuffer, AkkaUtils} import org.scalatest.FunSuite class CoarseGrainedSchedulerBackendSuite extends FunSuite with LocalSparkContext { test("serialized task larger than akka frame size") { val conf = new SparkConf conf.set("spark.akka.frameSize","1") conf.set("spark.default.parallelism","1") sc = new SparkContext("local-cluster[2 , 1 , 512]", "test", conf) val frameSize = AkkaUtils.maxFrameSizeBytes(sc.conf) val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize)) val larger = sc.parallelize(Seq(buffer)) val thrown = intercept[SparkException] { larger.collect() } assert(thrown.getMessage.contains("using broadcast variables for large values")) val smaller = sc.parallelize(1 to 4).collect() assert(smaller.size === 4) } }
Example 82
Source File: MutableURLClassLoaderSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.net.URLClassLoader import org.scalatest.FunSuite import org.apache.spark.{LocalSparkContext, SparkContext, SparkException, TestUtils} import org.apache.spark.util.Utils class MutableURLClassLoaderSuite extends FunSuite { val urls2 = List(TestUtils.createJarWithClasses( classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"), toStringValue = "2")).toArray val urls = List(TestUtils.createJarWithClasses( classNames = Seq("FakeClass1"), classNamesWithBase = Seq(("FakeClass2", "FakeClass3")), // FakeClass3 is in parent toStringValue = "1", classpathUrls = urls2)).toArray test("child first") { val parentLoader = new URLClassLoader(urls2, null) val classLoader = new ChildFirstURLClassLoader(urls, parentLoader) val fakeClass = classLoader.loadClass("FakeClass2").newInstance() val fakeClassVersion = fakeClass.toString assert(fakeClassVersion === "1") val fakeClass2 = classLoader.loadClass("FakeClass2").newInstance() assert(fakeClass.getClass === fakeClass2.getClass) } test("parent first") { val parentLoader = new URLClassLoader(urls2, null) val classLoader = new MutableURLClassLoader(urls, parentLoader) val fakeClass = classLoader.loadClass("FakeClass1").newInstance() val fakeClassVersion = fakeClass.toString assert(fakeClassVersion === "2") val fakeClass2 = classLoader.loadClass("FakeClass1").newInstance() assert(fakeClass.getClass === fakeClass2.getClass) } test("child first can fall back") { val parentLoader = new URLClassLoader(urls2, null) val classLoader = new ChildFirstURLClassLoader(urls, parentLoader) val fakeClass = classLoader.loadClass("FakeClass3").newInstance() val fakeClassVersion = fakeClass.toString assert(fakeClassVersion === "2") } test("child first can fail") { val parentLoader = new URLClassLoader(urls2, null) val classLoader = new ChildFirstURLClassLoader(urls, parentLoader) intercept[java.lang.ClassNotFoundException] { classLoader.loadClass("FakeClassDoesNotExist").newInstance() } } test("driver sets context class loader in local mode") { // Test the case where the driver program sets a context classloader and then runs a job // in local mode. This is what happens when ./spark-submit is called with "local" as the // master. val original = Thread.currentThread().getContextClassLoader val className = "ClassForDriverTest" val jar = TestUtils.createJarWithClasses(Seq(className)) val contextLoader = new URLClassLoader(Array(jar), Utils.getContextOrSparkClassLoader) Thread.currentThread().setContextClassLoader(contextLoader) val sc = new SparkContext("local", "driverLoaderTest") try { sc.makeRDD(1 to 5, 2).mapPartitions { x => val loader = Thread.currentThread().getContextClassLoader Class.forName(className, true, loader).newInstance() Seq().iterator }.count() } catch { case e: SparkException if e.getMessage.contains("ClassNotFoundException") => fail("Local executor could not find class", e) case t: Throwable => fail("Unexpected exception ", t) } sc.stop() Thread.currentThread().setContextClassLoader(original) } }
Example 83
Source File: Application.scala From ZparkIO with MIT License | 5 votes |
package com.leobenkel.zparkioProfileExampleMoreComplex import com.leobenkel.zparkio.Services._ import com.leobenkel.zparkio.ZparkioApp import com.leobenkel.zparkioProfileExampleMoreComplex.Application.APP_ENV import com.leobenkel.zparkioProfileExampleMoreComplex.Services.Database.Database import com.leobenkel.zparkioProfileExampleMoreComplex.Services.FileIO.FileIO import com.leobenkel.zparkioProfileExampleMoreComplex.Services._ import com.leobenkel.zparkioProfileExampleMoreComplex.Transformations.UserTransformations import izumi.reflect.Tag import org.apache.spark.SparkException import zio.{ZIO, ZLayer} trait Application extends ZparkioApp[Arguments, APP_ENV, Unit] { implicit lazy final override val tagC: Tag[Arguments] = Tag.tagFromTagMacro implicit lazy final override val tagEnv: Tag[APP_ENV] = Tag.tagFromTagMacro override protected def env: ZLayer[ZPARKIO_ENV, Throwable, APP_ENV] = FileIO.Live ++ Database.Live override protected def sparkFactory: FACTORY_SPARK = SparkBuilder override protected def loggerFactory: FACTORY_LOG = Logger.Factory(Log) override protected def makeCli(args: List[String]): Arguments = Arguments(args) override def runApp(): ZIO[COMPLETE_ENV, Throwable, Unit] = { for { _ <- Logger.info(s"--Start--") authors <- UserTransformations.getAuthors _ <- Logger.info(s"There are ${authors.count()} authors") } yield () } override def processErrors(f: Throwable): Option[Int] = { println(f) f.printStackTrace(System.out) f match { case _: SparkException => Some(10) case _: InterruptedException => Some(0) case _ => Some(1) } } } object Application { type APP_ENV = FileIO with Database }
Example 84
Source File: StringToShortIndexer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.util.collection.OpenHashMap class StringToShortIndexer(override val uid: String) extends Estimator[StringToShortIndexerModel] with StringIndexerBase { def this() = this(Identifiable.randomUID("strShortIdx")) def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) override def fit(dataset: DataFrame): StringToShortIndexerModel = { val counts = dataset.select(col($(inputCol)).cast(StringType)) .map(_.getString(0)) .countByValue() val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray require(labels.length <= Short.MaxValue, s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") copyValues(new StringToShortIndexerModel(uid, labels).setParent(this)) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): StringToShortIndexer = defaultCopy(extra) } class StringToShortIndexerModel ( override val uid: String, val labels: Array[String]) extends Model[StringToShortIndexerModel] with StringIndexerBase { def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels) require(labels.length <= Short.MaxValue, s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") private val labelToIndex: OpenHashMap[String, Short] = { val n = labels.length.toShort val map = new OpenHashMap[String, Short](n) var i: Short = 0 while (i < n) { map.update(labels(i), i) i = (i + 1).toShort } map } def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { if (!dataset.schema.fieldNames.contains($(inputCol))) { logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " + "Skip StringToShortIndexerModel.") return dataset } val indexer = udf { label: String => if (labelToIndex.contains(label)) { labelToIndex(label) } else { // TODO: handle unseen labels throw new SparkException(s"Unseen label: $label.") } } val outputColName = $(outputCol) val metadata = NominalAttribute.defaultAttr .withName(outputColName).withValues(labels).toMetadata() dataset.select(col("*"), indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { if (schema.fieldNames.contains($(inputCol))) { validateAndTransformSchema(schema) } else { // If the input column does not exist during transformation, we skip StringToShortIndexerModel. schema } } override def copy(extra: ParamMap): StringToShortIndexerModel = { val copied = new StringToShortIndexerModel(uid, labels) copyValues(copied, extra).setParent(parent) } }
Example 85
Source File: DruidQueriesTab.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.sparklinedata.ui import org.apache.spark.sql.hive.thriftserver.sparklinedata.ui.DruidQueriesTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.sql.SPLLogging private[thriftserver] class DruidQueriesTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "druid") with SPLLogging { override val name = "Druid Query Details" val parent = getSparkUI(sparkContext) attachPage(new DruidQueriesPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[spark] object DruidQueriesTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 86
Source File: PowerBiSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.split1 import java.io.File import com.microsoft.ml.spark.Secrets import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.io.powerbi.PowerBIWriter import org.apache.spark.SparkException import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.functions.{current_timestamp, lit} import scala.collection.JavaConverters._ class PowerBiSuite extends TestBase with FileReaderUtils { lazy val url: String = sys.env.getOrElse("MML_POWERBI_URL", Secrets.PowerbiURL) lazy val df: DataFrame = session .createDataFrame(Seq( (Some(0), "a"), (Some(1), "b"), (Some(2), "c"), (Some(3), ""), (None, "bad_row"))) .toDF("bar", "foo") .withColumn("baz", current_timestamp()) lazy val bigdf: DataFrame = (1 to 5).foldRight(df) { case (_, ldf) => ldf.union(df) }.repartition(2) lazy val delayDF: DataFrame = { val rows = Array.fill(100){df.collect()}.flatten.toList.asJava val df2 = session .createDataFrame(rows, df.schema) .coalesce(1).cache() df2.count() df2.map({x => Thread.sleep(10); x})(RowEncoder(df2.schema)) } test("write to powerBi", TestBase.BuildServer) { PowerBIWriter.write(df, url) } test("write to powerBi with delays"){ PowerBIWriter.write(delayDF, url) } test("using dynamic minibatching"){ PowerBIWriter.write(delayDF, url, Map("minibatcher"->"dynamic", "maxBatchSize"->"50")) } test("using timed minibatching"){ PowerBIWriter.write(delayDF, url, Map("minibatcher"->"timed")) } test("using consolidated timed minibatching"){ PowerBIWriter.write(delayDF, url, Map( "minibatcher"->"timed", "consolidate"->"true")) } test("using buffered batching"){ PowerBIWriter.write(delayDF, url, Map("buffered"->"true")) } ignore("throw useful error message when given an improper dataset") { //TODO figure out why this does not throw errors on the build machine assertThrows[SparkException] { PowerBIWriter.write(df.withColumn("bad", lit("foo")), url) } } test("stream to powerBi", TestBase.BuildServer) { bigdf.write.parquet(tmpDir + File.separator + "powerBI.parquet") val sdf = session.readStream.schema(df.schema).parquet(tmpDir + File.separator + "powerBI.parquet") val q1 = PowerBIWriter.stream(sdf, url).start() q1.processAllAvailable() } }
Example 87
Source File: MesosClusterManager.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.config._ import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl} private[spark] class MesosClusterManager extends ExternalClusterManager { private val MESOS_REGEX = """mesos://(.*)""".r override def canCreate(masterURL: String): Boolean = { masterURL.startsWith("mesos") } override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = { new TaskSchedulerImpl(sc) } override def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = { require(!sc.conf.get(IO_ENCRYPTION_ENABLED), "I/O encryption is currently not supported in Mesos.") val mesosUrl = MESOS_REGEX.findFirstMatchIn(masterURL).get.group(1) val coarse = sc.conf.getBoolean("spark.mesos.coarse", defaultValue = true) if (coarse) { new MesosCoarseGrainedSchedulerBackend( scheduler.asInstanceOf[TaskSchedulerImpl], sc, mesosUrl, sc.env.securityManager) } else { new MesosFineGrainedSchedulerBackend( scheduler.asInstanceOf[TaskSchedulerImpl], sc, mesosUrl) } } override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend) } }
Example 88
Source File: RWrappers.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkException import org.apache.spark.ml.util.MLReader private[r] object RWrappers extends MLReader[Object] { override def load(path: String): Object = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val className = (rMetadata \ "class").extract[String] className match { case "org.apache.spark.ml.r.NaiveBayesWrapper" => NaiveBayesWrapper.load(path) case "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper" => AFTSurvivalRegressionWrapper.load(path) case "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" => GeneralizedLinearRegressionWrapper.load(path) case "org.apache.spark.ml.r.KMeansWrapper" => KMeansWrapper.load(path) case "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" => MultilayerPerceptronClassifierWrapper.load(path) case "org.apache.spark.ml.r.LDAWrapper" => LDAWrapper.load(path) case "org.apache.spark.ml.r.IsotonicRegressionWrapper" => IsotonicRegressionWrapper.load(path) case "org.apache.spark.ml.r.GaussianMixtureWrapper" => GaussianMixtureWrapper.load(path) case "org.apache.spark.ml.r.ALSWrapper" => ALSWrapper.load(path) case "org.apache.spark.ml.r.LogisticRegressionWrapper" => LogisticRegressionWrapper.load(path) case "org.apache.spark.ml.r.RandomForestRegressorWrapper" => RandomForestRegressorWrapper.load(path) case "org.apache.spark.ml.r.RandomForestClassifierWrapper" => RandomForestClassifierWrapper.load(path) case "org.apache.spark.ml.r.GBTRegressorWrapper" => GBTRegressorWrapper.load(path) case "org.apache.spark.ml.r.GBTClassifierWrapper" => GBTClassifierWrapper.load(path) case _ => throw new SparkException(s"SparkR read.ml does not support load $className") } } }
Example 89
Source File: NumericParser.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty) { // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 90
Source File: LabeledPoint.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 91
Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 92
Source File: NumericParserSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.{SparkException, SparkFunSuite} class NumericParserSuite extends SparkFunSuite { test("parser") { val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3)) assert(parsed(1).asInstanceOf[Double] === -4.0) assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8)) assert(parsed(3).asInstanceOf[Double] === 9.0) val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4") malformatted.foreach { s => intercept[SparkException] { NumericParser.parse(s) throw new RuntimeException(s"Didn't detect malformatted string $s.") } } } test("parser with whitespaces") { val s = "(0.0, [1.0, 2.0])" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Double] === 0.0) assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) } }
Example 93
Source File: CommitFailureTestRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 94
Source File: ThriftServerTab.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.monitor.ThriftServerMonitor import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} private[thriftserver] class ThriftServerTab(userName: String, sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) // ThriftServerTab renders by different listener's content, identified by user. val listener = ThriftServerMonitor.getListener(userName) attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 95
Source File: ThriftServerMonitor.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.monitor import scala.collection.mutable.HashMap import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab object ThriftServerMonitor extends Logging { private[this] val uiTabs = new HashMap[String, ThriftServerTab]() private[this] val listeners = new HashMap[String, ThriftServerListener]() def setListener(user: String, sparkListener: ThriftServerListener): Unit = { listeners.put(user, sparkListener) } def getListener(user: String): ThriftServerListener = { listeners.getOrElse(user, throw new SparkException(s"Listener does not init for user[$user]")) } def addUITab(user: String, ui: ThriftServerTab): Unit = { uiTabs.put(user, ui) } def detachUITab(user: String): Unit = { listeners.remove(user) uiTabs.get(user).foreach(_.detach()) } def detachAllUITabs(): Unit = { uiTabs.values.foreach(_.detach()) } }
Example 96
Source File: UDTRegistration.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.util.Utils def getUDTFor(userClass: String): Option[Class[_]] = { udtMap.get(userClass).map { udtClassName => if (Utils.classIsLoadable(udtClassName)) { val udtClass = Utils.classForName(udtClassName) if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) { udtClass } else { throw new SparkException( s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " + s"an UserDefinedType for ${userClass}") } } else { throw new SparkException( s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.") } } } }
Example 97
Source File: ScalaUDFSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.types.{IntegerType, StringType} class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper { test("basic") { val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil) checkEvaluation(intUdf, 2) val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil) checkEvaluation(stringUdf, "ax") } test("better error message for NPE") { val udf = ScalaUDF( (s: String) => s.toLowerCase, StringType, Literal.create(null, StringType) :: Nil) val e1 = intercept[SparkException](udf.eval()) assert(e1.getMessage.contains("Failed to execute user defined function")) val e2 = intercept[SparkException] { checkEvalutionWithUnsafeProjection(udf, null) } assert(e2.getMessage.contains("Failed to execute user defined function")) } }
Example 98
Source File: UDTRegistrationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.types._ private[sql] class TestUserClass { } private[sql] class TestUserClass2 { } private[sql] class TestUserClass3 { } private[sql] class NonUserDefinedType { } private[sql] class TestUserClassUDT extends UserDefinedType[TestUserClass] { override def sqlType: DataType = IntegerType override def serialize(input: TestUserClass): Int = 1 override def deserialize(datum: Any): TestUserClass = new TestUserClass override def userClass: Class[TestUserClass] = classOf[TestUserClass] private[spark] override def asNullable: TestUserClassUDT = this override def hashCode(): Int = classOf[TestUserClassUDT].getName.hashCode() override def equals(other: Any): Boolean = other match { case _: TestUserClassUDT => true case _ => false } } class UDTRegistrationSuite extends SparkFunSuite { test("register non-UserDefinedType") { UDTRegistration.register(classOf[TestUserClass].getName, "org.apache.spark.sql.NonUserDefinedType") intercept[SparkException] { UDTRegistration.getUDTFor(classOf[TestUserClass].getName) } } test("default UDTs") { val userClasses = Seq( "org.apache.spark.ml.linalg.Vector", "org.apache.spark.ml.linalg.DenseVector", "org.apache.spark.ml.linalg.SparseVector", "org.apache.spark.ml.linalg.Matrix", "org.apache.spark.ml.linalg.DenseMatrix", "org.apache.spark.ml.linalg.SparseMatrix") userClasses.foreach { c => assert(UDTRegistration.exists(c)) } } test("query registered user class") { UDTRegistration.register(classOf[TestUserClass2].getName, classOf[TestUserClassUDT].getName) assert(UDTRegistration.exists(classOf[TestUserClass2].getName)) assert( classOf[UserDefinedType[_]].isAssignableFrom(( UDTRegistration.getUDTFor(classOf[TestUserClass2].getName).get))) } test("query unregistered user class") { assert(!UDTRegistration.exists(classOf[TestUserClass3].getName)) assert(!UDTRegistration.getUDTFor(classOf[TestUserClass3].getName).isDefined) } }
Example 99
Source File: YarnClusterManager.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl} private[spark] class YarnClusterManager extends ExternalClusterManager { override def canCreate(masterURL: String): Boolean = { masterURL == "yarn" } override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = { sc.deployMode match { case "cluster" => new YarnClusterScheduler(sc) case "client" => new YarnScheduler(sc) case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn") } } override def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = { sc.deployMode match { case "cluster" => new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc) case "client" => new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc) case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn") } } override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend) } }
Example 100
Source File: HDFSCredentialProviderSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn.security import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.scalatest.{Matchers, PrivateMethodTester} import org.apache.spark.{SparkConf, SparkException, SparkFunSuite} class HDFSCredentialProviderSuite extends SparkFunSuite with PrivateMethodTester with Matchers { private val _getTokenRenewer = PrivateMethod[String]('getTokenRenewer) private def getTokenRenewer( hdfsCredentialProvider: HDFSCredentialProvider, conf: Configuration): String = { hdfsCredentialProvider invokePrivate _getTokenRenewer(conf) } private var hdfsCredentialProvider: HDFSCredentialProvider = null override def beforeAll() { super.beforeAll() if (hdfsCredentialProvider == null) { hdfsCredentialProvider = new HDFSCredentialProvider() } } override def afterAll() { if (hdfsCredentialProvider != null) { hdfsCredentialProvider = null } super.afterAll() } test("check token renewer") { val hadoopConf = new Configuration() hadoopConf.set("yarn.resourcemanager.address", "myrm:8033") hadoopConf.set("yarn.resourcemanager.principal", "yarn/myrm:[email protected]") val renewer = getTokenRenewer(hdfsCredentialProvider, hadoopConf) renewer should be ("yarn/myrm:[email protected]") } test("check token renewer default") { val hadoopConf = new Configuration() val caught = intercept[SparkException] { getTokenRenewer(hdfsCredentialProvider, hadoopConf) } assert(caught.getMessage === "Can't get Master Kerberos principal for use as renewer") } }
Example 101
Source File: UnionDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.length == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.nonEmpty) { Some(ssc.sc.union(rdds)) } else { None } } }
Example 102
Source File: TransformedDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.nonEmpty, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse( // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE throw new SparkException(s"Couldn't generate RDD from parent at time $validTime")) } val transformedRDD = transformFunc(parentRDDs, validTime) if (transformedRDD == null) { throw new SparkException("Transform function must not return null. " + "Return SparkContext.emptyRDD() instead to represent no element " + "as the result of transformation.") } Some(transformedRDD) } override protected[streaming] def createRDDWithLocalProperties[U]( time: Time, displayInnerRDDOps: Boolean)(body: => U): U = { super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body) } }
Example 103
Source File: StreamingTab.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging { import StreamingTab._ private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 104
Source File: RpcEndpointAddress.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import org.apache.spark.SparkException private[spark] case class RpcEndpointAddress(val rpcAddress: RpcAddress, val name: String) { require(name != null, "RpcEndpoint name must be provided.") def this(host: String, port: Int, name: String) = { this(RpcAddress(host, port), name) } override val toString = if (rpcAddress != null) { s"spark://$name@${rpcAddress.host}:${rpcAddress.port}" } else { s"spark-client://$name" } } private[spark] object RpcEndpointAddress { def apply(host: String, port: Int, name: String): RpcEndpointAddress = { new RpcEndpointAddress(host, port, name) } def apply(sparkUrl: String): RpcEndpointAddress = { try { val uri = new java.net.URI(sparkUrl) val host = uri.getHost val port = uri.getPort val name = uri.getUserInfo if (uri.getScheme != "spark" || host == null || port < 0 || name == null || (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null uri.getFragment != null || uri.getQuery != null) { throw new SparkException("Invalid Spark URL: " + sparkUrl) } new RpcEndpointAddress(host, port, name) } catch { case e: java.net.URISyntaxException => throw new SparkException("Invalid Spark URL: " + sparkUrl, e) } } }
Example 105
Source File: RUtils.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.File import java.util.Arrays import org.apache.hadoop.security.UserGroupInformation import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Arrays.asList("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 106
Source File: RpcAddressSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import org.apache.spark.{SparkException, SparkFunSuite} class RpcAddressSuite extends SparkFunSuite { test("hostPort") { val address = RpcAddress("1.2.3.4", 1234) assert(address.host == "1.2.3.4") assert(address.port == 1234) assert(address.hostPort == "1.2.3.4:1234") } test("fromSparkURL") { val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234") assert(address.host == "1.2.3.4") assert(address.port == 1234) } test("fromSparkURL: a typo url") { val e = intercept[SparkException] { RpcAddress.fromSparkURL("spark://1.2. 3.4:1234") } assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage) } test("fromSparkURL: invalid scheme") { val e = intercept[SparkException] { RpcAddress.fromSparkURL("invalid://1.2.3.4:1234") } assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage) } test("toSparkURL") { val address = RpcAddress("1.2.3.4", 1234) assert(address.toSparkURL == "spark://1.2.3.4:1234") } }
Example 107
Source File: KryoSerializerResizableOutputSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.LocalSparkContext import org.apache.spark.SparkContext import org.apache.spark.SparkException class KryoSerializerResizableOutputSuite extends SparkFunSuite { // trial and error showed this will not serialize with 1mb buffer val x = (1 to 400000).toArray test("kryo without resizable output buffer should fail on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "1m") val sc = new SparkContext("local", "test", conf) intercept[SparkException](sc.parallelize(x).collect()) LocalSparkContext.stop(sc) } test("kryo with resizable output buffer should succeed on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "2m") val sc = new SparkContext("local", "test", conf) assert(sc.parallelize(x).collect() === x) LocalSparkContext.stop(sc) } }
Example 108
Source File: ProactiveClosureSerializationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 109
Source File: CoarseGrainedSchedulerBackendSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite} import org.apache.spark.util.{RpcUtils, SerializableBuffer} class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext { test("serialized task larger than max RPC message size") { val conf = new SparkConf conf.set("spark.rpc.message.maxSize", "1") conf.set("spark.default.parallelism", "1") sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf) val frameSize = RpcUtils.maxMessageSizeBytes(sc.conf) val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize)) val larger = sc.parallelize(Seq(buffer)) val thrown = intercept[SparkException] { larger.collect() } assert(thrown.getMessage.contains("using broadcast variables for large values")) val smaller = sc.parallelize(1 to 4).collect() assert(smaller.size === 4) } }
Example 110
Source File: NumericParser.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty){ // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 111
Source File: LabeledPoint.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 112
Source File: VectorAssemblerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.attribute.{AttributeGroup, NominalAttribute, NumericAttribute} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col class VectorAssemblerSuite extends SparkFunSuite with MLlibTestSparkContext { test("params") { ParamsSuite.checkParams(new VectorAssembler) } test("assemble") { import org.apache.spark.ml.feature.VectorAssembler.assemble assert(assemble(0.0) === Vectors.sparse(1, Array.empty, Array.empty)) assert(assemble(0.0, 1.0) === Vectors.sparse(2, Array(1), Array(1.0))) val dv = Vectors.dense(2.0, 0.0) assert(assemble(0.0, dv, 1.0) === Vectors.sparse(4, Array(1, 3), Array(2.0, 1.0))) val sv = Vectors.sparse(2, Array(0, 1), Array(3.0, 4.0)) assert(assemble(0.0, dv, 1.0, sv) === Vectors.sparse(6, Array(1, 3, 4, 5), Array(2.0, 1.0, 3.0, 4.0))) for (v <- Seq(1, "a", null)) { intercept[SparkException](assemble(v)) intercept[SparkException](assemble(1.0, v)) } } test("assemble should compress vectors") { import org.apache.spark.ml.feature.VectorAssembler.assemble val v1 = assemble(0.0, 0.0, 0.0, Vectors.dense(4.0)) assert(v1.isInstanceOf[SparseVector]) val v2 = assemble(1.0, 2.0, 3.0, Vectors.sparse(1, Array(0), Array(4.0))) assert(v2.isInstanceOf[DenseVector]) } test("VectorAssembler") { val df = sqlContext.createDataFrame(Seq( (0, 0.0, Vectors.dense(1.0, 2.0), "a", Vectors.sparse(2, Array(1), Array(3.0)), 10L) )).toDF("id", "x", "y", "name", "z", "n") val assembler = new VectorAssembler() .setInputCols(Array("x", "y", "z", "n")) .setOutputCol("features") assembler.transform(df).select("features").collect().foreach { case Row(v: Vector) => assert(v === Vectors.sparse(6, Array(1, 2, 4, 5), Array(1.0, 2.0, 3.0, 10.0))) } } test("ML attributes") { val browser = NominalAttribute.defaultAttr.withValues("chrome", "firefox", "safari") val hour = NumericAttribute.defaultAttr.withMin(0.0).withMax(24.0) val user = new AttributeGroup("user", Array( NominalAttribute.defaultAttr.withName("gender").withValues("male", "female"), NumericAttribute.defaultAttr.withName("salary"))) val row = (1.0, 0.5, 1, Vectors.dense(1.0, 1000.0), Vectors.sparse(2, Array(1), Array(2.0))) val df = sqlContext.createDataFrame(Seq(row)).toDF("browser", "hour", "count", "user", "ad") .select( col("browser").as("browser", browser.toMetadata()), col("hour").as("hour", hour.toMetadata()), col("count"), // "count" is an integer column without ML attribute col("user").as("user", user.toMetadata()), col("ad")) // "ad" is a vector column without ML attribute val assembler = new VectorAssembler() .setInputCols(Array("browser", "hour", "count", "user", "ad")) .setOutputCol("features") val output = assembler.transform(df) val schema = output.schema val features = AttributeGroup.fromStructField(schema("features")) assert(features.size === 7) val browserOut = features.getAttr(0) assert(browserOut === browser.withIndex(0).withName("browser")) val hourOut = features.getAttr(1) assert(hourOut === hour.withIndex(1).withName("hour")) val countOut = features.getAttr(2) assert(countOut === NumericAttribute.defaultAttr.withName("count").withIndex(2)) val userGenderOut = features.getAttr(3) assert(userGenderOut === user.getAttr("gender").withName("user_gender").withIndex(3)) val userSalaryOut = features.getAttr(4) assert(userSalaryOut === user.getAttr("salary").withName("user_salary").withIndex(4)) assert(features.getAttr(5) === NumericAttribute.defaultAttr.withIndex(5)) assert(features.getAttr(6) === NumericAttribute.defaultAttr.withIndex(6)) } }
Example 113
Source File: NumericParserSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.{SparkException, SparkFunSuite} class NumericParserSuite extends SparkFunSuite { test("parser") { val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3)) assert(parsed(1).asInstanceOf[Double] === -4.0) assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8)) assert(parsed(3).asInstanceOf[Double] === 9.0) val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4") malformatted.foreach { s => intercept[SparkException] { NumericParser.parse(s) println(s"Didn't detect malformatted string $s.") } } } test("parser with whitespaces") { val s = "(0.0, [1.0, 2.0])" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Double] === 0.0) assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) } }
Example 114
Source File: ThriftServerTab.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SparkSQLEnv} import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} import org.apache.spark.{SparkContext, Logging, SparkException} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sql") with Logging { override val name = "SQL" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 115
Source File: StreamingTab.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.eclipse.jetty.servlet.ServletContextHandler import org.apache.spark.{Logging, SparkException} import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{JettyUtils, SparkUI, SparkUITab} import StreamingTab._ private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(getSparkUI(ssc), "streaming") with Logging { private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) var staticHandler: ServletContextHandler = null def attach() { getSparkUI(ssc).attachTab(this) staticHandler = JettyUtils.createStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") getSparkUI(ssc).attachHandler(staticHandler) } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).detachHandler(staticHandler) staticHandler = null } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 116
Source File: KryoSerializerResizableOutputSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.SparkContext import org.apache.spark.LocalSparkContext import org.apache.spark.SparkException class KryoSerializerResizableOutputSuite extends SparkFunSuite { // trial and error showed this will not serialize with 1mb buffer val x = (1 to 400000).toArray test("kryo without resizable output buffer should fail on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "1m") val sc = new SparkContext("local", "test", conf) intercept[SparkException](sc.parallelize(x).collect()) LocalSparkContext.stop(sc) } test("kryo with resizable output buffer should succeed on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "2m") val sc = new SparkContext("local", "test", conf) assert(sc.parallelize(x).collect() === x) LocalSparkContext.stop(sc) } }
Example 117
Source File: ProactiveClosureSerializationSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 118
Source File: CoarseGrainedSchedulerBackendSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite} import org.apache.spark.util.{SerializableBuffer, AkkaUtils} class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext { test("serialized task larger than akka frame size") { val conf = new SparkConf conf.set("spark.akka.frameSize", "1") conf.set("spark.default.parallelism", "1") sc = new SparkContext("local-cluster[2 , 1 , 512]", "test", conf) val frameSize = AkkaUtils.maxFrameSizeBytes(sc.conf) val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize)) val larger = sc.parallelize(Seq(buffer)) val thrown = intercept[SparkException] { larger.collect() } assert(thrown.getMessage.contains("using broadcast variables for large values")) val smaller = sc.parallelize(1 to 4).collect() assert(smaller.size === 4) } }
Example 119
Source File: MutableURLClassLoaderSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.net.URLClassLoader import org.apache.spark.{SparkContext, SparkException, SparkFunSuite, TestUtils} class MutableURLClassLoaderSuite extends SparkFunSuite { val urls2 = List(TestUtils.createJarWithClasses( classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"), toStringValue = "2")).toArray val urls = List(TestUtils.createJarWithClasses( classNames = Seq("FakeClass1"), classNamesWithBase = Seq(("FakeClass2", "FakeClass3")), // FakeClass3 is in parent toStringValue = "1", classpathUrls = urls2)).toArray test("child first") { val parentLoader = new URLClassLoader(urls2, null) val classLoader = new ChildFirstURLClassLoader(urls, parentLoader) val fakeClass = classLoader.loadClass("FakeClass2").newInstance() val fakeClassVersion = fakeClass.toString assert(fakeClassVersion === "1") val fakeClass2 = classLoader.loadClass("FakeClass2").newInstance() assert(fakeClass.getClass === fakeClass2.getClass) } test("parent first") { val parentLoader = new URLClassLoader(urls2, null) val classLoader = new MutableURLClassLoader(urls, parentLoader) val fakeClass = classLoader.loadClass("FakeClass1").newInstance() val fakeClassVersion = fakeClass.toString assert(fakeClassVersion === "2") val fakeClass2 = classLoader.loadClass("FakeClass1").newInstance() assert(fakeClass.getClass === fakeClass2.getClass) } test("child first can fall back") { val parentLoader = new URLClassLoader(urls2, null) val classLoader = new ChildFirstURLClassLoader(urls, parentLoader) val fakeClass = classLoader.loadClass("FakeClass3").newInstance() val fakeClassVersion = fakeClass.toString assert(fakeClassVersion === "2") } test("child first can fail") { val parentLoader = new URLClassLoader(urls2, null) val classLoader = new ChildFirstURLClassLoader(urls, parentLoader) intercept[java.lang.ClassNotFoundException] { classLoader.loadClass("FakeClassDoesNotExist").newInstance() } } test("driver sets context class loader in local mode") { // Test the case where the driver program sets a context classloader and then runs a job // in local mode. This is what happens when ./spark-submit is called with "local" as the // master. val original = Thread.currentThread().getContextClassLoader val className = "ClassForDriverTest" val jar = TestUtils.createJarWithClasses(Seq(className)) val contextLoader = new URLClassLoader(Array(jar), Utils.getContextOrSparkClassLoader) Thread.currentThread().setContextClassLoader(contextLoader) val sc = new SparkContext("local", "driverLoaderTest") try { sc.makeRDD(1 to 5, 2).mapPartitions { x => val loader = Thread.currentThread().getContextClassLoader Class.forName(className, true, loader).newInstance() Seq().iterator }.count() } catch { case e: SparkException if e.getMessage.contains("ClassNotFoundException") => fail("Local executor could not find class", e) case t: Throwable => fail("Unexpected exception ", t) } sc.stop() Thread.currentThread().setContextClassLoader(original) } }
Example 120
Source File: DataFrameToFileWriter.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage import org.apache.spark.SparkException import ai.deepsense.commons.utils.LoggerForCallerClass import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.doperations.exceptions.WriteFileException import ai.deepsense.deeplang.doperations.inout.OutputFileFormatChoice.Csv import ai.deepsense.deeplang.doperations.inout.OutputStorageTypeChoice import ai.deepsense.deeplang.doperations.readwritedataframe.{FilePath, FilePathFromLibraryPath, FileScheme} import ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv.CsvSchemaStringifierBeforeCsvWriting import ai.deepsense.deeplang.exceptions.DeepLangException import ai.deepsense.deeplang.{ExecutionContext, FileSystemClient} import org.apache.spark.sql.SaveMode object DataFrameToFileWriter { val logger = LoggerForCallerClass() def writeToFile( fileChoice: OutputStorageTypeChoice.File, context: ExecutionContext, dataFrame: DataFrame): Unit = { implicit val ctx = context val path = FileSystemClient.replaceLeadingTildeWithHomeDirectory(fileChoice.getOutputFile()) val filePath = FilePath(path) val saveMode = if (fileChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists try { val preprocessed = fileChoice.getFileFormat() match { case csv: Csv => CsvSchemaStringifierBeforeCsvWriting.preprocess(dataFrame) case other => dataFrame } writeUsingProvidedFileScheme(fileChoice, preprocessed, filePath, saveMode) } catch { case e: SparkException => logger.error(s"WriteDataFrame error: Spark problem. Unable to write file to $path", e) throw WriteFileException(path, e) } } private def writeUsingProvidedFileScheme( fileChoice: OutputStorageTypeChoice.File, dataFrame: DataFrame, path: FilePath, saveMode: SaveMode )(implicit context: ExecutionContext): Unit = { import FileScheme._ path.fileScheme match { case Library => val filePath = FilePathFromLibraryPath(path) val FilePath(_, libraryPath) = filePath new java.io.File(libraryPath).getParentFile.mkdirs() writeUsingProvidedFileScheme(fileChoice, dataFrame, filePath, saveMode) case FileScheme.File => DriverFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode) case HDFS => ClusterFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode) case HTTP | HTTPS | FTP => throw NotSupportedScheme(path.fileScheme) } } case class NotSupportedScheme(fileScheme: FileScheme) extends DeepLangException(s"Not supported file scheme ${fileScheme.pathPrefix}") }
Example 121
Source File: CSVToAvroTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.io.csv import com.salesforce.op.test.{Passenger, TestSparkContext} import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class CSVToAvroTest extends FlatSpec with TestSparkContext { val avroSchema: String = loadFile(s"$resourceDir/PassengerSchemaModifiedDataTypes.avsc") val csvReader: CSVInOut = new CSVInOut(CSVOptions(header = true)) lazy val csvRDD: RDD[Seq[String]] = csvReader.readRDD(s"$resourceDir/PassengerDataModifiedDataTypes.csv") lazy val csvFileRecordCount: Long = csvRDD.count Spec(CSVToAvro.getClass) should "convert RDD[Seq[String]] to RDD[GenericRecord]" in { val res = CSVToAvro.toAvro(csvRDD, avroSchema) res shouldBe a[RDD[_]] res.count shouldBe csvFileRecordCount } it should "convert RDD[Seq[String]] to RDD[T]" in { val res = CSVToAvro.toAvroTyped[Passenger](csvRDD, avroSchema) res shouldBe a[RDD[_]] res.count shouldBe csvFileRecordCount } it should "throw an error for nested schema" in { val invalidAvroSchema = loadFile(s"$resourceDir/PassengerSchemaNestedTypeCSV.avsc") val exceptionMsg = "CSV should be a flat file and not have nested records (unsupported column(Sex schemaType=ENUM)" val error = intercept[SparkException](CSVToAvro.toAvro(csvRDD, invalidAvroSchema).count()) error.getCause.getMessage shouldBe exceptionMsg } it should "throw an error for mis-matching schema fields" in { val invalidAvroSchema = loadFile(s"$resourceDir/PassengerSchemaInvalidField.avsc") val error = intercept[SparkException](CSVToAvro.toAvro(csvRDD, invalidAvroSchema).count()) error.getCause.getMessage shouldBe "Mismatch number of fields in csv record and avro schema" } it should "throw an error for bad data" in { val invalidDataRDD = csvReader.readRDD(s"$resourceDir/PassengerDataContentTypeMisMatch.csv") val error = intercept[SparkException](CSVToAvro.toAvro(invalidDataRDD, avroSchema).count()) error.getCause.getMessage shouldBe "Boolean column not actually a boolean. Invalid value: 'fail'" } }
Example 122
Source File: PredictionDescalerTransformerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.OpWorkflow import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.SparkException import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import scala.util.{Failure, Success} @RunWith(classOf[JUnitRunner]) class PredictionDescalerTransformerTest extends OpTransformerSpec[Real, PredictionDescaler[Real, Real]] { val predictionData = Seq(-1.0, 0.0, 1.0, 2.0).map(Prediction(_)) val featureData = Seq(0.0, 1.0, 2.0, 3.0).map(_.toReal) val (testData, p, f1) = TestFeatureBuilder[Prediction, Real](predictionData zip featureData) val scalerMetadata = ScalerMetadata(ScalingType.Linear, LinearScalerArgs(slope = 4.0, intercept = 1.0)).toMetadata() val colWithMetadata = testData.col(f1.name).as(f1.name, scalerMetadata) val inputData = testData.withColumn(f1.name, colWithMetadata) val transformer = new PredictionDescaler[Real, Real]().setInput(p, f1) val expectedResult: Seq[Real] = Seq(-0.5, -0.25, 0.0, 0.25).map(_.toReal) it should "error on missing scaler metadata" in { val (df, p, f1) = TestFeatureBuilder(Seq(4.0, 1.0, 0.0).map(Prediction(_)) zip Seq(0.0, 0.0, 0.0).map(Real(_))) val error = intercept[SparkException]( new PredictionDescaler[Real, Real]().setInput(p, f1).transform(df).collect() ) error.getCause should not be null error.getCause shouldBe a[RuntimeException] error.getCause.getMessage shouldBe s"Failed to extract scaler metadata for input feature '${f1.name}'" } it should "descale and serialize log-scaling workflow" in { val logScaler = new ScalerTransformer[Real, Real]( scalingType = ScalingType.Logarithmic, scalingArgs = EmptyScalerArgs() ).setInput(f1) val scaledResponse = logScaler.getOutput() val metadata = logScaler.transform(inputData).schema(scaledResponse.name).metadata ScalerMetadata(metadata) match { case Failure(err) => fail(err) case Success(meta) => meta shouldBe ScalerMetadata(ScalingType.Logarithmic, EmptyScalerArgs()) } val shifted = scaledResponse.map[Prediction](v => Prediction(v.value.getOrElse(Double.NaN) + 1), operationName = "shift") val descaledPrediction = new PredictionDescaler[Real, Real]().setInput(shifted, scaledResponse).getOutput() val workflow = new OpWorkflow().setResultFeatures(descaledPrediction) val wfModel = workflow.setInputDataset(inputData).train() val transformed = wfModel.score() val actual = transformed.collect().map(_.getAs[Double](1)) val expected = Array(0.0, 1.0, 2.0, 3.0).map(_ * math.E) all(actual.zip(expected).map(x => math.abs(x._2 - x._1))) should be < 0.0001 } it should "descale and serialize linear-scaling workflow" in { val scalingArgs = LinearScalerArgs(slope = 2.0, intercept = 0.0) val linearScaler = new ScalerTransformer[Real, Real]( scalingType = ScalingType.Linear, scalingArgs = scalingArgs ).setInput(f1) val scaledResponse = linearScaler.getOutput() val metadata = linearScaler.transform(inputData).schema(scaledResponse.name).metadata ScalerMetadata(metadata) match { case Failure(err) => fail(err) case Success(meta) => meta shouldBe ScalerMetadata(ScalingType.Linear, scalingArgs) } val shifted = scaledResponse.map[Prediction](v => Prediction(v.value.getOrElse(Double.NaN) + 1), operationName = "shift") val descaledPrediction = new PredictionDescaler[Real, Real]().setInput(shifted, scaledResponse).getOutput() val workflow = new OpWorkflow().setResultFeatures(descaledPrediction) val wfModel = workflow.setInputDataset(inputData).train() val transformed = wfModel.score() val actual = transformed.collect().map(_.getAs[Double](1)) val expected = Array(0.5, 1.5, 2.5, 3.5) actual shouldBe expected } it should "work with its shortcut" in { val descaled = p.descale[Real, Real](f1) val transformed = descaled.originStage.asInstanceOf[PredictionDescaler[Real, Real]].transform(inputData) val actual = transformed.collect(descaled) actual shouldEqual expectedResult.toArray } }
Example 123
Source File: DescalerTransformerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.OpWorkflow import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.SparkException import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import scala.util.{Failure, Success} @RunWith(classOf[JUnitRunner]) class DescalerTransformerTest extends OpTransformerSpec[Real, DescalerTransformer[Real, Real, Real]] { val (testData, f1) = TestFeatureBuilder(Seq(4.0, 1.0, 0.0).map(_.toReal)) val scalerMetadata = ScalerMetadata(ScalingType.Linear, LinearScalerArgs(slope = 2.0, intercept = 3.0)).toMetadata() val colWithMetadata = testData.col(f1.name).as(f1.name, scalerMetadata) val inputData = testData.withColumn(f1.name, colWithMetadata) val transformer = new DescalerTransformer[Real, Real, Real]().setInput(f1, f1) val expectedResult: Seq[Real] = Seq(0.5, -1.0, -1.5).map(_.toReal) it should "error on missing scaler metadata" in { val (df, f) = TestFeatureBuilder(Seq(4.0, 1.0, 0.0).map(_.toReal)) val error = intercept[SparkException]( new DescalerTransformer[Real, Real, Real]().setInput(f, f).transform(df).collect() ) error.getCause should not be null error.getCause shouldBe a[RuntimeException] error.getCause.getMessage shouldBe s"Failed to extract scaler metadata for input feature '${f1.name}'" } it should "descale and work in log-scaling workflow" in { val logScaler = new ScalerTransformer[Real, Real]( scalingType = ScalingType.Logarithmic, scalingArgs = EmptyScalerArgs() ).setInput(f1) val scaledResponse = logScaler.getOutput() val metadata = logScaler.transform(inputData).schema(scaledResponse.name).metadata ScalerMetadata(metadata) match { case Failure(err) => fail(err) case Success(meta) => meta shouldBe ScalerMetadata(ScalingType.Logarithmic, EmptyScalerArgs()) } val shifted = scaledResponse.map[Real](v => v.value.map(_ + 1).toReal, operationName = "shift") val descaledResponse = new DescalerTransformer[Real, Real, Real]().setInput(shifted, scaledResponse).getOutput() val workflow = new OpWorkflow().setResultFeatures(descaledResponse) val wfModel = workflow.setInputDataset(inputData).train() val transformed = wfModel.score() val actual = transformed.collect().map(_.getAs[Double](1)) val expected = Array(4.0, 1.0, 0.0).map(_ * math.E) all(actual.zip(expected).map(x => math.abs(x._2 - x._1))) should be < 0.0001 } it should "work with its shortcut" in { val descaled = f1.descale[Real, Real](f1) val transformed = descaled.originStage.asInstanceOf[DescalerTransformer[Real, Real, Real]].transform(inputData) val actual = transformed.collect(descaled) actual shouldEqual expectedResult.toArray } }
Example 124
Source File: KyuubiSessionTab.scala From kyuubi with Apache License 2.0 | 5 votes |
package org.apache.spark.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.ui.KyuubiSessionTab._ import yaooqinn.kyuubi.ui.{KyuubiServerListener, KyuubiServerMonitor} class KyuubiSessionTab(userName: String, sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), userName) { override val name = s"Kyuubi Tab 4 $userName" val parent = getSparkUI(sparkContext) // KyuubiSessionTab renders by different listener's content, identified by user. val listener = KyuubiServerMonitor.getListener(userName).getOrElse { val lr = new KyuubiServerListener(sparkContext.conf) KyuubiServerMonitor.setListener(userName, lr) lr } attachPage(new KyuubiSessionPage(this)) attachPage(new KyuubiSessionSubPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } object KyuubiSessionTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 125
Source File: SparkContextReflectionSuite.scala From kyuubi with Apache License 2.0 | 5 votes |
package yaooqinn.kyuubi import org.apache.spark.{SparkConf, SparkContext, SparkException, SparkFunSuite} import yaooqinn.kyuubi.utils.ReflectUtils class SparkContextReflectionSuite extends SparkFunSuite { test("SparkContext initialization with default constructor") { val conf = new SparkConf(loadDefaults = true).setMaster("local").setAppName("sc_init") val sc = ReflectUtils .newInstance(classOf[SparkContext].getName, Seq(classOf[SparkConf]), Seq(conf)) .asInstanceOf[SparkContext] assert(sc.isInstanceOf[SparkContext]) sc.stop() } test("SparkContext initialization with this()") { intercept[SparkException](ReflectUtils .instantiateClassByName(classOf[SparkContext].getName) .asInstanceOf[SparkContext]) } test("SparkContext initialization with app name & master & conf") { val conf = new SparkConf(loadDefaults = true) val sc = ReflectUtils .newInstance( classOf[SparkContext].getName, Seq(classOf[String], classOf[String], classOf[SparkConf]), Seq("local", "sc_init", conf)) .asInstanceOf[SparkContext] assert(sc.isInstanceOf[SparkContext]) sc.stop() } test("Initializing 2 SparkContext with Reflection") { val conf1 = new SparkConf(loadDefaults = true) .setMaster("local").setAppName("sc1").set("spark.driver.allowMultipleContexts", "true") val sc1 = ReflectUtils .newInstance(classOf[SparkContext].getName, Seq(classOf[SparkConf]), Seq(conf1)) .asInstanceOf[SparkContext] val conf2 = new SparkConf(loadDefaults = true) .setMaster("local").setAppName("sc2").set("spark.driver.allowMultipleContexts", "true") val sc2 = ReflectUtils .newInstance(classOf[SparkContext].getName, Seq(classOf[SparkConf]), Seq(conf2)) .asInstanceOf[SparkContext] assert(sc1 !== sc2) sc1.stop() sc2.stop() } }
Example 126
Source File: NumericParser.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty){ // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 127
Source File: LabeledPoint.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 128
Source File: NumericParserSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.{SparkException, SparkFunSuite} class NumericParserSuite extends SparkFunSuite { test("parser") {//解析 val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3)) assert(parsed(1).asInstanceOf[Double] === -4.0) assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8)) assert(parsed(3).asInstanceOf[Double] === 9.0) val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4") malformatted.foreach { s => intercept[SparkException] { NumericParser.parse(s) throw new RuntimeException(s"Didn't detect malformatted string $s.") } } } test("parser with whitespaces") {//空格的解析 val s = "(0.0, [1.0, 2.0])" //数字解析 val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Double] === 0.0) assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) } }
Example 129
Source File: CommitFailureTestRelationSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils { override def _sqlContext: SQLContext = TestHive private val sqlContext = _sqlContext // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. //提交任务时,“CommitFailureTestSource”会为测试目的引发异常 val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName //commitTask()失败应该回退到abortTask() test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. //这里我们将分区号合并为1,以确保只发出一个任务, 这个防止当FileOutputCommitter尝试删除`_temporary`时发生竞争条件 //目录提交/中止作业, 有关详细信息,请参阅SPARK-8513 val df = sqlContext.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 130
Source File: ThriftServerTab.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SparkSQLEnv} import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} import org.apache.spark.{SparkContext, Logging, SparkException} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 131
Source File: StreamingTab.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.{Logging, SparkException} import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} import StreamingTab._ private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(getSparkUI(ssc), "streaming") with Logging { private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 132
Source File: RUtils.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.File import scala.collection.JavaConversions._ import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Seq("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 133
Source File: LocalRDDCheckpointData.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark.{Logging, SparkEnv, SparkException, TaskContext} import org.apache.spark.storage.{RDDBlockId, StorageLevel} import org.apache.spark.util.Utils def transformStorageLevel(level: StorageLevel): StorageLevel = { // If this RDD is to be cached off-heap, fail fast since we cannot provide any // correctness guarantees about subsequent computations after the first one //如果这个RDD要被堆栈缓存,那么快速失败,因为我们不能在第一个之后提供关于后续计算的任何正确性保证 if (level.useOffHeap) { throw new SparkException("Local checkpointing is not compatible with off-heap caching.") } StorageLevel(useDisk = true, level.useMemory, level.deserialized, level.replication) } }
Example 134
Source File: ProactiveClosureSerializationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 135
Source File: StarryClosureCleaner.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.util import org.apache.spark.internal.Logging import org.apache.spark.{SparkEnv, SparkException} import scala.collection.mutable object StarryClosureCleaner extends Logging { val serializableMap: LRUCache[String, Boolean] = new LRUCache[String, Boolean](100000) // Check whether a class represents a Scala closure private def isClosure(cls: Class[_]): Boolean = { cls.getName.contains("$anonfun$") } def clean( closure: AnyRef, checkSerializable: Boolean = true, cleanTransitively: Boolean = true): Unit = { clean(closure, checkSerializable, cleanTransitively, mutable.Map.empty) } private def clean( func: AnyRef, checkSerializable: Boolean, cleanTransitively: Boolean, accessedFields: mutable.Map[Class[_], mutable.Set[String]]): Unit = { if (!isClosure(func.getClass)) { logWarning("Expected a closure; got " + func.getClass.getName) return } if (func == null) { return } if (checkSerializable) { ensureSerializable(func) } } private def ensureSerializable(func: AnyRef) { if (!serializableMap.containsKey(func.getClass.getCanonicalName)) { try { if (SparkEnv.get != null) { SparkEnv.get.closureSerializer.newInstance().serialize(func) serializableMap.put(func.getClass.getCanonicalName, true) } } catch { case ex: Exception => throw new SparkException("Task not serializable", ex) } } } case class LRUCache[K, V](cacheSize: Int) extends util.LinkedHashMap[K, V] { override def removeEldestEntry(eldest: util.Map.Entry[K, V]): Boolean = size > cacheSize } }
Example 136
Source File: QuerySuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException} import org.scalatest.FunSuite import scala.collection.mutable.ArrayBuffer abstract class QuerySuite extends FunSuite with Logging { case class TestCase(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String], answersSize: Int) { def this(program: String, query: String, data: Map[String, Seq[String]], answersSize: Int) = this(program, query, data, null, answersSize) def this(program: String, query: String, data: Map[String, Seq[String]], answers: Seq[String]) = this(program, query, data, answers, answers.size) } def runTest(testCase: TestCase): Unit = runTests(Seq(testCase)) def runTests(testCases: Seq[TestCase]): Unit = { val sparkCtx = new SparkContext("local[*]", "QuerySuite", new SparkConf() .set("spark.eventLog.enabled", "true") //.set("spark.eventLog.dir", "../logs") .set("spark.ui.enabled", "false") .set("spark.sql.shuffle.partitions", "5") .setAll(Map.empty[String, String]) ) val bigDatalogCtx = new BigDatalogContext(sparkCtx) var count: Int = 1 for (testCase <- testCases) { bigDatalogCtx.loadProgram(testCase.program) for ((relationName, data) <- testCase.data) { val relationInfo = bigDatalogCtx.relationCatalog.getRelationInfo(relationName) if (relationInfo == null) throw new SparkException("You are attempting to load an unknown relation.") bigDatalogCtx.registerAndLoadTable(relationName, data, bigDatalogCtx.conf.numShufflePartitions) } val query = testCase.query val answers = testCase.answers logInfo("========== START BigDatalog Query " + count + " START ==========") val program = bigDatalogCtx.query(query) val results = program.execute().collect() // for some test cases we will only know the size of the answer set, not the actual answers if (answers == null) { assert(results.size == testCase.answersSize) } else { if (results.size != answers.size) { displayDifferences(results.map(_.toString), answers) // yes this will fail assert(results.size == answers.size) } else { for (result <- results) assert(answers.contains(result.toString())) } val resultStrings = results.map(_.toString).toSet for (answer <- answers) assert(resultStrings.contains(answer.toString())) } logInfo("========== END BigDatalog Query " + count + " END ==========\n") count += 1 bigDatalogCtx.reset() } sparkCtx.stop() } private def displayDifferences(results: Seq[String], answers: Seq[String]): Unit = { val missingAnswers = new ArrayBuffer[String] val missingResults = new ArrayBuffer[String] for (result <- results) if (!answers.contains(result)) missingAnswers += result for (answer <- answers) if (!results.contains(answer)) missingResults += answer if (missingAnswers.nonEmpty) logInfo("Results not in Answers: " + missingAnswers.mkString(", ")) if (missingResults.nonEmpty) logInfo("Answers not in Results: " + missingResults.mkString(", ")) } }
Example 137
Source File: NumericParser.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty){ // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 138
Source File: LabeledPoint.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.annotation.Since import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.util.NumericParser import org.apache.spark.SparkException @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 139
Source File: NumericParserSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.{SparkException, SparkFunSuite} class NumericParserSuite extends SparkFunSuite { test("parser") { val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3)) assert(parsed(1).asInstanceOf[Double] === -4.0) assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8)) assert(parsed(3).asInstanceOf[Double] === 9.0) val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4") malformatted.foreach { s => intercept[SparkException] { NumericParser.parse(s) throw new RuntimeException(s"Didn't detect malformatted string $s.") } } } test("parser with whitespaces") { val s = "(0.0, [1.0, 2.0])" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Double] === 0.0) assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) } }
Example 140
Source File: CommitFailureTestRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = sqlContext.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 141
Source File: ThriftServerTab.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SparkSQLEnv} import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} import org.apache.spark.{SparkContext, Logging, SparkException} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 142
Source File: UnionDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.rdd.UnionRDD private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.size > 0) { Some(new UnionRDD(ssc.sc, rdds)) } else { None } } }
Example 143
Source File: TransformedDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.length > 0, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse( // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE throw new SparkException(s"Couldn't generate RDD from parent at time $validTime")) } val transformedRDD = transformFunc(parentRDDs, validTime) if (transformedRDD == null) { throw new SparkException("Transform function must not return null. " + "Return SparkContext.emptyRDD() instead to represent no element " + "as the result of transformation.") } Some(transformedRDD) } override protected[streaming] def createRDDWithLocalProperties[U]( time: Time, displayInnerRDDOps: Boolean)(body: => U): U = { super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body) } }
Example 144
Source File: StreamingTab.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.{Logging, SparkException} import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} import StreamingTab._ private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(getSparkUI(ssc), "streaming") with Logging { private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 145
Source File: RpcEndpointAddress.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc.netty import org.apache.spark.SparkException import org.apache.spark.rpc.RpcAddress private[netty] case class RpcEndpointAddress(val rpcAddress: RpcAddress, val name: String) { require(name != null, "RpcEndpoint name must be provided.") def this(host: String, port: Int, name: String) = { this(RpcAddress(host, port), name) } override val toString = if (rpcAddress != null) { s"spark://$name@${rpcAddress.host}:${rpcAddress.port}" } else { s"spark-client://$name" } } private[netty] object RpcEndpointAddress { def apply(sparkUrl: String): RpcEndpointAddress = { try { val uri = new java.net.URI(sparkUrl) val host = uri.getHost val port = uri.getPort val name = uri.getUserInfo if (uri.getScheme != "spark" || host == null || port < 0 || name == null || (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null uri.getFragment != null || uri.getQuery != null) { throw new SparkException("Invalid Spark URL: " + sparkUrl) } new RpcEndpointAddress(host, port, name) } catch { case e: java.net.URISyntaxException => throw new SparkException("Invalid Spark URL: " + sparkUrl, e) } } }
Example 146
Source File: RUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.File import java.util.Arrays import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Arrays.asList("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 147
Source File: MemoryCheckpointRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.storage.RDDBlockId import org.apache.spark.{Partition, SparkContext, SparkException, TaskContext} import scala.reflect.ClassTag // We use a different class than LocalCheckpointRDD, but the same functionality, // so that we easily identify (e..g, pattern match) this class in DAGScheduler. class MemoryCheckpointRDD[T: ClassTag](sc: SparkContext, rddId: Int, numPartitions: Int) extends LocalCheckpointRDD[T](sc, rddId, numPartitions) { def this(rdd: RDD[T]) { this(rdd.context, rdd.id, rdd.partitions.size) } override def compute(partition: Partition, context: TaskContext): Iterator[T] = { throw new SparkException( s"Checkpoint block ${RDDBlockId(rddId, partition.index)} not found! Either the executor " + s"that originally checkpointed this partition is no longer alive, or the original RDD is " + s"unpersisted. If this problem persists, you may consider using `rdd.checkpoint()` " + s"or `rdd.localcheckpoint()` instead, which are slower than memory checkpointing but more fault-tolerant.") } }
Example 148
Source File: RpcAddressSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import org.apache.spark.{SparkException, SparkFunSuite} class RpcAddressSuite extends SparkFunSuite { test("hostPort") { val address = RpcAddress("1.2.3.4", 1234) assert(address.host == "1.2.3.4") assert(address.port == 1234) assert(address.hostPort == "1.2.3.4:1234") } test("fromSparkURL") { val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234") assert(address.host == "1.2.3.4") assert(address.port == 1234) } test("fromSparkURL: a typo url") { val e = intercept[SparkException] { RpcAddress.fromSparkURL("spark://1.2. 3.4:1234") } assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage) } test("fromSparkURL: invalid scheme") { val e = intercept[SparkException] { RpcAddress.fromSparkURL("invalid://1.2.3.4:1234") } assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage) } test("toSparkURL") { val address = RpcAddress("1.2.3.4", 1234) assert(address.toSparkURL == "spark://1.2.3.4:1234") } }
Example 149
Source File: KryoSerializerResizableOutputSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.SparkContext import org.apache.spark.LocalSparkContext import org.apache.spark.SparkException class KryoSerializerResizableOutputSuite extends SparkFunSuite { // trial and error showed this will not serialize with 1mb buffer val x = (1 to 400000).toArray test("kryo without resizable output buffer should fail on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "1m") val sc = new SparkContext("local", "test", conf) intercept[SparkException](sc.parallelize(x).collect()) LocalSparkContext.stop(sc) } test("kryo with resizable output buffer should succeed on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "2m") val sc = new SparkContext("local", "test", conf) assert(sc.parallelize(x).collect() === x) LocalSparkContext.stop(sc) } }
Example 150
Source File: CoarseGrainedSchedulerBackendSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite} import org.apache.spark.util.{RpcUtils, SerializableBuffer} class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext { test("serialized task larger than max RPC message size") { val conf = new SparkConf conf.set("spark.rpc.message.maxSize", "1") conf.set("spark.default.parallelism", "1") sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf) val frameSize = RpcUtils.maxMessageSizeBytes(sc.conf) val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize)) val larger = sc.parallelize(Seq(buffer)) val thrown = intercept[SparkException] { larger.collect() } assert(thrown.getMessage.contains("using broadcast variables for large values")) val smaller = sc.parallelize(1 to 4).collect() assert(smaller.size === 4) } }
Example 151
Source File: CoarseGrainedSchedulerBackendSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite} import org.apache.spark.util.{SerializableBuffer, AkkaUtils} class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext { test("serialized task larger than akka frame size") { val conf = new SparkConf conf.set("spark.akka.frameSize", "1") conf.set("spark.default.parallelism", "1") sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf) val frameSize = AkkaUtils.maxFrameSizeBytes(sc.conf) val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize)) val larger = sc.parallelize(Seq(buffer)) val thrown = intercept[SparkException] { larger.collect() } assert(thrown.getMessage.contains("using broadcast variables for large values")) val smaller = sc.parallelize(1 to 4).collect() assert(smaller.size === 4) } }
Example 152
Source File: ThriftServerTabSeq.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2Seq.HiveThriftServer2ListenerSeq import org.apache.spark.sql.hive.thriftserver.{HiveThriftServer2, SequilaThriftServer} import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} private[thriftserver] class ThriftServerTabSeq(sparkContext: SparkContext, list: HiveThriftServer2ListenerSeq) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "SeQuiLa JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = list attachPage(new ThriftServerPageSeq(this)) attachPage(new ThriftServerSessionPageSeq(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 153
Source File: SimhashIndexing.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.story import java.net.URL import com.datastax.spark.connector._ import io.gzet.story.model.Article import io.gzet.story.util.SimhashUtils._ import io.gzet.story.util.{HtmlFetcher, Tokenizer} import io.gzet.utils.spark.gdelt.GKGParser import org.apache.lucene.analysis.en.EnglishAnalyzer import org.apache.spark.{Logging, SparkConf, SparkContext, SparkException} import scala.util.Try object SimhashIndexing extends SimpleConfig with Logging { def main(args: Array[String]) = { val sc = new SparkContext(new SparkConf().setAppName("GDELT Indexing")) if (args.isEmpty) throw new SparkException("usage: <gdeltInputDir>") val gdeltInputDir = args.head val gkgRDD = sc.textFile(gdeltInputDir) .map(GKGParser.toJsonGKGV2) .map(GKGParser.toCaseClass2) val urlRDD = gkgRDD.map(g => g.documentId.getOrElse("NA")) .filter(url => Try(new URL(url)).isSuccess) .distinct() .repartition(partitions) val contentRDD = urlRDD.mapPartitions({ it => val html = new HtmlFetcher(gooseConnectionTimeout, gooseSocketTimeout) it map html.fetch }) val corpusRDD = contentRDD.mapPartitions({ it => val analyzer = new EnglishAnalyzer() it.map(content => (content, Tokenizer.lucene(content.body, analyzer))) }).filter({ case (content, corpus) => corpus.length > minWords }) //CREATE TABLE gzet.articles ( hash int PRIMARY KEY, url text, title text, body text ); corpusRDD.mapValues(_.mkString(" ").simhash).map({ case (content, simhash) => Article(simhash, content.body, content.title, content.url) }).saveToCassandra(cassandraKeyspace, cassandraTable) } }
Example 154
Source File: SimpleVectorAssembler.scala From albedo with MIT License | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuilder def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val schema = dataset.schema val assembleFunc = udf { r: Row => SimpleVectorAssembler.assemble(r.toSeq: _*) } val args = $(inputCols).map { c => schema(c).dataType match { case DoubleType => dataset(c) case _: VectorUDT => dataset(c) case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid") } } dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol))) } override def transformSchema(schema: StructType): StructType = { val inputColNames = $(inputCols) val outputColName = $(outputCol) val inputDataTypes = inputColNames.map(name => schema(name).dataType) inputDataTypes.foreach { case _: NumericType | BooleanType => case t if t.isInstanceOf[VectorUDT] => case other => throw new IllegalArgumentException(s"Data type $other is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true)) } override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra) } object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] { override def load(path: String): SimpleVectorAssembler = super.load(path) def assemble(vv: Any*): Vector = { val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] var cur = 0 vv.foreach { case v: Double => if (v != 0.0) { indices += cur values += v } cur += 1 case vec: Vector => vec.foreachActive { case (i, v) => if (v != 0.0) { indices += cur + i values += v } } cur += vec.size case null => // TODO: output Double.NaN? throw new SparkException("Values to assemble cannot be null.") case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } Vectors.sparse(cur, indices.result(), values.result()).compressed } }
Example 155
Source File: N1qlSpec.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.n1ql import com.couchbase.client.core.CouchbaseException import com.couchbase.client.java.error.QueryExecutionException import com.couchbase.client.java.query.N1qlQuery import org.apache.spark.{SparkConf, SparkContext, SparkException} import org.apache.spark.sql.sources.EqualTo import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import org.scalatest._ import com.couchbase.spark._ import com.couchbase.spark.connection.CouchbaseConnection import com.couchbase.spark.sql.N1QLRelation import org.apache.spark.sql.types.{StringType, StructField, StructType} import scala.util.control.NonFatal class N1qlSpec extends FunSuite with Matchers with BeforeAndAfterAll { private val master = "local[2]" private val appName = "cb-int-specs1" private var spark: SparkSession = _ override def beforeAll(): Unit = { spark = SparkSession .builder() .master(master) .appName(appName) .config("spark.couchbase.username", "Administrator") .config("spark.couchbase.password", "password") // Open 2 buckets as tests below rely on it .config("com.couchbase.bucket.default", "") .config("com.couchbase.bucket.travel-sample", "") .getOrCreate() } override def afterAll(): Unit = { CouchbaseConnection().stop() spark.stop() } test("Creating N1QLRelation with default bucket, when two buckets exist, should fail") { assertThrows[IllegalStateException] { spark.read .format("com.couchbase.spark.sql.DefaultSource") .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline"))) .option("schemaFilter", "`type` = 'airline'") .schema(StructType(StructField("name", StringType) :: Nil)) .load() } } test("Creating N1QLRelation with non-default bucket, when two buckets exist, should succeed") { spark.read .format("com.couchbase.spark.sql.DefaultSource") .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline"))) .option("schemaFilter", "`type` = 'airline'") .option("bucket", "travel-sample") .schema(StructType(StructField("name", StringType) :: Nil)) .load() } test("N1QL failures should fail the Observable") { try { spark.sparkContext .couchbaseQuery(N1qlQuery.simple("BAD QUERY"), bucketName = "default") .collect() .foreach(println) fail() } catch { case e: SparkException => assert (e.getCause.isInstanceOf[QueryExecutionException]) val err = e.getCause.asInstanceOf[QueryExecutionException] assert (err.getMessage == "syntax error - at QUERY") case NonFatal(e) => println(e) fail() } } }
Example 156
Source File: StructTypeToMleap.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.converter import com.truecar.mleap.runtime.types import org.apache.spark.SparkException import org.apache.spark.mllib.linalg.VectorUDT import org.apache.spark.sql.types._ case class StructTypeToMleap(schema: StructType) { def toMleap: types.StructType = { val leapFields = schema.fields.map { field => val sparkType = field.dataType val sparkTypeName = sparkType.typeName val dataType = sparkType match { case _: NumericType | BooleanType => types.DoubleType case _: StringType => types.StringType case _: VectorUDT => types.VectorType case dataType: ArrayType if dataType.elementType == StringType => types.StringArrayType case _ => throw new SparkException(s"unsupported MLeap datatype: $sparkTypeName") } types.StructField(field.name, dataType) } types.StructType(leapFields) } }
Example 157
Source File: StringIndexerModelSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase} import com.opendatagroup.hadrian.errors.PFAUserException import org.apache.spark.SparkException import org.apache.spark.ml.feature.StringIndexer class StringIndexerModelSuite extends SparkFeaturePFASuiteBase[StringIndexerResult] { import spark.implicits._ val df = Seq( (0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c") ).toDF("id", "category") val testHandleInvalidDF = Seq( (0, "a"), (1, "b"), (2, "c"), (3, "d"), (4, "e"), (5, "c") ).toDF("id", "category") val indexer = new StringIndexer() .setInputCol("category") .setOutputCol("categoryIndex") override val sparkTransformer = indexer.fit(df) val result = sparkTransformer.transform(df) val sparkOutput = result.select(indexer.getOutputCol).toDF() override val input = result.select(indexer.getInputCol).toJSON.collect() override val expectedOutput = sparkOutput.toJSON.collect() // Additional test for handleInvalid test("StringIndexer with handleInvalid=keep") { val sparkTransformer = indexer.setHandleInvalid("keep").fit(df) val result = sparkTransformer.transform(testHandleInvalidDF) val input = testHandleInvalidDF.select(indexer.getInputCol).toJSON.collect() val expectedOutput = result.select(indexer.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("StringIndexer with handleInvalid=error") { val sparkTransformer = indexer.setHandleInvalid("error").fit(df) intercept[SparkException] { val result = sparkTransformer.transform(testHandleInvalidDF) result.foreach(_ => Unit) } intercept[PFAUserException] { val input = testHandleInvalidDF.select(indexer.getInputCol).toJSON.collect() // we transform on df here to avoid Spark throwing the error and to ensure we match // the sizes of expected input / output. The error should be thrown before the comparison // would fail val expectedOutput = sparkTransformer.transform(df).select(indexer.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } } } case class StringIndexerResult(categoryIndex: Double) extends Result
Example 158
Source File: LabeledPoint.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import org.apache.spark.SparkException import org.apache.spark.linalg.{NumericParser, Vector, Vectors} import scala.beans.BeanInfo /** * * Class that represents the features and label of a data point. * * @param label Label for this data point. * @param features List of features for this data point. */ @BeanInfo case class LabeledPoint(label: Double, features: Vector) extends Serializable { override def toString: String = { s"($label,$features)" } } object LabeledPoint { /** * Parses a string resulted from `LabeledPoint#toString` into * an [[LabeledPoint]]. * */ def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } }
Example 159
Source File: PSVectorPool.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.context import com.tencent.angel.ml.math2.utils.RowType import com.tencent.angel.sona.models.PSVector import com.tencent.angel.sona.models.impl.PSVectorImpl import org.apache.spark.SparkException import sun.misc.Cleaner /** * PSVectorPool delegate a memory space on PS servers, * which hold `capacity` number vectors with `numDimensions` dimension. * The dimension of PSVectors in one PSVectorPool is the same. * * A PSVectorPool is like a Angel Matrix. * * @param id PSVectorPool unique id * @param dimension Dimension of vectors * @param capacity Capacity of pool */ class PSVectorPool( val id: Int, val dimension: Long, val capacity: Int, val rowType: RowType) { val cleaners = new java.util.WeakHashMap[PSVector, Cleaner] val bitSet = new java.util.BitSet(capacity) var destroyed = false var size = 0 def allocate(): PSVector = { if (destroyed) { throw new SparkException("This vector pool has been destroyed!") } if (size > math.max(capacity * 0.9, 4)) { System.gc() } tryOnce match { case Some(toReturn) => return toReturn case None => } System.gc() Thread.sleep(100L) // Try again tryOnce match { case Some(toReturn) => toReturn case None => throw new SparkException("This vector pool is full!") } } private def tryOnce: Option[PSVector] = { bitSet.synchronized { if (size < capacity) { val index = bitSet.nextClearBit(0) bitSet.set(index) size += 1 return Some(doCreateOne(index)) } } None } private def doCreateOne(index: Int): PSVector = { val vector = new PSVectorImpl(id, index, dimension, rowType) val task = new CleanTask(id, index) cleaners.put(vector, Cleaner.create(vector, task)) vector } private class CleanTask(poolId: Int, index: Int) extends Runnable { def run(): Unit = { bitSet.synchronized { bitSet.clear(index) size -= 1 } } } def delete(key: PSVector): Unit = { cleaners.remove(key).clean() } def destroy(): Unit = { destroyed = true } } object PSVectorPool { val DEFAULT_POOL_CAPACITY = 10 }
Example 160
Source File: PSVector.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.models import java.util.concurrent.Future import scala.collection.Map import org.apache.spark.SparkException import com.tencent.angel.ml.math2.vector.Vector import com.tencent.angel.ml.math2.utils.RowType import com.tencent.angel.ml.matrix.psf.get.base.{GetFunc, GetResult} import com.tencent.angel.ml.matrix.psf.update.base.{UpdateFunc, VoidResult} import com.tencent.angel.sona.context.PSContext def longKeySparse(dim: Long, maxRange: Long, capacity: Int = 20, rowType: RowType = RowType.T_DOUBLE_SPARSE_LONGKEY, additionalConfiguration: Map[String, String] = Map()): PSVector = { sparse(dim, capacity, maxRange, rowType, additionalConfiguration) } def sparse(dimension: Long, capacity: Int, range: Long, rowType: RowType, additionalConfiguration: Map[String, String]): PSVector = { PSContext.instance().createVector(dimension, rowType, capacity, range, additionalConfiguration) } def sparse(dimension: Long, capacity: Int = 20, rowType: RowType = RowType.T_DOUBLE_SPARSE_LONGKEY, additionalConfiguration: Map[String, String] = Map()): PSVector = { sparse(dimension, capacity, dimension, rowType, additionalConfiguration) } }
Example 161
Source File: TextPiperSuite.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.transformers.pipe import scala.collection.JavaConverters._ import org.apache.spark.SparkException import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{StringType, StructField, StructType} import io.projectglow.Glow import io.projectglow.sql.GlowBaseTest class TextPiperSuite extends GlowBaseTest { override def afterEach(): Unit = { Glow.transform("pipe_cleanup", spark.emptyDataFrame) super.afterEach() } def pipeText(df: DataFrame): DataFrame = { val options = Map("inputFormatter" -> "text", "outputFormatter" -> "text", "cmd" -> """["cat", "-"]""") new PipeTransformer().transform(df, options) } test("text input and output") { val sess = spark import sess.implicits._ val output = pipeText(Seq("hello", "world").toDF()) assert(output.count() == 2) assert(output.schema == StructType(Seq(StructField("text", StringType)))) assert(output.orderBy("text").as[String].collect.toSeq == Seq("hello", "world")) } test("text input requires one column") { val sess = spark import sess.implicits._ val df = Seq(Seq("hello", "world"), Seq("foo", "bar")).toDF() assertThrows[IllegalArgumentException](pipeText(df)) } test("text input requires string column") { val sess = spark import sess.implicits._ val df = Seq(Seq(5), Seq(6)).toDF() assertThrows[IllegalArgumentException](pipeText(df)) } test("does not break on null row") { val sess = spark import sess.implicits._ val df = Seq("hello", null, "hello").toDF() val output = pipeText(df) assert(output.count() == 2) assert(output.filter("text = 'hello'").count == 2) } test("command fails") { val sess = spark import sess.implicits._ val df = Seq("hello", "world").toDF() val options = Map( "inputFormatter" -> "text", "outputFormatter" -> "text", "cmd" -> """["bash", "-c", "exit 1"]""") val ex = intercept[SparkException] { new PipeTransformer().transform(df, options) } assert(ex.getMessage.contains("Subprocess exited with status 1")) // threads should still be cleaned up eventually { assert( !Thread .getAllStackTraces .asScala .keySet .exists(_.getName.startsWith(ProcessHelper.STDIN_WRITER_THREAD_PREFIX))) assert( !Thread .getAllStackTraces .asScala .keySet .exists(_.getName.startsWith(ProcessHelper.STDERR_READER_THREAD_PREFIX))) } } }
Example 162
Source File: CrailBroadcast.scala From crail-spark-io with Apache License 2.0 | 5 votes |
package org.apache.spark.broadcast import java.io._ import org.apache.spark.storage._ import org.apache.spark.{SparkEnv, SparkException} import scala.collection.mutable import scala.reflect.ClassTag import scala.util.control.NonFatal private[spark] class CrailBroadcast[T: ClassTag](obj: T, id: Long) extends Broadcast[T](id) with Serializable { @transient private lazy val _value: T = readBroadcastBlock() private val broadcastId = BroadcastBlockId(id) writeBlocks(obj) override protected def getValue() = { _value } override protected def doUnpersist(blocking: Boolean): Unit = { logWarning(" called doUnpersist on broadcastId: " + id + " (NYI)") } override protected def doDestroy(blocking: Boolean): Unit = { val obj = x.asInstanceOf[T] if(CrailBroadcast.useLocalCache) { CrailBroadcast.broadcastCache(id) = Some(x) } else { SparkEnv.get.blockManager.putSingle(broadcastId, obj, StorageLevel.MEMORY_ONLY, tellMaster = false) } obj case None => throw new SparkException(s"Failed to get broadcast " + broadcastId) } } } } } private object CrailBroadcast { //FIXME: (atr) I am not completely sure about if this gives us the best performance. val broadcastCache:mutable.HashMap[Long, Option[Any]] = new mutable.HashMap[Long, Option[Any]] private val useLocalCache = false def unbroadcast(id: Long, removeFromDriver: Boolean, blocking: Boolean): Unit = { this.synchronized { if(useLocalCache) { broadcastCache.remove(id) } else { SparkEnv.get.blockManager.master.removeBroadcast(id, removeFromDriver, blocking) SparkEnv.get.blockManager.removeBroadcast(id, false) } } } def cleanCache(): Unit = { this.synchronized { broadcastCache.clear() } } } object Utils { def tryOrIOException[T](block: => T): T = { try { block } catch { case e: IOException => throw e case NonFatal(e) => throw new IOException(e) } } }
Example 163
Source File: IOCommon.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.common import java.io.{File, FileInputStream, IOException, InputStreamReader} import java.util.Properties import org.apache.hadoop.io.compress.CompressionCodec import org.apache.hadoop.io.{NullWritable, Text} import org.apache.hadoop.mapred.SequenceFileOutputFormat import org.apache.spark.rdd.RDD import org.apache.spark.{SparkContext, SparkException} import scala.collection.JavaConversions._ import scala.collection.mutable.HashMap import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag class IOCommon(val sc:SparkContext) { def load[T:ClassTag:TypeTag](filename:String, force_format:Option[String]=None) = { val input_format = force_format.getOrElse( IOCommon.getProperty("sparkbench.inputformat").getOrElse("Text")) input_format match { case "Text" => sc.textFile(filename) case "Sequence" => sc.sequenceFile[NullWritable, Text](filename).map(_._2.toString) case _ => throw new UnsupportedOperationException(s"Unknown inpout format: $input_format") } } def save(filename:String, data:RDD[_], prefix:String) = { val output_format = IOCommon.getProperty(prefix).getOrElse("Text") val output_format_codec = loadClassByName[CompressionCodec](IOCommon.getProperty(prefix + ".codec")) output_format match { case "Text" => if (output_format_codec.isEmpty) data.saveAsTextFile(filename) else data.saveAsTextFile(filename, output_format_codec.get) case "Sequence" => val sequence_data = data.map(x => (NullWritable.get(), new Text(x.toString))) if (output_format_codec.isEmpty) { sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename) } else { sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename, output_format_codec.get) } case _ => throw new UnsupportedOperationException(s"Unknown output format: $output_format") } } def save(filename:String, data:RDD[_]):Unit = save(filename, data, "sparkbench.outputformat") private def loadClassByName[T](name:Option[String]) = { if (!name.isEmpty) Some(Class.forName(name.get) .newInstance.asInstanceOf[T].getClass) else None } private def callMethod[T, R](obj:T, method_name:String) = obj.getClass.getMethod(method_name).invoke(obj).asInstanceOf[R] } object IOCommon { private val sparkbench_conf: HashMap[String, String] = getPropertiesFromFile(System.getenv("SPARKBENCH_PROPERTIES_FILES")) def getPropertiesFromFile(filenames: String): HashMap[String, String] = { val result = new HashMap[String, String] filenames.split(',').filter(_.stripMargin.length > 0).foreach { filename => val file = new File(filename) require(file.exists, s"Properties file $file does not exist") require(file.isFile, s"Properties file $file is not a normal file") val inReader = new InputStreamReader(new FileInputStream(file), "UTF-8") try { val properties = new Properties() properties.load(inReader) result ++= properties.stringPropertyNames() .map(k => (k, properties(k).trim)).toMap } catch { case e: IOException => val message = s"Failed when loading Sparkbench properties file $file" throw new SparkException(message, e) } finally { inReader.close() } } result.filter{case (key, value) => value.toLowerCase != "none"} } def getProperty(key:String):Option[String] = sparkbench_conf.get(key) def dumpProperties(): Unit = sparkbench_conf .foreach{case (key, value)=> println(s"$key\t\t$value")} }
Example 164
Source File: SQLServerTab.scala From spark-sql-server with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.server.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.server.SQLServerListener import org.apache.spark.sql.server.ui.SQLServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} case class SQLServerTab( sparkContext: SparkContext, listener: SQLServerListener) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" private val parent = getSparkUI(sparkContext) attachPage(new SQLServerPage(this)) attachPage(new SQLServerSessionPage(this)) parent.attachTab(this) def detach() { parent.detachTab(this) } } object SQLServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 165
Source File: DataFrameToFileWriter.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations.readwritedataframe.filestorage import org.apache.spark.SparkException import io.deepsense.commons.utils.LoggerForCallerClass import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperations.exceptions.WriteFileException import io.deepsense.deeplang.doperations.inout.OutputFileFormatChoice.Csv import io.deepsense.deeplang.doperations.inout.OutputStorageTypeChoice import io.deepsense.deeplang.doperations.readwritedataframe.{FilePath, FilePathFromLibraryPath, FileScheme} import io.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv.CsvSchemaStringifierBeforeCsvWriting import io.deepsense.deeplang.exceptions.DeepLangException import io.deepsense.deeplang.{ExecutionContext, FileSystemClient} import org.apache.spark.sql.SaveMode object DataFrameToFileWriter { val logger = LoggerForCallerClass() def writeToFile( fileChoice: OutputStorageTypeChoice.File, context: ExecutionContext, dataFrame: DataFrame): Unit = { implicit val ctx = context val path = FileSystemClient.replaceLeadingTildeWithHomeDirectory(fileChoice.getOutputFile()) val filePath = FilePath(path) val saveMode = if (fileChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists try { val preprocessed = fileChoice.getFileFormat() match { case csv: Csv => CsvSchemaStringifierBeforeCsvWriting.preprocess(dataFrame) case other => dataFrame } writeUsingProvidedFileScheme(fileChoice, preprocessed, filePath, saveMode) } catch { case e: SparkException => logger.error(s"WriteDataFrame error: Spark problem. Unable to write file to $path", e) throw WriteFileException(path, e) } } private def writeUsingProvidedFileScheme( fileChoice: OutputStorageTypeChoice.File, dataFrame: DataFrame, path: FilePath, saveMode: SaveMode )(implicit context: ExecutionContext): Unit = { import FileScheme._ path.fileScheme match { case Library => val filePath = FilePathFromLibraryPath(path) val FilePath(_, libraryPath) = filePath new java.io.File(libraryPath).getParentFile.mkdirs() writeUsingProvidedFileScheme(fileChoice, dataFrame, filePath, saveMode) case FileScheme.File => DriverFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode) case HDFS => ClusterFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode) case HTTP | HTTPS | FTP => throw NotSupportedScheme(path.fileScheme) } } case class NotSupportedScheme(fileScheme: FileScheme) extends DeepLangException(s"Not supported file scheme ${fileScheme.pathPrefix}") }
Example 166
Source File: ThriftServerTab.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.ui import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.internal.Logging import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2 import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab._ import org.apache.spark.ui.{SparkUI, SparkUITab} private[thriftserver] class ThriftServerTab(sparkContext: SparkContext) extends SparkUITab(getSparkUI(sparkContext), "sqlserver") with Logging { override val name = "JDBC/ODBC Server" val parent = getSparkUI(sparkContext) val listener = HiveThriftServer2.listener attachPage(new ThriftServerPage(this)) attachPage(new ThriftServerSessionPage(this)) parent.attachTab(this) def detach() { getSparkUI(sparkContext).detachTab(this) } } private[thriftserver] object ThriftServerTab { def getSparkUI(sparkContext: SparkContext): SparkUI = { sparkContext.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 167
Source File: KryoSerializerResizableOutputSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.SparkContext import org.apache.spark.LocalSparkContext import org.apache.spark.SparkException class KryoSerializerResizableOutputSuite extends SparkFunSuite { // trial and error showed this will not serialize with 1mb buffer //试验和错误不会序列化使用1MB的缓冲 val x = (1 to 400000).toArray //kryo不可调整大小的输出缓冲区,应该在大数组失败 test("kryo without resizable output buffer should fail on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "1m") val sc = new SparkContext("local", "test", conf) intercept[SparkException](sc.parallelize(x).collect()) LocalSparkContext.stop(sc) } //kryo不可调整大小的输出缓冲区,应该在大数组成功 test("kryo with resizable output buffer should succeed on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "2m") val sc = new SparkContext("local", "test", conf) assert(sc.parallelize(x).collect() === x) LocalSparkContext.stop(sc) } }
Example 168
Source File: ProactiveClosureSerializationSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } //在一个活动的序列化异常,抛出预期的序列化异常 test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each //有可能是一个更清洁的方式来消除样板, for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 169
Source File: CoarseGrainedSchedulerBackendSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkException, SparkFunSuite} import org.apache.spark.util.{SerializableBuffer, AkkaUtils} class CoarseGrainedSchedulerBackendSuite extends SparkFunSuite with LocalSparkContext { //序列化任务大于Akka框架大小 ignore("serialized task larger than akka frame size") { val conf = new SparkConf //以MB为单位的driver和executor之间通信信息的大小,设置值越大,driver可以接受越大的计算结果 conf.set("spark.akka.frameSize", "1") //设置并发数 conf.set("spark.default.parallelism", "1") //sc = new SparkContext("local-cluster[2, 1, 1024]", "test", conf) sc = new SparkContext("local[*]", "test", conf) //获得Akka传递值大小 1048576默认10M val frameSize = AkkaUtils.maxFrameSizeBytes(sc.conf) //创建一个序列化缓存 //ByteBuffer.allocate在能够读和写之前,必须有一个缓冲区,用静态方法 allocate() 来分配缓冲区 //allocate 分配20M val buffer = new SerializableBuffer(java.nio.ByteBuffer.allocate(2 * frameSize)) val larger = sc.parallelize(Seq(buffer)) val thrown = intercept[SparkException] { larger.collect() } //抛出异常:使用大的值广播变量 assert(thrown.getMessage.contains("using broadcast variables for large values")) val smaller = sc.parallelize(1 to 4).collect() assert(smaller.size === 4)/**/ } }
Example 170
Source File: MutableURLClassLoaderSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.util import java.net.URLClassLoader import org.apache.spark.{SparkContext, SparkException, SparkFunSuite, TestUtils} class MutableURLClassLoaderSuite extends SparkFunSuite { val urls2 = List(TestUtils.createJarWithClasses( classNames = Seq("FakeClass1", "FakeClass2", "FakeClass3"), toStringValue = "2")).toArray val urls = List(TestUtils.createJarWithClasses( classNames = Seq("FakeClass1"), classNamesWithBase = Seq(("FakeClass2", "FakeClass3")), // FakeClass3 is in parent toStringValue = "1", classpathUrls = urls2)).toArray test("child first") {//第一个子类 val parentLoader = new URLClassLoader(urls2, null) val classLoader = new ChildFirstURLClassLoader(urls, parentLoader) val fakeClass = classLoader.loadClass("FakeClass2").newInstance() val fakeClassVersion = fakeClass.toString assert(fakeClassVersion === "1") val fakeClass2 = classLoader.loadClass("FakeClass2").newInstance() assert(fakeClass.getClass === fakeClass2.getClass) } test("parent first") {//第一个父类 val parentLoader = new URLClassLoader(urls2, null) val classLoader = new MutableURLClassLoader(urls, parentLoader) val fakeClass = classLoader.loadClass("FakeClass1").newInstance() val fakeClassVersion = fakeClass.toString assert(fakeClassVersion === "2") val fakeClass2 = classLoader.loadClass("FakeClass1").newInstance() assert(fakeClass.getClass === fakeClass2.getClass) } test("child first can fall back") {//子第一次可以倒退 val parentLoader = new URLClassLoader(urls2, null) val classLoader = new ChildFirstURLClassLoader(urls, parentLoader) val fakeClass = classLoader.loadClass("FakeClass3").newInstance() val fakeClassVersion = fakeClass.toString assert(fakeClassVersion === "2") } test("child first can fail") {//子第一次可以失败 val parentLoader = new URLClassLoader(urls2, null) val classLoader = new ChildFirstURLClassLoader(urls, parentLoader) intercept[java.lang.ClassNotFoundException] { classLoader.loadClass("FakeClassDoesNotExist").newInstance() } } //驱动程序在本地模式下设置上下文类加载程序 test("driver sets context class loader in local mode") { // Test the case where the driver program sets a context classloader and then runs a job // in local mode. This is what happens when ./spark-submit is called with "local" as the // master. //测试驱动程序设置上下文类加载器然后运行作业的情况 //在本地模式,当./spark-submit以“local”作为调用时,会发生什么master val original = Thread.currentThread().getContextClassLoader val className = "ClassForDriverTest" val jar = TestUtils.createJarWithClasses(Seq(className)) val contextLoader = new URLClassLoader(Array(jar), Utils.getContextOrSparkClassLoader) Thread.currentThread().setContextClassLoader(contextLoader) val sc = new SparkContext("local", "driverLoaderTest") try { sc.makeRDD(1 to 5, 2).mapPartitions { x => val loader = Thread.currentThread().getContextClassLoader // scalastyle:off classforname Class.forName(className, true, loader).newInstance() // scalastyle:on classforname Seq().iterator }.count() } catch { case e: SparkException if e.getMessage.contains("ClassNotFoundException") => fail("Local executor could not find class", e) case t: Throwable => fail("Unexpected exception ", t) } sc.stop() Thread.currentThread().setContextClassLoader(original) } }
Example 171
Source File: RWrappers.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods._ import org.apache.spark.SparkException import org.apache.spark.ml.util.MLReader private[r] object RWrappers extends MLReader[Object] { override def load(path: String): Object = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val className = (rMetadata \ "class").extract[String] className match { case "org.apache.spark.ml.r.NaiveBayesWrapper" => NaiveBayesWrapper.load(path) case "org.apache.spark.ml.r.AFTSurvivalRegressionWrapper" => AFTSurvivalRegressionWrapper.load(path) case "org.apache.spark.ml.r.GeneralizedLinearRegressionWrapper" => GeneralizedLinearRegressionWrapper.load(path) case "org.apache.spark.ml.r.KMeansWrapper" => KMeansWrapper.load(path) case "org.apache.spark.ml.r.MultilayerPerceptronClassifierWrapper" => MultilayerPerceptronClassifierWrapper.load(path) case "org.apache.spark.ml.r.LDAWrapper" => LDAWrapper.load(path) case "org.apache.spark.ml.r.IsotonicRegressionWrapper" => IsotonicRegressionWrapper.load(path) case "org.apache.spark.ml.r.GaussianMixtureWrapper" => GaussianMixtureWrapper.load(path) case "org.apache.spark.ml.r.ALSWrapper" => ALSWrapper.load(path) case "org.apache.spark.ml.r.LogisticRegressionWrapper" => LogisticRegressionWrapper.load(path) case "org.apache.spark.ml.r.RandomForestRegressorWrapper" => RandomForestRegressorWrapper.load(path) case "org.apache.spark.ml.r.RandomForestClassifierWrapper" => RandomForestClassifierWrapper.load(path) case "org.apache.spark.ml.r.DecisionTreeRegressorWrapper" => DecisionTreeRegressorWrapper.load(path) case "org.apache.spark.ml.r.DecisionTreeClassifierWrapper" => DecisionTreeClassifierWrapper.load(path) case "org.apache.spark.ml.r.GBTRegressorWrapper" => GBTRegressorWrapper.load(path) case "org.apache.spark.ml.r.GBTClassifierWrapper" => GBTClassifierWrapper.load(path) case "org.apache.spark.ml.r.BisectingKMeansWrapper" => BisectingKMeansWrapper.load(path) case "org.apache.spark.ml.r.LinearSVCWrapper" => LinearSVCWrapper.load(path) case "org.apache.spark.ml.r.FPGrowthWrapper" => FPGrowthWrapper.load(path) case _ => throw new SparkException(s"SparkR read.ml does not support load $className") } } }
Example 172
Source File: NumericParser.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import java.util.StringTokenizer import scala.collection.mutable.{ArrayBuilder, ListBuffer} import org.apache.spark.SparkException def parse(s: String): Any = { val tokenizer = new StringTokenizer(s, "()[],", true) if (tokenizer.hasMoreTokens()) { val token = tokenizer.nextToken() if (token == "(") { parseTuple(tokenizer) } else if (token == "[") { parseArray(tokenizer) } else { // expecting a number parseDouble(token) } } else { throw new SparkException(s"Cannot find any token from the input string.") } } private def parseArray(tokenizer: StringTokenizer): Array[Double] = { val values = ArrayBuilder.make[Double] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "]") { parsing = false } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else { // expecting a number values += parseDouble(token) allowComma = true } } if (parsing) { throw new SparkException(s"An array must end with ']'.") } values.result() } private def parseTuple(tokenizer: StringTokenizer): Seq[_] = { val items = ListBuffer.empty[Any] var parsing = true var allowComma = false var token: String = null while (parsing && tokenizer.hasMoreTokens()) { token = tokenizer.nextToken() if (token == "(") { items.append(parseTuple(tokenizer)) allowComma = true } else if (token == "[") { items.append(parseArray(tokenizer)) allowComma = true } else if (token == ",") { if (allowComma) { allowComma = false } else { throw new SparkException("Found a ',' at a wrong position.") } } else if (token == ")") { parsing = false } else if (token.trim.isEmpty) { // ignore whitespaces between delim chars, e.g. ", [" } else { // expecting a number items.append(parseDouble(token)) allowComma = true } } if (parsing) { throw new SparkException(s"A tuple must end with ')'.") } items } private def parseDouble(s: String): Double = { try { java.lang.Double.parseDouble(s) } catch { case e: NumberFormatException => throw new SparkException(s"Cannot parse a double from: $s", e) } } }
Example 173
Source File: LabeledPoint.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import scala.beans.BeanInfo import org.apache.spark.SparkException import org.apache.spark.annotation.Since import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.util.NumericParser @Since("1.1.0") def parse(s: String): LabeledPoint = { if (s.startsWith("(")) { NumericParser.parse(s) match { case Seq(label: Double, numeric: Any) => LabeledPoint(label, Vectors.parseNumeric(numeric)) case other => throw new SparkException(s"Cannot parse $other.") } } else { // dense format used before v1.0 val parts = s.split(',') val label = java.lang.Double.parseDouble(parts(0)) val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble)) LabeledPoint(label, features) } } private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = { LabeledPoint(point.label, Vectors.fromML(point.features)) } }
Example 174
Source File: ChiSquareTestSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import java.util.Random import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.stat.test.ChiSqTest import org.apache.spark.mllib.util.MLlibTestSparkContext class ChiSquareTestSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("test DataFrame of labeled points") { // labels: 1.0 (2 / 6), 0.0 (4 / 6) // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6) // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6) val data = Seq( LabeledPoint(0.0, Vectors.dense(0.5, 10.0)), LabeledPoint(0.0, Vectors.dense(1.5, 20.0)), LabeledPoint(1.0, Vectors.dense(1.5, 30.0)), LabeledPoint(0.0, Vectors.dense(3.5, 30.0)), LabeledPoint(0.0, Vectors.dense(3.5, 40.0)), LabeledPoint(1.0, Vectors.dense(3.5, 40.0))) for (numParts <- List(2, 4, 6, 8)) { val df = spark.createDataFrame(sc.parallelize(data, numParts)) val chi = ChiSquareTest.test(df, "features", "label") val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = chi.select("pValues", "degreesOfFreedom", "statistics") .as[(Vector, Array[Int], Vector)].head() assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4) assert(degreesOfFreedom === Array(2, 3)) assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4) } } test("large number of features (SPARK-3087)") { // Test that the right number of results is returned val numCols = 1001 val sparseData = Array( LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))), LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0))))) val df = spark.createDataFrame(sparseData) val chi = ChiSquareTest.test(df, "features", "label") val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = chi.select("pValues", "degreesOfFreedom", "statistics") .as[(Vector, Array[Int], Vector)].head() assert(pValues.size === numCols) assert(degreesOfFreedom.length === numCols) assert(statistics.size === numCols) assert(pValues(1000) !== null) // SPARK-3087 } test("fail on continuous features or labels") { val tooManyCategories: Int = 100000 assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " + "tooManyCategories be large enough to cause ChiSqTest to throw an exception.") val random = new Random(11L) val continuousLabel = Seq.fill(tooManyCategories)( LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2)))) withClue("ChiSquare should throw an exception when given a continuous-valued label") { intercept[SparkException] { val df = spark.createDataFrame(continuousLabel) ChiSquareTest.test(df, "features", "label") } } val continuousFeature = Seq.fill(tooManyCategories)( LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble()))) withClue("ChiSquare should throw an exception when given continuous-valued features") { intercept[SparkException] { val df = spark.createDataFrame(continuousFeature) ChiSquareTest.test(df, "features", "label") } } } }
Example 175
Source File: NumericParserSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.util import org.apache.spark.{SparkException, SparkFunSuite} class NumericParserSuite extends SparkFunSuite { test("parser") { val s = "((1.0,2e3),-4,[5e-6,7.0E8],+9)" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Seq[_]] === Seq(1.0, 2.0e3)) assert(parsed(1).asInstanceOf[Double] === -4.0) assert(parsed(2).asInstanceOf[Array[Double]] === Array(5.0e-6, 7.0e8)) assert(parsed(3).asInstanceOf[Double] === 9.0) val malformatted = Seq("a", "[1,,]", "0.123.4", "1 2", "3+4") malformatted.foreach { s => intercept[SparkException] { NumericParser.parse(s) throw new RuntimeException(s"Didn't detect malformatted string $s.") } } } test("parser with whitespaces") { val s = "(0.0, [1.0, 2.0])" val parsed = NumericParser.parse(s).asInstanceOf[Seq[_]] assert(parsed(0).asInstanceOf[Double] === 0.0) assert(parsed(1).asInstanceOf[Array[Double]] === Array(1.0, 2.0)) } }
Example 176
Source File: InitContainerConfigOrchestrator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.k8s.submit.steps.initcontainer import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.k8s.{InitContainerBootstrap, KubernetesUtils, MountSecretsBootstrap} import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ private[spark] class InitContainerConfigOrchestrator( sparkJars: Seq[String], sparkFiles: Seq[String], jarsDownloadPath: String, filesDownloadPath: String, imagePullPolicy: String, configMapName: String, configMapKey: String, sparkConf: SparkConf) { private val initContainerImage = sparkConf .get(INIT_CONTAINER_IMAGE) .getOrElse(throw new SparkException( "Must specify the init-container image when there are remote dependencies")) def getAllConfigurationSteps: Seq[InitContainerConfigurationStep] = { val initContainerBootstrap = new InitContainerBootstrap( initContainerImage, imagePullPolicy, jarsDownloadPath, filesDownloadPath, configMapName, configMapKey, SPARK_POD_DRIVER_ROLE, sparkConf) val baseStep = new BasicInitContainerConfigurationStep( sparkJars, sparkFiles, jarsDownloadPath, filesDownloadPath, initContainerBootstrap) val secretNamesToMountPaths = KubernetesUtils.parsePrefixedKeyValuePairs( sparkConf, KUBERNETES_DRIVER_SECRETS_PREFIX) // Mount user-specified driver secrets also into the driver's init-container. The // init-container may need credentials in the secrets to be able to download remote // dependencies. The driver's main container and its init-container share the secrets // because the init-container is sort of an implementation details and this sharing // avoids introducing a dedicated configuration property just for the init-container. val mountSecretsStep = if (secretNamesToMountPaths.nonEmpty) { Seq(new InitContainerMountSecretsStep(new MountSecretsBootstrap(secretNamesToMountPaths))) } else { Nil } Seq(baseStep) ++ mountSecretsStep } }
Example 177
Source File: InitContainerBootstrap.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.k8s import scala.collection.JavaConverters._ import io.fabric8.kubernetes.api.model.{ContainerBuilder, EmptyDirVolumeSource, EnvVarBuilder, PodBuilder, VolumeMount, VolumeMountBuilder} import org.apache.spark.{SparkConf, SparkException} import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.deploy.k8s.Constants._ def bootstrapInitContainer( original: PodWithDetachedInitContainer): PodWithDetachedInitContainer = { val sharedVolumeMounts = Seq[VolumeMount]( new VolumeMountBuilder() .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME) .withMountPath(jarsDownloadPath) .build(), new VolumeMountBuilder() .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME) .withMountPath(filesDownloadPath) .build()) val customEnvVarKeyPrefix = sparkRole match { case SPARK_POD_DRIVER_ROLE => KUBERNETES_DRIVER_ENV_KEY case SPARK_POD_EXECUTOR_ROLE => "spark.executorEnv." case _ => throw new SparkException(s"$sparkRole is not a valid Spark pod role") } val customEnvVars = sparkConf.getAllWithPrefix(customEnvVarKeyPrefix).toSeq.map { case (key, value) => new EnvVarBuilder() .withName(key) .withValue(value) .build() } val initContainer = new ContainerBuilder(original.initContainer) .withName("spark-init") .withImage(initContainerImage) .withImagePullPolicy(imagePullPolicy) .addAllToEnv(customEnvVars.asJava) .addNewVolumeMount() .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME) .withMountPath(INIT_CONTAINER_PROPERTIES_FILE_DIR) .endVolumeMount() .addToVolumeMounts(sharedVolumeMounts: _*) .addToArgs("init") .addToArgs(INIT_CONTAINER_PROPERTIES_FILE_PATH) .build() val podWithBasicVolumes = new PodBuilder(original.pod) .editSpec() .addNewVolume() .withName(INIT_CONTAINER_PROPERTIES_FILE_VOLUME) .withNewConfigMap() .withName(configMapName) .addNewItem() .withKey(configMapKey) .withPath(INIT_CONTAINER_PROPERTIES_FILE_NAME) .endItem() .endConfigMap() .endVolume() .addNewVolume() .withName(INIT_CONTAINER_DOWNLOAD_JARS_VOLUME_NAME) .withEmptyDir(new EmptyDirVolumeSource()) .endVolume() .addNewVolume() .withName(INIT_CONTAINER_DOWNLOAD_FILES_VOLUME_NAME) .withEmptyDir(new EmptyDirVolumeSource()) .endVolume() .endSpec() .build() val mainContainer = new ContainerBuilder(original.mainContainer) .addToVolumeMounts(sharedVolumeMounts: _*) .addNewEnv() .withName(ENV_MOUNTED_FILES_DIR) .withValue(filesDownloadPath) .endEnv() .build() PodWithDetachedInitContainer( podWithBasicVolumes, initContainer, mainContainer) } }
Example 178
Source File: MesosProtoUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster.mesos import scala.collection.JavaConverters._ import org.apache.mesos.Protos import org.apache.spark.SparkException import org.apache.spark.internal.Logging object MesosProtoUtils extends Logging { def mesosLabels(labelsStr: String): Protos.Labels.Builder = { val labels: Seq[Protos.Label] = if (labelsStr == "") { Seq() } else { labelsStr.split("""(?<!\\),""").toSeq.map { labelStr => val parts = labelStr.split("""(?<!\\):""") if (parts.length != 2) { throw new SparkException(s"Malformed label: ${labelStr}") } val cleanedParts = parts .map(part => part.replaceAll("""\\,""", ",")) .map(part => part.replaceAll("""\\:""", ":")) Protos.Label.newBuilder() .setKey(cleanedParts(0)) .setValue(cleanedParts(1)) .build() } } Protos.Labels.newBuilder().addAllLabels(labels.asJava) } }
Example 179
Source File: YarnClusterManager.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.spark.{SparkContext, SparkException} import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl} private[spark] class YarnClusterManager extends ExternalClusterManager { override def canCreate(masterURL: String): Boolean = { masterURL == "yarn" } override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = { sc.deployMode match { case "cluster" => new YarnClusterScheduler(sc) case "client" => new YarnScheduler(sc) case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn") } } override def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = { sc.deployMode match { case "cluster" => new YarnClusterSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc) case "client" => new YarnClientSchedulerBackend(scheduler.asInstanceOf[TaskSchedulerImpl], sc) case _ => throw new SparkException(s"Unknown deploy mode '${sc.deployMode}' for Yarn") } } override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend) } }
Example 180
Source File: InsertIntoHiveDirCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.language.existentials import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.hive.common.FileUtils import org.apache.hadoop.hive.ql.plan.TableDesc import org.apache.hadoop.hive.serde.serdeConstants import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.mapred._ import org.apache.spark.SparkException import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.catalog.{CatalogStorageFormat, CatalogTable} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.hive.client.HiveClientImpl case class InsertIntoHiveDirCommand( isLocal: Boolean, storage: CatalogStorageFormat, query: LogicalPlan, overwrite: Boolean, outputColumns: Seq[Attribute]) extends SaveAsHiveFile { override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { assert(storage.locationUri.nonEmpty) val hiveTable = HiveClientImpl.toHiveTable(CatalogTable( identifier = TableIdentifier(storage.locationUri.get.toString, Some("default")), tableType = org.apache.spark.sql.catalyst.catalog.CatalogTableType.VIEW, storage = storage, schema = query.schema )) hiveTable.getMetadata.put(serdeConstants.SERIALIZATION_LIB, storage.serde.getOrElse(classOf[LazySimpleSerDe].getName)) val tableDesc = new TableDesc( hiveTable.getInputFormatClass, hiveTable.getOutputFormatClass, hiveTable.getMetadata ) val hadoopConf = sparkSession.sessionState.newHadoopConf() val jobConf = new JobConf(hadoopConf) val targetPath = new Path(storage.locationUri.get) val writeToPath = if (isLocal) { val localFileSystem = FileSystem.getLocal(jobConf) localFileSystem.makeQualified(targetPath) } else { val qualifiedPath = FileUtils.makeQualified(targetPath, hadoopConf) val dfs = qualifiedPath.getFileSystem(jobConf) if (!dfs.exists(qualifiedPath)) { dfs.mkdirs(qualifiedPath.getParent) } qualifiedPath } val tmpPath = getExternalTmpPath(sparkSession, hadoopConf, writeToPath) val fileSinkConf = new org.apache.spark.sql.hive.HiveShim.ShimFileSinkDesc( tmpPath.toString, tableDesc, false) try { saveAsHiveFile( sparkSession = sparkSession, plan = child, hadoopConf = hadoopConf, fileSinkConf = fileSinkConf, outputLocation = tmpPath.toString, allColumns = outputColumns) val fs = writeToPath.getFileSystem(hadoopConf) if (overwrite && fs.exists(writeToPath)) { fs.listStatus(writeToPath).foreach { existFile => if (Option(existFile.getPath) != createdTempDir) fs.delete(existFile.getPath, true) } } fs.listStatus(tmpPath).foreach { tmpFile => fs.rename(tmpFile.getPath, writeToPath) } } catch { case e: Throwable => throw new SparkException( "Failed inserting overwrite directory " + storage.locationUri.get, e) } finally { deleteExternalTmpPath(hadoopConf) } Seq.empty[Row] } }
Example 181
Source File: CommitFailureTestRelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 182
Source File: RpcAddressSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import org.apache.spark.{SparkException, SparkFunSuite} class RpcAddressSuite extends SparkFunSuite { test("hostPort") {//主机端口 val address = RpcAddress("1.2.3.4", 1234) assert(address.host == "1.2.3.4") assert(address.port == 1234) assert(address.hostPort == "1.2.3.4:1234") } test("fromSparkURL") {//来自Spark URL val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234") assert(address.host == "1.2.3.4") assert(address.port == 1234) } test("fromSparkURL: a typo url") {//来自一个错误Spark URL val e = intercept[SparkException] { RpcAddress.fromSparkURL("spark://1.2. 3.4:1234")//中间有空格 } assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage) } test("fromSparkURL: invalid scheme") {//来自一个Spark URL无效模式 val e = intercept[SparkException] { RpcAddress.fromSparkURL("invalid://1.2.3.4:1234") } assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage) } test("toSparkURL") {//转换SparkURL格式 val address = RpcAddress("1.2.3.4", 1234) assert(address.toSparkURL == "spark://1.2.3.4:1234") } }
Example 183
Source File: UDTRegistration.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import scala.collection.mutable import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.util.Utils def getUDTFor(userClass: String): Option[Class[_]] = { udtMap.get(userClass).map { udtClassName => if (Utils.classIsLoadable(udtClassName)) { val udtClass = Utils.classForName(udtClassName) if (classOf[UserDefinedType[_]].isAssignableFrom(udtClass)) { udtClass } else { throw new SparkException( s"${udtClass.getName} is not an UserDefinedType. Please make sure registering " + s"an UserDefinedType for ${userClass}") } } else { throw new SparkException( s"Can not load in UserDefinedType ${udtClassName} for user class ${userClass}.") } } } }
Example 184
Source File: ScalaUDFSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import java.util.Locale import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext import org.apache.spark.sql.types.{IntegerType, StringType} class ScalaUDFSuite extends SparkFunSuite with ExpressionEvalHelper { test("basic") { val intUdf = ScalaUDF((i: Int) => i + 1, IntegerType, Literal(1) :: Nil) checkEvaluation(intUdf, 2) val stringUdf = ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil) checkEvaluation(stringUdf, "ax") } test("better error message for NPE") { val udf = ScalaUDF( (s: String) => s.toLowerCase(Locale.ROOT), StringType, Literal.create(null, StringType) :: Nil) val e1 = intercept[SparkException](udf.eval()) assert(e1.getMessage.contains("Failed to execute user defined function")) val e2 = intercept[SparkException] { checkEvalutionWithUnsafeProjection(udf, null) } assert(e2.getMessage.contains("Failed to execute user defined function")) } test("SPARK-22695: ScalaUDF should not use global variables") { val ctx = new CodegenContext ScalaUDF((s: String) => s + "x", StringType, Literal("a") :: Nil).genCode(ctx) assert(ctx.inlinedMutableStates.isEmpty) } }
Example 185
Source File: FailureSafeParser.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.types.StructType import org.apache.spark.unsafe.types.UTF8String class FailureSafeParser[IN]( rawParser: IN => Seq[InternalRow], mode: ParseMode, schema: StructType, columnNameOfCorruptRecord: String) { private val corruptFieldIndex = schema.getFieldIndex(columnNameOfCorruptRecord) private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord)) private val resultRow = new GenericInternalRow(schema.length) private val nullResult = new GenericInternalRow(schema.length) // This function takes 2 parameters: an optional partial result, and the bad record. If the given // schema doesn't contain a field for corrupted record, we just return the partial result or a // row with all fields null. If the given schema contains a field for corrupted record, we will // set the bad record to this field, and set other fields according to the partial result or null. private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = { if (corruptFieldIndex.isDefined) { (row, badRecord) => { var i = 0 while (i < actualSchema.length) { val from = actualSchema(i) resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull i += 1 } resultRow(corruptFieldIndex.get) = badRecord() resultRow } } else { (row, _) => row.getOrElse(nullResult) } } def parse(input: IN): Iterator[InternalRow] = { try { rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null)) } catch { case e: BadRecordException => mode match { case PermissiveMode => Iterator(toResultRow(e.partialResult(), e.record)) case DropMalformedMode => Iterator.empty case FailFastMode => throw new SparkException("Malformed records are detected in record parsing. " + s"Parse Mode: ${FailFastMode.name}.", e.cause) } } } }
Example 186
Source File: InsertIntoDataSourceDirCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.SparkException import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.catalog._ import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.datasources._ case class InsertIntoDataSourceDirCommand( storage: CatalogStorageFormat, provider: String, query: LogicalPlan, overwrite: Boolean) extends RunnableCommand { override protected def innerChildren: Seq[LogicalPlan] = query :: Nil override def run(sparkSession: SparkSession): Seq[Row] = { assert(storage.locationUri.nonEmpty, "Directory path is required") assert(provider.nonEmpty, "Data source is required") // Create the relation based on the input logical plan: `query`. val pathOption = storage.locationUri.map("path" -> CatalogUtils.URIToString(_)) val dataSource = DataSource( sparkSession, className = provider, options = storage.properties ++ pathOption, catalogTable = None) val isFileFormat = classOf[FileFormat].isAssignableFrom(dataSource.providingClass) if (!isFileFormat) { throw new SparkException( "Only Data Sources providing FileFormat are supported: " + dataSource.providingClass) } val saveMode = if (overwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists try { sparkSession.sessionState.executePlan(dataSource.planForWriting(saveMode, query)).toRdd } catch { case ex: AnalysisException => logError(s"Failed to write to directory " + storage.locationUri.toString, ex) throw ex } Seq.empty[Row] } }
Example 187
Source File: UDTRegistrationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.sql.types._ private[sql] class TestUserClass { } private[sql] class TestUserClass2 { } private[sql] class TestUserClass3 { } private[sql] class NonUserDefinedType { } private[sql] class TestUserClassUDT extends UserDefinedType[TestUserClass] { override def sqlType: DataType = IntegerType override def serialize(input: TestUserClass): Int = 1 override def deserialize(datum: Any): TestUserClass = new TestUserClass override def userClass: Class[TestUserClass] = classOf[TestUserClass] private[spark] override def asNullable: TestUserClassUDT = this override def hashCode(): Int = classOf[TestUserClassUDT].getName.hashCode() override def equals(other: Any): Boolean = other match { case _: TestUserClassUDT => true case _ => false } } class UDTRegistrationSuite extends SparkFunSuite { test("register non-UserDefinedType") { UDTRegistration.register(classOf[TestUserClass].getName, "org.apache.spark.sql.NonUserDefinedType") intercept[SparkException] { UDTRegistration.getUDTFor(classOf[TestUserClass].getName) } } test("default UDTs") { val userClasses = Seq( "org.apache.spark.ml.linalg.Vector", "org.apache.spark.ml.linalg.DenseVector", "org.apache.spark.ml.linalg.SparseVector", "org.apache.spark.ml.linalg.Matrix", "org.apache.spark.ml.linalg.DenseMatrix", "org.apache.spark.ml.linalg.SparseMatrix") userClasses.foreach { c => assert(UDTRegistration.exists(c)) } } test("query registered user class") { UDTRegistration.register(classOf[TestUserClass2].getName, classOf[TestUserClassUDT].getName) assert(UDTRegistration.exists(classOf[TestUserClass2].getName)) assert( classOf[UserDefinedType[_]].isAssignableFrom(( UDTRegistration.getUDTFor(classOf[TestUserClass2].getName).get))) } test("query unregistered user class") { assert(!UDTRegistration.exists(classOf[TestUserClass3].getName)) assert(!UDTRegistration.getUDTFor(classOf[TestUserClass3].getName).isDefined) } }
Example 188
Source File: ParquetFileFormatSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkException import org.apache.spark.sql.QueryTest import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.test.SharedSQLContext class ParquetFileFormatSuite extends QueryTest with ParquetTest with SharedSQLContext { test("read parquet footers in parallel") { def testReadFooters(ignoreCorruptFiles: Boolean): Unit = { withTempDir { dir => val fs = FileSystem.get(sparkContext.hadoopConfiguration) val basePath = dir.getCanonicalPath val path1 = new Path(basePath, "first") val path2 = new Path(basePath, "second") val path3 = new Path(basePath, "third") spark.range(1).toDF("a").coalesce(1).write.parquet(path1.toString) spark.range(1, 2).toDF("a").coalesce(1).write.parquet(path2.toString) spark.range(2, 3).toDF("a").coalesce(1).write.json(path3.toString) val fileStatuses = Seq(fs.listStatus(path1), fs.listStatus(path2), fs.listStatus(path3)).flatten val footers = ParquetFileFormat.readParquetFootersInParallel( sparkContext.hadoopConfiguration, fileStatuses, ignoreCorruptFiles) assert(footers.size == 2) } } testReadFooters(true) val exception = intercept[java.io.IOException] { testReadFooters(false) } assert(exception.getMessage().contains("Could not read footer for file")) } }
Example 189
Source File: UnionDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.length == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.nonEmpty) { Some(ssc.sc.union(rdds)) } else { None } } }
Example 190
Source File: TransformedDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.nonEmpty, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { // 针对每一个流,获取其当前时间的RDD。 val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse( // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE throw new SparkException(s"Couldn't generate RDD from parent at time $validTime")) } val transformedRDD = transformFunc(parentRDDs, validTime) if (transformedRDD == null) { throw new SparkException("Transform function must not return null. " + "Return SparkContext.emptyRDD() instead to represent no element " + "as the result of transformation.") } Some(transformedRDD) } override protected[streaming] def createRDDWithLocalProperties[U]( time: Time, displayInnerRDDOps: Boolean)(body: => U): U = { super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body) } }
Example 191
Source File: StreamingTab.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.streaming.StreamingContext import org.apache.spark.ui.{SparkUI, SparkUITab} private[spark] class StreamingTab(val ssc: StreamingContext) extends SparkUITab(StreamingTab.getSparkUI(ssc), "streaming") with Logging { import StreamingTab._ private val STATIC_RESOURCE_DIR = "org/apache/spark/streaming/ui/static" val parent = getSparkUI(ssc) val listener = ssc.progressListener ssc.addStreamingListener(listener) ssc.sc.addSparkListener(listener) parent.setStreamingJobProgressListener(listener) attachPage(new StreamingPage(this)) attachPage(new BatchPage(this)) def attach() { getSparkUI(ssc).attachTab(this) getSparkUI(ssc).addStaticHandler(STATIC_RESOURCE_DIR, "/static/streaming") } def detach() { getSparkUI(ssc).detachTab(this) getSparkUI(ssc).removeStaticHandler("/static/streaming") } } private object StreamingTab { def getSparkUI(ssc: StreamingContext): SparkUI = { ssc.sc.ui.getOrElse { throw new SparkException("Parent SparkUI to attach this tab to not found!") } } }
Example 192
Source File: StagesResource.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.status.api.v1 import java.util.{List => JList} import javax.ws.rs._ import javax.ws.rs.core.MediaType import org.apache.spark.SparkException import org.apache.spark.scheduler.StageInfo import org.apache.spark.status.api.v1.StageStatus._ import org.apache.spark.status.api.v1.TaskSorting._ import org.apache.spark.ui.SparkUI @Produces(Array(MediaType.APPLICATION_JSON)) private[v1] class StagesResource extends BaseAppResource { @GET def stageList(@QueryParam("status") statuses: JList[StageStatus]): Seq[StageData] = { withUI(_.store.stageList(statuses)) } @GET @Path("{stageId: \\d+}") def stageData( @PathParam("stageId") stageId: Int, @QueryParam("details") @DefaultValue("true") details: Boolean): Seq[StageData] = { withUI { ui => val ret = ui.store.stageData(stageId, details = details) if (ret.nonEmpty) { ret } else { throw new NotFoundException(s"unknown stage: $stageId") } } } @GET @Path("{stageId: \\d+}/{stageAttemptId: \\d+}") def oneAttemptData( @PathParam("stageId") stageId: Int, @PathParam("stageAttemptId") stageAttemptId: Int, @QueryParam("details") @DefaultValue("true") details: Boolean): StageData = withUI { ui => try { ui.store.stageAttempt(stageId, stageAttemptId, details = details) } catch { case _: NoSuchElementException => // Change the message depending on whether there are any attempts for the requested stage. val all = ui.store.stageData(stageId) val msg = if (all.nonEmpty) { val ids = all.map(_.attemptId) s"unknown attempt for stage $stageId. Found attempts: [${ids.mkString(",")}]" } else { s"unknown stage: $stageId" } throw new NotFoundException(msg) } } @GET @Path("{stageId: \\d+}/{stageAttemptId: \\d+}/taskSummary") def taskSummary( @PathParam("stageId") stageId: Int, @PathParam("stageAttemptId") stageAttemptId: Int, @DefaultValue("0.05,0.25,0.5,0.75,0.95") @QueryParam("quantiles") quantileString: String) : TaskMetricDistributions = withUI { ui => val quantiles = quantileString.split(",").map { s => try { s.toDouble } catch { case nfe: NumberFormatException => throw new BadParameterException("quantiles", "double", s) } } ui.store.taskSummary(stageId, stageAttemptId, quantiles).getOrElse( throw new NotFoundException(s"No tasks reported metrics for $stageId / $stageAttemptId yet.")) } @GET @Path("{stageId: \\d+}/{stageAttemptId: \\d+}/taskList") def taskList( @PathParam("stageId") stageId: Int, @PathParam("stageAttemptId") stageAttemptId: Int, @DefaultValue("0") @QueryParam("offset") offset: Int, @DefaultValue("20") @QueryParam("length") length: Int, @DefaultValue("ID") @QueryParam("sortBy") sortBy: TaskSorting): Seq[TaskData] = { withUI(_.store.taskList(stageId, stageAttemptId, offset, length, sortBy)) } }
Example 193
Source File: RpcEndpointAddress.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import org.apache.spark.SparkException private[spark] case class RpcEndpointAddress(rpcAddress: RpcAddress, name: String) { require(name != null, "RpcEndpoint name must be provided.") def this(host: String, port: Int, name: String) = { this(RpcAddress(host, port), name) } override val toString = if (rpcAddress != null) { s"spark://$name@${rpcAddress.host}:${rpcAddress.port}" } else { s"spark-client://$name" } } private[spark] object RpcEndpointAddress { def apply(host: String, port: Int, name: String): RpcEndpointAddress = { new RpcEndpointAddress(host, port, name) } def apply(sparkUrl: String): RpcEndpointAddress = { try { val uri = new java.net.URI(sparkUrl) val host = uri.getHost val port = uri.getPort val name = uri.getUserInfo if (uri.getScheme != "spark" || host == null || port < 0 || name == null || (uri.getPath != null && !uri.getPath.isEmpty) || // uri.getPath returns "" instead of null uri.getFragment != null || uri.getQuery != null) { throw new SparkException("Invalid Spark URL: " + sparkUrl) } new RpcEndpointAddress(host, port, name) } catch { case e: java.net.URISyntaxException => throw new SparkException("Invalid Spark URL: " + sparkUrl, e) } } }
Example 194
Source File: RUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.r import java.io.File import java.util.Arrays import org.apache.spark.{SparkEnv, SparkException} private[spark] object RUtils { // Local path where R binary packages built from R source code contained in the spark // packages specified with "--packages" or "--jars" command line option reside. var rPackages: Option[String] = None def isRInstalled: Boolean = { try { val builder = new ProcessBuilder(Arrays.asList("R", "--version")) builder.start().waitFor() == 0 } catch { case e: Exception => false } } }
Example 195
Source File: RpcAddressSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rpc import org.apache.spark.{SparkException, SparkFunSuite} class RpcAddressSuite extends SparkFunSuite { test("hostPort") { val address = RpcAddress("1.2.3.4", 1234) assert(address.host == "1.2.3.4") assert(address.port == 1234) assert(address.hostPort == "1.2.3.4:1234") } test("fromSparkURL") { val address = RpcAddress.fromSparkURL("spark://1.2.3.4:1234") assert(address.host == "1.2.3.4") assert(address.port == 1234) } test("fromSparkURL: a typo url") { val e = intercept[SparkException] { RpcAddress.fromSparkURL("spark://1.2. 3.4:1234") } assert("Invalid master URL: spark://1.2. 3.4:1234" === e.getMessage) } test("fromSparkURL: invalid scheme") { val e = intercept[SparkException] { RpcAddress.fromSparkURL("invalid://1.2.3.4:1234") } assert("Invalid master URL: invalid://1.2.3.4:1234" === e.getMessage) } test("toSparkURL") { val address = RpcAddress("1.2.3.4", 1234) assert(address.toSparkURL == "spark://1.2.3.4:1234") } }
Example 196
Source File: KryoSerializerResizableOutputSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.LocalSparkContext._ import org.apache.spark.SparkContext import org.apache.spark.SparkException class KryoSerializerResizableOutputSuite extends SparkFunSuite { // trial and error showed this will not serialize with 1mb buffer val x = (1 to 400000).toArray test("kryo without resizable output buffer should fail on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "1m") withSpark(new SparkContext("local", "test", conf)) { sc => intercept[SparkException](sc.parallelize(x).collect()) } } test("kryo with resizable output buffer should succeed on large array") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.set("spark.kryoserializer.buffer", "1m") conf.set("spark.kryoserializer.buffer.max", "2m") withSpark(new SparkContext("local", "test", conf)) { sc => assert(sc.parallelize(x).collect() === x) } } }
Example 197
Source File: ProactiveClosureSerializationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }