org.apache.spark.deploy.SparkHadoopUtil Scala Examples
The following examples show how to use org.apache.spark.deploy.SparkHadoopUtil.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: OrcFileOperator.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.flatMap(getFileReader(_, conf)).headOption.map { reader => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 2
Source File: CommitFailureTestRelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 3
Source File: ExecutorDelegationTokenUpdater.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.concurrent.{Executors, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.{ThreadUtils, Utils} import scala.util.control.NonFatal private[spark] class ExecutorDelegationTokenUpdater( sparkConf: SparkConf, hadoopConf: Configuration) extends Logging { @volatile private var lastCredentialsFileSuffix = 0 private val credentialsFile = sparkConf.get("spark.yarn.credentials.file") private val delegationTokenRenewer = Executors.newSingleThreadScheduledExecutor( ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread")) // On the executor, this thread wakes up and picks up new tokens from HDFS, if any. private val executorUpdaterRunnable = new Runnable { override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired()) } def updateCredentialsIfRequired(): Unit = { try { val credentialsFilePath = new Path(credentialsFile) val remoteFs = FileSystem.get(hadoopConf) SparkHadoopUtil.get.listFilesSorted( remoteFs, credentialsFilePath.getParent, credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION) .lastOption.foreach { credentialsStatus => val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath) if (suffix > lastCredentialsFileSuffix) { logInfo("Reading new delegation tokens from " + credentialsStatus.getPath) val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath) lastCredentialsFileSuffix = suffix UserGroupInformation.getCurrentUser.addCredentials(newCredentials) logInfo("Tokens updated from credentials file.") } else { // Check every hour to see if new credentials arrived. logInfo("Updated delegation tokens were expected, but the driver has not updated the " + "tokens yet, will check again in an hour.") delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) return } } val timeFromNowToRenewal = SparkHadoopUtil.get.getTimeFromNowToRenewal( sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials) if (timeFromNowToRenewal <= 0) { executorUpdaterRunnable.run() } else { logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.") delegationTokenRenewer.schedule( executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS) } } catch { // Since the file may get deleted while we are reading it, catch the Exception and come // back in an hour to try again case NonFatal(e) => logWarning("Error while trying to update credentials, will try again in 1 hour", e) delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) } } private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = { val stream = remoteFs.open(tokenPath) try { val newCredentials = new Credentials() newCredentials.readTokenStorageStream(stream) newCredentials } finally { stream.close() } } def stop(): Unit = { delegationTokenRenewer.shutdown() } }
Example 4
Source File: SimrSchedulerBackend.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 5
Source File: WholeTextFileRecordReader.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 6
Source File: OrcHadoopFsRelationSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.{Row, SQLConf} import org.apache.spark.sql.sources.HadoopFsRelationTest import org.apache.spark.sql.types._ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = classOf[DefaultSource].getCanonicalName import sqlContext._ import sqlContext.implicits._ // ORC does not play well with NullType and UDT. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: CalendarIntervalType => false case _: UserDefinedType[_] => false case _ => true } //分区表 - 简单查询 - 数据中的分区列 test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1)) .toDF("a", "b", "p1") .write .orc(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( read.options(Map( "path" -> file.getCanonicalPath, "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName).load()) } } //“不”包含在ORC滤镜下推中 test("SPARK-12218: 'Not' is included in ORC filter pushdown") { import testImplicits._ withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") { withTempPath { dir => val path = s"${dir.getCanonicalPath}/table1" (1 to 5).map(i => (i, (i % 2).toString)).toDF("a", "b").write.orc(path) checkAnswer( sqlContext.read.orc(path).where("not (a = 2) or not(b in ('1'))"), (1 to 5).map(i => Row(i, (i % 2).toString))) checkAnswer( sqlContext.read.orc(path).where("not (a = 2 and b in ('1'))"), (1 to 5).map(i => Row(i, (i % 2).toString))) } } } }
Example 7
Source File: JsonHadoopFsRelationSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.math.BigDecimal import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.Row import org.apache.spark.sql.types._ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = "json" import sqlContext._ // JSON does not write data of NullType and does not play well with BinaryType. //JSON不会写入Null Type的数据,并且不能使用二进制类型播放 override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: BinaryType => false case _: CalendarIntervalType => false case _ => true } //save()/ load() - 分区表 - 简单查询 - 数据中的分区列 test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } //将复杂类型保存到JSON test("SPARK-9894: save complex types to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("array", ArrayType(LongType)) .add("map", MapType(StringType, new StructType().add("innerField", LongType))) val data = Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) :: Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil val df = createDataFrame(sparkContext.parallelize(data), schema) // Write the data out.写出数据 df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. 把它读回来检查结果 checkAnswer( read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } //将十进制类型保存到JSON test("SPARK-10196: save decimal type to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("decimal", DecimalType(7, 2)) val data = Row(new BigDecimal("10.02")) :: Row(new BigDecimal("20000.99")) :: Row(new BigDecimal("10000")) :: Nil val df = createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. 写出数据 df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. 把它读回来检查结果 checkAnswer( read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } }
Example 8
Source File: CommitFailureTestRelationSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils { override def _sqlContext: SQLContext = TestHive private val sqlContext = _sqlContext // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. //提交任务时,“CommitFailureTestSource”会为测试目的引发异常 val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName //commitTask()失败应该回退到abortTask() test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. //这里我们将分区号合并为1,以确保只发出一个任务, 这个防止当FileOutputCommitter尝试删除`_temporary`时发生竞争条件 //目录提交/中止作业, 有关详细信息,请参阅SPARK-8513 val df = sqlContext.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 9
Source File: SimpleTextHadoopFsRelationSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName import sqlContext._ // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. //我们在这里支持的类型数量非常有限,因为它只是一个测试关系,我们在这里进行非常基本的测试。 override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } //save()/ load() - 分区表 - 简单查询 - 数据中的分区列 test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } }
Example 10
Source File: ExecutorDelegationTokenUpdater.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.concurrent.{Executors, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.{ThreadUtils, Utils} import scala.util.control.NonFatal private[spark] class ExecutorDelegationTokenUpdater( sparkConf: SparkConf, hadoopConf: Configuration) extends Logging { @volatile private var lastCredentialsFileSuffix = 0 private val credentialsFile = sparkConf.get("spark.yarn.credentials.file") private val freshHadoopConf = SparkHadoopUtil.get.getConfBypassingFSCache( hadoopConf, new Path(credentialsFile).toUri.getScheme) private val delegationTokenRenewer = Executors.newSingleThreadScheduledExecutor( ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread")) // On the executor, this thread wakes up and picks up new tokens from HDFS, if any. //在执行程序中,该线程唤醒并从HDFS中获取新令牌(如果有的话) private val executorUpdaterRunnable = new Runnable { override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired()) } def updateCredentialsIfRequired(): Unit = { try { val credentialsFilePath = new Path(credentialsFile) val remoteFs = FileSystem.get(freshHadoopConf) SparkHadoopUtil.get.listFilesSorted( remoteFs, credentialsFilePath.getParent, credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION) .lastOption.foreach { credentialsStatus => val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath) if (suffix > lastCredentialsFileSuffix) { logInfo("Reading new delegation tokens from " + credentialsStatus.getPath) val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath) lastCredentialsFileSuffix = suffix UserGroupInformation.getCurrentUser.addCredentials(newCredentials) logInfo("Tokens updated from credentials file.") } else { // Check every hour to see if new credentials arrived. logInfo("Updated delegation tokens were expected, but the driver has not updated the " + "tokens yet, will check again in an hour.") delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) return } } val timeFromNowToRenewal = SparkHadoopUtil.get.getTimeFromNowToRenewal( sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials) if (timeFromNowToRenewal <= 0) { executorUpdaterRunnable.run() } else { logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.") delegationTokenRenewer.schedule( executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS) } } catch { // Since the file may get deleted while we are reading it, catch the Exception and come // back in an hour to try again case NonFatal(e) => logWarning("Error while trying to update credentials, will try again in 1 hour", e) delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) } } private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = { val stream = remoteFs.open(tokenPath) try { val newCredentials = new Credentials() newCredentials.readTokenStorageStream(stream) newCredentials } finally { stream.close() } } def stop(): Unit = { delegationTokenRenewer.shutdown() } }
Example 11
Source File: EventLogDownloadResource.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.status.api.v1 import java.io.OutputStream import java.util.zip.ZipOutputStream import javax.ws.rs.{GET, Produces} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} import scala.util.control.NonFatal import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil @Produces(Array(MediaType.APPLICATION_OCTET_STREAM)) private[v1] class EventLogDownloadResource( val uIRoot: UIRoot, val appId: String, val attemptId: Option[String]) extends Logging { val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf) @GET def getEventLogs(): Response = { try { val fileName = { attemptId match { case Some(id) => s"eventLogs-$appId-$id.zip" case None => s"eventLogs-$appId.zip" } } //实现StreamingOutput接口 val stream = new StreamingOutput { override def write(output: OutputStream): Unit = { //ZipOutputStream实现打包 val zipStream = new ZipOutputStream(output) try { uIRoot.writeEventLogs(appId, attemptId, zipStream) } finally { zipStream.close() } } } Response.ok(stream) .header("Content-Disposition", s"attachment; filename=$fileName") .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM) .build() } catch { case NonFatal(e) => Response.serverError() .entity(s"Event logs are not available for app: $appId.") .status(Response.Status.SERVICE_UNAVAILABLE) .build() } } }
Example 12
Source File: SimrSchedulerBackend.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, //运行driver的主机名或 IP 地址 RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file //创建临时文件以防止执行程序获得空的驱动程序文件的竞争条件 val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 13
Source File: WholeTextFileRecordReader.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 14
Source File: SparkPodInitContainer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.k8s import java.io.File import java.util.concurrent.TimeUnit import scala.concurrent.{ExecutionContext, Future} import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.deploy.k8s.Config._ import org.apache.spark.internal.Logging import org.apache.spark.util.{ThreadUtils, Utils} private[spark] class SparkPodInitContainer( sparkConf: SparkConf, fileFetcher: FileFetcher) extends Logging { private val maxThreadPoolSize = sparkConf.get(INIT_CONTAINER_MAX_THREAD_POOL_SIZE) private implicit val downloadExecutor = ExecutionContext.fromExecutorService( ThreadUtils.newDaemonCachedThreadPool("download-executor", maxThreadPoolSize)) private val jarsDownloadDir = new File(sparkConf.get(JARS_DOWNLOAD_LOCATION)) private val filesDownloadDir = new File(sparkConf.get(FILES_DOWNLOAD_LOCATION)) private val remoteJars = sparkConf.get(INIT_CONTAINER_REMOTE_JARS) private val remoteFiles = sparkConf.get(INIT_CONTAINER_REMOTE_FILES) private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT) def run(): Unit = { logInfo(s"Downloading remote jars: $remoteJars") downloadFiles( remoteJars, jarsDownloadDir, s"Remote jars download directory specified at $jarsDownloadDir does not exist " + "or is not a directory.") logInfo(s"Downloading remote files: $remoteFiles") downloadFiles( remoteFiles, filesDownloadDir, s"Remote files download directory specified at $filesDownloadDir does not exist " + "or is not a directory.") downloadExecutor.shutdown() downloadExecutor.awaitTermination(downloadTimeoutMinutes, TimeUnit.MINUTES) } private def downloadFiles( filesCommaSeparated: Option[String], downloadDir: File, errMessage: String): Unit = { filesCommaSeparated.foreach { files => require(downloadDir.isDirectory, errMessage) Utils.stringToSeq(files).foreach { file => Future[Unit] { fileFetcher.fetchFile(file, downloadDir) } } } } } private class FileFetcher(sparkConf: SparkConf, securityManager: SparkSecurityManager) { def fetchFile(uri: String, targetDir: File): Unit = { Utils.fetchFile( url = uri, targetDir = targetDir, conf = sparkConf, securityMgr = securityManager, hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf), timestamp = System.currentTimeMillis(), useCache = false) } } object SparkPodInitContainer extends Logging { def main(args: Array[String]): Unit = { logInfo("Starting init-container to download Spark application dependencies.") val sparkConf = new SparkConf(true) if (args.nonEmpty) { Utils.loadDefaultSparkProperties(sparkConf, args(0)) } val securityManager = new SparkSecurityManager(sparkConf) val fileFetcher = new FileFetcher(sparkConf, securityManager) new SparkPodInitContainer(sparkConf, fileFetcher).run() logInfo("Finished downloading application dependencies.") } }
Example 15
Source File: HiveUtilsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import java.net.URL import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.QueryTest import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SQLTestUtils} import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader} class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("newTemporaryConfiguration overwrites listener configurations") { Seq(true, false).foreach { useInMemoryDerby => val conf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(conf(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "") } } test("newTemporaryConfiguration respect spark.hadoop.foo=bar in SparkConf") { sys.props.put("spark.hadoop.foo", "bar") Seq(true, false) foreach { useInMemoryDerby => val hiveConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(!hiveConf.contains("spark.hadoop.foo")) assert(hiveConf("foo") === "bar") } } test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") { val conf = new SparkConf val contextClassLoader = Thread.currentThread().getContextClassLoader val loader = new ChildFirstURLClassLoader(Array(), contextClassLoader) try { Thread.currentThread().setContextClassLoader(loader) HiveUtils.newClientForMetadata( conf, SparkHadoopUtil.newConfiguration(conf), HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true)) } finally { Thread.currentThread().setContextClassLoader(contextClassLoader) } } test("toHiveString correctly handles UDTs") { val point = new ExamplePoint(50.0, 50.0) val tpe = new ExamplePointUDT() assert(HiveUtils.toHiveString((point, tpe)) === "(50.0, 50.0)") } }
Example 16
Source File: OrcHadoopFsRelationSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.sources.HadoopFsRelationTest import org.apache.spark.sql.types._ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = classOf[DefaultSource].getCanonicalName import sqlContext._ import sqlContext.implicits._ test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1)) .toDF("a", "b", "p1") .write .format("orc") .save(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( load( source = dataSourceName, options = Map( "path" -> file.getCanonicalPath, "dataSchema" -> dataSchemaWithPartition.json))) } } }
Example 17
Source File: HiveCliSessionStateSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import org.apache.hadoop.hive.cli.CliSessionState import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.ql.session.SessionState import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.hive.HiveUtils class HiveCliSessionStateSuite extends SparkFunSuite { def withSessionClear(f: () => Unit): Unit = { try f finally SessionState.detachSession() } test("CliSessionState will be reused") { withSessionClear { () => val hiveConf = new HiveConf(classOf[SessionState]) HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach { case (key, value) => hiveConf.set(key, value) } val sessionState: SessionState = new CliSessionState(hiveConf) SessionState.start(sessionState) val s1 = SessionState.get val sparkConf = new SparkConf() val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf) val s2 = HiveUtils.newClientForMetadata(sparkConf, hadoopConf).getState assert(s1 === s2) assert(s2.isInstanceOf[CliSessionState]) } } test("SessionState will not be reused") { withSessionClear { () => val sparkConf = new SparkConf() val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf) HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach { case (key, value) => hadoopConf.set(key, value) } val hiveClient = HiveUtils.newClientForMetadata(sparkConf, hadoopConf) val s1 = hiveClient.getState val s2 = hiveClient.newSession().getState assert(s1 !== s2) } } }
Example 18
Source File: DriverWrapper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.worker import java.io.File import org.apache.commons.lang3.StringUtils import org.apache.spark.{SecurityManager, SparkConf} import org.apache.spark.deploy.{DependencyUtils, SparkHadoopUtil, SparkSubmit} import org.apache.spark.internal.Logging import org.apache.spark.rpc.RpcEnv import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils} case workerUrl :: userJar :: mainClass :: extraArgs => val conf = new SparkConf() val host: String = Utils.localHostName() val port: Int = sys.props.getOrElse("spark.driver.port", "0").toInt val rpcEnv = RpcEnv.create("Driver", host, port, conf, new SecurityManager(conf)) logInfo(s"Driver address: ${rpcEnv.address}") rpcEnv.setupEndpoint("workerWatcher", new WorkerWatcher(rpcEnv, workerUrl)) val currentLoader = Thread.currentThread.getContextClassLoader val userJarUrl = new File(userJar).toURI().toURL() val loader = if (sys.props.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) { new ChildFirstURLClassLoader(Array(userJarUrl), currentLoader) } else { new MutableURLClassLoader(Array(userJarUrl), currentLoader) } Thread.currentThread.setContextClassLoader(loader) setupDependencies(loader, userJar) // Delegate to supplied main class val clazz = Utils.classForName(mainClass) val mainMethod = clazz.getMethod("main", classOf[Array[String]]) mainMethod.invoke(null, extraArgs.toArray[String]) rpcEnv.shutdown() case _ => // scalastyle:off println System.err.println("Usage: DriverWrapper <workerUrl> <userJar> <driverMainClass> [options]") // scalastyle:on println System.exit(-1) } } private def setupDependencies(loader: MutableURLClassLoader, userJar: String): Unit = { val sparkConf = new SparkConf() val secMgr = new SecurityManager(sparkConf) val hadoopConf = SparkHadoopUtil.newConfiguration(sparkConf) val Seq(packagesExclusions, packages, repositories, ivyRepoPath, ivySettingsPath) = Seq( "spark.jars.excludes", "spark.jars.packages", "spark.jars.repositories", "spark.jars.ivy", "spark.jars.ivySettings" ).map(sys.props.get(_).orNull) val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(packagesExclusions, packages, repositories, ivyRepoPath, Option(ivySettingsPath)) val jars = { val jarsProp = sys.props.get("spark.jars").orNull if (!StringUtils.isBlank(resolvedMavenCoordinates)) { SparkSubmit.mergeFileLists(jarsProp, resolvedMavenCoordinates) } else { jarsProp } } val localJars = DependencyUtils.resolveAndDownloadJars(jars, userJar, sparkConf, hadoopConf, secMgr) DependencyUtils.addJarsToClassPath(localJars, loader) } }
Example 19
Source File: OrcFileOperator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.Logging import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.HiveMetastoreTypes import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(path: String, conf: Option[Configuration]): StructType = { val reader = getFileReader(path, conf).getOrElse { throw new AnalysisException( s"Failed to discover schema from ORC files stored in $path. " + "Probably there are either no ORC files or only empty ORC files.") } val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $path, got Hive schema string: $schema") HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType] } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDir) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) if (paths == null || paths.isEmpty) { throw new IllegalArgumentException( s"orcFileOperator: path $path does not have valid orc files matching the pattern") } paths } }
Example 20
Source File: OrcHadoopFsRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.{Row, SQLConf} import org.apache.spark.sql.sources.HadoopFsRelationTest import org.apache.spark.sql.types._ class OrcHadoopFsRelationSuite extends HadoopFsRelationTest { import testImplicits._ override val dataSourceName: String = classOf[DefaultSource].getCanonicalName // ORC does not play well with NullType and UDT. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: CalendarIntervalType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1)) .toDF("a", "b", "p1") .write .orc(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( hiveContext.read.options(Map( "path" -> file.getCanonicalPath, "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName).load()) } } test("SPARK-12218: 'Not' is included in ORC filter pushdown") { import testImplicits._ withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") { withTempPath { dir => val path = s"${dir.getCanonicalPath}/table1" (1 to 5).map(i => (i, (i % 2).toString)).toDF("a", "b").write.orc(path) checkAnswer( sqlContext.read.orc(path).where("not (a = 2) or not(b in ('1'))"), (1 to 5).map(i => Row(i, (i % 2).toString))) checkAnswer( sqlContext.read.orc(path).where("not (a = 2 and b in ('1'))"), (1 to 5).map(i => Row(i, (i % 2).toString))) } } } }
Example 21
Source File: JsonHadoopFsRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.math.BigDecimal import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.Row import org.apache.spark.sql.types._ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = "json" // JSON does not write data of NullType and does not play well with BinaryType. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: BinaryType => false case _: CalendarIntervalType => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( hiveContext.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("SPARK-9894: save complex types to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("array", ArrayType(LongType)) .add("map", MapType(StringType, new StructType().add("innerField", LongType))) val data = Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) :: Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil val df = hiveContext.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( hiveContext.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } test("SPARK-10196: save decimal type to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("decimal", DecimalType(7, 2)) val data = Row(new BigDecimal("10.02")) :: Row(new BigDecimal("20000.99")) :: Row(new BigDecimal("10000")) :: Nil val df = hiveContext.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( hiveContext.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } }
Example 22
Source File: CommitFailureTestRelationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = sqlContext.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 23
Source File: ExecutorDelegationTokenUpdater.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.util.concurrent.{Executors, TimeUnit} import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.{Logging, SparkConf} import org.apache.spark.util.{ThreadUtils, Utils} import scala.util.control.NonFatal private[spark] class ExecutorDelegationTokenUpdater( sparkConf: SparkConf, hadoopConf: Configuration) extends Logging { @volatile private var lastCredentialsFileSuffix = 0 private val credentialsFile = sparkConf.get("spark.yarn.credentials.file") private val freshHadoopConf = SparkHadoopUtil.get.getConfBypassingFSCache( hadoopConf, new Path(credentialsFile).toUri.getScheme) private val delegationTokenRenewer = Executors.newSingleThreadScheduledExecutor( ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread")) // On the executor, this thread wakes up and picks up new tokens from HDFS, if any. private val executorUpdaterRunnable = new Runnable { override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired()) } def updateCredentialsIfRequired(): Unit = { try { val credentialsFilePath = new Path(credentialsFile) val remoteFs = FileSystem.get(freshHadoopConf) SparkHadoopUtil.get.listFilesSorted( remoteFs, credentialsFilePath.getParent, credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION) .lastOption.foreach { credentialsStatus => val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath) if (suffix > lastCredentialsFileSuffix) { logInfo("Reading new delegation tokens from " + credentialsStatus.getPath) val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath) lastCredentialsFileSuffix = suffix UserGroupInformation.getCurrentUser.addCredentials(newCredentials) logInfo("Tokens updated from credentials file.") } else { // Check every hour to see if new credentials arrived. logInfo("Updated delegation tokens were expected, but the driver has not updated the " + "tokens yet, will check again in an hour.") delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) return } } val timeFromNowToRenewal = SparkHadoopUtil.get.getTimeFromNowToRenewal( sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials) if (timeFromNowToRenewal <= 0) { // We just checked for new credentials but none were there, wait a minute and retry. // This handles the shutdown case where the staging directory may have been removed(see // SPARK-12316 for more details). delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.MINUTES) } else { logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.") delegationTokenRenewer.schedule( executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS) } } catch { // Since the file may get deleted while we are reading it, catch the Exception and come // back in an hour to try again case NonFatal(e) => logWarning("Error while trying to update credentials, will try again in 1 hour", e) delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS) } } private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = { val stream = remoteFs.open(tokenPath) try { val newCredentials = new Credentials() newCredentials.readTokenStorageStream(stream) newCredentials } finally { stream.close() } } def stop(): Unit = { delegationTokenRenewer.shutdown() } }
Example 24
Source File: EventLogDownloadResource.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.status.api.v1 import java.io.OutputStream import java.util.zip.ZipOutputStream import javax.ws.rs.{GET, Produces} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} import scala.util.control.NonFatal import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.SparkHadoopUtil @Produces(Array(MediaType.APPLICATION_OCTET_STREAM)) private[v1] class EventLogDownloadResource( val uIRoot: UIRoot, val appId: String, val attemptId: Option[String]) extends Logging { val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf) @GET def getEventLogs(): Response = { try { val fileName = { attemptId match { case Some(id) => s"eventLogs-$appId-$id.zip" case None => s"eventLogs-$appId.zip" } } val stream = new StreamingOutput { override def write(output: OutputStream): Unit = { val zipStream = new ZipOutputStream(output) try { uIRoot.writeEventLogs(appId, attemptId, zipStream) } finally { zipStream.close() } } } Response.ok(stream) .header("Content-Disposition", s"attachment; filename=$fileName") .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM) .build() } catch { case NonFatal(e) => Response.serverError() .entity(s"Event logs are not available for app: $appId.") .status(Response.Status.SERVICE_UNAVAILABLE) .build() } } }
Example 25
Source File: SimrSchedulerBackend.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.rpc.RpcAddress import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName, RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt), CoarseGrainedSchedulerBackend.ENDPOINT_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) if (!fs.delete(new Path(driverFilePath), false)) { logWarning(s"error deleting ${driverFilePath}") } super.stop() } }
Example 26
Source File: WholeTextFileRecordReader.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 27
Source File: LasOutputWriter.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus.las import org.apache.spark.sql.types._ import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext } import java.io.DataOutputStream import org.apache.spark.sql.sources.OutputWriter import org.apache.spark.deploy.SparkHadoopUtil import org.apache.hadoop.io.{ NullWritable, BytesWritable } import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat import org.apache.hadoop.fs.Path import java.text.NumberFormat import org.apache.spark.sql.{ Row, SQLContext, sources } import fr.ign.spark.iqmulus.RowOutputStream class LasOutputWriter( name: String, context: TaskAttemptContext, dataSchema: StructType, formatOpt: Option[Byte] = None, version: Version = Version(), offset: Array[Double] = Array(0F, 0F, 0F), scale: Array[Double] = Array(0.01F, 0.01F, 0.01F) ) extends OutputWriter { private val file = { val path = getDefaultWorkFile("/1.pdr") val fs = path.getFileSystem(context.getConfiguration) fs.create(path) } private val pmin = Array.fill[Double](3)(Double.PositiveInfinity) private val pmax = Array.fill[Double](3)(Double.NegativeInfinity) private val countByReturn = Array.fill[Long](15)(0) private def count = countByReturn.sum private val format = formatOpt.getOrElse(LasHeader.formatFromSchema(dataSchema)) // todo, extra bytes private val schema = LasHeader.schema(format) private def header = new LasHeader(name, format, count, pmin, pmax, scale, offset, countByReturn) private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian = true, schema, dataSchema) def getDefaultWorkFile(extension: String): Path = { val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID") val taskAttemptId: TaskAttemptID = context.getTaskAttemptID val split = taskAttemptId.getTaskID.getId new Path(name, f"$split%05d-$uniqueWriteJobId$extension") } override def write(row: Row): Unit = { recordWriter.write(row) // gather statistics for the header val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble val ret = row.getAs[Byte]("flags") & 0x3 countByReturn(ret) += 1 pmin(0) = Math.min(pmin(0), x) pmin(1) = Math.min(pmin(1), y) pmin(2) = Math.min(pmin(2), z) pmax(0) = Math.max(pmax(0), x) pmax(1) = Math.max(pmax(1), y) pmax(2) = Math.max(pmax(2), z) } override def close(): Unit = { recordWriter.close // write header val path = getDefaultWorkFile("/0.header") val fs = path.getFileSystem(context.getConfiguration) val dos = new java.io.DataOutputStream(fs.create(path)) header.write(dos) dos.close // copy header and pdf to a final las file (1 per split) org.apache.hadoop.fs.FileUtil.copyMerge( fs, getDefaultWorkFile("/"), fs, getDefaultWorkFile(".las"), true, context.getConfiguration, "" ) } }
Example 28
Source File: AngelClientFactory.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.context import com.tencent.angel.client.AngelClient import com.tencent.angel.client.local.AngelLocalClient import com.tencent.angel.client.yarn.AngelYarnClient import org.apache.spark.SparkContext import org.apache.spark.deploy.SparkHadoopUtil object AngelClientFactory { def get(sc: SparkContext): AngelClient = { val conf = SparkHadoopUtil.get.newConfiguration(sc.getConf) if (sc.isLocal) { new AngelLocalClient(conf) } else { new AngelYarnClient(conf) } } }
Example 29
Source File: TreeUtils.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.tree import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil object TreeUtils { def getFileSystem(conf: SparkConf, path: Path): FileSystem = { val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) if (sys.env.contains("HADOOP_CONF_DIR") || sys.env.contains("YARN_CONF_DIR")) { val hdfsConfPath = if (sys.env.get("HADOOP_CONF_DIR").isDefined) { sys.env.get("HADOOP_CONF_DIR").get + "/core-site.xml" } else { sys.env.get("YARN_CONF_DIR").get + "/core-site.xml" } hadoopConf.addResource(new Path(hdfsConfPath)) } path.getFileSystem(hadoopConf) } def getPartitionOffsets(upper: Int, numPartitions: Int): (Array[Int], Array[Int]) = { val npp = upper / numPartitions val nppp = npp + 1 val residual = upper - npp * numPartitions val boundary = residual * nppp val startPP = new Array[Int](numPartitions) val lcLenPP = new Array[Int](numPartitions) var i = 0 while(i < numPartitions) { if (i < residual) { startPP(i) = nppp * i lcLenPP(i) = nppp } else{ startPP(i) = boundary + (i - residual) * npp lcLenPP(i) = npp } i += 1 } (startPP, lcLenPP) } }
Example 30
Source File: SparkUtils.scala From zen with Apache License 2.0 | 5 votes |
package com.github.cloudml.zen.ml.util import breeze.linalg.{Vector => BV, SparseVector => BSV, DenseVector => BDV} import breeze.storage.Zero import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.mllib.linalg.{DenseVector => SDV, Vector => SV, SparseVector => SSV} import scala.language.implicitConversions import scala.reflect.ClassTag private[zen] object SparkUtils { implicit def toBreeze(sv: SV): BV[Double] = { sv match { case SDV(data) => new BDV(data) case SSV(size, indices, values) => new BSV(indices, values, size) } } implicit def fromBreeze(breezeVector: BV[Double]): SV = { breezeVector match { case v: BDV[Double] => if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) { new SDV(v.data) } else { new SDV(v.toArray) // Can't use underlying array directly, so make a new one } case v: BSV[Double] => if (v.index.length == v.used) { new SSV(v.length, v.index, v.data) } else { new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) } case v: BV[_] => sys.error("Unsupported Breeze vector type: " + v.getClass.getName) } } def toBreezeConv[T: ClassTag](sv: SV)(implicit num: Numeric[T]): BV[T] = { val zero = num.zero implicit val conv: Array[Double] => Array[T] = (data) => { data.map(ele => (zero match { case zero: Double => ele case zero: Float => ele.toFloat case zero: Int => ele.toInt case zero: Long => ele.toLong }).asInstanceOf[T]).array } sv match { case SDV(data) => new BDV[T](data) case SSV(size, indices, values) => new BSV[T](indices, values, size)(Zero[T](zero)) } } def fromBreezeConv[T: ClassTag](breezeVector: BV[T])(implicit num: Numeric[T]): SV = { implicit val conv: Array[T] => Array[Double] = (data) => { data.map(num.toDouble).array } breezeVector match { case v: BDV[T] => if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) { new SDV(v.data) } else { new SDV(v.toArray) // Can't use underlying array directly, so make a new one } case v: BSV[T] => if (v.index.length == v.used) { new SSV(v.length, v.index, v.data) } else { new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) } case v: BV[T] => sys.error("Unsupported Breeze vector type: " + v.getClass.getName) } } def getFileSystem(conf: SparkConf, path: Path): FileSystem = { val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf) if (sys.env.contains("HADOOP_CONF_DIR") || sys.env.contains("YARN_CONF_DIR")) { val hdfsConfPath = if (sys.env.get("HADOOP_CONF_DIR").isDefined) { sys.env.get("HADOOP_CONF_DIR").get + "/core-site.xml" } else { sys.env.get("YARN_CONF_DIR").get + "/core-site.xml" } hadoopConf.addResource(new Path(hdfsConfPath)) } path.getFileSystem(hadoopConf) } def deleteChkptDirs(conf: SparkConf, dirs: Array[String]): Unit = { val fs = getFileSystem(conf, new Path(dirs(0))) dirs.foreach(dir => { fs.delete(new Path(dir), true) }) } }
Example 31
Source File: OrcFileOperator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.flatMap(getFileReader(_, conf)).headOption.map { reader => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 32
Source File: JsonHadoopFsRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.math.BigDecimal import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.Row import org.apache.spark.sql.types._ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = "json" // JSON does not write data of NullType and does not play well with BinaryType. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: BinaryType => false case _: CalendarIntervalType => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("SPARK-9894: save complex types to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("array", ArrayType(LongType)) .add("map", MapType(StringType, new StructType().add("innerField", LongType))) val data = Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) :: Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil val df = spark.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } test("SPARK-10196: save decimal type to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("decimal", DecimalType(7, 2)) val data = Row(new BigDecimal("10.02")) :: Row(new BigDecimal("20000.99")) :: Row(new BigDecimal("10000")) :: Nil val df = spark.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } }
Example 33
Source File: CommitFailureTestRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 34
Source File: SimpleTextHadoopFsRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("test hadoop conf option propagation") { withTempPath { file => // Test write side val df = spark.range(10).selectExpr("cast(id as string)") df.write .option("some-random-write-option", "hahah-WRITE") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName).save(file.getAbsolutePath) assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE") // Test read side val df1 = spark.read .option("some-random-read-option", "hahah-READ") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName) .load(file.getAbsolutePath) df1.count() assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ") } } }
Example 35
Source File: IOEncryptionSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import java.io._ import java.nio.charset.StandardCharsets import java.security.PrivilegedExceptionAction import java.util.UUID import org.apache.hadoop.security.{Credentials, UserGroupInformation} import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, Matchers} import org.apache.spark._ import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.config._ import org.apache.spark.serializer._ import org.apache.spark.storage._ class IOEncryptionSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll with BeforeAndAfterEach { private[this] val blockId = new TempShuffleBlockId(UUID.randomUUID()) private[this] val conf = new SparkConf() private[this] val ugi = UserGroupInformation.createUserForTesting("testuser", Array("testgroup")) private[this] val serializer = new KryoSerializer(conf) override def beforeAll(): Unit = { System.setProperty("SPARK_YARN_MODE", "true") ugi.doAs(new PrivilegedExceptionAction[Unit]() { override def run(): Unit = { conf.set(IO_ENCRYPTION_ENABLED, true) val creds = new Credentials() SecurityManager.initIOEncryptionKey(conf, creds) SparkHadoopUtil.get.addCurrentUserCredentials(creds) } }) } override def afterAll(): Unit = { SparkEnv.set(null) System.clearProperty("SPARK_YARN_MODE") } override def beforeEach(): Unit = { super.beforeEach() } override def afterEach(): Unit = { super.afterEach() conf.set("spark.shuffle.compress", false.toString) conf.set("spark.shuffle.spill.compress", false.toString) } test("IO encryption read and write") { ugi.doAs(new PrivilegedExceptionAction[Unit] { override def run(): Unit = { conf.set(IO_ENCRYPTION_ENABLED, true) conf.set("spark.shuffle.compress", false.toString) conf.set("spark.shuffle.spill.compress", false.toString) testYarnIOEncryptionWriteRead() } }) } test("IO encryption read and write with shuffle compression enabled") { ugi.doAs(new PrivilegedExceptionAction[Unit] { override def run(): Unit = { conf.set(IO_ENCRYPTION_ENABLED, true) conf.set("spark.shuffle.compress", true.toString) conf.set("spark.shuffle.spill.compress", true.toString) testYarnIOEncryptionWriteRead() } }) } private[this] def testYarnIOEncryptionWriteRead(): Unit = { val plainStr = "hello world" val outputStream = new ByteArrayOutputStream() val serializerManager = new SerializerManager(serializer, conf) val wrappedOutputStream = serializerManager.wrapStream(blockId, outputStream) wrappedOutputStream.write(plainStr.getBytes(StandardCharsets.UTF_8)) wrappedOutputStream.close() val encryptedBytes = outputStream.toByteArray val encryptedStr = new String(encryptedBytes) assert(plainStr !== encryptedStr) val inputStream = new ByteArrayInputStream(encryptedBytes) val wrappedInputStream = serializerManager.wrapStream(blockId, inputStream) val decryptedBytes = new Array[Byte](1024) val len = wrappedInputStream.read(decryptedBytes) val decryptedStr = new String(decryptedBytes, 0, len, StandardCharsets.UTF_8) assert(decryptedStr === plainStr) } }
Example 36
Source File: CryptoStreamUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.security import java.io.{InputStream, OutputStream} import java.util.Properties import javax.crypto.spec.{IvParameterSpec, SecretKeySpec} import org.apache.commons.crypto.random._ import org.apache.commons.crypto.stream._ import org.apache.hadoop.io.Text import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.internal.config._ private[this] def createInitializationVector(properties: Properties): Array[Byte] = { val iv = new Array[Byte](IV_LENGTH_IN_BYTES) val initialIVStart = System.currentTimeMillis() CryptoRandomFactory.getCryptoRandom(properties).nextBytes(iv) val initialIVFinish = System.currentTimeMillis() val initialIVTime = initialIVFinish - initialIVStart if (initialIVTime > 2000) { logWarning(s"It costs ${initialIVTime} milliseconds to create the Initialization Vector " + s"used by CryptoStream") } iv } }
Example 37
Source File: EventLogDownloadResource.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.status.api.v1 import java.io.OutputStream import java.util.zip.ZipOutputStream import javax.ws.rs.{GET, Produces} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} import scala.util.control.NonFatal import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging @Produces(Array(MediaType.APPLICATION_OCTET_STREAM)) private[v1] class EventLogDownloadResource( val uIRoot: UIRoot, val appId: String, val attemptId: Option[String]) extends Logging { val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf) @GET def getEventLogs(): Response = { try { val fileName = { attemptId match { case Some(id) => s"eventLogs-$appId-$id.zip" case None => s"eventLogs-$appId.zip" } } val stream = new StreamingOutput { override def write(output: OutputStream): Unit = { val zipStream = new ZipOutputStream(output) try { uIRoot.writeEventLogs(appId, attemptId, zipStream) } finally { zipStream.close() } } } Response.ok(stream) .header("Content-Disposition", s"attachment; filename=$fileName") .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM) .build() } catch { case NonFatal(e) => Response.serverError() .entity(s"Event logs are not available for app: $appId.") .status(Response.Status.SERVICE_UNAVAILABLE) .build() } } }
Example 38
Source File: HdfsFileAccessor.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.dataspecs.access import java.io.InputStream import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil import org.archive.archivespark.sparkling.io.IOUtil class HdfsFileAccessor(path: String, decompress: Boolean = true) extends CloseableDataAccessor[InputStream] { override def get: Option[InputStream] = { val fs = FileSystem.get(SparkHadoopUtil.get.conf) var stream: InputStream = null try { val raw = fs.open(new Path(path)) stream = if (decompress) IOUtil.decompress(raw, Some(path)) else raw Some(stream) } catch { case e: Exception => e.printStackTrace() if (stream != null) stream.close() None } } }
Example 39
Source File: HdfsStreamAccessor.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.dataspecs.access import java.io.InputStream import org.apache.commons.io.input.BoundedInputStream import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil class HdfsStreamAccessor(location: HdfsLocationInfo) extends CloseableDataAccessor[InputStream] { override def get: Option[InputStream] = { if (location.length < 0 || location.offset < 0) None else { val fs = FileSystem.get(SparkHadoopUtil.get.conf) var stream: FSDataInputStream = null try { stream = fs.open(new Path(location.path)) stream.seek(location.offset) Some(new BoundedInputStream(stream, location.length)) } catch { case e: Exception => e.printStackTrace() if (stream != null) stream.close() None } } } }
Example 40
Source File: FilePathMap.scala From ArchiveSpark with MIT License | 5 votes |
package org.archive.archivespark.util import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.deploy.SparkHadoopUtil import scala.util.Try case class FilePathMap(path: String, patterns: Seq[String] = Seq.empty) { val pathMap: Map[String, String] = { var map = collection.mutable.Map[String, String]() val fs = FileSystem.get(SparkHadoopUtil.get.conf) val files = fs.listFiles(new Path(path), true) while (files.hasNext) { val path = files.next.getPath val filename = path.getName if (patterns.isEmpty || patterns.exists(filename.matches)) { if (map.contains(filename)) throw new RuntimeException("duplicate filename: " + filename) map += filename -> path.getParent.toString.intern } } map.toMap } def pathToFile(file: String): Option[Path] = Try {new Path(file).getName}.toOption match { case Some(f) => pathMap.get(f).map(dir => new Path(dir, f)) case None => None } }
Example 41
Source File: OrcFileOperator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import java.io.IOException import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[hive] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None, ignoreCorruptFiles: Boolean = false) : Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => val reader = try { Some(OrcFile.createReader(fs, path)) } catch { case e: IOException => if (ignoreCorruptFiles) { logWarning(s"Skipped the footer in the corrupted file: $path", e) None } else { throw new SparkException(s"Could not read footer for file: $path", e) } } path -> reader }.collectFirst { case (path, Some(reader)) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration], ignoreCorruptFiles: Boolean) : Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.toIterator.map(getFileReader(_, conf, ignoreCorruptFiles)).collectFirst { case Some(reader) => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 42
Source File: HiveUtilsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive import java.net.URL import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.QueryTest import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SQLTestUtils} import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader} class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton { test("newTemporaryConfiguration overwrites listener configurations") { Seq(true, false).foreach { useInMemoryDerby => val conf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(conf(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_EVENT_LISTENERS.varname) === "") assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "") } } test("newTemporaryConfiguration respect spark.hadoop.foo=bar in SparkConf") { sys.props.put("spark.hadoop.foo", "bar") Seq(true, false) foreach { useInMemoryDerby => val hiveConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby) assert(!hiveConf.contains("spark.hadoop.foo")) assert(hiveConf("foo") === "bar") } } test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") { val conf = new SparkConf val contextClassLoader = Thread.currentThread().getContextClassLoader val loader = new ChildFirstURLClassLoader(Array(), contextClassLoader) try { Thread.currentThread().setContextClassLoader(loader) HiveUtils.newClientForMetadata( conf, SparkHadoopUtil.newConfiguration(conf), HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true)) } finally { Thread.currentThread().setContextClassLoader(contextClassLoader) } } test("toHiveString correctly handles UDTs") { val point = new ExamplePoint(50.0, 50.0) val tpe = new ExamplePointUDT() assert(HiveUtils.toHiveString((point, tpe)) === "(50.0, 50.0)") } }
Example 43
Source File: CommitFailureTestRelationSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 44
Source File: HiveCliSessionStateSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import org.apache.hadoop.hive.cli.CliSessionState import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.ql.session.SessionState import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.hive.HiveUtils class HiveCliSessionStateSuite extends SparkFunSuite { def withSessionClear(f: () => Unit): Unit = { try f finally SessionState.detachSession() } test("CliSessionState will be reused") { withSessionClear { () => val hiveConf = new HiveConf(classOf[SessionState]) HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach { case (key, value) => hiveConf.set(key, value) } val sessionState: SessionState = new CliSessionState(hiveConf) SessionState.start(sessionState) val s1 = SessionState.get val sparkConf = new SparkConf() val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf) val s2 = HiveUtils.newClientForMetadata(sparkConf, hadoopConf).getState assert(s1 === s2) assert(s2.isInstanceOf[CliSessionState]) } } test("SessionState will not be reused") { withSessionClear { () => val sparkConf = new SparkConf() val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf) HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach { case (key, value) => hadoopConf.set(key, value) } val hiveClient = HiveUtils.newClientForMetadata(sparkConf, hadoopConf) val s1 = hiveClient.getState val s2 = hiveClient.newSession().getState assert(s1 !== s2) } } }
Example 45
Source File: OrcFileOperator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.Logging import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.HiveMetastoreTypes import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(path: String, conf: Option[Configuration]): StructType = { val reader = getFileReader(path, conf).getOrElse { throw new AnalysisException( s"Failed to discover schema from ORC files stored in $path. " + "Probably there are either no ORC files or only empty ORC files.") } val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $path, got Hive schema string: $schema") HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType] } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDir) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) if (paths == null || paths.isEmpty) { throw new IllegalArgumentException( s"orcFileOperator: path $path does not have valid orc files matching the pattern") } paths } }
Example 46
Source File: JsonHadoopFsRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.math.BigDecimal import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.Row import org.apache.spark.sql.types._ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = "json" // JSON does not write data of NullType and does not play well with BinaryType. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: BinaryType => false case _: CalendarIntervalType => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("SPARK-9894: save complex types to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("array", ArrayType(LongType)) .add("map", MapType(StringType, new StructType().add("innerField", LongType))) val data = Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) :: Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil val df = spark.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } test("SPARK-10196: save decimal type to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("decimal", DecimalType(7, 2)) val data = Row(new BigDecimal("10.02")) :: Row(new BigDecimal("20000.99")) :: Row(new BigDecimal("10000")) :: Nil val df = spark.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } }
Example 47
Source File: CommitFailureTestRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 48
Source File: SimpleTextHadoopFsRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("test hadoop conf option propagation") { withTempPath { file => // Test write side val df = spark.range(10).selectExpr("cast(id as string)") df.write .option("some-random-write-option", "hahah-WRITE") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName).save(file.getAbsolutePath) assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE") // Test read side val df1 = spark.read .option("some-random-read-option", "hahah-READ") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName) .load(file.getAbsolutePath) df1.count() assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ") } } }
Example 49
Source File: EventLogDownloadResource.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.status.api.v1 import java.io.OutputStream import java.util.zip.ZipOutputStream import javax.ws.rs.{GET, Produces} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} import scala.util.control.NonFatal import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging @Produces(Array(MediaType.APPLICATION_OCTET_STREAM)) private[v1] class EventLogDownloadResource( val uIRoot: UIRoot, val appId: String, val attemptId: Option[String]) extends Logging { val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf) @GET def getEventLogs(): Response = { try { val fileName = { attemptId match { case Some(id) => s"eventLogs-$appId-$id.zip" case None => s"eventLogs-$appId.zip" } } val stream = new StreamingOutput { override def write(output: OutputStream): Unit = { val zipStream = new ZipOutputStream(output) try { uIRoot.writeEventLogs(appId, attemptId, zipStream) } finally { zipStream.close() } } } Response.ok(stream) .header("Content-Disposition", s"attachment; filename=$fileName") .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM) .build() } catch { case NonFatal(e) => Response.serverError() .entity(s"Event logs are not available for app: $appId.") .status(Response.Status.SERVICE_UNAVAILABLE) .build() } } }
Example 50
Source File: SimrSchedulerBackend.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.cluster import org.apache.hadoop.fs.{Path, FileSystem} import org.apache.spark.{Logging, SparkContext, SparkEnv} import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.scheduler.TaskSchedulerImpl import org.apache.spark.util.AkkaUtils private[spark] class SimrSchedulerBackend( scheduler: TaskSchedulerImpl, sc: SparkContext, driverFilePath: String) extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem) with Logging { val tmpPath = new Path(driverFilePath + "_tmp") val filePath = new Path(driverFilePath) val maxCores = conf.getInt("spark.simr.executor.cores", 1) override def start() { super.start() val driverUrl = AkkaUtils.address( AkkaUtils.protocol(actorSystem), SparkEnv.driverActorSystemName, sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port"), CoarseGrainedSchedulerBackend.ACTOR_NAME) val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("") logInfo("Writing to HDFS file: " + driverFilePath) logInfo("Writing Akka address: " + driverUrl) logInfo("Writing Spark UI Address: " + appUIAddress) // Create temporary file to prevent race condition where executors get empty driverUrl file val temp = fs.create(tmpPath, true) temp.writeUTF(driverUrl) temp.writeInt(maxCores) temp.writeUTF(appUIAddress) temp.close() // "Atomic" rename fs.rename(tmpPath, filePath) } override def stop() { val conf = SparkHadoopUtil.get.newConfiguration(sc.conf) val fs = FileSystem.get(conf) fs.delete(new Path(driverFilePath), false) super.stop() } }
Example 51
Source File: WholeTextFileRecordReader.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.input import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable} import com.google.common.io.{ByteStreams, Closeables} import org.apache.hadoop.io.Text import org.apache.hadoop.io.compress.CompressionCodecFactory import org.apache.hadoop.mapreduce.InputSplit import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader} import org.apache.hadoop.mapreduce.RecordReader import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.spark.deploy.SparkHadoopUtil private[spark] class ConfigurableCombineFileRecordReader[K, V]( split: InputSplit, context: TaskAttemptContext, recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable]) extends CombineFileRecordReader[K, V]( split.asInstanceOf[CombineFileSplit], context, recordReaderClass ) with Configurable { override def initNextRecordReader(): Boolean = { val r = super.initNextRecordReader() if (r) { this.curReader.asInstanceOf[HConfigurable].setConf(getConf) } r } }
Example 52
Source File: OrcFileOperator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader} import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.types.StructType private[orc] object OrcFileOperator extends Logging { def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = { def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = { reader.getObjectInspector match { case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 => logInfo( s"ORC file $path has empty schema, it probably contains no rows. " + "Trying to read another ORC file to figure out the schema.") false case _ => true } } val conf = config.getOrElse(new Configuration) val fs = { val hdfsPath = new Path(basePath) hdfsPath.getFileSystem(conf) } listOrcFiles(basePath, conf).iterator.map { path => path -> OrcFile.createReader(fs, path) }.collectFirst { case (path, reader) if isWithNonEmptySchema(path, reader) => reader } } def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = { // Take the first file where we can open a valid reader if we can find one. Otherwise just // return None to indicate we can't infer the schema. paths.flatMap(getFileReader(_, conf)).headOption.map { reader => val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector] val schema = readerInspector.getTypeName logDebug(s"Reading schema from file $paths, got Hive schema string: $schema") CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType] } } def getObjectInspector( path: String, conf: Option[Configuration]): Option[StructObjectInspector] = { getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector]) } def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = { // TODO: Check if the paths coming in are already qualified and simplify. val origPath = new Path(pathStr) val fs = origPath.getFileSystem(conf) val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath) .filterNot(_.isDirectory) .map(_.getPath) .filterNot(_.getName.startsWith("_")) .filterNot(_.getName.startsWith(".")) paths } }
Example 53
Source File: JsonHadoopFsRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import java.math.BigDecimal import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.Row import org.apache.spark.sql.types._ class JsonHadoopFsRelationSuite extends HadoopFsRelationTest { override val dataSourceName: String = "json" // JSON does not write data of NullType and does not play well with BinaryType. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: NullType => false case _: BinaryType => false case _: CalendarIntervalType => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("SPARK-9894: save complex types to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("array", ArrayType(LongType)) .add("map", MapType(StringType, new StructType().add("innerField", LongType))) val data = Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) :: Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil val df = spark.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } test("SPARK-10196: save decimal type to JSON") { withTempDir { file => file.delete() val schema = new StructType() .add("decimal", DecimalType(7, 2)) val data = Row(new BigDecimal("10.02")) :: Row(new BigDecimal("20000.99")) :: Row(new BigDecimal("10000")) :: Nil val df = spark.createDataFrame(sparkContext.parallelize(data), schema) // Write the data out. df.write.format(dataSourceName).save(file.getCanonicalPath) // Read it back and check the result. checkAnswer( spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath), df ) } } }
Example 54
Source File: CommitFailureTestRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.SparkException import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton import org.apache.spark.sql.test.SQLTestUtils class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton { // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose. val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName test("SPARK-7684: commitTask() failure should fallback to abortTask()") { withTempPath { file => // Here we coalesce partition number to 1 to ensure that only a single task is issued. This // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary` // directory while committing/aborting the job. See SPARK-8513 for more details. val df = spark.range(0, 10).coalesce(1) intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - default") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val divideByZero = udf((x: Int) => { x / (x - 1)}) val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id"))) SimpleTextRelation.callbackCalled = false intercept[SparkException] { df.write.format(dataSourceName).save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } test("call failure callbacks before close writer - partitioned") { SimpleTextRelation.failCommitter = false withTempPath { file => // fail the job in the middle of writing val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id")) SimpleTextRelation.callbackCalled = false SimpleTextRelation.failWriter = true intercept[SparkException] { df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath) } assert(SimpleTextRelation.callbackCalled, "failure callback should be called") val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf) assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary"))) } } }
Example 55
Source File: SimpleTextHadoopFsRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.hadoop.fs.Path import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.sql.catalyst.expressions.PredicateHelper import org.apache.spark.sql.types._ class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper { override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName // We have a very limited number of supported types at here since it is just for a // test relation and we do very basic testing at here. override protected def supportsDataType(dataType: DataType): Boolean = dataType match { case _: BinaryType => false // We are using random data generator and the generated strings are not really valid string. case _: StringType => false case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442 case _: CalendarIntervalType => false case _: DateType => false case _: TimestampType => false case _: ArrayType => false case _: MapType => false case _: StructType => false case _: UserDefinedType[_] => false case _ => true } test("save()/load() - partitioned table - simple queries - partition columns in data") { withTempDir { file => val basePath = new Path(file.getCanonicalPath) val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf) val qualifiedBasePath = fs.makeQualified(basePath) for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) { val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2") sparkContext .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1") .saveAsTextFile(partitionDir.toString) } val dataSchemaWithPartition = StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true)) checkQueries( spark.read.format(dataSourceName) .option("dataSchema", dataSchemaWithPartition.json) .load(file.getCanonicalPath)) } } test("test hadoop conf option propagation") { withTempPath { file => // Test write side val df = spark.range(10).selectExpr("cast(id as string)") df.write .option("some-random-write-option", "hahah-WRITE") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName).save(file.getAbsolutePath) assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE") // Test read side val df1 = spark.read .option("some-random-read-option", "hahah-READ") .option("some-null-value-option", null) // test null robustness .option("dataSchema", df.schema.json) .format(dataSourceName) .load(file.getAbsolutePath) df1.count() assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ") } } }
Example 56
Source File: EventLogDownloadResource.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.status.api.v1 import java.io.OutputStream import java.util.zip.ZipOutputStream import javax.ws.rs.{GET, Produces} import javax.ws.rs.core.{MediaType, Response, StreamingOutput} import scala.util.control.NonFatal import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.internal.Logging @Produces(Array(MediaType.APPLICATION_OCTET_STREAM)) private[v1] class EventLogDownloadResource( val uIRoot: UIRoot, val appId: String, val attemptId: Option[String]) extends Logging { val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf) @GET def getEventLogs(): Response = { try { val fileName = { attemptId match { case Some(id) => s"eventLogs-$appId-$id.zip" case None => s"eventLogs-$appId.zip" } } val stream = new StreamingOutput { override def write(output: OutputStream): Unit = { val zipStream = new ZipOutputStream(output) try { uIRoot.writeEventLogs(appId, attemptId, zipStream) } finally { zipStream.close() } } } Response.ok(stream) .header("Content-Disposition", s"attachment; filename=$fileName") .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM) .build() } catch { case NonFatal(e) => Response.serverError() .entity(s"Event logs are not available for app: $appId.") .status(Response.Status.SERVICE_UNAVAILABLE) .build() } } }
Example 57
Source File: HDFSConfigHelper.scala From sparklens with Apache License 2.0 | 5 votes |
package com.qubole.sparklens.helper import org.apache.hadoop.conf.Configuration import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkHadoopUtil object HDFSConfigHelper { def getHadoopConf(sparkConfOptional:Option[SparkConf]): Configuration = { if (sparkConfOptional.isDefined) { SparkHadoopUtil.get.newConfiguration(sparkConfOptional.get) }else { val sparkConf = new SparkConf() SparkHadoopUtil.get.newConfiguration(sparkConf) } } }
Example 58
Source File: CarbonCountStar.scala From carbondata with Apache License 2.0 | 4 votes |
package org.apache.spark.sql import scala.collection.JavaConverters._ import org.apache.hadoop.fs.Path import org.apache.hadoop.mapred.JobConf import org.apache.hadoop.mapreduce.Job import org.apache.hadoop.mapreduce.lib.input.FileInputFormat import org.apache.spark.deploy.SparkHadoopUtil import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.LeafExecNode import org.apache.spark.sql.optimizer.CarbonFilters import org.apache.spark.sql.types.StringType import org.apache.spark.unsafe.types.UTF8String import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier import org.apache.carbondata.core.metadata.schema.table.CarbonTable import org.apache.carbondata.core.mutate.CarbonUpdateUtil import org.apache.carbondata.core.statusmanager.StageInputCollector import org.apache.carbondata.core.util.{CarbonProperties, ThreadLocalSessionInfo} import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat} import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil import org.apache.carbondata.spark.load.DataLoadProcessBuilderOnSpark case class CarbonCountStar( attributesRaw: Seq[Attribute], carbonTable: CarbonTable, sparkSession: SparkSession, outUnsafeRows: Boolean = true) extends LeafExecNode { override def doExecute(): RDD[InternalRow] = { ThreadLocalSessionInfo .setConfigurationToCurrentThread(sparkSession.sessionState.newHadoopConf()) val absoluteTableIdentifier = carbonTable.getAbsoluteTableIdentifier val (job, tableInputFormat) = createCarbonInputFormat(absoluteTableIdentifier) CarbonInputFormat.setQuerySegment(job.getConfiguration, carbonTable) // get row count var rowCount = CarbonUpdateUtil.getRowCount( tableInputFormat.getBlockRowCount( job, carbonTable, CarbonFilters.getPartitions( Seq.empty, sparkSession, TableIdentifier( carbonTable.getTableName, Some(carbonTable.getDatabaseName))).map(_.asJava).orNull, false), carbonTable) if (CarbonProperties.isQueryStageInputEnabled) { // check for number of row for stage input val splits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration) if (!splits.isEmpty) { val df = DataLoadProcessBuilderOnSpark.createInputDataFrame( sparkSession, carbonTable, splits.asScala) rowCount += df.count() } } val valueRaw = attributesRaw.head.dataType match { case StringType => Seq(UTF8String.fromString(Long.box(rowCount).toString)).toArray .asInstanceOf[Array[Any]] case _ => Seq(Long.box(rowCount)).toArray.asInstanceOf[Array[Any]] } val value = new GenericInternalRow(valueRaw) val unsafeProjection = UnsafeProjection.create(output.map(_.dataType).toArray) val row = if (outUnsafeRows) unsafeProjection(value) else value sparkContext.parallelize(Seq(row)) } override def output: Seq[Attribute] = { attributesRaw } private def createCarbonInputFormat(absoluteTableIdentifier: AbsoluteTableIdentifier ): (Job, CarbonTableInputFormat[Array[Object]]) = { val carbonInputFormat = new CarbonTableInputFormat[Array[Object]]() val jobConf: JobConf = new JobConf(FileFactory.getConfiguration) SparkHadoopUtil.get.addCredentials(jobConf) CarbonInputFormat.setTableInfo(jobConf, carbonTable.getTableInfo) val job = new Job(jobConf) FileInputFormat.addInputPath(job, new Path(absoluteTableIdentifier.getTablePath)) CarbonInputFormat .setTransactionalTable(job.getConfiguration, carbonTable.getTableInfo.isTransactionalTable) CarbonInputFormatUtil.setIndexJobIfConfigured(job.getConfiguration) (job, carbonInputFormat) } }