org.apache.spark.deploy.SparkHadoopUtil Scala Example

Source File: OrcFileOperator.scala From drizzle-spark with Apache License 2.0

6 votes

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.types.StructType

private[orc] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      path -> OrcFile.createReader(fs, path)
    }.collectFirst {
      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = {
    // Take the first file where we can open a valid reader if we can find one.  Otherwise just
    // return None to indicate we can't infer the schema.
    paths.flatMap(getFileReader(_, conf)).headOption.map { reader =>
      val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
      val schema = readerInspector.getTypeName
      logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")
      CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType]
    }
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    // TODO: Check if the paths coming in are already qualified and simplify.
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDirectory)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))
    paths
  }
}

Source File: CommitFailureTestRelationSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton {
  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = spark.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - default") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val divideByZero = udf((x: Int) => { x / (x - 1)})
      val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id")))

      SimpleTextRelation.callbackCalled = false
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - partitioned") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))

      SimpleTextRelation.callbackCalled = false
      SimpleTextRelation.failWriter = true
      intercept[SparkException] {
        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: ExecutorDelegationTokenUpdater.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.util.concurrent.{Executors, TimeUnit}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.security.{Credentials, UserGroupInformation}

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.{ThreadUtils, Utils}

import scala.util.control.NonFatal

private[spark] class ExecutorDelegationTokenUpdater(
    sparkConf: SparkConf,
    hadoopConf: Configuration) extends Logging {

  @volatile private var lastCredentialsFileSuffix = 0

  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")

  private val delegationTokenRenewer =
    Executors.newSingleThreadScheduledExecutor(
      ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread"))

  // On the executor, this thread wakes up and picks up new tokens from HDFS, if any.
  private val executorUpdaterRunnable =
    new Runnable {
      override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired())
    }

  def updateCredentialsIfRequired(): Unit = {
    try {
      val credentialsFilePath = new Path(credentialsFile)
      val remoteFs = FileSystem.get(hadoopConf)
      SparkHadoopUtil.get.listFilesSorted(
        remoteFs, credentialsFilePath.getParent,
        credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
        .lastOption.foreach { credentialsStatus =>
        val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath)
        if (suffix > lastCredentialsFileSuffix) {
          logInfo("Reading new delegation tokens from " + credentialsStatus.getPath)
          val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath)
          lastCredentialsFileSuffix = suffix
          UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
          logInfo("Tokens updated from credentials file.")
        } else {
          // Check every hour to see if new credentials arrived.
          logInfo("Updated delegation tokens were expected, but the driver has not updated the " +
            "tokens yet, will check again in an hour.")
          delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
          return
        }
      }
      val timeFromNowToRenewal =
        SparkHadoopUtil.get.getTimeFromNowToRenewal(
          sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials)
      if (timeFromNowToRenewal <= 0) {
        executorUpdaterRunnable.run()
      } else {
        logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.")
        delegationTokenRenewer.schedule(
          executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS)
      }
    } catch {
      // Since the file may get deleted while we are reading it, catch the Exception and come
      // back in an hour to try again
      case NonFatal(e) =>
        logWarning("Error while trying to update credentials, will try again in 1 hour", e)
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
    }
  }

  private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = {
    val stream = remoteFs.open(tokenPath)
    try {
      val newCredentials = new Credentials()
      newCredentials.readTokenStorageStream(stream)
      newCredentials
    } finally {
      stream.close()
    }
  }

  def stop(): Unit = {
    delegationTokenRenewer.shutdown()
  }

}

Source File: SimrSchedulerBackend.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.rpc.RpcAddress
import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,
      RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt),
      CoarseGrainedSchedulerBackend.ENDPOINT_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    fs.delete(new Path(driverFilePath), false)
    super.stop()
  }

}

Source File: WholeTextFileRecordReader.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.input

import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable}
import com.google.common.io.{ByteStreams, Closeables}

import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader}
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.spark.deploy.SparkHadoopUtil



private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
}

Source File: OrcHadoopFsRelationSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.{Row, SQLConf}
import org.apache.spark.sql.sources.HadoopFsRelationTest
import org.apache.spark.sql.types._

class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
  override val dataSourceName: String = classOf[DefaultSource].getCanonicalName

  import sqlContext._
  import sqlContext.implicits._

  // ORC does not play well with NullType and UDT.
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: NullType => false
    case _: CalendarIntervalType => false
    case _: UserDefinedType[_] => false
    case _ => true
  }
  //分区表 - 简单查询 - 数据中的分区列
  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1))
          .toDF("a", "b", "p1")
          .write
          .orc(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        read.options(Map(
          "path" -> file.getCanonicalPath,
          "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName).load())
    }
  }
  //“不”包含在ORC滤镜下推中
  test("SPARK-12218: 'Not' is included in ORC filter pushdown") {
    import testImplicits._

    withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
      withTempPath { dir =>
        val path = s"${dir.getCanonicalPath}/table1"
        (1 to 5).map(i => (i, (i % 2).toString)).toDF("a", "b").write.orc(path)

        checkAnswer(
          sqlContext.read.orc(path).where("not (a = 2) or not(b in ('1'))"),
          (1 to 5).map(i => Row(i, (i % 2).toString)))

        checkAnswer(
          sqlContext.read.orc(path).where("not (a = 2 and b in ('1'))"),
          (1 to 5).map(i => Row(i, (i % 2).toString)))
      }
    }
  }
}

Source File: JsonHadoopFsRelationSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import java.math.BigDecimal

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
  override val dataSourceName: String = "json"

  import sqlContext._

  // JSON does not write data of NullType and does not play well with BinaryType.
  //JSON不会写入Null Type的数据，并且不能使用二进制类型播放
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: NullType => false
    case _: BinaryType => false
    case _: CalendarIntervalType => false
    case _ => true
  }
  //save（）/ load（） - 分区表 - 简单查询 - 数据中的分区列
  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""")
          .saveAsTextFile(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        read.format(dataSourceName)
          .option("dataSchema", dataSchemaWithPartition.json)
          .load(file.getCanonicalPath))
    }
  }
  //将复杂类型保存到JSON
  test("SPARK-9894: save complex types to JSON") {
    withTempDir { file =>
      file.delete()

      val schema =
        new StructType()
          .add("array", ArrayType(LongType))
          .add("map", MapType(StringType, new StructType().add("innerField", LongType)))

      val data =
        Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) ::
          Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil
      val df = createDataFrame(sparkContext.parallelize(data), schema)

      // Write the data out.写出数据
      df.write.format(dataSourceName).save(file.getCanonicalPath)

      // Read it back and check the result. 把它读回来检查结果
      checkAnswer(
        read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
        df
      )
    }
  }
  //将十进制类型保存到JSON
  test("SPARK-10196: save decimal type to JSON") {
    withTempDir { file =>
      file.delete()

      val schema =
        new StructType()
          .add("decimal", DecimalType(7, 2))

      val data =
        Row(new BigDecimal("10.02")) ::
          Row(new BigDecimal("20000.99")) ::
          Row(new BigDecimal("10000")) :: Nil
      val df = createDataFrame(sparkContext.parallelize(data), schema)

      // Write the data out. 写出数据
      df.write.format(dataSourceName).save(file.getCanonicalPath)

      // Read it back and check the result. 把它读回来检查结果
      checkAnswer(
        read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
        df
      )
    }
  }
}

Source File: CommitFailureTestRelationSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path
import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.test.SQLTestUtils


class CommitFailureTestRelationSuite extends SparkFunSuite with SQLTestUtils {
  override def _sqlContext: SQLContext = TestHive
  private val sqlContext = _sqlContext

  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  //提交任务时，“CommitFailureTestSource”会为测试目的引发异常
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName
  //commitTask（）失败应该回退到abortTask（）
  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      //这里我们将分区号合并为1，以确保只发出一个任务, 这个防止当FileOutputCommitter尝试删除`_temporary`时发生竞争条件
      //目录提交/中止作业, 有关详细信息，请参阅SPARK-8513
      val df = sqlContext.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: SimpleTextHadoopFsRelationSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.types._

class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest {
  override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName

  import sqlContext._

  // We have a very limited number of supported types at here since it is just for a
  // test relation and we do very basic testing at here.
  //我们在这里支持的类型数量非常有限，因为它只是一个测试关系,我们在这里进行非常基本的测试。
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: BinaryType => false
    // We are using random data generator and the generated strings are not really valid string.
    case _: StringType => false
    case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442
    case _: CalendarIntervalType => false
    case _: DateType => false
    case _: TimestampType => false
    case _: ArrayType => false
    case _: MapType => false
    case _: StructType => false
    case _: UserDefinedType[_] => false
    case _ => true
  }
  //save（）/ load（） - 分区表 - 简单查询 - 数据中的分区列
  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1")
          .saveAsTextFile(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        read.format(dataSourceName)
          .option("dataSchema", dataSchemaWithPartition.json)
          .load(file.getCanonicalPath))
    }
  }
}

Source File: ExecutorDelegationTokenUpdater.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.util.concurrent.{Executors, TimeUnit}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.security.{Credentials, UserGroupInformation}

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.{ThreadUtils, Utils}

import scala.util.control.NonFatal

private[spark] class ExecutorDelegationTokenUpdater(
    sparkConf: SparkConf,
    hadoopConf: Configuration) extends Logging {

  @volatile private var lastCredentialsFileSuffix = 0

  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")
  private val freshHadoopConf =
    SparkHadoopUtil.get.getConfBypassingFSCache(
      hadoopConf, new Path(credentialsFile).toUri.getScheme)

  private val delegationTokenRenewer =
    Executors.newSingleThreadScheduledExecutor(
      ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread"))

  // On the executor, this thread wakes up and picks up new tokens from HDFS, if any.
  //在执行程序中,该线程唤醒并从HDFS中获取新令牌(如果有的话)
  private val executorUpdaterRunnable =
    new Runnable {
      override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired())
    }

  def updateCredentialsIfRequired(): Unit = {
    try {
      val credentialsFilePath = new Path(credentialsFile)
      val remoteFs = FileSystem.get(freshHadoopConf)
      SparkHadoopUtil.get.listFilesSorted(
        remoteFs, credentialsFilePath.getParent,
        credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
        .lastOption.foreach { credentialsStatus =>
        val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath)
        if (suffix > lastCredentialsFileSuffix) {
          logInfo("Reading new delegation tokens from " + credentialsStatus.getPath)
          val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath)
          lastCredentialsFileSuffix = suffix
          UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
          logInfo("Tokens updated from credentials file.")
        } else {
          // Check every hour to see if new credentials arrived.
          logInfo("Updated delegation tokens were expected, but the driver has not updated the " +
            "tokens yet, will check again in an hour.")
          delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
          return
        }
      }
      val timeFromNowToRenewal =
        SparkHadoopUtil.get.getTimeFromNowToRenewal(
          sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials)
      if (timeFromNowToRenewal <= 0) {
        executorUpdaterRunnable.run()
      } else {
        logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.")
        delegationTokenRenewer.schedule(
          executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS)
      }
    } catch {
      // Since the file may get deleted while we are reading it, catch the Exception and come
      // back in an hour to try again
      case NonFatal(e) =>
        logWarning("Error while trying to update credentials, will try again in 1 hour", e)
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
    }
  }

  private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = {
    val stream = remoteFs.open(tokenPath)
    try {
      val newCredentials = new Credentials()
      newCredentials.readTokenStorageStream(stream)
      newCredentials
    } finally {
      stream.close()
    }
  }

  def stop(): Unit = {
    delegationTokenRenewer.shutdown()
  }

}

Source File: EventLogDownloadResource.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.status.api.v1

import java.io.OutputStream
import java.util.zip.ZipOutputStream
import javax.ws.rs.{GET, Produces}
import javax.ws.rs.core.{MediaType, Response, StreamingOutput}

import scala.util.control.NonFatal

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.SparkHadoopUtil

@Produces(Array(MediaType.APPLICATION_OCTET_STREAM))
private[v1] class EventLogDownloadResource(
    val uIRoot: UIRoot,
    val appId: String,
    val attemptId: Option[String]) extends Logging {
  val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf)

  @GET
  def getEventLogs(): Response = {
    try {
      val fileName = {
        attemptId match {
          case Some(id) => s"eventLogs-$appId-$id.zip"
          case None => s"eventLogs-$appId.zip"
        }
      }
      //实现StreamingOutput接口
      val stream = new StreamingOutput {
        override def write(output: OutputStream): Unit = {
          //ZipOutputStream实现打包
          val zipStream = new ZipOutputStream(output)
          try {
            uIRoot.writeEventLogs(appId, attemptId, zipStream)
          } finally {
            zipStream.close()
          }

        }
      }

      Response.ok(stream)
        .header("Content-Disposition", s"attachment; filename=$fileName")
        .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM)
        .build()
    } catch {
      case NonFatal(e) =>
        Response.serverError()
          .entity(s"Event logs are not available for app: $appId.")
          .status(Response.Status.SERVICE_UNAVAILABLE)
          .build()
    }
  }
}

Source File: SimrSchedulerBackend.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.rpc.RpcAddress
import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,
    //运行driver的主机名或 IP 地址
      RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt),
      CoarseGrainedSchedulerBackend.ENDPOINT_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    //创建临时文件以防止执行程序获得空的驱动程序文件的竞争条件
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
val fs = FileSystem.get(conf)
fs.delete(new Path(driverFilePath), false)
super.stop()
}

}

Source File: WholeTextFileRecordReader.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.input

import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable}
import com.google.common.io.{ByteStreams, Closeables}

import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader}
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.spark.deploy.SparkHadoopUtil



private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
}

Source File: SparkPodInitContainer.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.k8s

import java.io.File
import java.util.concurrent.TimeUnit

import scala.concurrent.{ExecutionContext, Future}

import org.apache.spark.{SecurityManager => SparkSecurityManager, SparkConf}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.deploy.k8s.Config._
import org.apache.spark.internal.Logging
import org.apache.spark.util.{ThreadUtils, Utils}


private[spark] class SparkPodInitContainer(
    sparkConf: SparkConf,
    fileFetcher: FileFetcher) extends Logging {

  private val maxThreadPoolSize = sparkConf.get(INIT_CONTAINER_MAX_THREAD_POOL_SIZE)
  private implicit val downloadExecutor = ExecutionContext.fromExecutorService(
    ThreadUtils.newDaemonCachedThreadPool("download-executor", maxThreadPoolSize))

  private val jarsDownloadDir = new File(sparkConf.get(JARS_DOWNLOAD_LOCATION))
  private val filesDownloadDir = new File(sparkConf.get(FILES_DOWNLOAD_LOCATION))

  private val remoteJars = sparkConf.get(INIT_CONTAINER_REMOTE_JARS)
  private val remoteFiles = sparkConf.get(INIT_CONTAINER_REMOTE_FILES)

  private val downloadTimeoutMinutes = sparkConf.get(INIT_CONTAINER_MOUNT_TIMEOUT)

  def run(): Unit = {
    logInfo(s"Downloading remote jars: $remoteJars")
    downloadFiles(
      remoteJars,
      jarsDownloadDir,
      s"Remote jars download directory specified at $jarsDownloadDir does not exist " +
        "or is not a directory.")

    logInfo(s"Downloading remote files: $remoteFiles")
    downloadFiles(
      remoteFiles,
      filesDownloadDir,
      s"Remote files download directory specified at $filesDownloadDir does not exist " +
        "or is not a directory.")

    downloadExecutor.shutdown()
    downloadExecutor.awaitTermination(downloadTimeoutMinutes, TimeUnit.MINUTES)
  }

  private def downloadFiles(
      filesCommaSeparated: Option[String],
      downloadDir: File,
      errMessage: String): Unit = {
    filesCommaSeparated.foreach { files =>
      require(downloadDir.isDirectory, errMessage)
      Utils.stringToSeq(files).foreach { file =>
        Future[Unit] {
          fileFetcher.fetchFile(file, downloadDir)
        }
      }
    }
  }
}

private class FileFetcher(sparkConf: SparkConf, securityManager: SparkSecurityManager) {

  def fetchFile(uri: String, targetDir: File): Unit = {
    Utils.fetchFile(
      url = uri,
      targetDir = targetDir,
      conf = sparkConf,
      securityMgr = securityManager,
      hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf),
      timestamp = System.currentTimeMillis(),
      useCache = false)
  }
}

object SparkPodInitContainer extends Logging {

  def main(args: Array[String]): Unit = {
    logInfo("Starting init-container to download Spark application dependencies.")
    val sparkConf = new SparkConf(true)
    if (args.nonEmpty) {
      Utils.loadDefaultSparkProperties(sparkConf, args(0))
    }

    val securityManager = new SparkSecurityManager(sparkConf)
    val fileFetcher = new FileFetcher(sparkConf, securityManager)
    new SparkPodInitContainer(sparkConf, fileFetcher).run()
    logInfo("Finished downloading application dependencies.")
  }
}

Source File: HiveUtilsSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import java.net.URL

import org.apache.hadoop.hive.conf.HiveConf.ConfVars

import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SQLTestUtils}
import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader}

class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {

  test("newTemporaryConfiguration overwrites listener configurations") {
    Seq(true, false).foreach { useInMemoryDerby =>
      val conf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby)
      assert(conf(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname) === "")
      assert(conf(ConfVars.METASTORE_EVENT_LISTENERS.varname) === "")
      assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "")
    }
  }

  test("newTemporaryConfiguration respect spark.hadoop.foo=bar in SparkConf") {
    sys.props.put("spark.hadoop.foo", "bar")
    Seq(true, false) foreach { useInMemoryDerby =>
      val hiveConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby)
      assert(!hiveConf.contains("spark.hadoop.foo"))
      assert(hiveConf("foo") === "bar")
    }
  }

  test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") {
    val conf = new SparkConf
    val contextClassLoader = Thread.currentThread().getContextClassLoader
    val loader = new ChildFirstURLClassLoader(Array(), contextClassLoader)
    try {
      Thread.currentThread().setContextClassLoader(loader)
      HiveUtils.newClientForMetadata(
        conf,
        SparkHadoopUtil.newConfiguration(conf),
        HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true))
    } finally {
      Thread.currentThread().setContextClassLoader(contextClassLoader)
    }
  }

  test("toHiveString correctly handles UDTs") {
    val point = new ExamplePoint(50.0, 50.0)
    val tpe = new ExamplePointUDT()
    assert(HiveUtils.toHiveString((point, tpe)) === "(50.0, 50.0)")
  }
}

Source File: OrcHadoopFsRelationSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.sources.HadoopFsRelationTest
import org.apache.spark.sql.types._

class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
  override val dataSourceName: String = classOf[DefaultSource].getCanonicalName

  import sqlContext._
  import sqlContext.implicits._

  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1))
          .toDF("a", "b", "p1")
          .write
          .format("orc")
          .save(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        load(
          source = dataSourceName,
          options = Map(
            "path" -> file.getCanonicalPath,
            "dataSchema" -> dataSchemaWithPartition.json)))
    }
  }
}

Source File: HiveCliSessionStateSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import org.apache.hadoop.hive.cli.CliSessionState
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.ql.session.SessionState

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.hive.HiveUtils

class HiveCliSessionStateSuite extends SparkFunSuite {

  def withSessionClear(f: () => Unit): Unit = {
    try f finally SessionState.detachSession()
  }

  test("CliSessionState will be reused") {
    withSessionClear { () =>
      val hiveConf = new HiveConf(classOf[SessionState])
      HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach {
        case (key, value) => hiveConf.set(key, value)
      }
      val sessionState: SessionState = new CliSessionState(hiveConf)
      SessionState.start(sessionState)
      val s1 = SessionState.get
      val sparkConf = new SparkConf()
      val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf)
      val s2 = HiveUtils.newClientForMetadata(sparkConf, hadoopConf).getState
      assert(s1 === s2)
      assert(s2.isInstanceOf[CliSessionState])
    }
  }

  test("SessionState will not be reused") {
    withSessionClear { () =>
      val sparkConf = new SparkConf()
      val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf)
      HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach {
        case (key, value) => hadoopConf.set(key, value)
      }
      val hiveClient = HiveUtils.newClientForMetadata(sparkConf, hadoopConf)
      val s1 = hiveClient.getState
      val s2 = hiveClient.newSession().getState
      assert(s1 !== s2)
    }
  }
}

Source File: DriverWrapper.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.worker

import java.io.File

import org.apache.commons.lang3.StringUtils

import org.apache.spark.{SecurityManager, SparkConf}
import org.apache.spark.deploy.{DependencyUtils, SparkHadoopUtil, SparkSubmit}
import org.apache.spark.internal.Logging
import org.apache.spark.rpc.RpcEnv
import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader, Utils}


      case workerUrl :: userJar :: mainClass :: extraArgs =>
        val conf = new SparkConf()
        val host: String = Utils.localHostName()
        val port: Int = sys.props.getOrElse("spark.driver.port", "0").toInt
        val rpcEnv = RpcEnv.create("Driver", host, port, conf, new SecurityManager(conf))
        logInfo(s"Driver address: ${rpcEnv.address}")
        rpcEnv.setupEndpoint("workerWatcher", new WorkerWatcher(rpcEnv, workerUrl))

        val currentLoader = Thread.currentThread.getContextClassLoader
        val userJarUrl = new File(userJar).toURI().toURL()
        val loader =
          if (sys.props.getOrElse("spark.driver.userClassPathFirst", "false").toBoolean) {
            new ChildFirstURLClassLoader(Array(userJarUrl), currentLoader)
          } else {
            new MutableURLClassLoader(Array(userJarUrl), currentLoader)
          }
        Thread.currentThread.setContextClassLoader(loader)
        setupDependencies(loader, userJar)

        // Delegate to supplied main class
        val clazz = Utils.classForName(mainClass)
        val mainMethod = clazz.getMethod("main", classOf[Array[String]])
        mainMethod.invoke(null, extraArgs.toArray[String])

        rpcEnv.shutdown()

      case _ =>
        // scalastyle:off println
        System.err.println("Usage: DriverWrapper <workerUrl> <userJar> <driverMainClass> [options]")
        // scalastyle:on println
        System.exit(-1)
    }
  }

  private def setupDependencies(loader: MutableURLClassLoader, userJar: String): Unit = {
    val sparkConf = new SparkConf()
    val secMgr = new SecurityManager(sparkConf)
    val hadoopConf = SparkHadoopUtil.newConfiguration(sparkConf)

    val Seq(packagesExclusions, packages, repositories, ivyRepoPath, ivySettingsPath) =
      Seq(
        "spark.jars.excludes",
        "spark.jars.packages",
        "spark.jars.repositories",
        "spark.jars.ivy",
        "spark.jars.ivySettings"
      ).map(sys.props.get(_).orNull)

    val resolvedMavenCoordinates = DependencyUtils.resolveMavenDependencies(packagesExclusions,
      packages, repositories, ivyRepoPath, Option(ivySettingsPath))
    val jars = {
      val jarsProp = sys.props.get("spark.jars").orNull
      if (!StringUtils.isBlank(resolvedMavenCoordinates)) {
        SparkSubmit.mergeFileLists(jarsProp, resolvedMavenCoordinates)
      } else {
        jarsProp
      }
    }
    val localJars = DependencyUtils.resolveAndDownloadJars(jars, userJar, sparkConf, hadoopConf,
      secMgr)
    DependencyUtils.addJarsToClassPath(localJars, loader)
  }
}

Source File: OrcFileOperator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.Logging
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.HiveMetastoreTypes
import org.apache.spark.sql.types.StructType

private[orc] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      path -> OrcFile.createReader(fs, path)
    }.collectFirst {
      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(path: String, conf: Option[Configuration]): StructType = {
    val reader = getFileReader(path, conf).getOrElse {
      throw new AnalysisException(
        s"Failed to discover schema from ORC files stored in $path. " +
          "Probably there are either no ORC files or only empty ORC files.")
    }
    val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
    val schema = readerInspector.getTypeName
    logDebug(s"Reading schema from file $path, got Hive schema string: $schema")
    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType]
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDir)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))

    if (paths == null || paths.isEmpty) {
      throw new IllegalArgumentException(
        s"orcFileOperator: path $path does not have valid orc files matching the pattern")
    }

    paths
  }
}

Source File: OrcHadoopFsRelationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.{Row, SQLConf}
import org.apache.spark.sql.sources.HadoopFsRelationTest
import org.apache.spark.sql.types._

class OrcHadoopFsRelationSuite extends HadoopFsRelationTest {
  import testImplicits._

  override val dataSourceName: String = classOf[DefaultSource].getCanonicalName

  // ORC does not play well with NullType and UDT.
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: NullType => false
    case _: CalendarIntervalType => false
    case _: UserDefinedType[_] => false
    case _ => true
  }

  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield (i, s"val_$i", p1))
          .toDF("a", "b", "p1")
          .write
          .orc(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        hiveContext.read.options(Map(
          "path" -> file.getCanonicalPath,
          "dataSchema" -> dataSchemaWithPartition.json)).format(dataSourceName).load())
    }
  }

  test("SPARK-12218: 'Not' is included in ORC filter pushdown") {
    import testImplicits._

    withSQLConf(SQLConf.ORC_FILTER_PUSHDOWN_ENABLED.key -> "true") {
      withTempPath { dir =>
        val path = s"${dir.getCanonicalPath}/table1"
        (1 to 5).map(i => (i, (i % 2).toString)).toDF("a", "b").write.orc(path)

        checkAnswer(
          sqlContext.read.orc(path).where("not (a = 2) or not(b in ('1'))"),
          (1 to 5).map(i => Row(i, (i % 2).toString)))

        checkAnswer(
          sqlContext.read.orc(path).where("not (a = 2 and b in ('1'))"),
          (1 to 5).map(i => Row(i, (i % 2).toString)))
      }
    }
  }
}

Source File: JsonHadoopFsRelationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import java.math.BigDecimal

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
  override val dataSourceName: String = "json"

  // JSON does not write data of NullType and does not play well with BinaryType.
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: NullType => false
    case _: BinaryType => false
    case _: CalendarIntervalType => false
    case _ => true
  }

  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""")
          .saveAsTextFile(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        hiveContext.read.format(dataSourceName)
          .option("dataSchema", dataSchemaWithPartition.json)
          .load(file.getCanonicalPath))
    }
  }

  test("SPARK-9894: save complex types to JSON") {
    withTempDir { file =>
      file.delete()

      val schema =
        new StructType()
          .add("array", ArrayType(LongType))
          .add("map", MapType(StringType, new StructType().add("innerField", LongType)))

      val data =
        Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) ::
          Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil
      val df = hiveContext.createDataFrame(sparkContext.parallelize(data), schema)

      // Write the data out.
      df.write.format(dataSourceName).save(file.getCanonicalPath)

      // Read it back and check the result.
      checkAnswer(
        hiveContext.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
        df
      )
    }
  }

  test("SPARK-10196: save decimal type to JSON") {
    withTempDir { file =>
      file.delete()

      val schema =
        new StructType()
          .add("decimal", DecimalType(7, 2))

      val data =
        Row(new BigDecimal("10.02")) ::
          Row(new BigDecimal("20000.99")) ::
          Row(new BigDecimal("10000")) :: Nil
      val df = hiveContext.createDataFrame(sparkContext.parallelize(data), schema)

      // Write the data out.
      df.write.format(dataSourceName).save(file.getCanonicalPath)

      // Read it back and check the result.
      checkAnswer(
        hiveContext.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
        df
      )
    }
  }
}

Source File: CommitFailureTestRelationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path
import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils


class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton  {

  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = sqlContext.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: ExecutorDelegationTokenUpdater.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.util.concurrent.{Executors, TimeUnit}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.security.{Credentials, UserGroupInformation}

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.util.{ThreadUtils, Utils}

import scala.util.control.NonFatal

private[spark] class ExecutorDelegationTokenUpdater(
    sparkConf: SparkConf,
    hadoopConf: Configuration) extends Logging {

  @volatile private var lastCredentialsFileSuffix = 0

  private val credentialsFile = sparkConf.get("spark.yarn.credentials.file")
  private val freshHadoopConf =
    SparkHadoopUtil.get.getConfBypassingFSCache(
      hadoopConf, new Path(credentialsFile).toUri.getScheme)

  private val delegationTokenRenewer =
    Executors.newSingleThreadScheduledExecutor(
      ThreadUtils.namedThreadFactory("Delegation Token Refresh Thread"))

  // On the executor, this thread wakes up and picks up new tokens from HDFS, if any.
  private val executorUpdaterRunnable =
    new Runnable {
      override def run(): Unit = Utils.logUncaughtExceptions(updateCredentialsIfRequired())
    }

  def updateCredentialsIfRequired(): Unit = {
    try {
      val credentialsFilePath = new Path(credentialsFile)
      val remoteFs = FileSystem.get(freshHadoopConf)
      SparkHadoopUtil.get.listFilesSorted(
        remoteFs, credentialsFilePath.getParent,
        credentialsFilePath.getName, SparkHadoopUtil.SPARK_YARN_CREDS_TEMP_EXTENSION)
        .lastOption.foreach { credentialsStatus =>
        val suffix = SparkHadoopUtil.get.getSuffixForCredentialsPath(credentialsStatus.getPath)
        if (suffix > lastCredentialsFileSuffix) {
          logInfo("Reading new delegation tokens from " + credentialsStatus.getPath)
          val newCredentials = getCredentialsFromHDFSFile(remoteFs, credentialsStatus.getPath)
          lastCredentialsFileSuffix = suffix
          UserGroupInformation.getCurrentUser.addCredentials(newCredentials)
          logInfo("Tokens updated from credentials file.")
        } else {
          // Check every hour to see if new credentials arrived.
          logInfo("Updated delegation tokens were expected, but the driver has not updated the " +
            "tokens yet, will check again in an hour.")
          delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
          return
        }
      }
      val timeFromNowToRenewal =
        SparkHadoopUtil.get.getTimeFromNowToRenewal(
          sparkConf, 0.8, UserGroupInformation.getCurrentUser.getCredentials)
      if (timeFromNowToRenewal <= 0) {
        // We just checked for new credentials but none were there, wait a minute and retry.
        // This handles the shutdown case where the staging directory may have been removed(see
        // SPARK-12316 for more details).
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.MINUTES)
      } else {
        logInfo(s"Scheduling token refresh from HDFS in $timeFromNowToRenewal millis.")
        delegationTokenRenewer.schedule(
          executorUpdaterRunnable, timeFromNowToRenewal, TimeUnit.MILLISECONDS)
      }
    } catch {
      // Since the file may get deleted while we are reading it, catch the Exception and come
      // back in an hour to try again
      case NonFatal(e) =>
        logWarning("Error while trying to update credentials, will try again in 1 hour", e)
        delegationTokenRenewer.schedule(executorUpdaterRunnable, 1, TimeUnit.HOURS)
    }
  }

  private def getCredentialsFromHDFSFile(remoteFs: FileSystem, tokenPath: Path): Credentials = {
    val stream = remoteFs.open(tokenPath)
    try {
      val newCredentials = new Credentials()
      newCredentials.readTokenStorageStream(stream)
      newCredentials
    } finally {
      stream.close()
    }
  }

  def stop(): Unit = {
    delegationTokenRenewer.shutdown()
  }

}

Source File: EventLogDownloadResource.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.status.api.v1

import java.io.OutputStream
import java.util.zip.ZipOutputStream
import javax.ws.rs.{GET, Produces}
import javax.ws.rs.core.{MediaType, Response, StreamingOutput}

import scala.util.control.NonFatal

import org.apache.spark.{Logging, SparkConf}
import org.apache.spark.deploy.SparkHadoopUtil

@Produces(Array(MediaType.APPLICATION_OCTET_STREAM))
private[v1] class EventLogDownloadResource(
    val uIRoot: UIRoot,
    val appId: String,
    val attemptId: Option[String]) extends Logging {
  val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf)

  @GET
  def getEventLogs(): Response = {
    try {
      val fileName = {
        attemptId match {
          case Some(id) => s"eventLogs-$appId-$id.zip"
          case None => s"eventLogs-$appId.zip"
        }
      }

      val stream = new StreamingOutput {
        override def write(output: OutputStream): Unit = {
          val zipStream = new ZipOutputStream(output)
          try {
            uIRoot.writeEventLogs(appId, attemptId, zipStream)
          } finally {
            zipStream.close()
          }

        }
      }

      Response.ok(stream)
        .header("Content-Disposition", s"attachment; filename=$fileName")
        .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM)
        .build()
    } catch {
      case NonFatal(e) =>
        Response.serverError()
          .entity(s"Event logs are not available for app: $appId.")
          .status(Response.Status.SERVICE_UNAVAILABLE)
          .build()
    }
  }
}

Source File: SimrSchedulerBackend.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.rpc.RpcAddress
import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.rpcEnv)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = rpcEnv.uriOf(SparkEnv.driverActorSystemName,
      RpcAddress(sc.conf.get("spark.driver.host"), sc.conf.get("spark.driver.port").toInt),
      CoarseGrainedSchedulerBackend.ENDPOINT_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    if (!fs.delete(new Path(driverFilePath), false)) {
      logWarning(s"error deleting ${driverFilePath}")
    }
    super.stop()
  }

}

Source File: WholeTextFileRecordReader.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.input

import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable}
import com.google.common.io.{ByteStreams, Closeables}

import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader}
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.spark.deploy.SparkHadoopUtil



private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
}

Source File: LasOutputWriter.scala From spark-iqmulus with Apache License 2.0

5 votes

package fr.ign.spark.iqmulus.las

import org.apache.spark.sql.types._
import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext }
import java.io.DataOutputStream
import org.apache.spark.sql.sources.OutputWriter
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.hadoop.io.{ NullWritable, BytesWritable }
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.fs.Path
import java.text.NumberFormat
import org.apache.spark.sql.{ Row, SQLContext, sources }
import fr.ign.spark.iqmulus.RowOutputStream

class LasOutputWriter(
  name: String,
  context: TaskAttemptContext,
  dataSchema: StructType,
  formatOpt: Option[Byte] = None,
  version: Version = Version(),
  offset: Array[Double] = Array(0F, 0F, 0F),
  scale: Array[Double] = Array(0.01F, 0.01F, 0.01F)
)
    extends OutputWriter {

  private val file = {
    val path = getDefaultWorkFile("/1.pdr")
    val fs = path.getFileSystem(context.getConfiguration)
    fs.create(path)
  }

  private val pmin = Array.fill[Double](3)(Double.PositiveInfinity)
  private val pmax = Array.fill[Double](3)(Double.NegativeInfinity)
  private val countByReturn = Array.fill[Long](15)(0)
  private def count = countByReturn.sum

  private val format = formatOpt.getOrElse(LasHeader.formatFromSchema(dataSchema))

  // todo, extra bytes
  private val schema = LasHeader.schema(format)
  private def header =
    new LasHeader(name, format, count, pmin, pmax, scale, offset, countByReturn)

  private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian = true, schema, dataSchema)

  def getDefaultWorkFile(extension: String): Path = {
    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
    val taskAttemptId: TaskAttemptID = context.getTaskAttemptID
    val split = taskAttemptId.getTaskID.getId
    new Path(name, f"$split%05d-$uniqueWriteJobId$extension")
  }

  override def write(row: Row): Unit = {
    recordWriter.write(row)

    // gather statistics for the header
    val x = offset(0) + scale(0) * row.getAs[Int]("x").toDouble
    val y = offset(1) + scale(1) * row.getAs[Int]("y").toDouble
    val z = offset(2) + scale(2) * row.getAs[Int]("z").toDouble
    val ret = row.getAs[Byte]("flags") & 0x3
    countByReturn(ret) += 1
    pmin(0) = Math.min(pmin(0), x)
    pmin(1) = Math.min(pmin(1), y)
    pmin(2) = Math.min(pmin(2), z)
    pmax(0) = Math.max(pmax(0), x)
    pmax(1) = Math.max(pmax(1), y)
    pmax(2) = Math.max(pmax(2), z)
  }

  override def close(): Unit = {
    recordWriter.close

    // write header
    val path = getDefaultWorkFile("/0.header")
    val fs = path.getFileSystem(context.getConfiguration)
    val dos = new java.io.DataOutputStream(fs.create(path))
    header.write(dos)
    dos.close

    // copy header and pdf to a final las file (1 per split)
    org.apache.hadoop.fs.FileUtil.copyMerge(
      fs, getDefaultWorkFile("/"),
      fs, getDefaultWorkFile(".las"),
      true, context.getConfiguration, ""
    )
  }
}

Source File: AngelClientFactory.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.context

import com.tencent.angel.client.AngelClient
import com.tencent.angel.client.local.AngelLocalClient
import com.tencent.angel.client.yarn.AngelYarnClient
import org.apache.spark.SparkContext
import org.apache.spark.deploy.SparkHadoopUtil

object AngelClientFactory {
  def get(sc: SparkContext): AngelClient = {
    val conf = SparkHadoopUtil.get.newConfiguration(sc.getConf)

    if (sc.isLocal) {
      new AngelLocalClient(conf)
    } else {
      new AngelYarnClient(conf)
    }
  }
}

Source File: TreeUtils.scala From zen with Apache License 2.0

5 votes

package com.github.cloudml.zen.ml.tree

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkHadoopUtil

object TreeUtils {
  def getFileSystem(conf: SparkConf, path: Path): FileSystem = {
    val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
    if (sys.env.contains("HADOOP_CONF_DIR") || sys.env.contains("YARN_CONF_DIR")) {
      val hdfsConfPath = if (sys.env.get("HADOOP_CONF_DIR").isDefined) {
        sys.env.get("HADOOP_CONF_DIR").get + "/core-site.xml"
      } else {
        sys.env.get("YARN_CONF_DIR").get + "/core-site.xml"
      }
      hadoopConf.addResource(new Path(hdfsConfPath))
    }
    path.getFileSystem(hadoopConf)
  }

  def getPartitionOffsets(upper: Int, numPartitions: Int): (Array[Int], Array[Int]) = {
    val npp = upper / numPartitions
    val nppp = npp + 1
    val residual = upper - npp * numPartitions
    val boundary = residual * nppp
    val startPP = new Array[Int](numPartitions)
    val lcLenPP = new Array[Int](numPartitions)
    var i = 0
    while(i < numPartitions) {
      if (i < residual) {
        startPP(i) = nppp * i
        lcLenPP(i) = nppp
      }
      else{
        startPP(i) = boundary + (i - residual) * npp
        lcLenPP(i) = npp
      }
      i += 1
    }
    (startPP, lcLenPP)
  }
}

Source File: SparkUtils.scala From zen with Apache License 2.0

5 votes

package com.github.cloudml.zen.ml.util

import breeze.linalg.{Vector => BV, SparseVector => BSV, DenseVector => BDV}
import breeze.storage.Zero
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.mllib.linalg.{DenseVector => SDV, Vector => SV, SparseVector => SSV}
import scala.language.implicitConversions
import scala.reflect.ClassTag


private[zen] object SparkUtils {
  implicit def toBreeze(sv: SV): BV[Double] = {
    sv match {
      case SDV(data) =>
        new BDV(data)
      case SSV(size, indices, values) =>
        new BSV(indices, values, size)
    }
  }

  implicit def fromBreeze(breezeVector: BV[Double]): SV = {
    breezeVector match {
      case v: BDV[Double] =>
        if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
          new SDV(v.data)
        } else {
          new SDV(v.toArray) // Can't use underlying array directly, so make a new one
        }
      case v: BSV[Double] =>
        if (v.index.length == v.used) {
          new SSV(v.length, v.index, v.data)
        } else {
          new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
        }
      case v: BV[_] =>
        sys.error("Unsupported Breeze vector type: " + v.getClass.getName)
    }
  }

  def toBreezeConv[T: ClassTag](sv: SV)(implicit num: Numeric[T]): BV[T] = {
    val zero = num.zero
    implicit val conv: Array[Double] => Array[T] = (data) => {
      data.map(ele => (zero match {
        case zero: Double => ele
        case zero: Float => ele.toFloat
        case zero: Int => ele.toInt
        case zero: Long => ele.toLong
      }).asInstanceOf[T]).array
    }
    sv match {
      case SDV(data) =>
        new BDV[T](data)
      case SSV(size, indices, values) =>
        new BSV[T](indices, values, size)(Zero[T](zero))
    }
  }

  def fromBreezeConv[T: ClassTag](breezeVector: BV[T])(implicit num: Numeric[T]): SV = {
    implicit val conv: Array[T] => Array[Double] = (data) => {
      data.map(num.toDouble).array
    }
    breezeVector match {
      case v: BDV[T] =>
        if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) {
          new SDV(v.data)
        } else {
          new SDV(v.toArray) // Can't use underlying array directly, so make a new one
        }
      case v: BSV[T] =>
        if (v.index.length == v.used) {
          new SSV(v.length, v.index, v.data)
        } else {
          new SSV(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used))
        }
      case v: BV[T] =>
        sys.error("Unsupported Breeze vector type: " + v.getClass.getName)
    }
  }

  def getFileSystem(conf: SparkConf, path: Path): FileSystem = {
    val hadoopConf = SparkHadoopUtil.get.newConfiguration(conf)
    if (sys.env.contains("HADOOP_CONF_DIR") || sys.env.contains("YARN_CONF_DIR")) {
      val hdfsConfPath = if (sys.env.get("HADOOP_CONF_DIR").isDefined) {
        sys.env.get("HADOOP_CONF_DIR").get + "/core-site.xml"
      } else {
        sys.env.get("YARN_CONF_DIR").get + "/core-site.xml"
      }
      hadoopConf.addResource(new Path(hdfsConfPath))
    }
    path.getFileSystem(hadoopConf)
  }

  def deleteChkptDirs(conf: SparkConf, dirs: Array[String]): Unit = {
    val fs = getFileSystem(conf, new Path(dirs(0)))
    dirs.foreach(dir => {
      fs.delete(new Path(dir), true)
    })
  }
}

Source File: OrcFileOperator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.types.StructType

private[orc] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      path -> OrcFile.createReader(fs, path)
    }.collectFirst {
      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = {
    // Take the first file where we can open a valid reader if we can find one.  Otherwise just
    // return None to indicate we can't infer the schema.
    paths.flatMap(getFileReader(_, conf)).headOption.map { reader =>
      val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
      val schema = readerInspector.getTypeName
      logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")
      CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType]
    }
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    // TODO: Check if the paths coming in are already qualified and simplify.
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDirectory)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))
    paths
  }
}

Source File: JsonHadoopFsRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import java.math.BigDecimal

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
  override val dataSourceName: String = "json"

  // JSON does not write data of NullType and does not play well with BinaryType.
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: NullType => false
    case _: BinaryType => false
    case _: CalendarIntervalType => false
    case _ => true
  }

  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""")
          .saveAsTextFile(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        spark.read.format(dataSourceName)
          .option("dataSchema", dataSchemaWithPartition.json)
          .load(file.getCanonicalPath))
    }
  }

  test("SPARK-9894: save complex types to JSON") {
    withTempDir { file =>
      file.delete()

      val schema =
        new StructType()
          .add("array", ArrayType(LongType))
          .add("map", MapType(StringType, new StructType().add("innerField", LongType)))

      val data =
        Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) ::
          Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil
      val df = spark.createDataFrame(sparkContext.parallelize(data), schema)

      // Write the data out.
      df.write.format(dataSourceName).save(file.getCanonicalPath)

      // Read it back and check the result.
      checkAnswer(
        spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
        df
      )
    }
  }

  test("SPARK-10196: save decimal type to JSON") {
    withTempDir { file =>
      file.delete()

      val schema =
        new StructType()
          .add("decimal", DecimalType(7, 2))

      val data =
        Row(new BigDecimal("10.02")) ::
          Row(new BigDecimal("20000.99")) ::
          Row(new BigDecimal("10000")) :: Nil
      val df = spark.createDataFrame(sparkContext.parallelize(data), schema)

      // Write the data out.
      df.write.format(dataSourceName).save(file.getCanonicalPath)

      // Read it back and check the result.
      checkAnswer(
        spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
        df
      )
    }
  }
}

Source File: CommitFailureTestRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton {
  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = spark.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - default") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val divideByZero = udf((x: Int) => { x / (x - 1)})
      val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id")))

      SimpleTextRelation.callbackCalled = false
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - partitioned") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))

      SimpleTextRelation.callbackCalled = false
      SimpleTextRelation.failWriter = true
      intercept[SparkException] {
        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: SimpleTextHadoopFsRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.catalyst.expressions.PredicateHelper
import org.apache.spark.sql.types._

class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper {
  override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName

  // We have a very limited number of supported types at here since it is just for a
  // test relation and we do very basic testing at here.
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: BinaryType => false
    // We are using random data generator and the generated strings are not really valid string.
    case _: StringType => false
    case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442
    case _: CalendarIntervalType => false
    case _: DateType => false
    case _: TimestampType => false
    case _: ArrayType => false
    case _: MapType => false
    case _: StructType => false
    case _: UserDefinedType[_] => false
    case _ => true
  }

  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1")
          .saveAsTextFile(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        spark.read.format(dataSourceName)
          .option("dataSchema", dataSchemaWithPartition.json)
          .load(file.getCanonicalPath))
    }
  }

  test("test hadoop conf option propagation") {
    withTempPath { file =>
      // Test write side
      val df = spark.range(10).selectExpr("cast(id as string)")
      df.write
        .option("some-random-write-option", "hahah-WRITE")
        .option("some-null-value-option", null)  // test null robustness
        .option("dataSchema", df.schema.json)
        .format(dataSourceName).save(file.getAbsolutePath)
      assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE")

      // Test read side
      val df1 = spark.read
        .option("some-random-read-option", "hahah-READ")
        .option("some-null-value-option", null)  // test null robustness
        .option("dataSchema", df.schema.json)
        .format(dataSourceName)
        .load(file.getAbsolutePath)
      df1.count()
      assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ")
    }
  }
}

Source File: IOEncryptionSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.io._
import java.nio.charset.StandardCharsets
import java.security.PrivilegedExceptionAction
import java.util.UUID

import org.apache.hadoop.security.{Credentials, UserGroupInformation}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, Matchers}

import org.apache.spark._
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.config._
import org.apache.spark.serializer._
import org.apache.spark.storage._

class IOEncryptionSuite extends SparkFunSuite with Matchers with BeforeAndAfterAll
  with BeforeAndAfterEach {
  private[this] val blockId = new TempShuffleBlockId(UUID.randomUUID())
  private[this] val conf = new SparkConf()
  private[this] val ugi = UserGroupInformation.createUserForTesting("testuser", Array("testgroup"))
  private[this] val serializer = new KryoSerializer(conf)

  override def beforeAll(): Unit = {
    System.setProperty("SPARK_YARN_MODE", "true")
    ugi.doAs(new PrivilegedExceptionAction[Unit]() {
      override def run(): Unit = {
        conf.set(IO_ENCRYPTION_ENABLED, true)
        val creds = new Credentials()
        SecurityManager.initIOEncryptionKey(conf, creds)
        SparkHadoopUtil.get.addCurrentUserCredentials(creds)
      }
    })
  }

  override def afterAll(): Unit = {
    SparkEnv.set(null)
    System.clearProperty("SPARK_YARN_MODE")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
  }

  override def afterEach(): Unit = {
    super.afterEach()
    conf.set("spark.shuffle.compress", false.toString)
    conf.set("spark.shuffle.spill.compress", false.toString)
  }

  test("IO encryption read and write") {
    ugi.doAs(new PrivilegedExceptionAction[Unit] {
      override def run(): Unit = {
        conf.set(IO_ENCRYPTION_ENABLED, true)
        conf.set("spark.shuffle.compress", false.toString)
        conf.set("spark.shuffle.spill.compress", false.toString)
        testYarnIOEncryptionWriteRead()
      }
    })
  }

  test("IO encryption read and write with shuffle compression enabled") {
    ugi.doAs(new PrivilegedExceptionAction[Unit] {
      override def run(): Unit = {
        conf.set(IO_ENCRYPTION_ENABLED, true)
        conf.set("spark.shuffle.compress", true.toString)
        conf.set("spark.shuffle.spill.compress", true.toString)
        testYarnIOEncryptionWriteRead()
      }
    })
  }

  private[this] def testYarnIOEncryptionWriteRead(): Unit = {
    val plainStr = "hello world"
    val outputStream = new ByteArrayOutputStream()
    val serializerManager = new SerializerManager(serializer, conf)
    val wrappedOutputStream = serializerManager.wrapStream(blockId, outputStream)
    wrappedOutputStream.write(plainStr.getBytes(StandardCharsets.UTF_8))
    wrappedOutputStream.close()

    val encryptedBytes = outputStream.toByteArray
    val encryptedStr = new String(encryptedBytes)
    assert(plainStr !== encryptedStr)

    val inputStream = new ByteArrayInputStream(encryptedBytes)
    val wrappedInputStream = serializerManager.wrapStream(blockId, inputStream)
    val decryptedBytes = new Array[Byte](1024)
    val len = wrappedInputStream.read(decryptedBytes)
    val decryptedStr = new String(decryptedBytes, 0, len, StandardCharsets.UTF_8)
    assert(decryptedStr === plainStr)
  }
}

Source File: CryptoStreamUtils.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.security

import java.io.{InputStream, OutputStream}
import java.util.Properties
import javax.crypto.spec.{IvParameterSpec, SecretKeySpec}

import org.apache.commons.crypto.random._
import org.apache.commons.crypto.stream._
import org.apache.hadoop.io.Text

import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._


  private[this] def createInitializationVector(properties: Properties): Array[Byte] = {
    val iv = new Array[Byte](IV_LENGTH_IN_BYTES)
    val initialIVStart = System.currentTimeMillis()
    CryptoRandomFactory.getCryptoRandom(properties).nextBytes(iv)
    val initialIVFinish = System.currentTimeMillis()
    val initialIVTime = initialIVFinish - initialIVStart
    if (initialIVTime > 2000) {
      logWarning(s"It costs ${initialIVTime} milliseconds to create the Initialization Vector " +
        s"used by CryptoStream")
    }
    iv
  }
}

Source File: EventLogDownloadResource.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.status.api.v1

import java.io.OutputStream
import java.util.zip.ZipOutputStream
import javax.ws.rs.{GET, Produces}
import javax.ws.rs.core.{MediaType, Response, StreamingOutput}

import scala.util.control.NonFatal

import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging

@Produces(Array(MediaType.APPLICATION_OCTET_STREAM))
private[v1] class EventLogDownloadResource(
    val uIRoot: UIRoot,
    val appId: String,
    val attemptId: Option[String]) extends Logging {
  val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf)

  @GET
  def getEventLogs(): Response = {
    try {
      val fileName = {
        attemptId match {
          case Some(id) => s"eventLogs-$appId-$id.zip"
          case None => s"eventLogs-$appId.zip"
        }
      }

      val stream = new StreamingOutput {
        override def write(output: OutputStream): Unit = {
          val zipStream = new ZipOutputStream(output)
          try {
            uIRoot.writeEventLogs(appId, attemptId, zipStream)
          } finally {
            zipStream.close()
          }

        }
      }

      Response.ok(stream)
        .header("Content-Disposition", s"attachment; filename=$fileName")
        .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM)
        .build()
    } catch {
      case NonFatal(e) =>
        Response.serverError()
          .entity(s"Event logs are not available for app: $appId.")
          .status(Response.Status.SERVICE_UNAVAILABLE)
          .build()
    }
  }
}

Source File: HdfsFileAccessor.scala From ArchiveSpark with MIT License

5 votes

package org.archive.archivespark.dataspecs.access

import java.io.InputStream

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.deploy.SparkHadoopUtil
import org.archive.archivespark.sparkling.io.IOUtil

class HdfsFileAccessor(path: String, decompress: Boolean = true) extends CloseableDataAccessor[InputStream] {
  override def get: Option[InputStream] = {
    val fs = FileSystem.get(SparkHadoopUtil.get.conf)
    var stream: InputStream = null
    try {
      val raw = fs.open(new Path(path))
      stream = if (decompress) IOUtil.decompress(raw, Some(path)) else raw
      Some(stream)
    } catch {
      case e: Exception =>
        e.printStackTrace()
        if (stream != null) stream.close()
        None
    }
  }
}

Source File: HdfsStreamAccessor.scala From ArchiveSpark with MIT License

5 votes

package org.archive.archivespark.dataspecs.access

import java.io.InputStream

import org.apache.commons.io.input.BoundedInputStream
import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
import org.apache.spark.deploy.SparkHadoopUtil

class HdfsStreamAccessor(location: HdfsLocationInfo) extends CloseableDataAccessor[InputStream] {
  override def get: Option[InputStream] = {
    if (location.length < 0 || location.offset < 0) None
    else {
      val fs = FileSystem.get(SparkHadoopUtil.get.conf)
      var stream: FSDataInputStream = null
      try {
        stream = fs.open(new Path(location.path))
        stream.seek(location.offset)
        Some(new BoundedInputStream(stream, location.length))
      } catch {
        case e: Exception =>
          e.printStackTrace()
          if (stream != null) stream.close()
          None
      }
    }
  }
}

Source File: FilePathMap.scala From ArchiveSpark with MIT License

5 votes

package org.archive.archivespark.util

import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.deploy.SparkHadoopUtil

import scala.util.Try

case class FilePathMap(path: String, patterns: Seq[String] = Seq.empty) {
  val pathMap: Map[String, String] = {
    var map = collection.mutable.Map[String, String]()

    val fs = FileSystem.get(SparkHadoopUtil.get.conf)
    val files = fs.listFiles(new Path(path), true)
    while (files.hasNext) {
      val path = files.next.getPath
      val filename = path.getName
      if (patterns.isEmpty || patterns.exists(filename.matches)) {
        if (map.contains(filename)) throw new RuntimeException("duplicate filename: " + filename)
        map += filename -> path.getParent.toString.intern
      }
    }

    map.toMap
  }

  def pathToFile(file: String): Option[Path] = Try {new Path(file).getName}.toOption match {
    case Some(f) => pathMap.get(f).map(dir => new Path(dir, f))
    case None => None
  }
}

Source File: OrcFileOperator.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import java.io.IOException

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.types.StructType

private[hive] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String,
      config: Option[Configuration] = None,
      ignoreCorruptFiles: Boolean = false)
      : Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      val reader = try {
        Some(OrcFile.createReader(fs, path))
      } catch {
        case e: IOException =>
          if (ignoreCorruptFiles) {
            logWarning(s"Skipped the footer in the corrupted file: $path", e)
            None
          } else {
            throw new SparkException(s"Could not read footer for file: $path", e)
          }
      }
      path -> reader
    }.collectFirst {
      case (path, Some(reader)) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(paths: Seq[String], conf: Option[Configuration], ignoreCorruptFiles: Boolean)
      : Option[StructType] = {
    // Take the first file where we can open a valid reader if we can find one.  Otherwise just
    // return None to indicate we can't infer the schema.
    paths.toIterator.map(getFileReader(_, conf, ignoreCorruptFiles)).collectFirst {
      case Some(reader) =>
        val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
        val schema = readerInspector.getTypeName
        logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")
        CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType]
    }
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    // TODO: Check if the paths coming in are already qualified and simplify.
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDirectory)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))
    paths
  }
}

Source File: HiveUtilsSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import java.net.URL

import org.apache.hadoop.hive.conf.HiveConf.ConfVars

import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.{ExamplePoint, ExamplePointUDT, SQLTestUtils}
import org.apache.spark.util.{ChildFirstURLClassLoader, MutableURLClassLoader}

class HiveUtilsSuite extends QueryTest with SQLTestUtils with TestHiveSingleton {

  test("newTemporaryConfiguration overwrites listener configurations") {
    Seq(true, false).foreach { useInMemoryDerby =>
      val conf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby)
      assert(conf(ConfVars.METASTORE_PRE_EVENT_LISTENERS.varname) === "")
      assert(conf(ConfVars.METASTORE_EVENT_LISTENERS.varname) === "")
      assert(conf(ConfVars.METASTORE_END_FUNCTION_LISTENERS.varname) === "")
    }
  }

  test("newTemporaryConfiguration respect spark.hadoop.foo=bar in SparkConf") {
    sys.props.put("spark.hadoop.foo", "bar")
    Seq(true, false) foreach { useInMemoryDerby =>
      val hiveConf = HiveUtils.newTemporaryConfiguration(useInMemoryDerby)
      assert(!hiveConf.contains("spark.hadoop.foo"))
      assert(hiveConf("foo") === "bar")
    }
  }

  test("ChildFirstURLClassLoader's parent is null, get spark classloader instead") {
    val conf = new SparkConf
    val contextClassLoader = Thread.currentThread().getContextClassLoader
    val loader = new ChildFirstURLClassLoader(Array(), contextClassLoader)
    try {
      Thread.currentThread().setContextClassLoader(loader)
      HiveUtils.newClientForMetadata(
        conf,
        SparkHadoopUtil.newConfiguration(conf),
        HiveUtils.newTemporaryConfiguration(useInMemoryDerby = true))
    } finally {
      Thread.currentThread().setContextClassLoader(contextClassLoader)
    }
  }

  test("toHiveString correctly handles UDTs") {
    val point = new ExamplePoint(50.0, 50.0)
    val tpe = new ExamplePointUDT()
    assert(HiveUtils.toHiveString((point, tpe)) === "(50.0, 50.0)")
  }
}

Source File: CommitFailureTestRelationSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton {
  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = spark.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - default") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val divideByZero = udf((x: Int) => { x / (x - 1)})
      val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id")))

      SimpleTextRelation.callbackCalled = false
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - partitioned") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))

      SimpleTextRelation.callbackCalled = false
      SimpleTextRelation.failWriter = true
      intercept[SparkException] {
        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: HiveCliSessionStateSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import org.apache.hadoop.hive.cli.CliSessionState
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.ql.session.SessionState

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.hive.HiveUtils

class HiveCliSessionStateSuite extends SparkFunSuite {

  def withSessionClear(f: () => Unit): Unit = {
    try f finally SessionState.detachSession()
  }

  test("CliSessionState will be reused") {
    withSessionClear { () =>
      val hiveConf = new HiveConf(classOf[SessionState])
      HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach {
        case (key, value) => hiveConf.set(key, value)
      }
      val sessionState: SessionState = new CliSessionState(hiveConf)
      SessionState.start(sessionState)
      val s1 = SessionState.get
      val sparkConf = new SparkConf()
      val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf)
      val s2 = HiveUtils.newClientForMetadata(sparkConf, hadoopConf).getState
      assert(s1 === s2)
      assert(s2.isInstanceOf[CliSessionState])
    }
  }

  test("SessionState will not be reused") {
    withSessionClear { () =>
      val sparkConf = new SparkConf()
      val hadoopConf = SparkHadoopUtil.get.newConfiguration(sparkConf)
      HiveUtils.newTemporaryConfiguration(useInMemoryDerby = false).foreach {
        case (key, value) => hadoopConf.set(key, value)
      }
      val hiveClient = HiveUtils.newClientForMetadata(sparkConf, hadoopConf)
      val s1 = hiveClient.getState
      val s2 = hiveClient.newSession().getState
      assert(s1 !== s2)
    }
  }
}

Source File: OrcFileOperator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.Logging
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.HiveMetastoreTypes
import org.apache.spark.sql.types.StructType

private[orc] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      path -> OrcFile.createReader(fs, path)
    }.collectFirst {
      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(path: String, conf: Option[Configuration]): StructType = {
    val reader = getFileReader(path, conf).getOrElse {
      throw new AnalysisException(
        s"Failed to discover schema from ORC files stored in $path. " +
          "Probably there are either no ORC files or only empty ORC files.")
    }
    val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
    val schema = readerInspector.getTypeName
    logDebug(s"Reading schema from file $path, got Hive schema string: $schema")
    HiveMetastoreTypes.toDataType(schema).asInstanceOf[StructType]
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val path = origPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDir)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))

    if (paths == null || paths.isEmpty) {
      throw new IllegalArgumentException(
        s"orcFileOperator: path $path does not have valid orc files matching the pattern")
    }

    paths
  }
}

Source File: JsonHadoopFsRelationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import java.math.BigDecimal

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
  override val dataSourceName: String = "json"

  // JSON does not write data of NullType and does not play well with BinaryType.
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: NullType => false
    case _: BinaryType => false
    case _: CalendarIntervalType => false
    case _ => true
  }

  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""")
          .saveAsTextFile(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        spark.read.format(dataSourceName)
          .option("dataSchema", dataSchemaWithPartition.json)
          .load(file.getCanonicalPath))
    }
  }

  test("SPARK-9894: save complex types to JSON") {
    withTempDir { file =>
      file.delete()

      val schema =
        new StructType()
          .add("array", ArrayType(LongType))
          .add("map", MapType(StringType, new StructType().add("innerField", LongType)))

      val data =
        Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) ::
          Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil
      val df = spark.createDataFrame(sparkContext.parallelize(data), schema)

      // Write the data out.
      df.write.format(dataSourceName).save(file.getCanonicalPath)

      // Read it back and check the result.
      checkAnswer(
        spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
        df
      )
    }
  }

  test("SPARK-10196: save decimal type to JSON") {
    withTempDir { file =>
      file.delete()

      val schema =
        new StructType()
          .add("decimal", DecimalType(7, 2))

      val data =
        Row(new BigDecimal("10.02")) ::
          Row(new BigDecimal("20000.99")) ::
          Row(new BigDecimal("10000")) :: Nil
      val df = spark.createDataFrame(sparkContext.parallelize(data), schema)

      // Write the data out.
      df.write.format(dataSourceName).save(file.getCanonicalPath)

      // Read it back and check the result.
      checkAnswer(
        spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
        df
      )
    }
  }
}

Source File: CommitFailureTestRelationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton {
  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = spark.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - default") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val divideByZero = udf((x: Int) => { x / (x - 1)})
      val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id")))

      SimpleTextRelation.callbackCalled = false
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - partitioned") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))

      SimpleTextRelation.callbackCalled = false
      SimpleTextRelation.failWriter = true
      intercept[SparkException] {
        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: SimpleTextHadoopFsRelationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.catalyst.expressions.PredicateHelper
import org.apache.spark.sql.types._

class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper {
  override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName

  // We have a very limited number of supported types at here since it is just for a
  // test relation and we do very basic testing at here.
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: BinaryType => false
    // We are using random data generator and the generated strings are not really valid string.
    case _: StringType => false
    case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442
    case _: CalendarIntervalType => false
    case _: DateType => false
    case _: TimestampType => false
    case _: ArrayType => false
    case _: MapType => false
    case _: StructType => false
    case _: UserDefinedType[_] => false
    case _ => true
  }

  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1")
          .saveAsTextFile(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        spark.read.format(dataSourceName)
          .option("dataSchema", dataSchemaWithPartition.json)
          .load(file.getCanonicalPath))
    }
  }

  test("test hadoop conf option propagation") {
    withTempPath { file =>
      // Test write side
      val df = spark.range(10).selectExpr("cast(id as string)")
      df.write
        .option("some-random-write-option", "hahah-WRITE")
        .option("some-null-value-option", null)  // test null robustness
        .option("dataSchema", df.schema.json)
        .format(dataSourceName).save(file.getAbsolutePath)
      assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE")

      // Test read side
      val df1 = spark.read
        .option("some-random-read-option", "hahah-READ")
        .option("some-null-value-option", null)  // test null robustness
        .option("dataSchema", df.schema.json)
        .format(dataSourceName)
        .load(file.getAbsolutePath)
      df1.count()
      assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ")
    }
  }
}

Source File: EventLogDownloadResource.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.status.api.v1

import java.io.OutputStream
import java.util.zip.ZipOutputStream
import javax.ws.rs.{GET, Produces}
import javax.ws.rs.core.{MediaType, Response, StreamingOutput}

import scala.util.control.NonFatal

import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging

@Produces(Array(MediaType.APPLICATION_OCTET_STREAM))
private[v1] class EventLogDownloadResource(
    val uIRoot: UIRoot,
    val appId: String,
    val attemptId: Option[String]) extends Logging {
  val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf)

  @GET
  def getEventLogs(): Response = {
    try {
      val fileName = {
        attemptId match {
          case Some(id) => s"eventLogs-$appId-$id.zip"
          case None => s"eventLogs-$appId.zip"
        }
      }

      val stream = new StreamingOutput {
        override def write(output: OutputStream): Unit = {
          val zipStream = new ZipOutputStream(output)
          try {
            uIRoot.writeEventLogs(appId, attemptId, zipStream)
          } finally {
            zipStream.close()
          }

        }
      }

      Response.ok(stream)
        .header("Content-Disposition", s"attachment; filename=$fileName")
        .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM)
        .build()
    } catch {
      case NonFatal(e) =>
        Response.serverError()
          .entity(s"Event logs are not available for app: $appId.")
          .status(Response.Status.SERVICE_UNAVAILABLE)
          .build()
    }
  }
}

Source File: SimrSchedulerBackend.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import org.apache.hadoop.fs.{Path, FileSystem}

import org.apache.spark.{Logging, SparkContext, SparkEnv}
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.util.AkkaUtils

private[spark] class SimrSchedulerBackend(
    scheduler: TaskSchedulerImpl,
    sc: SparkContext,
    driverFilePath: String)
  extends CoarseGrainedSchedulerBackend(scheduler, sc.env.actorSystem)
  with Logging {

  val tmpPath = new Path(driverFilePath + "_tmp")
  val filePath = new Path(driverFilePath)

  val maxCores = conf.getInt("spark.simr.executor.cores", 1)

  override def start() {
    super.start()

    val driverUrl = AkkaUtils.address(
      AkkaUtils.protocol(actorSystem),
      SparkEnv.driverActorSystemName,
      sc.conf.get("spark.driver.host"),
      sc.conf.get("spark.driver.port"),
      CoarseGrainedSchedulerBackend.ACTOR_NAME)

    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    val appUIAddress = sc.ui.map(_.appUIAddress).getOrElse("")

    logInfo("Writing to HDFS file: "  + driverFilePath)
    logInfo("Writing Akka address: "  + driverUrl)
    logInfo("Writing Spark UI Address: " + appUIAddress)

    // Create temporary file to prevent race condition where executors get empty driverUrl file
    val temp = fs.create(tmpPath, true)
    temp.writeUTF(driverUrl)
    temp.writeInt(maxCores)
    temp.writeUTF(appUIAddress)
    temp.close()

    // "Atomic" rename
    fs.rename(tmpPath, filePath)
  }

  override def stop() {
    val conf = SparkHadoopUtil.get.newConfiguration(sc.conf)
    val fs = FileSystem.get(conf)
    fs.delete(new Path(driverFilePath), false)
    super.stop()
  }

}

Source File: WholeTextFileRecordReader.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.input

import org.apache.hadoop.conf.{Configuration, Configurable => HConfigurable}
import com.google.common.io.{ByteStreams, Closeables}

import org.apache.hadoop.io.Text
import org.apache.hadoop.io.compress.CompressionCodecFactory
import org.apache.hadoop.mapreduce.InputSplit
import org.apache.hadoop.mapreduce.lib.input.{CombineFileSplit, CombineFileRecordReader}
import org.apache.hadoop.mapreduce.RecordReader
import org.apache.hadoop.mapreduce.TaskAttemptContext
import org.apache.spark.deploy.SparkHadoopUtil



private[spark] class ConfigurableCombineFileRecordReader[K, V](
    split: InputSplit,
    context: TaskAttemptContext,
    recordReaderClass: Class[_ <: RecordReader[K, V] with HConfigurable])
  extends CombineFileRecordReader[K, V](
    split.asInstanceOf[CombineFileSplit],
    context,
    recordReaderClass
  ) with Configurable {

  override def initNextRecordReader(): Boolean = {
    val r = super.initNextRecordReader()
    if (r) {
      this.curReader.asInstanceOf[HConfigurable].setConf(getConf)
    }
    r
  }
}

Source File: OrcFileOperator.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.hadoop.hive.ql.io.orc.{OrcFile, Reader}
import org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.types.StructType

private[orc] object OrcFileOperator extends Logging {
  
  def getFileReader(basePath: String, config: Option[Configuration] = None): Option[Reader] = {
    def isWithNonEmptySchema(path: Path, reader: Reader): Boolean = {
      reader.getObjectInspector match {
        case oi: StructObjectInspector if oi.getAllStructFieldRefs.size() == 0 =>
          logInfo(
            s"ORC file $path has empty schema, it probably contains no rows. " +
              "Trying to read another ORC file to figure out the schema.")
          false
        case _ => true
      }
    }

    val conf = config.getOrElse(new Configuration)
    val fs = {
      val hdfsPath = new Path(basePath)
      hdfsPath.getFileSystem(conf)
    }

    listOrcFiles(basePath, conf).iterator.map { path =>
      path -> OrcFile.createReader(fs, path)
    }.collectFirst {
      case (path, reader) if isWithNonEmptySchema(path, reader) => reader
    }
  }

  def readSchema(paths: Seq[String], conf: Option[Configuration]): Option[StructType] = {
    // Take the first file where we can open a valid reader if we can find one.  Otherwise just
    // return None to indicate we can't infer the schema.
    paths.flatMap(getFileReader(_, conf)).headOption.map { reader =>
      val readerInspector = reader.getObjectInspector.asInstanceOf[StructObjectInspector]
      val schema = readerInspector.getTypeName
      logDebug(s"Reading schema from file $paths, got Hive schema string: $schema")
      CatalystSqlParser.parseDataType(schema).asInstanceOf[StructType]
    }
  }

  def getObjectInspector(
      path: String, conf: Option[Configuration]): Option[StructObjectInspector] = {
    getFileReader(path, conf).map(_.getObjectInspector.asInstanceOf[StructObjectInspector])
  }

  def listOrcFiles(pathStr: String, conf: Configuration): Seq[Path] = {
    // TODO: Check if the paths coming in are already qualified and simplify.
    val origPath = new Path(pathStr)
    val fs = origPath.getFileSystem(conf)
    val paths = SparkHadoopUtil.get.listLeafStatuses(fs, origPath)
      .filterNot(_.isDirectory)
      .map(_.getPath)
      .filterNot(_.getName.startsWith("_"))
      .filterNot(_.getName.startsWith("."))
    paths
  }
}

Source File: JsonHadoopFsRelationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import java.math.BigDecimal

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

class JsonHadoopFsRelationSuite extends HadoopFsRelationTest {
  override val dataSourceName: String = "json"

  // JSON does not write data of NullType and does not play well with BinaryType.
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: NullType => false
    case _: BinaryType => false
    case _: CalendarIntervalType => false
    case _ => true
  }

  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield s"""{"a":$i,"b":"val_$i"}""")
          .saveAsTextFile(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        spark.read.format(dataSourceName)
          .option("dataSchema", dataSchemaWithPartition.json)
          .load(file.getCanonicalPath))
    }
  }

  test("SPARK-9894: save complex types to JSON") {
    withTempDir { file =>
      file.delete()

      val schema =
        new StructType()
          .add("array", ArrayType(LongType))
          .add("map", MapType(StringType, new StructType().add("innerField", LongType)))

      val data =
        Row(Seq(1L, 2L, 3L), Map("m1" -> Row(4L))) ::
          Row(Seq(5L, 6L, 7L), Map("m2" -> Row(10L))) :: Nil
      val df = spark.createDataFrame(sparkContext.parallelize(data), schema)

      // Write the data out.
      df.write.format(dataSourceName).save(file.getCanonicalPath)

      // Read it back and check the result.
      checkAnswer(
        spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
        df
      )
    }
  }

  test("SPARK-10196: save decimal type to JSON") {
    withTempDir { file =>
      file.delete()

      val schema =
        new StructType()
          .add("decimal", DecimalType(7, 2))

      val data =
        Row(new BigDecimal("10.02")) ::
          Row(new BigDecimal("20000.99")) ::
          Row(new BigDecimal("10000")) :: Nil
      val df = spark.createDataFrame(sparkContext.parallelize(data), schema)

      // Write the data out.
      df.write.format(dataSourceName).save(file.getCanonicalPath)

      // Read it back and check the result.
      checkAnswer(
        spark.read.format(dataSourceName).schema(schema).load(file.getCanonicalPath),
        df
      )
    }
  }
}

Source File: CommitFailureTestRelationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.SparkException
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton
import org.apache.spark.sql.test.SQLTestUtils

class CommitFailureTestRelationSuite extends SQLTestUtils with TestHiveSingleton {
  // When committing a task, `CommitFailureTestSource` throws an exception for testing purpose.
  val dataSourceName: String = classOf[CommitFailureTestSource].getCanonicalName

  test("SPARK-7684: commitTask() failure should fallback to abortTask()") {
    withTempPath { file =>
      // Here we coalesce partition number to 1 to ensure that only a single task is issued.  This
      // prevents race condition happened when FileOutputCommitter tries to remove the `_temporary`
      // directory while committing/aborting the job.  See SPARK-8513 for more details.
      val df = spark.range(0, 10).coalesce(1)
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - default") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val divideByZero = udf((x: Int) => { x / (x - 1)})
      val df = spark.range(0, 10).coalesce(1).select(divideByZero(col("id")))

      SimpleTextRelation.callbackCalled = false
      intercept[SparkException] {
        df.write.format(dataSourceName).save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }

  test("call failure callbacks before close writer - partitioned") {
    SimpleTextRelation.failCommitter = false
    withTempPath { file =>
      // fail the job in the middle of writing
      val df = spark.range(0, 10).coalesce(1).select(col("id").mod(2).as("key"), col("id"))

      SimpleTextRelation.callbackCalled = false
      SimpleTextRelation.failWriter = true
      intercept[SparkException] {
        df.write.format(dataSourceName).partitionBy("key").save(file.getCanonicalPath)
      }
      assert(SimpleTextRelation.callbackCalled, "failure callback should be called")

      val fs = new Path(file.getCanonicalPath).getFileSystem(SparkHadoopUtil.get.conf)
      assert(!fs.exists(new Path(file.getCanonicalPath, "_temporary")))
    }
  }
}

Source File: SimpleTextHadoopFsRelationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.hadoop.fs.Path

import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.sql.catalyst.expressions.PredicateHelper
import org.apache.spark.sql.types._

class SimpleTextHadoopFsRelationSuite extends HadoopFsRelationTest with PredicateHelper {
  override val dataSourceName: String = classOf[SimpleTextSource].getCanonicalName

  // We have a very limited number of supported types at here since it is just for a
  // test relation and we do very basic testing at here.
  override protected def supportsDataType(dataType: DataType): Boolean = dataType match {
    case _: BinaryType => false
    // We are using random data generator and the generated strings are not really valid string.
    case _: StringType => false
    case _: BooleanType => false // see https://issues.apache.org/jira/browse/SPARK-10442
    case _: CalendarIntervalType => false
    case _: DateType => false
    case _: TimestampType => false
    case _: ArrayType => false
    case _: MapType => false
    case _: StructType => false
    case _: UserDefinedType[_] => false
    case _ => true
  }

  test("save()/load() - partitioned table - simple queries - partition columns in data") {
    withTempDir { file =>
      val basePath = new Path(file.getCanonicalPath)
      val fs = basePath.getFileSystem(SparkHadoopUtil.get.conf)
      val qualifiedBasePath = fs.makeQualified(basePath)

      for (p1 <- 1 to 2; p2 <- Seq("foo", "bar")) {
        val partitionDir = new Path(qualifiedBasePath, s"p1=$p1/p2=$p2")
        sparkContext
          .parallelize(for (i <- 1 to 3) yield s"$i,val_$i,$p1")
          .saveAsTextFile(partitionDir.toString)
      }

      val dataSchemaWithPartition =
        StructType(dataSchema.fields :+ StructField("p1", IntegerType, nullable = true))

      checkQueries(
        spark.read.format(dataSourceName)
          .option("dataSchema", dataSchemaWithPartition.json)
          .load(file.getCanonicalPath))
    }
  }

  test("test hadoop conf option propagation") {
    withTempPath { file =>
      // Test write side
      val df = spark.range(10).selectExpr("cast(id as string)")
      df.write
        .option("some-random-write-option", "hahah-WRITE")
        .option("some-null-value-option", null)  // test null robustness
        .option("dataSchema", df.schema.json)
        .format(dataSourceName).save(file.getAbsolutePath)
      assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-write-option") == "hahah-WRITE")

      // Test read side
      val df1 = spark.read
        .option("some-random-read-option", "hahah-READ")
        .option("some-null-value-option", null)  // test null robustness
        .option("dataSchema", df.schema.json)
        .format(dataSourceName)
        .load(file.getAbsolutePath)
      df1.count()
      assert(SimpleTextRelation.lastHadoopConf.get.get("some-random-read-option") == "hahah-READ")
    }
  }
}

Source File: EventLogDownloadResource.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.status.api.v1

import java.io.OutputStream
import java.util.zip.ZipOutputStream
import javax.ws.rs.{GET, Produces}
import javax.ws.rs.core.{MediaType, Response, StreamingOutput}

import scala.util.control.NonFatal

import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.internal.Logging

@Produces(Array(MediaType.APPLICATION_OCTET_STREAM))
private[v1] class EventLogDownloadResource(
    val uIRoot: UIRoot,
    val appId: String,
    val attemptId: Option[String]) extends Logging {
  val conf = SparkHadoopUtil.get.newConfiguration(new SparkConf)

  @GET
  def getEventLogs(): Response = {
    try {
      val fileName = {
        attemptId match {
          case Some(id) => s"eventLogs-$appId-$id.zip"
          case None => s"eventLogs-$appId.zip"
        }
      }

      val stream = new StreamingOutput {
        override def write(output: OutputStream): Unit = {
          val zipStream = new ZipOutputStream(output)
          try {
            uIRoot.writeEventLogs(appId, attemptId, zipStream)
          } finally {
            zipStream.close()
          }

        }
      }

      Response.ok(stream)
        .header("Content-Disposition", s"attachment; filename=$fileName")
        .header("Content-Type", MediaType.APPLICATION_OCTET_STREAM)
        .build()
    } catch {
      case NonFatal(e) =>
        Response.serverError()
          .entity(s"Event logs are not available for app: $appId.")
          .status(Response.Status.SERVICE_UNAVAILABLE)
          .build()
    }
  }
}

Source File: HDFSConfigHelper.scala From sparklens with Apache License 2.0

5 votes

package com.qubole.sparklens.helper

import org.apache.hadoop.conf.Configuration
import org.apache.spark.SparkConf
import org.apache.spark.deploy.SparkHadoopUtil

object HDFSConfigHelper {

   def getHadoopConf(sparkConfOptional:Option[SparkConf]): Configuration = {
    if (sparkConfOptional.isDefined) {
      SparkHadoopUtil.get.newConfiguration(sparkConfOptional.get)
    }else {
      val sparkConf = new SparkConf()
      SparkHadoopUtil.get.newConfiguration(sparkConf)
    }
  }
}

Source File: CarbonCountStar.scala From carbondata with Apache License 2.0

4 votes

package org.apache.spark.sql

import scala.collection.JavaConverters._

import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.mapreduce.Job
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
import org.apache.spark.deploy.SparkHadoopUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.{InternalRow, TableIdentifier}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.LeafExecNode
import org.apache.spark.sql.optimizer.CarbonFilters
import org.apache.spark.sql.types.StringType
import org.apache.spark.unsafe.types.UTF8String

import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.metadata.AbsoluteTableIdentifier
import org.apache.carbondata.core.metadata.schema.table.CarbonTable
import org.apache.carbondata.core.mutate.CarbonUpdateUtil
import org.apache.carbondata.core.statusmanager.StageInputCollector
import org.apache.carbondata.core.util.{CarbonProperties, ThreadLocalSessionInfo}
import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat}
import org.apache.carbondata.hadoop.util.CarbonInputFormatUtil
import org.apache.carbondata.spark.load.DataLoadProcessBuilderOnSpark

case class CarbonCountStar(
    attributesRaw: Seq[Attribute],
    carbonTable: CarbonTable,
    sparkSession: SparkSession,
    outUnsafeRows: Boolean = true) extends LeafExecNode {

  override def doExecute(): RDD[InternalRow] = {
    ThreadLocalSessionInfo
      .setConfigurationToCurrentThread(sparkSession.sessionState.newHadoopConf())
    val absoluteTableIdentifier = carbonTable.getAbsoluteTableIdentifier
    val (job, tableInputFormat) = createCarbonInputFormat(absoluteTableIdentifier)
    CarbonInputFormat.setQuerySegment(job.getConfiguration, carbonTable)

    // get row count
    var rowCount = CarbonUpdateUtil.getRowCount(
      tableInputFormat.getBlockRowCount(
        job,
        carbonTable,
        CarbonFilters.getPartitions(
          Seq.empty,
          sparkSession,
          TableIdentifier(
            carbonTable.getTableName,
            Some(carbonTable.getDatabaseName))).map(_.asJava).orNull, false),
      carbonTable)

    if (CarbonProperties.isQueryStageInputEnabled) {
      // check for number of row for stage input
      val splits = StageInputCollector.createInputSplits(carbonTable, job.getConfiguration)
      if (!splits.isEmpty) {
        val df = DataLoadProcessBuilderOnSpark.createInputDataFrame(
          sparkSession, carbonTable, splits.asScala)
        rowCount += df.count()
      }
    }

    val valueRaw =
      attributesRaw.head.dataType match {
        case StringType => Seq(UTF8String.fromString(Long.box(rowCount).toString)).toArray
          .asInstanceOf[Array[Any]]
        case _ => Seq(Long.box(rowCount)).toArray.asInstanceOf[Array[Any]]
      }
    val value = new GenericInternalRow(valueRaw)
    val unsafeProjection = UnsafeProjection.create(output.map(_.dataType).toArray)
    val row = if (outUnsafeRows) unsafeProjection(value) else value
    sparkContext.parallelize(Seq(row))
  }

  override def output: Seq[Attribute] = {
    attributesRaw
  }

  private def createCarbonInputFormat(absoluteTableIdentifier: AbsoluteTableIdentifier
  ): (Job, CarbonTableInputFormat[Array[Object]]) = {
    val carbonInputFormat = new CarbonTableInputFormat[Array[Object]]()
    val jobConf: JobConf = new JobConf(FileFactory.getConfiguration)
    SparkHadoopUtil.get.addCredentials(jobConf)
    CarbonInputFormat.setTableInfo(jobConf, carbonTable.getTableInfo)
    val job = new Job(jobConf)
    FileInputFormat.addInputPath(job, new Path(absoluteTableIdentifier.getTablePath))
    CarbonInputFormat
      .setTransactionalTable(job.getConfiguration,
        carbonTable.getTableInfo.isTransactionalTable)
    CarbonInputFormatUtil.setIndexJobIfConfigured(job.getConfiguration)
    (job, carbonInputFormat)
  }
}

org.apache.spark.deploy.SparkHadoopUtil Scala Examples