org.apache.spark.internal.config.ConfigBuilder Scala Example

Source File: Monitor.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.monitor

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkConf
import org.apache.spark.alarm.{Alarm, AlertMessage}
import org.apache.spark.alarm.AlertType.AlertType
import org.apache.spark.internal.config.ConfigBuilder
import org.apache.spark.monitor.MonitorItem.MonitorItem
import org.apache.spark.scheduler.SparkListenerEvent
import org.apache.spark.status.AppStatusStore
import org.apache.spark.util.kvstore.KVStore

trait Monitor {

  val alertType: Seq[AlertType]
  val item: MonitorItem
  val alarms: ArrayBuffer[Alarm] = ArrayBuffer()
  var kvStore: KVStore = null
  var appStore: AppStatusStore = null
  var conf: SparkConf = null

  def watchOut(event: SparkListenerEvent): Option[AlertMessage]
  def bind(alarm: Alarm): Monitor = {
    alarms.append(alarm)
    this
  }
  def bind(alarms: Seq[Alarm]): Monitor = {
    this.alarms.appendAll(alarms)
    this
  }
  def bind(kvStore: KVStore): Monitor = {
    this.kvStore = kvStore
    this.appStore = new AppStatusStore(kvStore)
    this
  }
  def bind(conf: SparkConf): Monitor = {
    this.conf = conf
    this
  }
  def onEvent(event: SparkListenerEvent): Unit = {
    val message = watchOut(event)
    if (message.isDefined) {
      alarms.foreach(_.alarm(message.get))
    }
  }
}
object Monitor {
  val commonClasses = Seq(
    "org.apache.spark.sql.xsql.shell.SparkXSQLShell",
    "org.apache.spark.repl.Main",
    "org.apache.spark.sql.hive.xitong.shell.SparkHiveShell",
    "org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver")
  val dateFormats = Seq("yyyy-MM-dd", "yyyy/MM/dd", "yyyyMMdd")
  val PREFIX = "spark.monitor"
  private[spark] val MONITOR_ITEMS =
    ConfigBuilder("spark.monitor.items")
      .internal()
      .doc("choose monitors to open, split with `,`")
      .stringConf
      .transform(_.toUpperCase)
      .toSequence
      .checkValue(
        _.toSet.subsetOf(MonitorItem.values.map(_.toString)),
        s"must be one of ${MonitorItem.values.map(_.toString)}")
      .createWithDefault(Seq.empty)
}
object MonitorItem extends Enumeration {
  type MonitorItem = Value
  val SQL_CHANGE_NOTIFIER = Value
  val APP_FINISH_NOTIFIER, EXECUTOR_NUM_NOTIFIER, DATASKEW_NOTIFIER, EXECUTOR_MEMORY_ADVISER =
    Value
  val SPARK_APPLICATION_SUMMARY, APP_IDLE_WARNER = Value
}

Source File: RemoteShuffleConf.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.shuffle.remote

import org.apache.spark.internal.config.{ConfigBuilder, ConfigEntry}

object RemoteShuffleConf {

  val STORAGE_MASTER_URI: ConfigEntry[String] =
    ConfigBuilder("spark.shuffle.remote.storageMasterUri")
      .doc("Contact this storage master while persisting shuffle files")
      .stringConf
      .createWithDefault("hdfs://localhost:9001")

  val STORAGE_HDFS_MASTER_UI_PORT: ConfigEntry[String] =
    ConfigBuilder("spark.shuffle.remote.hdfs.storageMasterUIPort")
      .doc("Contact this UI port to retrieve HDFS configurations")
      .stringConf
      .createWithDefault("50070")

  val SHUFFLE_FILES_ROOT_DIRECTORY: ConfigEntry[String] =
    ConfigBuilder("spark.shuffle.remote.filesRootDirectory")
      .doc("Use this as the root directory for shuffle files")
      .stringConf
      .createWithDefault("/shuffle")

  val DFS_REPLICATION: ConfigEntry[Int] =
    ConfigBuilder("spark.shuffle.remote.hdfs.replication")
      .doc("The default replication of remote storage system, will override dfs.replication" +
        " when HDFS is used as shuffling storage")
      .intConf
      .createWithDefault(3)

  val REMOTE_OPTIMIZED_SHUFFLE_ENABLED: ConfigEntry[Boolean] =
    ConfigBuilder("spark.shuffle.remote.optimizedPathEnabled")
      .doc("Enable using unsafe-optimized shuffle writer")
      .internal()
      .booleanConf
      .createWithDefault(true)

  val REMOTE_BYPASS_MERGE_THRESHOLD: ConfigEntry[Int] =
    ConfigBuilder("spark.shuffle.remote.bypassMergeThreshold")
      .doc("Remote shuffle manager uses this threshold to decide using bypass-merge(hash-based)" +
        "shuffle or not, a new configuration is introduced(and it's -1 by default) because we" +
        " want to explicitly make disabling hash-based shuffle writer as the default behavior." +
        " When memory is relatively sufficient, using sort-based shuffle writer in remote shuffle" +
        " is often more efficient than the hash-based one. Because the bypass-merge shuffle " +
        "writer proceeds I/O of 3x total shuffle size: 1 time for read I/O and 2 times for write" +
        " I/Os, and this can be an even larger overhead under remote shuffle, the 3x shuffle size" +
        " is gone through network, arriving at remote storage system.")
      .intConf
      .createWithDefault(-1)

  val REMOTE_INDEX_CACHE_SIZE: ConfigEntry[String] =
    ConfigBuilder("spark.shuffle.remote.index.cache.size")
      .doc("This index file cache resides in each executor. If it's a positive value, index " +
        "cache will be turned on: instead of reading index files directly from remote storage" +
        ", a reducer will fetch the index files from the executors that write them through" +
        " network. And those executors will return the index files kept in cache. (read them" +
        "from storage if needed)")
      .stringConf
      .createWithDefault("0")

  val NUM_TRANSFER_SERVICE_THREADS: ConfigEntry[Int] =
    ConfigBuilder("spark.shuffle.remote.numIndexReadThreads")
      .doc("The maximum number of server/client threads used in RemoteShuffleTransferService for" +
        "index files transferring")
      .intConf
      .createWithDefault(3)

  val NUM_CONCURRENT_FETCH: ConfigEntry[Int] =
    ConfigBuilder("spark.shuffle.remote.numReadThreads")
      .doc("The maximum number of concurrent reading threads fetching shuffle data blocks")
      .intConf
      .createWithDefault(Runtime.getRuntime.availableProcessors())

  val REUSE_FILE_HANDLE: ConfigEntry[Boolean] =
    ConfigBuilder("spark.shuffle.remote.reuseFileHandle")
      .doc("By switching on this feature, the file handles returned by Filesystem open operations" +
        " will be cached/reused inside an executor(across different rounds of reduce tasks)," +
        " eliminating open overhead. This should improve the reduce stage performance only when" +
        " file open operations occupy majority of the time, e.g. There is a large number of" +
        " shuffle blocks, each reading a fairly small block of data, and there is no other" +
        " compute in the reduce stage.")
      .booleanConf
      .createWithDefault(false)

  val DATA_FETCH_EAGER_REQUIREMENT: ConfigEntry[Boolean] =
    ConfigBuilder("spark.shuffle.remote.eagerRequirementDataFetch")
      .doc("With eager requirement = false, a shuffle block will be counted ready and served for" +
        " compute until all content of the block is put in Spark's local memory. With eager " +
        "requirement = true, a shuffle block will be served to later compute after the bytes " +
        "required is fetched and put in memory")
      .booleanConf
      .createWithDefault(false)

}

Source File: config.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.mesos

import java.util.concurrent.TimeUnit

import org.apache.spark.internal.config.ConfigBuilder

package object config {

  

  private[spark] val SHUFFLE_CLEANER_INTERVAL_S =
    ConfigBuilder("spark.shuffle.cleaner.interval")
      .timeConf(TimeUnit.SECONDS)
      .createWithDefaultString("30s")

  private[spark] val RECOVERY_MODE =
    ConfigBuilder("spark.deploy.recoveryMode")
      .stringConf
      .createWithDefault("NONE")

  private[spark] val DISPATCHER_WEBUI_URL =
    ConfigBuilder("spark.mesos.dispatcher.webui.url")
      .doc("Set the Spark Mesos dispatcher webui_url for interacting with the " +
        "framework. If unset it will point to Spark's internal web UI.")
      .stringConf
      .createOptional

  private[spark] val ZOOKEEPER_URL =
    ConfigBuilder("spark.deploy.zookeeper.url")
      .doc("When `spark.deploy.recoveryMode` is set to ZOOKEEPER, this " +
        "configuration is used to set the zookeeper URL to connect to.")
      .stringConf
      .createOptional

  private[spark] val HISTORY_SERVER_URL =
    ConfigBuilder("spark.mesos.dispatcher.historyServer.url")
      .doc("Set the URL of the history server. The dispatcher will then " +
        "link each driver to its entry in the history server.")
      .stringConf
      .createOptional

}

Source File: config.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.mesos

import java.util.concurrent.TimeUnit

import org.apache.spark.internal.config.ConfigBuilder

package object config {

  

  private[spark] val SHUFFLE_CLEANER_INTERVAL_S =
    ConfigBuilder("spark.shuffle.cleaner.interval")
      .timeConf(TimeUnit.SECONDS)
      .createWithDefaultString("30s")

  private[spark] val RECOVERY_MODE =
    ConfigBuilder("spark.deploy.recoveryMode")
      .stringConf
      .createWithDefault("NONE")

  private[spark] val DISPATCHER_WEBUI_URL =
    ConfigBuilder("spark.mesos.dispatcher.webui.url")
      .doc("Set the Spark Mesos dispatcher webui_url for interacting with the " +
        "framework. If unset it will point to Spark's internal web UI.")
      .stringConf
      .createOptional

  private[spark] val ZOOKEEPER_URL =
    ConfigBuilder("spark.deploy.zookeeper.url")
      .doc("When `spark.deploy.recoveryMode` is set to ZOOKEEPER, this " +
        "configuration is used to set the zookeeper URL to connect to.")
      .stringConf
      .createOptional

  private[spark] val HISTORY_SERVER_URL =
    ConfigBuilder("spark.mesos.dispatcher.historyServer.url")
      .doc("Set the URL of the history server. The dispatcher will then " +
        "link each driver to its entry in the history server.")
      .stringConf
      .createOptional

}

Source File: package.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.internal.config.ConfigBuilder

package object config {
  
  private[hive] val PROXY_USERS =
    ConfigBuilder("spark.sql.proxy.users")
      .doc(s"Comma separated string of user names for Spark Thrift Server to initializing " +
        s"different SparkContext. These users must have rights to impersonate the real user" +
        s"who start the driver side jvm.")
      .stringConf
      .toSequence
      .createWithDefault(Nil)
}

Source File: config.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.history

import java.util.concurrent.TimeUnit

import org.apache.spark.internal.config.ConfigBuilder
import org.apache.spark.network.util.ByteUnit

private[spark] object config {

  val DEFAULT_LOG_DIR = "file:/tmp/spark-events"

  val EVENT_LOG_DIR = ConfigBuilder("spark.history.fs.logDirectory")
    .stringConf
    .createWithDefault(DEFAULT_LOG_DIR)

  val MAX_LOG_AGE_S = ConfigBuilder("spark.history.fs.cleaner.maxAge")
    .timeConf(TimeUnit.SECONDS)
    .createWithDefaultString("7d")

  val LOCAL_STORE_DIR = ConfigBuilder("spark.history.store.path")
    .doc("Local directory where to cache application history information. By default this is " +
      "not set, meaning all history information will be kept in memory.")
    .stringConf
    .createOptional

  val MAX_LOCAL_DISK_USAGE = ConfigBuilder("spark.history.store.maxDiskUsage")
    .bytesConf(ByteUnit.BYTE)
    .createWithDefaultString("10g")

  val HISTORY_SERVER_UI_PORT = ConfigBuilder("spark.history.ui.port")
    .doc("Web UI port to bind Spark History Server")
    .intConf
    .createWithDefault(18080)

}

org.apache.spark.internal.config.ConfigBuilder Scala Examples