org.apache.spark.sql.hive.HiveContext Scala Example

Source File: SqlUnitTest.scala From SparkUnitTestingExamples with Apache License 2.0

6 votes

package com.cloudera.sa.spark.unittest.sql

import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

import scala.collection.mutable

class SqlUnitTest extends FunSuite with
BeforeAndAfterEach with BeforeAndAfterAll{

  @transient var sc: SparkContext = null
  @transient var hiveContext: HiveContext = null

  override def beforeAll(): Unit = {

    val envMap = Map[String,String](("Xmx", "512m"))

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    sparkConfig.set("spark.io.compression.codec", "lzf")
    sc = new SparkContext("local[2]", "unit test", sparkConfig)
    hiveContext = new HiveContext(sc)
  }

  override def afterAll(): Unit = {
    sc.stop()
  }

  test("Test table creation and summing of counts") {
    val personRDD = sc.parallelize(Seq(Row("ted", 42, "blue"),
      Row("tj", 11, "green"),
      Row("andrew", 9, "green")))

    hiveContext.sql("create table person (name string, age int, color string)")

    val emptyDataFrame = hiveContext.sql("select * from person limit 0")

    val personDataFrame = hiveContext.createDataFrame(personRDD, emptyDataFrame.schema)
    personDataFrame.registerTempTable("tempPerson")

    val ageSumDataFrame = hiveContext.sql("select sum(age) from tempPerson")

    val localAgeSum = ageSumDataFrame.take(10)

    assert(localAgeSum(0).get(0) == 62, "The sum of age should equal 62 but it equaled " + localAgeSum(0).get(0))
  }
}

Source File: SparkSQLEnv.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.PrintStream

import scala.collection.JavaConversions._

import org.apache.spark.scheduler.StatsReportListener
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.apache.spark.util.Utils


  def stop() {
    logDebug("Shutting down Spark SQL Environment")
    // Stop the SparkContext
    if (SparkSQLEnv.sparkContext != null) {
      sparkContext.stop()
      sparkContext = null
      hiveContext = null
    }
  }
}

Source File: SystemArg.scala From mist with Apache License 2.0

5 votes

package mist.api

import mist.api.data.JsMap
import org.apache.spark.{SparkContext, SparkSessionUtils}
import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaStreamingContext

trait SystemArg[A] extends ArgDef[A] {
  final def validate(params: JsMap): Extraction[Unit] = Extracted(())
}

object SystemArg {

  def apply[A](tags: Seq[String], f: => Extraction[A]): ArgDef[A] = new SystemArg[A] {
    override def extract(ctx: FnContext): Extraction[A] = f
    override def describe() = Seq(InternalArgument(tags))
  }

  def apply[A](tags: Seq[String], f: FullFnContext => Extraction[A]): ArgDef[A] = new SystemArg[A] {
    override def extract(ctx: FnContext): Extraction[A] = ctx match {
      case c: FullFnContext => f(c)
      case _ =>
        val desc = s"Unknown type of job context ${ctx.getClass.getSimpleName} " +
          s"expected ${FullFnContext.getClass.getSimpleName}"
        Failed.InternalError(desc)
    }
    override def describe() = Seq(InternalArgument(tags))
  }
}

trait SparkArgs {

  val sparkContextArg: ArgDef[SparkContext] = SystemArg(
    Seq.empty,
    c => Extracted(c.sc)
  )

  val streamingContextArg: ArgDef[StreamingContext] = SystemArg(Seq(ArgInfo.StreamingContextTag),
    ctx => {
      val ssc = StreamingContext.getActiveOrCreate(() => new StreamingContext(ctx.sc, ctx.streamingDuration))
      Extracted(ssc)
    }
  )

  val sqlContextArg: ArgDef[SQLContext] = SystemArg(Seq(ArgInfo.SqlContextTag),
    ctx => sparkContextArg.map(SQLContext.getOrCreate).extract(ctx)
  )

  // HiveContext should be cached per jvm
  // see #325
  val hiveContextArg: ArgDef[HiveContext] = new SystemArg[HiveContext] {

    var cache: HiveContext = _

    override def extract(ctx: FnContext): Extraction[HiveContext] = synchronized {
      ctx match {
        case c: FullFnContext =>
          if (cache == null)
            cache = new HiveContext(c.sc)
          Extracted(cache)
        case _ =>
          Failed.InternalError(s"Unknown type of job context ${ctx.getClass.getSimpleName} expected ${FullFnContext.getClass.getSimpleName}")
      }
    }

    override def describe(): Seq[ArgInfo] = Seq(InternalArgument(
      Seq(ArgInfo.HiveContextTag, ArgInfo.SqlContextTag)))
  }

  val javaSparkContextArg: ArgDef[JavaSparkContext] = sparkContextArg.map(sc => new JavaSparkContext(sc))
  val javaStreamingContextArg: ArgDef[JavaStreamingContext] = SystemArg(Seq(ArgInfo.StreamingContextTag),
    ctx => streamingContextArg.map(scc => new JavaStreamingContext(scc)).extract(ctx))

  val sparkSessionArg: ArgDef[SparkSession] = SystemArg(Seq(ArgInfo.SqlContextTag),
    ctx => sparkContextArg.map(sc => SparkSessionUtils.getOrCreate(sc, false)).extract(ctx)
  )

  val sparkSessionWithHiveArg: ArgDef[SparkSession] = SystemArg(
    Seq(ArgInfo.SqlContextTag, ArgInfo.HiveContextTag),
    ctx => sparkContextArg.map(sc => SparkSessionUtils.getOrCreate(sc, true)).extract(ctx))
}

object SparkArgs extends SparkArgs

Source File: SparkSQLOperationManager.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.server

import java.util.{Map => JMap}
import scala.collection.mutable.Map

import org.apache.hive.service.cli._
import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager}
import org.apache.hive.service.cli.session.HiveSession
import org.apache.spark.Logging
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, ReflectionUtils}


private[thriftserver] class SparkSQLOperationManager(hiveContext: HiveContext)
  extends OperationManager with Logging {

  val handleToOperation = ReflectionUtils
    .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")

  val sessionToActivePool = Map[SessionHandle, String]()

  override def newExecuteStatementOperation(
      parentSession: HiveSession,
      statement: String,
      confOverlay: JMap[String, String],
      async: Boolean): ExecuteStatementOperation = synchronized {

    val runInBackground = async && hiveContext.hiveThriftServerAsync
    val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay,
      runInBackground)(hiveContext, sessionToActivePool)
    handleToOperation.put(operation.getHandle, operation)
    logDebug(s"Created Operation for $statement with session=$parentSession, " +
      s"runInBackground=$runInBackground")
    operation
  }
}

Source File: SparkSQLSessionManager.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.util.concurrent.Executors

import org.apache.commons.logging.Log
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.conf.HiveConf.ConfVars
import org.apache.hive.service.cli.SessionHandle
import org.apache.hive.service.cli.session.SessionManager
import org.apache.hive.service.cli.thrift.TProtocolVersion
import org.apache.hive.service.server.HiveServer2

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager


private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext: HiveContext)
  extends SessionManager(hiveServer)
  with ReflectedCompositeService {

  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext)

  override def init(hiveConf: HiveConf) {
    setSuperField(this, "hiveConf", hiveConf)

    val backgroundPoolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS)
    //用于保存等待执行的任务的阻塞队列,
    //LinkedBlockingQueue：一个基于链表结构的阻塞队列，此队列按FIFO(先进先出)排序元素，吞吐量通常要高于ArrayBlockingQueue
    //Executors.newFixedThreadPool()使用了这个队列
    setSuperField(this, "backgroundOperationPool", Executors.newFixedThreadPool(backgroundPoolSize))
    getAncestorField[Log](this, 3, "LOG").info(
      s"HiveServer2: Async execution pool size $backgroundPoolSize")

    setSuperField(this, "operationManager", sparkSqlOperationManager)
    addService(sparkSqlOperationManager)

    initCompositeService(hiveConf)
  }

  override def openSession(
      protocol: TProtocolVersion,
      username: String,
      passwd: String,
      ipAddress: String,
      sessionConf: java.util.Map[String, String],
      withImpersonation: Boolean,
      delegationToken: String): SessionHandle = {
    hiveContext.openSession()
    val sessionHandle =
      super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation,
          delegationToken)
    val session = super.getSession(sessionHandle)
    HiveThriftServer2.listener.onSessionCreated(
      session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername)
    sessionHandle
  }

  override def closeSession(sessionHandle: SessionHandle) {
    HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString)
    super.closeSession(sessionHandle)
    sparkSqlOperationManager.sessionToActivePool -= sessionHandle

    hiveContext.detachSession()
  }
}

Source File: SparkSQLDriver.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.util.{ArrayList => JArrayList, List => JList}

import scala.collection.JavaConversions._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.Logging
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}

private[hive] class SparkSQLDriver(
    val context: HiveContext = SparkSQLEnv.hiveContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: context.QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.size == 0) {
      new Schema(new FieldSchema("Response code", "string", "") :: Nil, null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
      }

      new Schema(fieldSchemas, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.stringResult()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
}

Source File: SparkSQLCLIService.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.IOException
import java.util.{List => JList}
import javax.security.auth.login.LoginException

import org.apache.commons.logging.Log
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.shims.Utils
import org.apache.hadoop.security.UserGroupInformation
import org.apache.hive.service.Service.STATE
import org.apache.hive.service.auth.HiveAuthFactory
import org.apache.hive.service.cli._
import org.apache.hive.service.server.HiveServer2
import org.apache.hive.service.{AbstractService, Service, ServiceException}

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._

import scala.collection.JavaConversions._

private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, hiveContext: HiveContext)
  extends CLIService(hiveServer)
  with ReflectedCompositeService {

  override def init(hiveConf: HiveConf) {
    setSuperField(this, "hiveConf", hiveConf)

    val sparkSqlSessionManager = new SparkSQLSessionManager(hiveServer, hiveContext)
    setSuperField(this, "sessionManager", sparkSqlSessionManager)
    addService(sparkSqlSessionManager)
    var sparkServiceUGI: UserGroupInformation = null

    if (UserGroupInformation.isSecurityEnabled) {
      try {
        HiveAuthFactory.loginFromKeytab(hiveConf)
        sparkServiceUGI = Utils.getUGI()
        setSuperField(this, "serviceUGI", sparkServiceUGI)
      } catch {
        case e @ (_: IOException | _: LoginException) =>
          throw new ServiceException("Unable to login to kerberos with given principal/keytab", e)
      }
    }

    initCompositeService(hiveConf)
  }

  override def getInfo(sessionHandle: SessionHandle, getInfoType: GetInfoType): GetInfoValue = {
    getInfoType match {
      case GetInfoType.CLI_SERVER_NAME => new GetInfoValue("Spark SQL")
      case GetInfoType.CLI_DBMS_NAME => new GetInfoValue("Spark SQL")
      case GetInfoType.CLI_DBMS_VER => new GetInfoValue(hiveContext.sparkContext.version)
      case _ => super.getInfo(sessionHandle, getInfoType)
    }
  }
}

private[thriftserver] trait ReflectedCompositeService { this: AbstractService =>
  def initCompositeService(hiveConf: HiveConf) {
    // Emulating `CompositeService.init(hiveConf)`
    val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList")
    serviceList.foreach(_.init(hiveConf))

    // Emulating `AbstractService.init(hiveConf)`
    invoke(classOf[AbstractService], this, "ensureCurrentState", classOf[STATE] -> STATE.NOTINITED)
    setAncestorField(this, 3, "hiveConf", hiveConf)
    invoke(classOf[AbstractService], this, "changeState", classOf[STATE] -> STATE.INITED)
    getAncestorField[Log](this, 3, "LOG").info(s"Service: $getName is inited.")
  }
}

Source File: Main.scala From spark1.52 with Apache License 2.0

5 votes

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext


object Main {
  def main(args: Array[String]) {
    // scalastyle:off println
    println("Running regression test for SPARK-8489.")
    val sc = new SparkContext("local", "testing")
    val hc = new HiveContext(sc)
    // This line should not throw scala.reflect.internal.MissingRequirementError.
    // See SPARK-8470 for more detail.
    val df = hc.createDataFrame(Seq(MyCoolClass("1", "2", "3")))
    df.collect()
    println("Regression test for SPARK-8489 success!")
    // scalastyle:on println
    sc.stop()
  }
}

Source File: HiveApp.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package main.scala

import scala.collection.mutable.{ListBuffer, Queue}

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext

case class Person(name: String, age: Int)

object SparkSqlExample {

  def main(args: Array[String]) {
    //System.getenv()和System.getProperties()的区别
    //System.getenv() 返回系统环境变量值 设置系统环境变量：当前登录用户主目录下的".bashrc"文件中可以设置系统环境变量
    //System.getProperties() 返回Java进程变量值 通过命令行参数的"-D"选项
    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
      case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master)
      case None => new SparkConf().setAppName("Simple Sql App")
    }
    val sc = new SparkContext(conf)
    val hiveContext = new HiveContext(sc)

    import hiveContext._
    sql("DROP TABLE IF EXISTS src")
    sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
    sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src")
    val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect()
    results.foreach(println)
    
    def test(f: => Boolean, failureMsg: String) = {
      if (!f) {
        println(failureMsg)
        System.exit(-1)
      }
    }
    
    test(results.size == 5, "Unexpected number of selected elements: " + results)
    println("Test succeeded")
    sc.stop()
  }
}
// scalastyle:on println

Source File: HiveReader.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

package it.agilelab.bigdata.DataQuality.utils.io

import it.agilelab.bigdata.DataQuality.sources.HiveTableConfig
import it.agilelab.bigdata.DataQuality.utils.Logging
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.hive.HiveContext

import scala.util.Try


  def loadHiveTable(inputConf: HiveTableConfig)(
      implicit hiveContext: HiveContext): Seq[DataFrame] = {

    // You can specify a template for queries here. Currently it's just an input query as it is
    val full_query = inputConf.query
    Try {
      Seq(hiveContext.sql(full_query))
    }.getOrElse({
      log.warn("Failed to load HIVE table")
      Seq.empty
    })
  }
}

Source File: DQMainClass.scala From DataQuality with GNU Lesser General Public License v3.0

5 votes

package it.agilelab.bigdata.DataQuality.utils

import java.util.Locale

import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext


trait DQMainClass { this: DQSparkContext with Logging =>

  private def initLogger(): Unit = {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("org.apache.spark.scheduler.TaskSetManager").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.OFF)
    Logger.getLogger("io.netty").setLevel(Level.OFF)
    Logger.getLogger("org.spark-project.jetty").setLevel(Level.OFF)
    Logger.getLogger("org.apache.hadoop.hdfs.KeyProviderCache").setLevel(Level.OFF)
  }

  private def makeFileSystem(settings: DQSettings, sc: SparkContext): FileSystem = {
    if (sc.isLocal) FileSystem.getLocal(sc.hadoopConfiguration)
    else {
      if (settings.s3Bucket.isDefined) {
        sc.hadoopConfiguration.set("fs.defaultFS", settings.s3Bucket.get)
        sc.hadoopConfiguration.set("fs.AbstractFileSystem.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
      }
      FileSystem.get( sc.hadoopConfiguration)
    }
  }

  protected def body()(implicit fs: FileSystem,
                       sparkContext: SparkContext,
                       sqlContext: SQLContext,
                       sqlWriter: HistoryDBManager,
                       settings: DQSettings): Boolean

  def preMessage(task: String): Unit = {
    log.warn("************************************************************************")
    log.warn(s"               Starting execution of task $task")
    log.warn("************************************************************************")
  }

  def postMessage(task: String): Unit = {
    log.warn("************************************************************************")
    log.warn(s"               Finishing execution of task $task")
    log.warn("************************************************************************")
  }

  def main(args: Array[String]): Unit = {
    // set to avoid casting problems in metric result name generation
    Locale.setDefault(Locale.ENGLISH)
    initLogger()

    DQCommandLineOptions.parser().parse(args, DQCommandLineOptions("","")) match {
      case Some(commandLineOptions) =>
        // Load our own config values from the default location, application.conf
        val settings = new DQSettings(commandLineOptions)
        val sparkContext = makeSparkContext(settings)
        val fs = makeFileSystem(settings, sparkContext)

        settings.logThis()(log)

        val sqlContext: SQLContext = if (settings.hiveDir.isDefined) {
          val hc =  new HiveContext(sparkContext)
          hc.setConf("hive.metastore.warehouse.dir", settings.hiveDir.get)
          hc
        } else makeSqlContext(sparkContext)

        val historyDatabase = new HistoryDBManager(settings)

        // Starting application body
        preMessage(s"{${settings.appName}}")
        val startTime = System.currentTimeMillis()
        body()(fs, sparkContext, sqlContext, historyDatabase, settings)
        postMessage(s"{${settings.appName}}")

        log.info(s"Execution finished in [${(System.currentTimeMillis() - startTime) / 60000}] min(s)")
        log.info("Closing application...")

        historyDatabase.closeConnection()
        sparkContext.stop()

        log.info("Spark context were terminated. Exiting...")

      case None =>
        log.error("Wrong parameters provided")
        throw new Exception("Wrong parameters provided")

    }

  }

}

Source File: HiveManageTable.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.common.ml.util

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.types.StructType

trait HiveManageTable {
  def toRow(partition: String): Row
}

object HiveManageTable {
  def saveRDDToHive[T <: HiveManageTable](hiveContext: HiveContext,
                                          data: RDD[T],
                                          table: String,
                                          schema: StructType,
                                          mode: SaveMode,
                                          partition: String,
                                          partitionValue: String,
                                          hiveConfig: Map[String, String] = dynamicPartitions):Unit = {
    hiveConfig.foreach {
      case (key, value) =>
        hiveContext.setConf(key, value)
    }

    hiveContext.createDataFrame(data.map(_.toRow(partitionValue)), schema)
      .write
      .mode(mode)
      .partitionBy(partition)
      .insertInto(table)
  }

  lazy val dynamicPartitions = Map(
    "hive.exec.dynamic.partition" -> "true",
    "hive.exec.dynamic.partition.mode" -> "nonstrict"
  )
}

Source File: BaseBinaryRegressionTrainer.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.common.ml.strategy.trainer

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.hive.HiveContext

import com.airbnb.common.ml.strategy.config.TrainingOptions
import com.airbnb.common.ml.strategy.data.{BaseBinarySample, TrainingData}
import com.airbnb.common.ml.strategy.params.StrategyParams


case class BaseBinaryRegressionTrainer(
    strategyParams: StrategyParams[BaseBinarySample],
    trainingDataType: TrainingData[BaseBinarySample]
) extends BinaryRegressionTrainer[BaseBinarySample] {

  override def getLearningRate(
      r0: Double,
      r1: Double,
      example: BaseBinarySample,
      options: TrainingOptions
  ): Double = {
    val x = example.x
    val learningRate = if (example.label) {
      r1 * x
    } else {
      1 - x
    }
    r0 * learningRate
  }

  override def createDataFrameFromModelOutput(
      models: RDD[(String,
        StrategyParams[BaseBinarySample])], hc: HiveContext
  ): DataFrame = {
    ???
  }
}

Source File: ModelOutput.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.common.ml.strategy.data

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SaveMode}

import com.airbnb.common.ml.strategy.config.TrainingOptions
import com.airbnb.common.ml.strategy.eval.BinaryMetrics
import com.airbnb.common.ml.strategy.params.StrategyParams
import com.airbnb.common.ml.util.HiveManageTable



case class ModelOutput[T](
    id: String,
    params: StrategyParams[T],
    loss: Double,
    evalMetrics: BinaryMetrics,
    holdoutMetrics: BinaryMetrics,
    options: TrainingOptions
) extends HiveManageTable {
  override def toRow(partition: String): Row = {
    Row(
      id.toLong,
      holdoutMetrics.posCount,
      holdoutMetrics.negCount,
      holdoutMetrics.posSugHigher,
      holdoutMetrics.posSugLower,
      holdoutMetrics.negSugHigher,
      holdoutMetrics.negSugLower,
      holdoutMetrics.increasePrecision,
      holdoutMetrics.increaseRecall,
      holdoutMetrics.decreasePrecision,
      holdoutMetrics.decreaseRecall,
      holdoutMetrics.trueRegret,
      holdoutMetrics.trueRegretMedian,
      holdoutMetrics.trueRegret75Percentile,
      holdoutMetrics.falseRegret,
      holdoutMetrics.trueIncreaseMagnitude,
      holdoutMetrics.trueDecreaseMagnitude,
      holdoutMetrics.falseDecreaseMagnitude,
      holdoutMetrics.falseIncreaseMagnitude,
      params.params,
      loss,
      options.toPartialArray,
      partition
    )
  }
}

object ModelOutput {
  lazy val schema = StructType(
    Seq(
      StructField("id", LongType),
      StructField("posCount", IntegerType),
      StructField("negCount", IntegerType),
      StructField("posSugHigher", IntegerType),
      StructField("posSugLower", IntegerType),
      StructField("negSugHigher", IntegerType),
      StructField("negSugLower", IntegerType),

      StructField("increasePrecision", DoubleType),
      StructField("increaseRecall", DoubleType),
      StructField("decreasePrecision", DoubleType),
      StructField("decreaseRecall", DoubleType),

      StructField("trueRegret", DoubleType),
      StructField("trueRegretMedian", DoubleType),
      StructField("trueRegret75Percentile", DoubleType),

      StructField("falseRegret", DoubleType),
      StructField("trueIncreaseMagnitude", DoubleType),
      StructField("trueDecreaseMagnitude", DoubleType),
      StructField("falseDecreaseMagnitude", DoubleType),
      StructField("falseIncreaseMagnitude", DoubleType),

      StructField("params", ArrayType(DoubleType)),
      StructField("loss", DoubleType),
      StructField("options", ArrayType(DoubleType)),

      StructField("model", StringType)
    )
  )

  def save[T](
      hiveContext: HiveContext,
      data: RDD[ModelOutput[T]],
      table: String,
      partition: String
  ): Unit = {
    HiveManageTable.saveRDDToHive(
      hiveContext,
      data,
      table,
      ModelOutput.schema,
      SaveMode.Overwrite,
      "model",
      partition)
  }
}

Source File: ModelData.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.common.ml.xgboost.data

import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{DataFrame, Row}

import com.airbnb.common.ml.util.HiveUtil

object ModelData {
  def parseRowToLabelAndVaue(row: Row, labelPos: Int, dropCount: Int): (Float, Array[Float]) = {
    // for row doesn't contains label, pass in negative labelPos
    val label: Number = if (labelPos >= 0) {
      row.getAs[Number](labelPos)
    } else {
      0
    }
    val seq = row.toSeq.drop(dropCount).map(x=>{
      if (x != null) {
        x.asInstanceOf[Number].floatValue()
      } else {
        ModelData.NULL_VALUE
      }
    }).toArray
    (label.floatValue(), seq)
  }

  def parseRowToRawXgboostLabeledPoint(row: Row, labelPos: Int, dropCount: Int): LabeledPoint = {
    val (label, seq) = parseRowToLabelAndVaue(row, labelPos, dropCount)
    LabeledPoint.fromDenseVector(label, seq)
  }

  def getLabeledPoints(
    sc: SparkContext,
    query: String,
    trainingLabeledPoint: TrainingModelData
  ): RDD[(String, Seq[LabeledPoint])] = {
    val df = ModelData.getDataFrame(sc, query)
    HiveUtil.loadDataFromDataFrameGroupByKey(
      df,
      ModelData.parseKeyFromHiveRow(ModelData.TRAINING_KEY_INDEX),
      trainingLabeledPoint.parseRowToXgboostLabeledPoint)
  }

  def getScoringLabeledPoints(sc: SparkContext,
                              query: String, scoringLabeledPoint: ScoringModelData): RDD[(String, ScoringLabeledPoint)] = {
    val df = ModelData.getDataFrame(sc, query)
    HiveUtil.loadDataFromDataFrame(
      df,
      // score_query_head of scoring.conf also defined S_node_10k_id same as TRAINING_KEY_INDEX
      ModelData.parseKeyFromHiveRow(ModelData.TRAINING_KEY_INDEX),
      scoringLabeledPoint.parseRowToXgboostLabeledPointAndData)
  }

  def getLabeledPointsAndString(sc: SparkContext,
                                query: String, scoringLabeledPoint: ScoringModelData): RDD[(String, Seq[ScoringLabeledPoint])] = {
    val df = ModelData.getDataFrame(sc, query)
    HiveUtil.loadDataFromDataFrameGroupByKey(
      df,
      // score_query_head of scoring.conf also defined S_node_10k_id same as TRAINING_KEY_INDEX
      ModelData.parseKeyFromHiveRow(ModelData.TRAINING_KEY_INDEX),
      scoringLabeledPoint.parseRowToXgboostLabeledPointAndData)
  }

  // parseRowToLabelAndVaue can't return null, so use -1 if input is null
  // this is same with train_with_prob.conf 's hql query.
  val NULL_VALUE: Int = -1
  // refer to query in xgboost/search.conf
  val TRAINING_KEY_INDEX: Int = 1
  // refer to pricing.grid_search
  val PARAMS_KEY_INDEX: Int = 0
  val PARAMS_INDEX: Int = 1

  // default use second field as node key
  def parseKeyFromHiveRow(keyIndex: Int)(row: Row): String= {
    row.getAs[Long](keyIndex).toString
  }

  def getParams(sc: SparkContext,
                query: String): RDD[(String, Seq[Array[Double]])] = {
    val df = getDataFrame(sc, query)
    HiveUtil.loadDataFromDataFrameGroupByKey(
      df,
      ModelData.parseKeyFromHiveRow(0),
      parseRowToParams)
  }

  def parseRowToParams(row: Row): Array[Double] = {
    row.getAs[scala.collection.mutable.WrappedArray[Double]](ModelData.PARAMS_INDEX).toArray
  }

  def getDataFrame(sc: SparkContext,
                   query: String): DataFrame = {
    val hc = new HiveContext(sc)
    hc.sql(query)
  }
}

Source File: XGBoostScoringPipeline.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.common.ml.xgboost

import com.typesafe.config.Config
import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext

import com.airbnb.common.ml.util.{PipelineUtil, ScalaLogging}
import com.airbnb.common.ml.xgboost.config.XGBoostScoringConfig
import com.airbnb.common.ml.xgboost.data.{ModelData, ScoringLabeledPoint, ScoringModelData}


object XGBoostScoringPipeline extends ScalaLogging {
  def loadModels(path: String): Map[String, ml.dmlc.xgboost4j.scala.Booster] = {
    val streams = PipelineUtil.getHDFSInputStreams(path)
    streams.map {
      case (id, stream) => {
        (id, XGBoost.loadModel(stream))
      }
    }.toMap
  }

  def scorePartition(
      path: String,
      groupNumber: Int
  ) (iter: Iterator[(String, ScoringLabeledPoint)]): Iterator[String] = {
    iter.grouped(groupNumber).flatMap { seq =>
      seq.groupBy(_._1).flatMap {
        case (id, scoringData) => {
          val data = scoringData.map(x => x._2.labeledPoint)
          val model_path = path + id
          val prediction = scoreWithPath(model_path, data)
          prediction.zip(scoringData.map(x => x._2.data)).map {
            case (score, index) => {
              s"$index\t${score(0)}"
            }
          }
        }
      }.iterator
    }
  }

  def baseScore(
      modelData: ScoringModelData,
      sc: SparkContext,
      config: Config
  ): Unit = {
    // read training data, available at xgboost/demo/data
    val conf = XGBoostScoringConfig.loadConfig(sc, config)
    val data = ModelData.getScoringLabeledPoints(
      sc,
      conf.query,
      modelData)
    val output: RDD[String] = data.mapPartitions(scorePartition(conf.modelBasePath, conf
      .groupNumber))

    if (conf.saveHiveTable) {
      val hc = new HiveContext(sc)
      PipelineUtil.saveToHdfsAndUpdateHive(
        hc, conf.outputPath, conf.outputTable, conf.partitionSpec, output, conf.overwrite)
    } else {
      PipelineUtil.saveAndCommitAsTextFile(output, conf.outputPath, conf.overwrite)
    }
  }

  def scoreWithPath(
      path: String,
      data: Seq[LabeledPoint]
  ): Array[Array[Float]] = {
    logger.info(s"model_path: $path")
    val scoring = new DMatrix(data.iterator, null)
    val inputStream = PipelineUtil.getHDFSInputStream(path)
    val model = XGBoost.loadModel(inputStream)
    // output array[[score],[score],...]
    val output = model.predict(scoring)
    scoring.delete()
    model.dispose
    assert(data.length == output.length)
    output
  }

}

Source File: HiveUtil.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.aerosolve.training.pipeline

import org.apache.spark.sql.hive.HiveContext

object HiveUtil {
  def updateHivePartition(hc: HiveContext,
                          hiveTable: String,
                          partitionSpec: String,
                          hdfsLocation: String) : Boolean = {
    if (!hiveTable.contains('.')) {
      throw new RuntimeException(s"Missing namespace for the hive table $hiveTable.")
    }
    val Array(namespace, table) = hiveTable.split('.')
    hc.sql(s"USE $namespace")
    hc.sql(s"ALTER TABLE $table DROP IF EXISTS PARTITION ($partitionSpec)")
    hc.sql(s"ALTER TABLE $table ADD PARTITION ($partitionSpec) location '$hdfsLocation'")
    true
  }
}

Source File: UISeleniumSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import scala.util.Random

import org.apache.hadoop.hive.conf.HiveConf.ConfVars
import org.openqa.selenium.WebDriver
import org.openqa.selenium.htmlunit.HtmlUnitDriver
import org.scalatest.concurrent.Eventually._
import org.scalatest.selenium.WebBrowser
import org.scalatest.time.SpanSugar._
import org.scalatest.{BeforeAndAfterAll, Matchers}

import org.apache.spark.sql.hive.HiveContext

class UISeleniumSuite
  extends HiveThriftJdbcTest
  with WebBrowser with Matchers with BeforeAndAfterAll {

  implicit var webDriver: WebDriver = _
  var server: HiveThriftServer2 = _
  var hc: HiveContext = _
  val uiPort = 20000 + Random.nextInt(10000)
  override def mode: ServerMode.Value = ServerMode.binary

  override def beforeAll(): Unit = {
    webDriver = new HtmlUnitDriver
    super.beforeAll()
  }

  override def afterAll(): Unit = {
    if (webDriver != null) {
      webDriver.quit()
    }
    super.afterAll()
  }

  override protected def serverStartCommand(port: Int) = {
    val portConf = if (mode == ServerMode.binary) {
      ConfVars.HIVE_SERVER2_THRIFT_PORT
    } else {
      ConfVars.HIVE_SERVER2_THRIFT_HTTP_PORT
    }

    s"""$startScript
        |  --master local
        |  --hiveconf hive.root.logger=INFO,console
        |  --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$metastoreJdbcUri
        |  --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath
        |  --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost
        |  --hiveconf ${ConfVars.HIVE_SERVER2_TRANSPORT_MODE}=$mode
        |  --hiveconf $portConf=$port
        |  --driver-class-path ${sys.props("java.class.path")}
        |  --conf spark.ui.enabled=true
        |  --conf spark.ui.port=$uiPort
     """.stripMargin.split("\\s+").toSeq
  }

  ignore("thrift server ui test") {
    withJdbcStatement { statement =>
      val baseURL = s"http://localhost:$uiPort"

      val queries = Seq(
        "CREATE TABLE test_map(key INT, value STRING)",
        s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_map")

      queries.foreach(statement.execute)

      eventually(timeout(10 seconds), interval(50 milliseconds)) {
        go to baseURL
        find(cssSelector("""ul li a[href*="sql"]""")) should not be None
      }

      eventually(timeout(10 seconds), interval(50 milliseconds)) {
        go to (baseURL + "/sql")
        find(id("sessionstat")) should not be None
        find(id("sqlstat")) should not be None

        // check whether statements exists
        queries.foreach { line =>
          findAll(cssSelector("""ul table tbody tr td""")).map(_.text).toList should contain (line)
        }
      }
    }
  }
}

Source File: MistScContext.scala From mist with Apache License 2.0

5 votes

package io.hydrosphere.mist.worker

import org.apache.spark.api.java.JavaSparkContext
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.{Duration => SDuration}
import org.apache.spark.{SparkConf, SparkContext, SparkSessionUtils}

import scala.collection.mutable
import scala.concurrent.duration.Duration

class MistScContext(
  val sc: SparkContext,
  val namespace: String,
  val streamingDuration: SDuration = SDuration(40 * 1000)
) {

  private val jars = mutable.Buffer.empty[String]

  def isK8S: Boolean = sc.getConf.get("spark.master").startsWith("k8s://")

  def addJar(artifact: SparkArtifact): Unit = synchronized {
    val path = if (isK8S) artifact.url else artifact.local.getAbsolutePath
    if (!jars.contains(path)) {
      sc.addJar(path)
      jars += path
    }
  }

  def getUIAddress(): Option[String] = SparkUtils.getSparkUiAddress(sc)

  //TODO: can we call that inside python directly using setupConfiguration?
  // python support
  def sparkConf: SparkConf = sc.getConf

  // python support
  def javaContext: JavaSparkContext = new JavaSparkContext(sc)

  // python support
  def sqlContext: SQLContext = new SQLContext(sc)

  // python support
  def hiveContext: HiveContext = new HiveContext(sc)

  def sparkSession(enableHive: Boolean): SparkSession = SparkSessionUtils.getOrCreate(sc, enableHive)

  def stop(): Unit = {
    sc.stop()
  }

}

object MistScContext {

  def apply(id: String, streamingDuration: Duration, sparkConf: SparkConf): MistScContext = {
    val upd = sparkConf.clone()
      .setAppName(id)
      .set("spark.streaming.stopSparkContextByDefault", "false")

    val duration = SDuration(streamingDuration.toMillis)
    val sc = new SparkContext(upd)
    new MistScContext(sc, id, duration)
  }

  def apply(id: String, streamingDuration: Duration): MistScContext = apply(id, streamingDuration, new SparkConf())

}

Source File: HiveApp.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package main.scala

import scala.collection.mutable.{ListBuffer, Queue}

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext

case class Person(name: String, age: Int)

object SparkSqlExample {

  def main(args: Array[String]) {
    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
      case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master)
      case None => new SparkConf().setAppName("Simple Sql App")
    }
    val sc = new SparkContext(conf)
    val hiveContext = new HiveContext(sc)

    import hiveContext._
    sql("DROP TABLE IF EXISTS src")
    sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
    sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src")
    val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect()
    results.foreach(println)
    
    def test(f: => Boolean, failureMsg: String) = {
      if (!f) {
        println(failureMsg)
        System.exit(-1)
      }
    }
    
    test(results.size == 5, "Unexpected number of selected elements: " + results)
    println("Test succeeded")
    sc.stop()
  }
}
// scalastyle:on println

Source File: CreateViewAsSelect.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.hive.{HiveMetastoreTypes, HiveContext}
import org.apache.spark.sql.{AnalysisException, Row, SQLContext}
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable}


// TODO: Note that this class can NOT canonicalize the view SQL string entirely, which is different
// from Hive and may not work for some cases like create view on self join.
private[hive] case class CreateViewAsSelect(
    tableDesc: HiveTable,
    childSchema: Seq[Attribute],
    allowExisting: Boolean,
    orReplace: Boolean) extends RunnableCommand {

  assert(tableDesc.schema == Nil || tableDesc.schema.length == childSchema.length)
  assert(tableDesc.viewText.isDefined)

  val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database))

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val hiveContext = sqlContext.asInstanceOf[HiveContext]

    if (hiveContext.catalog.tableExists(tableIdentifier)) {
      if (allowExisting) {
        // view already exists, will do nothing, to keep consistent with Hive
      } else if (orReplace) {
        hiveContext.catalog.client.alertView(prepareTable())
      } else {
        throw new AnalysisException(s"View $tableIdentifier already exists. " +
          "If you want to update the view definition, please use ALTER VIEW AS or " +
          "CREATE OR REPLACE VIEW AS")
      }
    } else {
      hiveContext.catalog.client.createView(prepareTable())
    }

    Seq.empty[Row]
  }

  private def prepareTable(): HiveTable = {
    // setup column types according to the schema of child.
    val schema = if (tableDesc.schema == Nil) {
      childSchema.map { attr =>
        HiveColumn(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), null)
      }
    } else {
      childSchema.zip(tableDesc.schema).map { case (attr, col) =>
        HiveColumn(col.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), col.comment)
      }
    }

    val columnNames = childSchema.map(f => verbose(f.name))

    // When user specified column names for view, we should create a project to do the renaming.
    // When no column name specified, we still need to create a project to declare the columns
    // we need, to make us more robust to top level `*`s.
    val projectList = if (tableDesc.schema == Nil) {
      columnNames.mkString(", ")
    } else {
      columnNames.zip(tableDesc.schema.map(f => verbose(f.name))).map {
        case (name, alias) => s"$name AS $alias"
      }.mkString(", ")
    }

    val viewName = verbose(tableDesc.name)

    val expandedText = s"SELECT $projectList FROM (${tableDesc.viewText.get}) $viewName"

    tableDesc.copy(schema = schema, viewText = Some(expandedText))
  }

  // escape backtick with double-backtick in column name and wrap it with backtick.
  private def verbose(name: String) = s"`${name.replaceAll("`", "``")}`"
}

Source File: CreateTableAsSelect.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable}
import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes, MetastoreRelation}
import org.apache.spark.sql.{AnalysisException, Row, SQLContext}


private[hive]
case class CreateTableAsSelect(
    tableDesc: HiveTable,
    query: LogicalPlan,
    allowExisting: Boolean)
  extends RunnableCommand {

  val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database))

  override def children: Seq[LogicalPlan] = Seq(query)

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val hiveContext = sqlContext.asInstanceOf[HiveContext]
    lazy val metastoreRelation: MetastoreRelation = {
      import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
      import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
      import org.apache.hadoop.io.Text
      import org.apache.hadoop.mapred.TextInputFormat

      val withFormat =
        tableDesc.copy(
          inputFormat =
            tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)),
          outputFormat =
            tableDesc.outputFormat
              .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)),
          serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName())))

      val withSchema = if (withFormat.schema.isEmpty) {
        // Hive doesn't support specifying the column list for target table in CTAS
        // However we don't think SparkSQL should follow that.
        tableDesc.copy(schema =
        query.output.map(c =>
          HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)))
      } else {
        withFormat
      }

      hiveContext.catalog.client.createTable(withSchema)

      // Get the Metastore Relation
      hiveContext.catalog.lookupRelation(tableIdentifier, None) match {
        case r: MetastoreRelation => r
      }
    }
    // TODO ideally, we should get the output data ready first and then
    // add the relation into catalog, just in case of failure occurs while data
    // processing.
    if (hiveContext.catalog.tableExists(tableIdentifier)) {
      if (allowExisting) {
        // table already exists, will do nothing, to keep consistent with Hive
      } else {
        throw new AnalysisException(s"$tableIdentifier already exists.")
      }
    } else {
      hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:${tableDesc.database}}, TableName: ${tableDesc.name}, InsertIntoHiveTable]"
  }
}

Source File: Main.scala From BigDatalog with Apache License 2.0

5 votes

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext


object Main {
  def main(args: Array[String]) {
    // scalastyle:off println
    println("Running regression test for SPARK-8489.")
    val sc = new SparkContext("local", "testing")
    val hc = new HiveContext(sc)
    // This line should not throw scala.reflect.internal.MissingRequirementError.
    // See SPARK-8470 for more detail.
    val df = hc.createDataFrame(Seq(MyCoolClass("1", "2", "3")))
    df.collect()
    println("Regression test for SPARK-8489 success!")
    // scalastyle:on println
    sc.stop()
  }
}

Source File: SparkSQLCLIService.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.IOException
import java.util.{List => JList}
import javax.security.auth.login.LoginException

import scala.collection.JavaConverters._

import org.apache.commons.logging.Log
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.shims.Utils
import org.apache.hadoop.security.UserGroupInformation
import org.apache.hive.service.Service.STATE
import org.apache.hive.service.auth.HiveAuthFactory
import org.apache.hive.service.cli._
import org.apache.hive.service.server.HiveServer2
import org.apache.hive.service.{AbstractService, Service, ServiceException}

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._

private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, hiveContext: HiveContext)
  extends CLIService(hiveServer)
  with ReflectedCompositeService {

  override def init(hiveConf: HiveConf) {
    setSuperField(this, "hiveConf", hiveConf)

    val sparkSqlSessionManager = new SparkSQLSessionManager(hiveServer, hiveContext)
    setSuperField(this, "sessionManager", sparkSqlSessionManager)
    addService(sparkSqlSessionManager)
    var sparkServiceUGI: UserGroupInformation = null

    if (UserGroupInformation.isSecurityEnabled) {
      try {
        HiveAuthFactory.loginFromKeytab(hiveConf)
        sparkServiceUGI = Utils.getUGI()
        setSuperField(this, "serviceUGI", sparkServiceUGI)
      } catch {
        case e @ (_: IOException | _: LoginException) =>
          throw new ServiceException("Unable to login to kerberos with given principal/keytab", e)
      }
    }

    initCompositeService(hiveConf)
  }

  override def getInfo(sessionHandle: SessionHandle, getInfoType: GetInfoType): GetInfoValue = {
    getInfoType match {
      case GetInfoType.CLI_SERVER_NAME => new GetInfoValue("Spark SQL")
      case GetInfoType.CLI_DBMS_NAME => new GetInfoValue("Spark SQL")
      case GetInfoType.CLI_DBMS_VER => new GetInfoValue(hiveContext.sparkContext.version)
      case _ => super.getInfo(sessionHandle, getInfoType)
    }
  }
}

private[thriftserver] trait ReflectedCompositeService { this: AbstractService =>
  def initCompositeService(hiveConf: HiveConf) {
    // Emulating `CompositeService.init(hiveConf)`
    val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList")
    serviceList.asScala.foreach(_.init(hiveConf))

    // Emulating `AbstractService.init(hiveConf)`
    invoke(classOf[AbstractService], this, "ensureCurrentState", classOf[STATE] -> STATE.NOTINITED)
    setAncestorField(this, 3, "hiveConf", hiveConf)
    invoke(classOf[AbstractService], this, "changeState", classOf[STATE] -> STATE.INITED)
    getAncestorField[Log](this, 3, "LOG").info(s"Service: $getName is inited.")
  }
}

Source File: SparkSQLDriver.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.util.{Arrays, ArrayList => JArrayList, List => JList}
import org.apache.log4j.LogManager
import org.apache.spark.sql.AnalysisException

import scala.collection.JavaConverters._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.Logging
import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}

private[hive] class SparkSQLDriver(
    val context: HiveContext = SparkSQLEnv.hiveContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: context.QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.isEmpty) {
      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
      }

      new Schema(fieldSchemas.asJava, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.stringResult()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
}

Source File: SparkSQLSessionManager.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.util.concurrent.Executors

import org.apache.commons.logging.Log
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.conf.HiveConf.ConfVars
import org.apache.hive.service.cli.SessionHandle
import org.apache.hive.service.cli.session.SessionManager
import org.apache.hive.service.cli.thrift.TProtocolVersion
import org.apache.hive.service.server.HiveServer2

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager


private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext: HiveContext)
  extends SessionManager(hiveServer)
  with ReflectedCompositeService {

  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager()

  override def init(hiveConf: HiveConf) {
    setSuperField(this, "hiveConf", hiveConf)

    // Create operation log root directory, if operation logging is enabled
    if (hiveConf.getBoolVar(ConfVars.HIVE_SERVER2_LOGGING_OPERATION_ENABLED)) {
      invoke(classOf[SessionManager], this, "initOperationLogRootDir")
    }

    val backgroundPoolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS)
    setSuperField(this, "backgroundOperationPool", Executors.newFixedThreadPool(backgroundPoolSize))
    getAncestorField[Log](this, 3, "LOG").info(
      s"HiveServer2: Async execution pool size $backgroundPoolSize")

    setSuperField(this, "operationManager", sparkSqlOperationManager)
    addService(sparkSqlOperationManager)

    initCompositeService(hiveConf)
  }

  override def openSession(
      protocol: TProtocolVersion,
      username: String,
      passwd: String,
      ipAddress: String,
      sessionConf: java.util.Map[String, String],
      withImpersonation: Boolean,
      delegationToken: String): SessionHandle = {
    val sessionHandle =
      super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation,
          delegationToken)
    val session = super.getSession(sessionHandle)
    HiveThriftServer2.listener.onSessionCreated(
      session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername)
    val ctx = if (hiveContext.hiveThriftServerSingleSession) {
      hiveContext
    } else {
      hiveContext.newSession()
    }
    ctx.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)
    sparkSqlOperationManager.sessionToContexts += sessionHandle -> ctx
    sessionHandle
  }

  override def closeSession(sessionHandle: SessionHandle) {
    HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString)
    super.closeSession(sessionHandle)
    sparkSqlOperationManager.sessionToActivePool -= sessionHandle
    sparkSqlOperationManager.sessionToContexts.remove(sessionHandle)
  }
}

Source File: SparkSQLOperationManager.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.server

import java.util.{Map => JMap}
import scala.collection.mutable.Map

import org.apache.hive.service.cli._
import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager}
import org.apache.hive.service.cli.session.HiveSession
import org.apache.spark.Logging
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, ReflectionUtils}


private[thriftserver] class SparkSQLOperationManager()
  extends OperationManager with Logging {

  val handleToOperation = ReflectionUtils
    .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")

  val sessionToActivePool = Map[SessionHandle, String]()
  val sessionToContexts = Map[SessionHandle, HiveContext]()

  override def newExecuteStatementOperation(
      parentSession: HiveSession,
      statement: String,
      confOverlay: JMap[String, String],
      async: Boolean): ExecuteStatementOperation = synchronized {
    val hiveContext = sessionToContexts(parentSession.getSessionHandle)
    val runInBackground = async && hiveContext.hiveThriftServerAsync
    val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay,
      runInBackground)(hiveContext, sessionToActivePool)
    handleToOperation.put(operation.getHandle, operation)
    logDebug(s"Created Operation for $statement with session=$parentSession, " +
      s"runInBackground=$runInBackground")
    operation
  }
}

Source File: SparkSQLEnv.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.PrintStream

import scala.collection.JavaConverters._

import org.apache.spark.scheduler.StatsReportListener
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.apache.spark.util.Utils


  def stop() {
    logDebug("Shutting down Spark SQL Environment")
    // Stop the SparkContext
    if (SparkSQLEnv.sparkContext != null) {
      sparkContext.stop()
      sparkContext = null
      hiveContext = null
    }
  }
}

Source File: ScalaSparkSQLBench.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.sql

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext


object ScalaSparkSQLBench{
  def main(args: Array[String]){
    if (args.length < 2){
      System.err.println(
        s"Usage: $ScalaSparkSQLBench <workload name> <SQL sciprt file>"
      )
      System.exit(1)
    }
    val workload_name = args(0)
    val sql_file = args(1)
    val sparkConf = new SparkConf().setAppName(workload_name)
                                  .set("spark.shuffle.compress", "false")
                                  .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec")
                                  .set("spark.smartCompress", "false")
                         
    val sc = new SparkContext(sparkConf)
    val hc = new HiveContext(sc)

    val _sql = scala.io.Source.fromFile(sql_file).mkString
    _sql.split(';').foreach { x =>
      if (x.trim.nonEmpty)
        hc.sql(x)
    }

    sc.stop()
  }
}

Source File: KuduAccountMartSimpleSums.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduAccountMartSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAccountMartTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAccountMartTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAccountMartTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("account_mart_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
}

Source File: KuduToNestedHDFS.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.sql.kudu

import com.cloudera.sa.taxi360.model.NyTaxiYellowTripBuilder
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduToNestedHDFS {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<kuduTaxiTripTableName> " +
        "<hdfsTaxiNestedTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduTaxiTripTableName = args(2)
    val hdfsTaxiNestedTableName = args(3)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduTaxiTripTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load.
      registerTempTable("ny_taxi_trip_tmp")

    val kuduDataDf = hiveContext.sql("select * from ny_taxi_trip_tmp")

    val newNestedDf = kuduDataDf.map(r => {
      val pojo = NyTaxiYellowTripBuilder.build(r)
      (pojo.vender_id, pojo)
    }).groupByKey().map(grp => {
      Row(grp._1, grp._2.map(p => {
        Row(p.passenger_count,
          p.payment_type,
          p.total_amount,
          p.fare_amount)
      }))
    })

    hiveContext.sql("create table " + hdfsTaxiNestedTableName + "( " +
      " vender_id string," +
      " trip array<struct< " +
      "   passenger_count: INT," +
      "   payment_type: STRING, " +
      "   total_amount: DOUBLE, " +
      "   fare_amount: DOUBLE " +
      "  >>" +
      " ) stored as parquet")

    val emptyDf = hiveContext.sql("select * from " + hdfsTaxiNestedTableName + " limit 0")

    hiveContext.createDataFrame(newNestedDf, emptyDf.schema).registerTempTable("tmpNested")

    hiveContext.sql("insert into " + hdfsTaxiNestedTableName + " select * from tmpNested")

    sc.stop()
  }
}

Source File: KuduToHDFS.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduToHDFS {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduTaxiTripTableName> " +
        "<hdfsTaxiTripTableName> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduTaxiTripTableName = args(2)
    val hdfsTaxiTripTableName = args(3)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduTaxiTripTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("kuduTaxiTripTableName")

    hiveContext.sql("CREATE TABLE " + hdfsTaxiTripTableName + " " +
      " AS SELECT * FROM kuduTaxiTripTableName " +
      " STORED AS PARQUET")

    sc.stop()
  }
}

Source File: KuduAppEventSimpleSums.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}


object KuduAppEventSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAppEventTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAppEventTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAppEventTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("app_event_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
}

Source File: DateDataTypeDirectDictionaryWithNoDictTestCase.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.testsuite.directdictionary

import java.io.File
import java.sql.Date

import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.scalatest.BeforeAndAfterAll
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.spark.sql.test.util.QueryTest


class DateDataTypeDirectDictionaryWithNoDictTestCase extends QueryTest with BeforeAndAfterAll {
  var hiveContext: HiveContext = _

  override def beforeAll {
    try {
      sql("drop table if exists directDictionaryTable")
      CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "true")
      sql(
        """
         CREATE TABLE IF NOT EXISTS directDictionaryTable
        (empno String, doj Date, salary Int)
         STORED AS carbondata """
      )

      CarbonProperties.getInstance()
        .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy-MM-dd")
      val csvFilePath = s"$resourcesPath/datasample.csv"
      println(csvFilePath)
      sql("LOAD DATA local inpath '" + csvFilePath + "' INTO TABLE directDictionaryTable OPTIONS"
        + "('DELIMITER'= ',', 'QUOTECHAR'= '\"')");
    } catch {
      case x: Throwable =>
        x.printStackTrace()
        CarbonProperties.getInstance()
        .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT,
          CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT)
    }
  }

  test("select doj from directDictionaryTable") {
    sql("select doj from directDictionaryTable")
    checkAnswer(
      sql("select doj from directDictionaryTable"),
      Seq(Row(Date.valueOf("2016-03-14")),
        Row(Date.valueOf("2016-04-14")),
        Row(null)
      )
    )
  }


  test("select doj from directDictionaryTable with equals filter") {
    sql("select doj from directDictionaryTable where doj='2016-03-14 15:00:09'")
    checkAnswer(
      sql("select doj from directDictionaryTable where doj='2016-03-14'"),
      Seq(Row(Date.valueOf("2016-03-14")))
    )

  }

  test("select doj from directDictionaryTable with greater than filter") {
    sql("select doj from directDictionaryTable where doj>'2016-03-14 15:00:09'")
    checkAnswer(
      sql("select doj from directDictionaryTable where doj>'2016-03-14 15:00:09'"),
      Seq(Row(Date.valueOf("2016-04-14")))
    )

  }


  override def afterAll {
    sql("drop table directDictionaryTable")
    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT,
        CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT)
    CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "false")
  }
}

Source File: MakingNestedTableTest.scala From SparkUnitTestingExamples with Apache License 2.0

5 votes

package com.cloudera.sa.spark.unittest.sql

import org.apache.spark.sql.Row
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

object MakingNestedTableTest   extends FunSuite with
  BeforeAndAfterEach with BeforeAndAfterAll {

  @transient var sc: SparkContext = null
  @transient var hiveContext: HiveContext = null

  override def beforeAll(): Unit = {

    val envMap = Map[String, String](("Xmx", "512m"))

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    sparkConfig.set("spark.io.compression.codec", "lzf")
    sc = new SparkContext("local[2]", "unit test", sparkConfig)
    hiveContext = new HiveContext(sc)
  }

  override def afterAll(): Unit = {
    sc.stop()
  }

  test("Test table creation and summing of counts") {

    val loanRDD = sc.parallelize(Seq(Row("100", "100000000"),
                                      Row("101", "100000000"),
                                      Row("102", "100000000")))

    val partiesRDD = sc.parallelize(Seq(Row("100", "ted"),
      Row("101", "bob", "42"),
      Row("101", "cat", "42"),
      Row("102", "Jen", "42"),
      Row("102", "Jenny", "42"),
      Row("102", "Ed", "42")))

    //loan
    hiveContext.sql("create table loan (id string, amount string) as parquet")
    val emptyLoanDF = hiveContext.sql("select * from loan limit 0;")
    val loanDF = hiveContext.createDataFrame(loanRDD, emptyLoanDF.schema)
    loanDF.registerTempTable("loanTmp")
    hiveContext.sql("insert into loan select * from loanTmp")

    //parties
    hiveContext.sql("create table party (loan_id string, name string, age string) as parquet")
    val emptyPartyDF = hiveContext.sql("select * from party limit 0;")
    val partyDF = hiveContext.createDataFrame(partiesRDD, emptyPartyDF.schema)
    partyDF.registerTempTable("partyTmp")
    hiveContext.sql("insert into party select * from partyTmp")

    val keyValueParty = hiveContext.sql("select * from party").map(r => {
      //Key Value
      (r.getString(r.fieldIndex("loan_id")), Seq(r))
    }).reduceByKey((a, b) => {
      a ++ b
    })

    val keyValueLoan = hiveContext.sql("select * from loan").map(r => {
      //Key Value
      (r.getString(r.fieldIndex("id")), r.getString(r.fieldIndex("amount")))
    })

    val nestedRDD = keyValueLoan.join(keyValueParty).map(r => {
      val loanId = r._1
      val loanAmount = r._2._1
      val seqOfParties = r._2._2.map(r => {
        Row(r.getString(r.fieldIndex("name")),
        r.getString(r.fieldIndex("age")))
      })

      Row(loanId, loanAmount, seqOfParties)
    })

    hiveContext.sql("create table nested (" +
      "loan_id string, " +
      "amount string, " +
      "party <array<struct<" +
      "  name: String," +
      "  age: String>>" +
      ") as parquet")

    val emptyNestedDF = hiveContext.sql("select * from nested limit 0;")
    val nestedDF = hiveContext.createDataFrame(nestedRDD, emptyNestedDF.schema)
    nestedDF.registerTempTable("nestedTmp")
    hiveContext.sql("insert into nested select * from nestedTmp")


  }
}

Source File: KuduAccountMartSimpleSums.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduAccountMartSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAccountMartTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAccountMartTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAccountMartTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("account_mart_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
}

Source File: KuduToNestedHDFS.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.sql.kudu

import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTripBuilder
import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduToNestedHDFS {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<kuduTaxiTripTableName> " +
        "<hdfsTaxiNestedTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduTaxiTripTableName = args(2)
    val hdfsTaxiNestedTableName = args(3)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduTaxiTripTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load.
      registerTempTable("ny_taxi_trip_tmp")

    val kuduDataDf = hiveContext.sql("select * from ny_taxi_trip_tmp")

    val newNestedDf = kuduDataDf.map(r => {
      val pojo = NyTaxiYellowTripBuilder.build(r)
      (pojo.vender_id, pojo)
    }).groupByKey().map(grp => {
      Row(grp._1, grp._2.map(p => {
        Row(p.passenger_count,
          p.payment_type,
          p.total_amount,
          p.fare_amount)
      }))
    })

    hiveContext.sql("create table " + hdfsTaxiNestedTableName + "( " +
      " vender_id string," +
      " trip array<struct< " +
      "   passenger_count: INT," +
      "   payment_type: STRING, " +
      "   total_amount: DOUBLE, " +
      "   fare_amount: DOUBLE " +
      "  >>" +
      " ) stored as parquet")

    val emptyDf = hiveContext.sql("select * from " + hdfsTaxiNestedTableName + " limit 0")

    hiveContext.createDataFrame(newNestedDf, emptyDf.schema).registerTempTable("tmpNested")

    hiveContext.sql("insert into " + hdfsTaxiNestedTableName + " select * from tmpNested")

    sc.stop()
  }
}

Source File: KuduToHDFS.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}

object KuduToHDFS {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduTaxiTripTableName> " +
        "<hdfsTaxiTripTableName> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduTaxiTripTableName = args(2)
    val hdfsTaxiTripTableName = args(3)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduTaxiTripTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("kuduTaxiTripTableName")

    hiveContext.sql("CREATE TABLE " + hdfsTaxiTripTableName + " " +
      " AS SELECT * FROM kuduTaxiTripTableName " +
      " STORED AS PARQUET")

    sc.stop()
  }
}

Source File: KuduAppEventSimpleSums.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.sql.kudu

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}


object KuduAppEventSimpleSums {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> <kuduMaster> " +
        "<kuduAppEventTableName> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val kuduAppEventTableName = args(2)

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val hiveContext = new HiveContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> kuduAppEventTableName,
      "kudu.master" -> kuduMaster)

    hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load.
      registerTempTable("app_event_tmp")


    println("------------")
    val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id").
      take(100)
    println("------------")

    values.foreach(println)
    println("------------")

    sc.stop()
  }
}

Source File: WithSQLContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark

import java.util.Locale

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.scalatest.{BeforeAndAfterEach, Suite}

trait WithSQLContext extends BeforeAndAfterEach {
  self: Suite with WithSparkContext =>

  override def beforeEach(): Unit = {
    try {
      super.beforeEach()
      setUpSQLContext()
    } catch {
      case ex: Throwable =>
        tearDownSQLContext()
        throw ex
    }
  }

  override def afterEach(): Unit = {
    try {
      super.afterEach()
    } finally {
      tearDownSQLContext()
    }
  }

  implicit def sqlContext: SQLContext = _sqlContext
  def sqlc: SQLContext = sqlContext

  var _sqlContext: SQLContext = _

  protected def setUpSQLContext(): Unit =
    _sqlContext = SQLContext.getOrCreate(sc).newSession()


  protected def tearDownSQLContext(): Unit =
    _sqlContext = null

  protected def tableName(name: String): String =
    sqlc match {
      
      case _: HiveContext => name.toLowerCase(Locale.ENGLISH)
      case _ => name
    }

}

Source File: SapSQLEnv.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.sap.thriftserver

import java.io.PrintStream

import org.apache.spark.scheduler.StatsReportListener
import org.apache.spark.sql.hive.{HiveContext, SapHiveContext}
import org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver
import org.apache.spark.sql.hive.thriftserver.SparkSQLEnv._
import org.apache.spark.util.Utils
import org.apache.spark.{Logging, SparkConf, SparkContext}

import scala.collection.JavaConversions._


object SapSQLEnv extends Logging {

  def init() {
    logDebug("Initializing SapSQLEnv")
    if (hiveContext == null) {
      logInfo("Creating SapSQLContext")
      val sparkConf = new SparkConf(loadDefaults = true)
      val maybeSerializer = sparkConf.getOption("spark.serializer")
      val maybeKryoReferenceTracking = sparkConf.getOption("spark.kryo.referenceTracking")
      // If user doesn't specify the appName, we want to get [SparkSQL::localHostName] instead of
      // the default appName [SparkSQLCLIDriver] in cli or beeline.
      val maybeAppName = sparkConf
        .getOption("spark.app.name")
        .filterNot(_ == classOf[SparkSQLCLIDriver].getName)

      sparkConf
        .setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}"))
        .set("spark.serializer",
          maybeSerializer.getOrElse("org.apache.spark.serializer.KryoSerializer"))
        .set("spark.kryo.referenceTracking",
          maybeKryoReferenceTracking.getOrElse("false"))

      sparkContext = new SparkContext(sparkConf)
      sparkContext.addSparkListener(new StatsReportListener())
      hiveContext = new SapHiveContext(sparkContext)

      hiveContext.metadataHive.setOut(new PrintStream(System.out, true, "UTF-8"))
      hiveContext.metadataHive.setInfo(new PrintStream(System.err, true, "UTF-8"))
      hiveContext.metadataHive.setError(new PrintStream(System.err, true, "UTF-8"))

      hiveContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion)

      if (log.isDebugEnabled) {
        hiveContext.hiveconf.getAllProperties.toSeq.sorted.foreach { case (k, v) =>
          logDebug(s"HiveConf var: $k=$v")
        }
      }
    }
  }
}

Source File: SapThriftServer.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import org.apache.commons.logging.LogFactory
import org.apache.spark.Logging
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.sap.thriftserver.SapSQLEnv
import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2._
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab
import org.apache.hive.service.server.HiveServerServerOptionsProcessor

object SapThriftServer extends Logging {
  var LOG = LogFactory.getLog(classOf[SapThriftServer])


  def main(args: Array[String]) {
    val optionsProcessor = new HiveServerServerOptionsProcessor("SapThriftServer")
    if (!optionsProcessor.process(args)) {
      System.exit(-1)
    }

    logInfo("Starting SparkContext")
    SapSQLEnv.init()

    org.apache.spark.util.ShutdownHookManager.addShutdownHook { () =>
      SparkSQLEnv.stop()
      uiTab.foreach(_.detach())
    }

    try {
      val server = new HiveThriftServer2(SparkSQLEnv.hiveContext)
      server.init(SparkSQLEnv.hiveContext.hiveconf)
      server.start()
      logInfo("SapThriftServer started")
      listener = new HiveThriftServer2Listener(server, SparkSQLEnv.hiveContext.conf)
      SparkSQLEnv.sparkContext.addSparkListener(listener)
      uiTab = if (SparkSQLEnv.sparkContext.getConf.getBoolean("spark.ui.enabled", true)) {
        Some(new ThriftServerTab(SparkSQLEnv.sparkContext))
      } else {
        None
      }
    } catch {
      case e: Exception =>
        logError("Error starting SapThriftServer", e)
        System.exit(-1)
    }
  }
}

private[hive] class SapThriftServer(val hiveContext: HiveContext) extends Logging{

  def start: Unit = {
    logInfo("ThriftServer with SapSQLContext")
    logInfo("Starting SparkContext")
    HiveThriftServer2.startWithContext(hiveContext)
  }
}

Source File: L8-38SparkR.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import java.nio.file.Paths
import org.apache.spark.SparkFiles

object CdrStreamingSparkRApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 7) {
      System.err.println(
        "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val cl = Thread.currentThread().getContextClassLoader()
    val hiveC = new HiveContext(ssc.sparkContext)
    Thread.currentThread().setContextClassLoader(cl)

    import hiveC.implicits._

    ssc.sparkContext.addFile(rScriptPath)
    val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString)
    val master = hiveC.sparkContext.getConf.get("spark.master")

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD((rdd, time) => {
        val iTableName = tableName + time.milliseconds
        seqToCdr(rdd).toDF().write.saveAsTable(iTableName)
        hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString)
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: T8-5-L8-30-34DataFrameExamplesActions.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr
import org.json4s.DefaultFormats

object CdrDataframeExamplesActionsApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CdrDataframeExamplesActionsApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val cl = Thread.currentThread().getContextClassLoader()
    val hiveC = new HiveContext(ssc.sparkContext)
    Thread.currentThread().setContextClassLoader(cl)
    import hiveC.implicits._
    implicit val formats = DefaultFormats

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        val cdrs = seqToCdr(rdd).toDF()

        val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count"))
        counts.show(5)
        counts.show()
        println("head(5): " + counts.head(5))
        println("take(5): " + counts.take(5))
        println("head(): " + counts.head())
        println("first(5): " + counts.first())
        println("count(): " + counts.count())
        println("collect(): " + counts.collect())
        println("collectAsList(): " + counts.collectAsList())
        println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show())
        counts.write.format("parquet").save("/tmp/parquent" + rdd.id)
        counts.write.format("json").save("/tmp/json" + rdd.id)
        counts.write.parquet("/tmp/parquent2" + rdd.id)
        counts.write.json("/tmp/json2" + rdd.id)
        counts.write.saveAsTable("count_table")
        cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts")
        val prop: java.util.Properties = new java.util.Properties()
        counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop)
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: L8-13HiveQL.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object CdrHiveqlApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CdrHiveqlApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val cl = Thread.currentThread().getContextClassLoader()
    val hiveC = new HiveContext(ssc.sparkContext)
    Thread.currentThread().setContextClassLoader(cl)

    import hiveC.implicits._

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        seqToCdr(rdd).toDF().registerTempTable("cdrs")

        hiveC.sql("SET DATE_FMT='yy-MM-dd|HH'")
        hiveC.sql("SELECT from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) AS TS, SUM(smsInActivity + smsOutActivity + callInActivity + callOutActivity + internetTrafficActivity) AS Activity FROM cdrs GROUP BY from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) ORDER BY Activity DESC").show()
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: BadRecordPathLoadOptionTest.scala From carbondata with Apache License 2.0

5 votes

package org.apache.spark.carbondata

import org.apache.spark.sql.CarbonEnv
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.test.util.QueryTest
import org.scalatest.BeforeAndAfterAll

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.datastore.filesystem.{CarbonFile, CarbonFileFilter}
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.util.CarbonProperties


class BadRecordPathLoadOptionTest extends QueryTest with BeforeAndAfterAll {
  var hiveContext: HiveContext = _

  override def beforeAll {
    sql("drop table IF EXISTS salestest")
  }

  test("data load log file and csv file written at the configured location") {
    sql(
      s"""CREATE TABLE IF NOT EXISTS salestest(ID BigInt, date Timestamp, country String,
          actual_price Double, Quantity int, sold_price Decimal(19,2)) STORED AS carbondata TBLPROPERTIES('BAD_RECORD_PATH'='$warehouse')""")
    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, "yyyy/MM/dd")
    val csvFilePath = s"$resourcesPath/badrecords/datasample.csv"
    sql("LOAD DATA local inpath '" + csvFilePath + "' INTO TABLE salestest OPTIONS" +
        "('bad_records_logger_enable'='true','bad_records_action'='redirect', 'DELIMITER'=" +
        " ',', 'QUOTECHAR'= '\"')")
    val location: Boolean = isFilesWrittenAtBadStoreLocation
    assert(location)
  }

  override def afterAll {
    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
        CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT)
    sql("drop table salestest")
  }

  def isFilesWrittenAtBadStoreLocation: Boolean = {
    val badStorePath =
      CarbonEnv.getCarbonTable(Some("default"), "salestest")(sqlContext.sparkSession).getTableInfo
        .getFactTable.getTableProperties.get("bad_record_path") + "/0/0"
    val carbonFile: CarbonFile = FileFactory.getCarbonFile(badStorePath)
    var exists: Boolean = carbonFile.exists()
    if (exists) {
      val listFiles: Array[CarbonFile] = carbonFile.listFiles(new CarbonFileFilter {
        override def accept(file: CarbonFile): Boolean = {
          if (file.getName.endsWith(".log") || file.getName.endsWith(".csv")) {
            return true;
          }
          return false;
        }
      })
      exists = listFiles.size > 0
    }
    return exists;
  }

}

Source File: DateDataTypeNullDataTest.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.testsuite.directdictionary

import java.sql.Date

import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.scalatest.BeforeAndAfterAll
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.spark.sql.test.util.QueryTest


class DateDataTypeNullDataTest extends QueryTest with BeforeAndAfterAll {
  var hiveContext: HiveContext = _

  override def beforeAll {
    try {
      sql(
        """CREATE TABLE IF NOT EXISTS timestampTyeNullData
                     (ID Int, dateField date, country String,
                     name String, phonetype String, serialname String, salary Int)
                    STORED AS carbondata"""
      )

      CarbonProperties.getInstance()
        .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy/MM/dd")
      val csvFilePath = s"$resourcesPath/datasamplenull.csv"
      sql("LOAD DATA LOCAL INPATH '" + csvFilePath + "' INTO TABLE timestampTyeNullData").collect();

    } catch {
      case x: Throwable =>
        x.printStackTrace()
        CarbonProperties.getInstance()
        .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT,
          CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT)
    }
  }

  test("SELECT max(dateField) FROM timestampTyeNullData where dateField is not null") {
    checkAnswer(
      sql("SELECT max(dateField) FROM timestampTyeNullData where dateField is not null"),
      Seq(Row(Date.valueOf("2015-07-23"))
      )
    )
  }
  test("SELECT * FROM timestampTyeNullData where dateField is null") {
    checkAnswer(
      sql("SELECT dateField FROM timestampTyeNullData where dateField is null"),
      Seq(Row(null)
      ))
  }

  override def afterAll {
    sql("drop table timestampTyeNullData")
    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT,
        CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT)
    CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "false")
  }

}

Source File: TimestampDataTypeNullDataTest.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.testsuite.directdictionary

import java.io.File
import java.sql.Timestamp

import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.scalatest.BeforeAndAfterAll
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.keygenerator.directdictionary.timestamp.TimeStampGranularityConstants
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.spark.sql.test.util.QueryTest


class TimestampDataTypeNullDataTest extends QueryTest with BeforeAndAfterAll {
  var hiveContext: HiveContext = _

  override def beforeAll {
    try {
      CarbonProperties.getInstance()
        .addProperty(TimeStampGranularityConstants.CARBON_CUTOFF_TIMESTAMP, "2000-12-13 02:10.00.0")
      CarbonProperties.getInstance()
        .addProperty(TimeStampGranularityConstants.CARBON_TIME_GRANULARITY,
          TimeStampGranularityConstants.TIME_GRAN_SEC.toString
        )
      sql(
        """CREATE TABLE IF NOT EXISTS timestampTyeNullData
                     (ID Int, dateField Timestamp, country String,
                     name String, phonetype String, serialname String, salary Int)
                    STORED AS carbondata"""
      )

      CarbonProperties.getInstance()
        .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, "yyyy/MM/dd")
      val csvFilePath = s"$resourcesPath/datasamplenull.csv"
      sql("LOAD DATA LOCAL INPATH '" + csvFilePath + "' INTO TABLE timestampTyeNullData").collect();

    } catch {
      case x: Throwable => CarbonProperties.getInstance()
        .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
          CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT)
    }
  }

  test("SELECT max(dateField) FROM timestampTyeNullData where dateField is not null") {
    checkAnswer(
      sql("SELECT max(dateField) FROM timestampTyeNullData where dateField is not null"),
      Seq(Row(Timestamp.valueOf("2015-07-23 00:00:00.0"))
      )
    )
  }
  test("SELECT * FROM timestampTyeNullData where dateField is null") {
    checkAnswer(
      sql("SELECT dateField FROM timestampTyeNullData where dateField is null"),
      Seq(Row(null)
      ))
  }

  override def afterAll {
    sql("drop table timestampTyeNullData")
    CarbonProperties.getInstance()
      .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT,
        CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT)
    CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "false")
  }

}

Source File: TimestampDataTypeDirectDictionaryWithNoDictTestCase.scala From carbondata with Apache License 2.0

5 votes

package org.apache.carbondata.spark.testsuite.directdictionary

import java.sql.Timestamp

import org.apache.spark.sql.Row
import org.apache.spark.sql.hive.HiveContext
import org.scalatest.BeforeAndAfterAll
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.keygenerator.directdictionary.timestamp.TimeStampGranularityConstants
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.spark.sql.test.util.QueryTest


class TimestampDataTypeDirectDictionaryWithNoDictTestCase extends QueryTest with BeforeAndAfterAll {
  var hiveContext: HiveContext = _

  override def beforeAll {
    CarbonProperties.getInstance()
      .addProperty(TimeStampGranularityConstants.CARBON_CUTOFF_TIMESTAMP, "2000-12-13 02:10.00.0")
    CarbonProperties.getInstance()
      .addProperty(TimeStampGranularityConstants.CARBON_TIME_GRANULARITY,
        TimeStampGranularityConstants.TIME_GRAN_SEC.toString
      )
    CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "true")
    sql(
      """
         CREATE TABLE IF NOT EXISTS directDictionaryTable
        (empno String, doj Timestamp, salary Int)
         STORED AS carbondata"""
    )
    val csvFilePath = s"$resourcesPath/datasample.csv"
    sql("LOAD DATA local inpath '" + csvFilePath + "' INTO TABLE directDictionaryTable OPTIONS"
        + "('DELIMITER'= ',', 'QUOTECHAR'= '\"')")
  }

  test("select doj from directDictionaryTable") {
    checkAnswer(
      sql("select doj from directDictionaryTable"),
      Seq(Row(Timestamp.valueOf("2016-03-14 15:00:09.0")),
        Row(Timestamp.valueOf("2016-04-14 15:00:09.0")),
        Row(null)
      )
    )
  }


  test("select doj from directDictionaryTable with equals filter") {
    checkAnswer(
      sql("select doj from directDictionaryTable where doj='2016-03-14 15:00:09'"),
      Seq(Row(Timestamp.valueOf("2016-03-14 15:00:09")))
    )

  }

  test("select doj from directDictionaryTable with greater than filter") {
    checkAnswer(
      sql("select doj from directDictionaryTable where doj>'2016-03-14 15:00:09'"),
      Seq(Row(Timestamp.valueOf("2016-04-14 15:00:09")))
    )

  }


  override def afterAll {
    sql("drop table directDictionaryTable")
    CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "false")
  }
}

Source File: ModelDebug.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.aerosolve.training.pipeline

import com.airbnb.aerosolve.core.ModelRecord
import com.airbnb.aerosolve.core.function.{AbstractFunction, MultiDimensionSpline}
import com.airbnb.aerosolve.core.util.Util
import com.airbnb.aerosolve.training.pipeline.HiveUtil
import com.typesafe.config.Config
import org.apache.spark.SparkContext
import org.apache.spark.sql.hive.HiveContext
import org.slf4j.{Logger, LoggerFactory}

import scala.util.Try


object ModelDebug {
  val log: Logger = LoggerFactory.getLogger("DebugPipeline")
  def modelRecordToString(x: ModelRecord) : String = {
    if (x.weightVector != null && !x.weightVector.isEmpty) {
      val func = AbstractFunction.buildFunction(x)
      val tolerance = func.smooth(0, false)
      val tolerancePercentage = func.smooth(0, true)
      val nDTreeModelString: String = if (x.ndtreeModel != null) {
        val w = func.asInstanceOf[MultiDimensionSpline].getWeightsString
        x.ndtreeModel.toString + w

      } else {
        ""
      }

      s"%s\u0001%s\u0001%f\u0001%f\u0001%s\u0001%s\u0001%f\u0001%f\u0001%f".format(
        x.featureFamily, x.featureName, x.minVal, x.maxVal, x.weightVector.toString,
        nDTreeModelString, 0.0, tolerance, tolerancePercentage)
    } else {
      log.info(s" ${x.featureFamily} ${x.featureName} miss weightVector")
      ""
    }
  }

  def dumpModelForHive(sc: SparkContext, config: Config) = {
    val cfg = config.getConfig("dump_model_to_hive")
    dumpModel(sc, cfg, x => x.featureName != null, modelRecordToString)
  }

  def dumpModel(sc: SparkContext, config: Config,
    filterFunction: (ModelRecord) => Boolean,
    recordToString: (ModelRecord) => String): Unit = {
    val modelName = config.getString("model_name")
    val modelDump = config.getString("model_dump")
    val outputHiveTable = Try(config.getString("output_hive_table")).getOrElse("")

    val overwrite: Boolean = Try(config.getBoolean("overwrite")).getOrElse(false)

    val model = sc
      .textFile(modelName)
      .map(Util.decodeModel)
      .filter(filterFunction)
      .map(recordToString)
      .filter(_.length > 0)

    PipelineUtil.saveAndCommitAsTextFile(model, modelDump, overwrite)

    if (!outputHiveTable.isEmpty) {
      val hiveContext = new HiveContext(sc)
      val partitionKey = config.getString("partition_key")
      val partitionValue = config.getString("partition_value")
      // assume the value of partition key is string
      HiveUtil.updateHivePartition(
        hiveContext, outputHiveTable, s"$partitionKey='$partitionValue'", modelDump
      )
    }
  }
}

Source File: DataLoader.scala From variantsdwh with Apache License 2.0

5 votes

package pl.edu.pw.ii.zsibio.dwh.benchmark

import com.typesafe.config.ConfigFactory
import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.{SparkConf, SparkContext}
import org.rogach.scallop.ScallopConf
import org.apache.kudu.spark.kudu._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DataType, StructField, StructType}


object DataLoader {
  class RunConf(args:Array[String]) extends ScallopConf(args){

    val csvFile =opt[String]("csvFile",required = true, descr = "A CSV file to load" )
    val tableName =opt[String]("tableName",required = true, descr = "A table to load" )
    val storageType = opt[String]("storageType",required = true, descr = "Storage type parquet|orc|kudu|carbon")
    val dbName =opt[String]("dbName",required = true, descr = "Database name" )


    verify()
  }
  def main(args: Array[String]): Unit = {
    val runConf = new RunConf(args)
    val scConf = new SparkConf()
        .setAppName("DataLoader")
    val sc = new SparkContext(scConf)
    val sqlContext = new HiveContext(sc)


    if(runConf.storageType().toLowerCase() == "orc" || runConf.storageType().toLowerCase() == "parquet") {
      val df = sqlContext.read
        .format("com.databricks.spark.csv")
        .option("delimiter", "|")
        .option("nullValue","\\N")
        .option("inferSchema", "true") // Automatically infer data types
        .load(runConf.csvFile())
        .repartition(10)
      df.registerTempTable("temp_csv")
      sqlContext.sql(
        s"""
        |INSERT OVERWRITE TABLE ${runConf.dbName()}.${runConf.tableName()}

        |SELECT * FROM temp_csv
        """.stripMargin)
      }
    if(runConf.storageType().toLowerCase() == "kudu"){
      val confFile = ConfigFactory.load()
      val kuduMaster = confFile.getString("kudu.master.server")
      val kuduContext = new KuduContext(kuduMaster)
      val dfTarget = sqlContext.read.options(Map("kudu.master" -> kuduMaster,"kudu.table" -> runConf.tableName())).kudu
      val df = sqlContext.read
        .format("com.databricks.spark.csv")
        .option("delimiter", "|")
        .option("nullValue","\\N")
        .schema(dfTarget.schema)
        .load(runConf.csvFile())
        .repartition(10)
      kuduContext.upsertRows(df,runConf.tableName())
    }

  }

  private def synSchemas(inSchema:StructType, outSchema:StructType) = {

    val size = inSchema.fields.length
    val structFields = (0 to size - 1).map{
      i => StructField(outSchema.fields(i).name,inSchema.fields(i).dataType,outSchema.fields(i).nullable)
    }
    new StructType(structFields.toArray)

  }

}

Source File: SamplesGenerator.scala From variantsdwh with Apache License 2.0

5 votes

package pl.edu.pw.ii.zsibio.dwh.benchmark.generation

import java.lang

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.{Row, SQLContext}
import pl.edu.pw.ii.zsibio.dwh.benchmark.generation.model.GeneratedSample
import pl.edu.pw.ii.zsibio.dwh.benchmark.utils.Probability._

import scala.util.Random



    val population = sqlContext.sql(
      s"""
        |SELECT
        |population,
        |geo_id as id,
        |geo_country_name_en as countryName,
        |geo_region_name_en as region
        |FROM
        |${config.countryPopulation}
      """.stripMargin)

    val populationDist = population.map {
      case Row(population: Int, id: Int, countryName: String, region: String) =>
        (region, (population.toLong, id))
    }.groupByKey().collect().toMap

    def selectCountry(region: String): Int = {
      populationDist(region).selectWithProbability()
    }

    sc.parallelize(1 to n)
      .map((_, selectRegion()))
      .map(row => GeneratedSample(row._1, selectCountry(row._2.regionName), selectDisease(), row._2.afColumn))
  }

  def selectDisease() = {
    val r = Math.random()
    val ret: java.lang.Long = if (r <= 0.05)
      (Random.nextInt(60) * 100).toLong
    else
      new lang.Long(-1)
    ret
  }

  def selectRegion(): RegionConfig = {
    val percentages = Seq((config.africaConfig.percent, config.africaConfig)
      , (config.americasConfig.percent, config.americasConfig)
      , (config.europaConfig.percent, config.europaConfig)
      , (config.finnishConfig.percent, config.finnishConfig)
      , (config.southAsianConfig.percent, config.southAsianConfig)
      , (config.westAsianConfig.percent, config.westAsianConfig)
    )

    percentages.selectWithProbability()
  }

}

Source File: SavingStream.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService}
import com.kakao.mango.text.ThreadSafeDateFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

import java.util.concurrent.{Future => JFuture}
import scala.reflect.runtime.universe.TypeTag

object SavingStream {
  val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd")
  val hh = ThreadSafeDateFormat("HH")
  val mm = ThreadSafeDateFormat("mm")
  val m0 = (ms: Long) => mm(ms).charAt(0) + "0"
}


  @transient var executor: RichExecutorService = _

  def ex: RichExecutorService = {
    if (executor == null) {
      this.synchronized {
        if (executor == null) {
          executor = new RichExecutorService(es.get())
        }
      }
    }
    executor
  }

  def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = {
    stream.foreachRDD { (rdd, time) =>
      ex.submit {
        toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*)
      }
    }
  }

  def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms))
    }
  }

  def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms))
    }
  }

  def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms))
    }
  }

  def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms))
    }
  }

}

class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) {
  override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd)
}

class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) {
  override def toDF(rdd: RDD[String]) = ctx.read.json(rdd)
}

class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) {
  import com.kakao.mango.json._

  override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson))
}

class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) {
  override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema)
}

Source File: UdtfEnableQuery.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.examples

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.streaming.StreamSQLContext
import org.apache.spark.streaming.{Duration, StreamingContext}
import org.apache.spark.streaming.dstream.ConstantInputDStream

object UdtfEnabledQuery {

  case class People(name: String, items: Array[String])

  def main(args: Array[String]): Unit = {
    val ssc = new StreamingContext("local[10]", "test", Duration(3000))
    val sc = ssc.sparkContext
    val hiveContext = new HiveContext(sc)
    val streamSqlContext = new StreamSQLContext(ssc, hiveContext)
    import hiveContext.implicits._
    import streamSqlContext.createSchemaDStream
    val dummyRDD = sc.makeRDD(1 to 3).map(i => People(s"jack$i", Array("book", "gun")))
    val dummyStream = new ConstantInputDStream[People](ssc, dummyRDD)
    streamSqlContext.registerDStreamAsTable(dummyStream, "people")
    streamSqlContext.sql(
      """SELECT
        |    name,
        |    item
        |FROM
        |    people
        |    lateral view explode(items) items AS item""".stripMargin).map(_.copy()).print()
    ssc.start()
    ssc.awaitTerminationOrTimeout(30 * 1000)
    ssc.stop()
  }
}

Source File: UdafEnableQuery.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.examples

import scala.collection.mutable.ListBuffer

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.streaming.StreamSQLContext
import org.apache.spark.streaming.{Duration, StreamingContext}
import org.apache.spark.streaming.dstream.ConstantInputDStream

object UdafEnabledQuery {

  case class Data(name: String, money: Int)

  def main(args: Array[String]): Unit = {
    val ssc = new StreamingContext("local[10]", "test", Duration(3000))
    val sc = ssc.sparkContext
    val hiveContext = new HiveContext(sc)
    val streamSQlContext = new StreamSQLContext(ssc, hiveContext)
    val dummyRDD = sc.makeRDD(1 to 10).map(i => Data(s"jack$i", i))
    val dummyStream = new ConstantInputDStream[Data](ssc, dummyRDD)
    val schemaStream = streamSQlContext.createSchemaDStream(dummyStream)
    streamSQlContext.registerDStreamAsTable(schemaStream, "data")
    val resultList = ListBuffer[String]()
    streamSQlContext.sql(
      """SELECT
        |    percentile(money,0.8),
        |    stddev_pop(money)
        |FROM data
      """.stripMargin).map(_.copy()).print()
    ssc.start()
    ssc.awaitTerminationOrTimeout(30 * 1000)
    ssc.stop()
  }
}

Source File: StreamHQL.scala From spark-cep with Apache License 2.0

5 votes

import java.util.Properties

import kafka.consumer.ConsumerConfig
import org.I0Itec.zkclient.ZkClient
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.streaming.StreamSQLContext
import org.apache.spark.sql.streaming.sources.MessageDelimiter
import org.apache.spark.streaming.dstream.ConstantInputDStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import redis.RedisManager

import scala.util.parsing.json.JSON

class TabDelimiter extends MessageDelimiter {
  override val delimiter = "\t"
}

object StreamDDL {
  def main(args: Array[String]): Unit = {
    Logger.getRootLogger.setLevel(Level.WARN)
    val query = args(0)
    val sc = new SparkContext(new SparkConf())
    val ssc = new StreamingContext(sc, Seconds(1))
    val streamSqlContext = new StreamSQLContext(ssc, new HiveContext(sc))
    streamSqlContext.command(query)
    new ConstantInputDStream[Int](ssc, sc.parallelize(Seq(1))).print
    ssc.start()
    ssc.awaitTerminationOrTimeout(100)
    ssc.stop()
  }
}

object StreamHQL {

  object Redis {
    var initialized = false
    var manager: RedisManager = _
    def init(confMap: Map[String, String]) {
      if (initialized == false) {
        manager = new RedisManager(
          confMap("redis.shards"),
          confMap("redis.sentinels"),
          confMap("redis.database").toInt)
        manager.init
        initialized = true
      }
    }
  }

  def removeConsumerGroup(zkQuorum: String, groupId: String) {
    val properties = new Properties()
    properties.put("zookeeper.connect", zkQuorum)
    properties.put("group.id", groupId)
    val conf = new ConsumerConfig(properties)
    val zkClient = new ZkClient(conf.zkConnect)
    zkClient.deleteRecursive(s"/consumers/${conf.groupId}")
    zkClient.close()
  }

  def main(args: Array[String]): Unit = {
    Logger.getRootLogger.setLevel(Level.WARN)
    val confMap = JSON.parseFull(args(0)).get.asInstanceOf[Map[String, String]]
    val qid = args(1)
    val query = args(2)
    val sc = new SparkContext(new SparkConf())
    val ssc = new StreamingContext(sc, Seconds(1))
    val hc = new HiveContext(sc)
    val streamSqlContext = new StreamSQLContext(ssc, hc)
    val redisExpireSec = confMap("redis.expire.sec").toInt
    ssc.checkpoint(s"checkpoint/$qid")
    hc.setConf("spark.streaming.query.id", qid)
    hc.setConf("spark.sql.shuffle.partitions", confMap("spark.sql.shuffle.partitions"))

    removeConsumerGroup(confMap("kafka.zookeeper.quorum"), qid)
    val result = streamSqlContext.sql(query)
    val schema = result.schema

    result.foreachRDD((rdd, time) => {
      rdd.foreachPartition(partition => {
        Redis.init(confMap)
        val jedis = Redis.manager.getResource
        val pipe = jedis.pipelined
        partition.foreach(record => {
          val seq = record.toSeq(schema)
          val ts = time.milliseconds / 1000
          val hkey = seq.take(seq.size - 1).mkString(".")
          pipe.hset(qid + "." + ts, hkey, seq(seq.size - 1).toString)
          pipe.expire(qid + "." + ts, redisExpireSec)
        })
        pipe.sync
        Redis.manager.returnResource(jedis)
      })
    })

    ssc.start()
    ssc.awaitTermination()
    ssc.stop()
  }
}

Source File: HiveApp.scala From iolap with Apache License 2.0

5 votes

package main.scala

import scala.collection.mutable.{ListBuffer, Queue}

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.hive.HiveContext

case class Person(name: String, age: Int)

object SparkSqlExample {

  def main(args: Array[String]) {
    val conf = sys.env.get("SPARK_AUDIT_MASTER") match {
      case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master)
      case None => new SparkConf().setAppName("Simple Sql App")
    }
    val sc = new SparkContext(conf)
    val hiveContext = new HiveContext(sc)

    import hiveContext._
    sql("DROP TABLE IF EXISTS src")
    sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)")
    sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src")
    val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect()
    results.foreach(println)
    
    def test(f: => Boolean, failureMsg: String) = {
      if (!f) {
        println(failureMsg)
        System.exit(-1)
      }
    }
    
    test(results.size == 5, "Unexpected number of selected elements: " + results)
    println("Test succeeded")
    sc.stop()
  }
}

Source File: DescribeHiveTableCommand.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import scala.collection.JavaConversions._

import org.apache.hadoop.hive.metastore.api.FieldSchema

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.sql.catalyst.expressions.{Attribute, Row}
import org.apache.spark.sql.execution.{SparkPlan, RunnableCommand}
import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation}
import org.apache.spark.sql.hive.HiveShim
import org.apache.spark.sql.SQLContext


private[hive]
case class DescribeHiveTableCommand(
    table: MetastoreRelation,
    override val output: Seq[Attribute],
    isExtended: Boolean) extends RunnableCommand {

  override def run(sqlContext: SQLContext): Seq[Row] = {
    // Trying to mimic the format of Hive's output. But not exactly the same.
    var results: Seq[(String, String, String)] = Nil

    val columns: Seq[FieldSchema] = table.hiveQlTable.getCols
    val partitionColumns: Seq[FieldSchema] = table.hiveQlTable.getPartCols
    results ++= columns.map(field => (field.getName, field.getType, field.getComment))
    if (partitionColumns.nonEmpty) {
      val partColumnInfo =
        partitionColumns.map(field => (field.getName, field.getType, field.getComment))
      results ++=
        partColumnInfo ++
          Seq(("# Partition Information", "", "")) ++
          Seq((s"# ${output.get(0).name}", output.get(1).name, output.get(2).name)) ++
          partColumnInfo
    }

    if (isExtended) {
      results ++= Seq(("Detailed Table Information", table.hiveQlTable.getTTable.toString, ""))
    }

    results.map { case (name, dataType, comment) =>
      Row(name, dataType, comment)
    }
  }
}

Source File: CreateTableAsSelect.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.annotation.Experimental
import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.catalyst.expressions.Row
import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan}
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.hive.client.{HiveTable, HiveColumn}
import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation, HiveMetastoreTypes}


private[hive]
case class CreateTableAsSelect(
    tableDesc: HiveTable,
    query: LogicalPlan,
    allowExisting: Boolean)
  extends RunnableCommand {

  def database: String = tableDesc.database
  def tableName: String = tableDesc.name

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val hiveContext = sqlContext.asInstanceOf[HiveContext]
    lazy val metastoreRelation: MetastoreRelation = {
      import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe
      import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
      import org.apache.hadoop.io.Text
      import org.apache.hadoop.mapred.TextInputFormat

      val withSchema =
        tableDesc.copy(
          schema =
            query.output.map(c =>
              HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)),
          inputFormat =
            tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)),
          outputFormat =
            tableDesc.outputFormat
              .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)),
          serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName())))
      hiveContext.catalog.client.createTable(withSchema)

      // Get the Metastore Relation
      hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match {
        case r: MetastoreRelation => r
      }
    }
    // TODO ideally, we should get the output data ready first and then
    // add the relation into catalog, just in case of failure occurs while data
    // processing.
    if (hiveContext.catalog.tableExists(Seq(database, tableName))) {
      if (allowExisting) {
        // table already exists, will do nothing, to keep consistent with Hive
      } else {
        throw new AnalysisException(s"$database.$tableName already exists.")
      }
    } else {
      hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd
    }

    Seq.empty[Row]
  }

  override def argString: String = {
    s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]\n" + query.toString
  }
}

Source File: Main.scala From iolap with Apache License 2.0

5 votes

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.hive.HiveContext


object Main {
  def main(args: Array[String]) {
    println("Running regression test for SPARK-8489.")
    val sc = new SparkContext("local", "testing")
    val hc = new HiveContext(sc)
    // This line should not throw scala.reflect.internal.MissingRequirementError.
    // See SPARK-8470 for more detail.
    val df = hc.createDataFrame(Seq(MyCoolClass("1", "2", "3")))
    df.collect()
    println("Regression test for SPARK-8489 success!")
    sc.stop()
  }
}

Source File: OrcTest.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.orc

import java.io.File

import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.test.TestHive
import org.apache.spark.sql.test.SQLTestUtils
import org.apache.spark.sql._

private[sql] trait OrcTest extends SQLTestUtils {
  protected def hiveContext = sqlContext.asInstanceOf[HiveContext]

  import sqlContext.sparkContext
  import sqlContext.implicits._

  
  protected def withOrcTable[T <: Product: ClassTag: TypeTag]
      (data: Seq[T], tableName: String)
      (f: => Unit): Unit = {
    withOrcDataFrame(data) { df =>
      hiveContext.registerDataFrameAsTable(df, tableName)
      withTempTable(tableName)(f)
    }
  }

  protected def makeOrcFile[T <: Product: ClassTag: TypeTag](
      data: Seq[T], path: File): Unit = {
    data.toDF().write.format("orc").mode(SaveMode.Overwrite).save(path.getCanonicalPath)
  }

  protected def makeOrcFile[T <: Product: ClassTag: TypeTag](
      df: DataFrame, path: File): Unit = {
    df.write.format("orc").mode(SaveMode.Overwrite).save(path.getCanonicalPath)
  }
}

Source File: SparkSQLCLIService.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.IOException
import java.util.{List => JList}
import javax.security.auth.login.LoginException

import scala.collection.JavaConversions._

import org.apache.commons.logging.Log
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.shims.ShimLoader
import org.apache.hadoop.security.UserGroupInformation
import org.apache.hive.service.Service.STATE
import org.apache.hive.service.auth.HiveAuthFactory
import org.apache.hive.service.cli._
import org.apache.hive.service.{AbstractService, Service, ServiceException}

import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
import org.apache.spark.util.Utils

private[hive] class SparkSQLCLIService(hiveContext: HiveContext)
  extends CLIService
  with ReflectedCompositeService {

  override def init(hiveConf: HiveConf) {
    setSuperField(this, "hiveConf", hiveConf)

    val sparkSqlSessionManager = new SparkSQLSessionManager(hiveContext)
    setSuperField(this, "sessionManager", sparkSqlSessionManager)
    addService(sparkSqlSessionManager)
    var sparkServiceUGI: UserGroupInformation = null

    if (ShimLoader.getHadoopShims.isSecurityEnabled) {
      try {
        HiveAuthFactory.loginFromKeytab(hiveConf)
        sparkServiceUGI = ShimLoader.getHadoopShims.getUGIForConf(hiveConf)
        HiveThriftServerShim.setServerUserName(sparkServiceUGI, this)
      } catch {
        case e @ (_: IOException | _: LoginException) =>
          throw new ServiceException("Unable to login to kerberos with given principal/keytab", e)
      }
    }

    initCompositeService(hiveConf)
  }

  override def getInfo(sessionHandle: SessionHandle, getInfoType: GetInfoType): GetInfoValue = {
    getInfoType match {
      case GetInfoType.CLI_SERVER_NAME => new GetInfoValue("Spark SQL")
      case GetInfoType.CLI_DBMS_NAME => new GetInfoValue("Spark SQL")
      case GetInfoType.CLI_DBMS_VER => new GetInfoValue(hiveContext.sparkContext.version)
      case _ => super.getInfo(sessionHandle, getInfoType)
    }
  }
}

private[thriftserver] trait ReflectedCompositeService { this: AbstractService =>
  def initCompositeService(hiveConf: HiveConf) {
    // Emulating `CompositeService.init(hiveConf)`
    val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList")
    serviceList.foreach(_.init(hiveConf))

    // Emulating `AbstractService.init(hiveConf)`
    invoke(classOf[AbstractService], this, "ensureCurrentState", classOf[STATE] -> STATE.NOTINITED)
    setAncestorField(this, 3, "hiveConf", hiveConf)
    invoke(classOf[AbstractService], this, "changeState", classOf[STATE] -> STATE.INITED)
    getAncestorField[Log](this, 3, "LOG").info(s"Service: $getName is inited.")
  }
}

Source File: SparkSQLOperationManager.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.server

import java.util.{Map => JMap}
import scala.collection.mutable.Map

import org.apache.hive.service.cli._
import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager}
import org.apache.hive.service.cli.session.HiveSession
import org.apache.spark.Logging
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, ReflectionUtils}


private[thriftserver] class SparkSQLOperationManager(hiveContext: HiveContext)
  extends OperationManager with Logging {

  val handleToOperation = ReflectionUtils
    .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")

  val sessionToActivePool = Map[SessionHandle, String]()

  override def newExecuteStatementOperation(
      parentSession: HiveSession,
      statement: String,
      confOverlay: JMap[String, String],
      async: Boolean): ExecuteStatementOperation = synchronized {

    val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay)(
      hiveContext, sessionToActivePool)
    handleToOperation.put(operation.getHandle, operation)
    operation
  }
}

Source File: SparkSQLEnv.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.PrintStream

import scala.collection.JavaConversions._

import org.apache.spark.scheduler.StatsReportListener
import org.apache.spark.sql.hive.{HiveShim, HiveContext}
import org.apache.spark.{Logging, SparkConf, SparkContext}
import org.apache.spark.util.Utils


  def stop() {
    logDebug("Shutting down Spark SQL Environment")
    // Stop the SparkContext
    if (SparkSQLEnv.sparkContext != null) {
      sparkContext.stop()
      sparkContext = null
      hiveContext = null
    }
  }
}

Source File: AbstractSparkSQLDriver.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import scala.collection.JavaConversions._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.Logging
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes}

private[hive] abstract class AbstractSparkSQLDriver(
    val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: context.QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.size == 0) {
      new Schema(new FieldSchema("Response code", "string", "") :: Nil, null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "")
      }

      new Schema(fieldSchemas, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.stringResult()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    }
  }

  def runWrapper(command: String): CommandProcessorResponseWrapper = try {
    val result = run(command)
    new CommandProcessorResponseWrapper(result, null)
  } catch {
    case ae: AnalysisException =>
      logDebug(s"Failed in [$command]", ae)
      new CommandProcessorResponseWrapper(new CommandProcessorResponse(1,
        ExceptionUtils.getStackTrace(ae), null), ae)
    case cause: Throwable =>
      logError(s"Failed in [$command]", cause)
      new CommandProcessorResponseWrapper(new CommandProcessorResponse(1,
        ExceptionUtils.getStackTrace(cause), null), cause)
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
}

private[hive] case class CommandProcessorResponseWrapper(
    rc : CommandProcessorResponse,
    cause : Throwable)

org.apache.spark.sql.hive.HiveContext Scala Examples