org.apache.spark.sql.hive.HiveContext Scala Examples
The following examples show how to use org.apache.spark.sql.hive.HiveContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SqlUnitTest.scala From SparkUnitTestingExamples with Apache License 2.0 | 6 votes |
package com.cloudera.sa.spark.unittest.sql import org.apache.spark.sql.Row import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} import scala.collection.mutable class SqlUnitTest extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{ @transient var sc: SparkContext = null @transient var hiveContext: HiveContext = null override def beforeAll(): Unit = { val envMap = Map[String,String](("Xmx", "512m")) val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sparkConfig.set("spark.io.compression.codec", "lzf") sc = new SparkContext("local[2]", "unit test", sparkConfig) hiveContext = new HiveContext(sc) } override def afterAll(): Unit = { sc.stop() } test("Test table creation and summing of counts") { val personRDD = sc.parallelize(Seq(Row("ted", 42, "blue"), Row("tj", 11, "green"), Row("andrew", 9, "green"))) hiveContext.sql("create table person (name string, age int, color string)") val emptyDataFrame = hiveContext.sql("select * from person limit 0") val personDataFrame = hiveContext.createDataFrame(personRDD, emptyDataFrame.schema) personDataFrame.registerTempTable("tempPerson") val ageSumDataFrame = hiveContext.sql("select sum(age) from tempPerson") val localAgeSum = ageSumDataFrame.take(10) assert(localAgeSum(0).get(0) == 62, "The sum of age should equal 62 but it equaled " + localAgeSum(0).get(0)) } }
Example 2
Source File: SparkSQLEnv.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream import scala.collection.JavaConversions._ import org.apache.spark.scheduler.StatsReportListener import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{Logging, SparkConf, SparkContext} import org.apache.spark.util.Utils def stop() { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { sparkContext.stop() sparkContext = null hiveContext = null } } }
Example 3
Source File: SystemArg.scala From mist with Apache License 2.0 | 5 votes |
package mist.api import mist.api.data.JsMap import org.apache.spark.{SparkContext, SparkSessionUtils} import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaStreamingContext trait SystemArg[A] extends ArgDef[A] { final def validate(params: JsMap): Extraction[Unit] = Extracted(()) } object SystemArg { def apply[A](tags: Seq[String], f: => Extraction[A]): ArgDef[A] = new SystemArg[A] { override def extract(ctx: FnContext): Extraction[A] = f override def describe() = Seq(InternalArgument(tags)) } def apply[A](tags: Seq[String], f: FullFnContext => Extraction[A]): ArgDef[A] = new SystemArg[A] { override def extract(ctx: FnContext): Extraction[A] = ctx match { case c: FullFnContext => f(c) case _ => val desc = s"Unknown type of job context ${ctx.getClass.getSimpleName} " + s"expected ${FullFnContext.getClass.getSimpleName}" Failed.InternalError(desc) } override def describe() = Seq(InternalArgument(tags)) } } trait SparkArgs { val sparkContextArg: ArgDef[SparkContext] = SystemArg( Seq.empty, c => Extracted(c.sc) ) val streamingContextArg: ArgDef[StreamingContext] = SystemArg(Seq(ArgInfo.StreamingContextTag), ctx => { val ssc = StreamingContext.getActiveOrCreate(() => new StreamingContext(ctx.sc, ctx.streamingDuration)) Extracted(ssc) } ) val sqlContextArg: ArgDef[SQLContext] = SystemArg(Seq(ArgInfo.SqlContextTag), ctx => sparkContextArg.map(SQLContext.getOrCreate).extract(ctx) ) // HiveContext should be cached per jvm // see #325 val hiveContextArg: ArgDef[HiveContext] = new SystemArg[HiveContext] { var cache: HiveContext = _ override def extract(ctx: FnContext): Extraction[HiveContext] = synchronized { ctx match { case c: FullFnContext => if (cache == null) cache = new HiveContext(c.sc) Extracted(cache) case _ => Failed.InternalError(s"Unknown type of job context ${ctx.getClass.getSimpleName} expected ${FullFnContext.getClass.getSimpleName}") } } override def describe(): Seq[ArgInfo] = Seq(InternalArgument( Seq(ArgInfo.HiveContextTag, ArgInfo.SqlContextTag))) } val javaSparkContextArg: ArgDef[JavaSparkContext] = sparkContextArg.map(sc => new JavaSparkContext(sc)) val javaStreamingContextArg: ArgDef[JavaStreamingContext] = SystemArg(Seq(ArgInfo.StreamingContextTag), ctx => streamingContextArg.map(scc => new JavaStreamingContext(scc)).extract(ctx)) val sparkSessionArg: ArgDef[SparkSession] = SystemArg(Seq(ArgInfo.SqlContextTag), ctx => sparkContextArg.map(sc => SparkSessionUtils.getOrCreate(sc, false)).extract(ctx) ) val sparkSessionWithHiveArg: ArgDef[SparkSession] = SystemArg( Seq(ArgInfo.SqlContextTag, ArgInfo.HiveContextTag), ctx => sparkContextArg.map(sc => SparkSessionUtils.getOrCreate(sc, true)).extract(ctx)) } object SparkArgs extends SparkArgs
Example 4
Source File: SparkSQLOperationManager.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.server import java.util.{Map => JMap} import scala.collection.mutable.Map import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager} import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.Logging import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, ReflectionUtils} private[thriftserver] class SparkSQLOperationManager(hiveContext: HiveContext) extends OperationManager with Logging { val handleToOperation = ReflectionUtils .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation") val sessionToActivePool = Map[SessionHandle, String]() override def newExecuteStatementOperation( parentSession: HiveSession, statement: String, confOverlay: JMap[String, String], async: Boolean): ExecuteStatementOperation = synchronized { val runInBackground = async && hiveContext.hiveThriftServerAsync val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay, runInBackground)(hiveContext, sessionToActivePool) handleToOperation.put(operation.getHandle, operation) logDebug(s"Created Operation for $statement with session=$parentSession, " + s"runInBackground=$runInBackground") operation } }
Example 5
Source File: SparkSQLSessionManager.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.concurrent.Executors import org.apache.commons.logging.Log import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hive.service.cli.SessionHandle import org.apache.hive.service.cli.session.SessionManager import org.apache.hive.service.cli.thrift.TProtocolVersion import org.apache.hive.service.server.HiveServer2 import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext: HiveContext) extends SessionManager(hiveServer) with ReflectedCompositeService { private lazy val sparkSqlOperationManager = new SparkSQLOperationManager(hiveContext) override def init(hiveConf: HiveConf) { setSuperField(this, "hiveConf", hiveConf) val backgroundPoolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS) //用于保存等待执行的任务的阻塞队列, //LinkedBlockingQueue:一个基于链表结构的阻塞队列,此队列按FIFO(先进先出)排序元素,吞吐量通常要高于ArrayBlockingQueue //Executors.newFixedThreadPool()使用了这个队列 setSuperField(this, "backgroundOperationPool", Executors.newFixedThreadPool(backgroundPoolSize)) getAncestorField[Log](this, 3, "LOG").info( s"HiveServer2: Async execution pool size $backgroundPoolSize") setSuperField(this, "operationManager", sparkSqlOperationManager) addService(sparkSqlOperationManager) initCompositeService(hiveConf) } override def openSession( protocol: TProtocolVersion, username: String, passwd: String, ipAddress: String, sessionConf: java.util.Map[String, String], withImpersonation: Boolean, delegationToken: String): SessionHandle = { hiveContext.openSession() val sessionHandle = super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation, delegationToken) val session = super.getSession(sessionHandle) HiveThriftServer2.listener.onSessionCreated( session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername) sessionHandle } override def closeSession(sessionHandle: SessionHandle) { HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString) super.closeSession(sessionHandle) sparkSqlOperationManager.sessionToActivePool -= sessionHandle hiveContext.detachSession() } }
Example 6
Source File: SparkSQLDriver.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{ArrayList => JArrayList, List => JList} import scala.collection.JavaConversions._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.Logging import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} private[hive] class SparkSQLDriver( val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: context.QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.size == 0) { new Schema(new FieldSchema("Response code", "string", "") :: Nil, null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "") } new Schema(fieldSchemas, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.stringResult() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 7
Source File: SparkSQLCLIService.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.IOException import java.util.{List => JList} import javax.security.auth.login.LoginException import org.apache.commons.logging.Log import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.shims.Utils import org.apache.hadoop.security.UserGroupInformation import org.apache.hive.service.Service.STATE import org.apache.hive.service.auth.HiveAuthFactory import org.apache.hive.service.cli._ import org.apache.hive.service.server.HiveServer2 import org.apache.hive.service.{AbstractService, Service, ServiceException} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import scala.collection.JavaConversions._ private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, hiveContext: HiveContext) extends CLIService(hiveServer) with ReflectedCompositeService { override def init(hiveConf: HiveConf) { setSuperField(this, "hiveConf", hiveConf) val sparkSqlSessionManager = new SparkSQLSessionManager(hiveServer, hiveContext) setSuperField(this, "sessionManager", sparkSqlSessionManager) addService(sparkSqlSessionManager) var sparkServiceUGI: UserGroupInformation = null if (UserGroupInformation.isSecurityEnabled) { try { HiveAuthFactory.loginFromKeytab(hiveConf) sparkServiceUGI = Utils.getUGI() setSuperField(this, "serviceUGI", sparkServiceUGI) } catch { case e @ (_: IOException | _: LoginException) => throw new ServiceException("Unable to login to kerberos with given principal/keytab", e) } } initCompositeService(hiveConf) } override def getInfo(sessionHandle: SessionHandle, getInfoType: GetInfoType): GetInfoValue = { getInfoType match { case GetInfoType.CLI_SERVER_NAME => new GetInfoValue("Spark SQL") case GetInfoType.CLI_DBMS_NAME => new GetInfoValue("Spark SQL") case GetInfoType.CLI_DBMS_VER => new GetInfoValue(hiveContext.sparkContext.version) case _ => super.getInfo(sessionHandle, getInfoType) } } } private[thriftserver] trait ReflectedCompositeService { this: AbstractService => def initCompositeService(hiveConf: HiveConf) { // Emulating `CompositeService.init(hiveConf)` val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList") serviceList.foreach(_.init(hiveConf)) // Emulating `AbstractService.init(hiveConf)` invoke(classOf[AbstractService], this, "ensureCurrentState", classOf[STATE] -> STATE.NOTINITED) setAncestorField(this, 3, "hiveConf", hiveConf) invoke(classOf[AbstractService], this, "changeState", classOf[STATE] -> STATE.INITED) getAncestorField[Log](this, 3, "LOG").info(s"Service: $getName is inited.") } }
Example 8
Source File: Main.scala From spark1.52 with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.hive.HiveContext object Main { def main(args: Array[String]) { // scalastyle:off println println("Running regression test for SPARK-8489.") val sc = new SparkContext("local", "testing") val hc = new HiveContext(sc) // This line should not throw scala.reflect.internal.MissingRequirementError. // See SPARK-8470 for more detail. val df = hc.createDataFrame(Seq(MyCoolClass("1", "2", "3"))) df.collect() println("Regression test for SPARK-8489 success!") // scalastyle:on println sc.stop() } }
Example 9
Source File: HiveApp.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package main.scala import scala.collection.mutable.{ListBuffer, Queue} import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext case class Person(name: String, age: Int) object SparkSqlExample { def main(args: Array[String]) { //System.getenv()和System.getProperties()的区别 //System.getenv() 返回系统环境变量值 设置系统环境变量:当前登录用户主目录下的".bashrc"文件中可以设置系统环境变量 //System.getProperties() 返回Java进程变量值 通过命令行参数的"-D"选项 val conf = sys.env.get("SPARK_AUDIT_MASTER") match { case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master) case None => new SparkConf().setAppName("Simple Sql App") } val sc = new SparkContext(conf) val hiveContext = new HiveContext(sc) import hiveContext._ sql("DROP TABLE IF EXISTS src") sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src") val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect() results.foreach(println) def test(f: => Boolean, failureMsg: String) = { if (!f) { println(failureMsg) System.exit(-1) } } test(results.size == 5, "Unexpected number of selected elements: " + results) println("Test succeeded") sc.stop() } } // scalastyle:on println
Example 10
Source File: HiveReader.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.utils.io import it.agilelab.bigdata.DataQuality.sources.HiveTableConfig import it.agilelab.bigdata.DataQuality.utils.Logging import org.apache.spark.sql.DataFrame import org.apache.spark.sql.hive.HiveContext import scala.util.Try def loadHiveTable(inputConf: HiveTableConfig)( implicit hiveContext: HiveContext): Seq[DataFrame] = { // You can specify a template for queries here. Currently it's just an input query as it is val full_query = inputConf.query Try { Seq(hiveContext.sql(full_query)) }.getOrElse({ log.warn("Failed to load HIVE table") Seq.empty }) } }
Example 11
Source File: DQMainClass.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.utils import java.util.Locale import it.agilelab.bigdata.DataQuality.utils.io.HistoryDBManager import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.log4j.{Level, Logger} import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.HiveContext trait DQMainClass { this: DQSparkContext with Logging => private def initLogger(): Unit = { Logger.getLogger("org").setLevel(Level.WARN) Logger.getLogger("org.apache.spark.scheduler.TaskSetManager").setLevel(Level.WARN) Logger.getLogger("akka").setLevel(Level.OFF) Logger.getLogger("io.netty").setLevel(Level.OFF) Logger.getLogger("org.spark-project.jetty").setLevel(Level.OFF) Logger.getLogger("org.apache.hadoop.hdfs.KeyProviderCache").setLevel(Level.OFF) } private def makeFileSystem(settings: DQSettings, sc: SparkContext): FileSystem = { if (sc.isLocal) FileSystem.getLocal(sc.hadoopConfiguration) else { if (settings.s3Bucket.isDefined) { sc.hadoopConfiguration.set("fs.defaultFS", settings.s3Bucket.get) sc.hadoopConfiguration.set("fs.AbstractFileSystem.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") } FileSystem.get( sc.hadoopConfiguration) } } protected def body()(implicit fs: FileSystem, sparkContext: SparkContext, sqlContext: SQLContext, sqlWriter: HistoryDBManager, settings: DQSettings): Boolean def preMessage(task: String): Unit = { log.warn("************************************************************************") log.warn(s" Starting execution of task $task") log.warn("************************************************************************") } def postMessage(task: String): Unit = { log.warn("************************************************************************") log.warn(s" Finishing execution of task $task") log.warn("************************************************************************") } def main(args: Array[String]): Unit = { // set to avoid casting problems in metric result name generation Locale.setDefault(Locale.ENGLISH) initLogger() DQCommandLineOptions.parser().parse(args, DQCommandLineOptions("","")) match { case Some(commandLineOptions) => // Load our own config values from the default location, application.conf val settings = new DQSettings(commandLineOptions) val sparkContext = makeSparkContext(settings) val fs = makeFileSystem(settings, sparkContext) settings.logThis()(log) val sqlContext: SQLContext = if (settings.hiveDir.isDefined) { val hc = new HiveContext(sparkContext) hc.setConf("hive.metastore.warehouse.dir", settings.hiveDir.get) hc } else makeSqlContext(sparkContext) val historyDatabase = new HistoryDBManager(settings) // Starting application body preMessage(s"{${settings.appName}}") val startTime = System.currentTimeMillis() body()(fs, sparkContext, sqlContext, historyDatabase, settings) postMessage(s"{${settings.appName}}") log.info(s"Execution finished in [${(System.currentTimeMillis() - startTime) / 60000}] min(s)") log.info("Closing application...") historyDatabase.closeConnection() sparkContext.stop() log.info("Spark context were terminated. Exiting...") case None => log.error("Wrong parameters provided") throw new Exception("Wrong parameters provided") } } }
Example 12
Source File: HiveManageTable.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.common.ml.util import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.SaveMode import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.types.StructType trait HiveManageTable { def toRow(partition: String): Row } object HiveManageTable { def saveRDDToHive[T <: HiveManageTable](hiveContext: HiveContext, data: RDD[T], table: String, schema: StructType, mode: SaveMode, partition: String, partitionValue: String, hiveConfig: Map[String, String] = dynamicPartitions):Unit = { hiveConfig.foreach { case (key, value) => hiveContext.setConf(key, value) } hiveContext.createDataFrame(data.map(_.toRow(partitionValue)), schema) .write .mode(mode) .partitionBy(partition) .insertInto(table) } lazy val dynamicPartitions = Map( "hive.exec.dynamic.partition" -> "true", "hive.exec.dynamic.partition.mode" -> "nonstrict" ) }
Example 13
Source File: BaseBinaryRegressionTrainer.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.common.ml.strategy.trainer import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.hive.HiveContext import com.airbnb.common.ml.strategy.config.TrainingOptions import com.airbnb.common.ml.strategy.data.{BaseBinarySample, TrainingData} import com.airbnb.common.ml.strategy.params.StrategyParams case class BaseBinaryRegressionTrainer( strategyParams: StrategyParams[BaseBinarySample], trainingDataType: TrainingData[BaseBinarySample] ) extends BinaryRegressionTrainer[BaseBinarySample] { override def getLearningRate( r0: Double, r1: Double, example: BaseBinarySample, options: TrainingOptions ): Double = { val x = example.x val learningRate = if (example.label) { r1 * x } else { 1 - x } r0 * learningRate } override def createDataFrameFromModelOutput( models: RDD[(String, StrategyParams[BaseBinarySample])], hc: HiveContext ): DataFrame = { ??? } }
Example 14
Source File: ModelOutput.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.common.ml.strategy.data import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SaveMode} import com.airbnb.common.ml.strategy.config.TrainingOptions import com.airbnb.common.ml.strategy.eval.BinaryMetrics import com.airbnb.common.ml.strategy.params.StrategyParams import com.airbnb.common.ml.util.HiveManageTable case class ModelOutput[T]( id: String, params: StrategyParams[T], loss: Double, evalMetrics: BinaryMetrics, holdoutMetrics: BinaryMetrics, options: TrainingOptions ) extends HiveManageTable { override def toRow(partition: String): Row = { Row( id.toLong, holdoutMetrics.posCount, holdoutMetrics.negCount, holdoutMetrics.posSugHigher, holdoutMetrics.posSugLower, holdoutMetrics.negSugHigher, holdoutMetrics.negSugLower, holdoutMetrics.increasePrecision, holdoutMetrics.increaseRecall, holdoutMetrics.decreasePrecision, holdoutMetrics.decreaseRecall, holdoutMetrics.trueRegret, holdoutMetrics.trueRegretMedian, holdoutMetrics.trueRegret75Percentile, holdoutMetrics.falseRegret, holdoutMetrics.trueIncreaseMagnitude, holdoutMetrics.trueDecreaseMagnitude, holdoutMetrics.falseDecreaseMagnitude, holdoutMetrics.falseIncreaseMagnitude, params.params, loss, options.toPartialArray, partition ) } } object ModelOutput { lazy val schema = StructType( Seq( StructField("id", LongType), StructField("posCount", IntegerType), StructField("negCount", IntegerType), StructField("posSugHigher", IntegerType), StructField("posSugLower", IntegerType), StructField("negSugHigher", IntegerType), StructField("negSugLower", IntegerType), StructField("increasePrecision", DoubleType), StructField("increaseRecall", DoubleType), StructField("decreasePrecision", DoubleType), StructField("decreaseRecall", DoubleType), StructField("trueRegret", DoubleType), StructField("trueRegretMedian", DoubleType), StructField("trueRegret75Percentile", DoubleType), StructField("falseRegret", DoubleType), StructField("trueIncreaseMagnitude", DoubleType), StructField("trueDecreaseMagnitude", DoubleType), StructField("falseDecreaseMagnitude", DoubleType), StructField("falseIncreaseMagnitude", DoubleType), StructField("params", ArrayType(DoubleType)), StructField("loss", DoubleType), StructField("options", ArrayType(DoubleType)), StructField("model", StringType) ) ) def save[T]( hiveContext: HiveContext, data: RDD[ModelOutput[T]], table: String, partition: String ): Unit = { HiveManageTable.saveRDDToHive( hiveContext, data, table, ModelOutput.schema, SaveMode.Overwrite, "model", partition) } }
Example 15
Source File: ModelData.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.common.ml.xgboost.data import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.{DataFrame, Row} import com.airbnb.common.ml.util.HiveUtil object ModelData { def parseRowToLabelAndVaue(row: Row, labelPos: Int, dropCount: Int): (Float, Array[Float]) = { // for row doesn't contains label, pass in negative labelPos val label: Number = if (labelPos >= 0) { row.getAs[Number](labelPos) } else { 0 } val seq = row.toSeq.drop(dropCount).map(x=>{ if (x != null) { x.asInstanceOf[Number].floatValue() } else { ModelData.NULL_VALUE } }).toArray (label.floatValue(), seq) } def parseRowToRawXgboostLabeledPoint(row: Row, labelPos: Int, dropCount: Int): LabeledPoint = { val (label, seq) = parseRowToLabelAndVaue(row, labelPos, dropCount) LabeledPoint.fromDenseVector(label, seq) } def getLabeledPoints( sc: SparkContext, query: String, trainingLabeledPoint: TrainingModelData ): RDD[(String, Seq[LabeledPoint])] = { val df = ModelData.getDataFrame(sc, query) HiveUtil.loadDataFromDataFrameGroupByKey( df, ModelData.parseKeyFromHiveRow(ModelData.TRAINING_KEY_INDEX), trainingLabeledPoint.parseRowToXgboostLabeledPoint) } def getScoringLabeledPoints(sc: SparkContext, query: String, scoringLabeledPoint: ScoringModelData): RDD[(String, ScoringLabeledPoint)] = { val df = ModelData.getDataFrame(sc, query) HiveUtil.loadDataFromDataFrame( df, // score_query_head of scoring.conf also defined S_node_10k_id same as TRAINING_KEY_INDEX ModelData.parseKeyFromHiveRow(ModelData.TRAINING_KEY_INDEX), scoringLabeledPoint.parseRowToXgboostLabeledPointAndData) } def getLabeledPointsAndString(sc: SparkContext, query: String, scoringLabeledPoint: ScoringModelData): RDD[(String, Seq[ScoringLabeledPoint])] = { val df = ModelData.getDataFrame(sc, query) HiveUtil.loadDataFromDataFrameGroupByKey( df, // score_query_head of scoring.conf also defined S_node_10k_id same as TRAINING_KEY_INDEX ModelData.parseKeyFromHiveRow(ModelData.TRAINING_KEY_INDEX), scoringLabeledPoint.parseRowToXgboostLabeledPointAndData) } // parseRowToLabelAndVaue can't return null, so use -1 if input is null // this is same with train_with_prob.conf 's hql query. val NULL_VALUE: Int = -1 // refer to query in xgboost/search.conf val TRAINING_KEY_INDEX: Int = 1 // refer to pricing.grid_search val PARAMS_KEY_INDEX: Int = 0 val PARAMS_INDEX: Int = 1 // default use second field as node key def parseKeyFromHiveRow(keyIndex: Int)(row: Row): String= { row.getAs[Long](keyIndex).toString } def getParams(sc: SparkContext, query: String): RDD[(String, Seq[Array[Double]])] = { val df = getDataFrame(sc, query) HiveUtil.loadDataFromDataFrameGroupByKey( df, ModelData.parseKeyFromHiveRow(0), parseRowToParams) } def parseRowToParams(row: Row): Array[Double] = { row.getAs[scala.collection.mutable.WrappedArray[Double]](ModelData.PARAMS_INDEX).toArray } def getDataFrame(sc: SparkContext, query: String): DataFrame = { val hc = new HiveContext(sc) hc.sql(query) } }
Example 16
Source File: XGBoostScoringPipeline.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.common.ml.xgboost import com.typesafe.config.Config import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.{DMatrix, XGBoost} import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import com.airbnb.common.ml.util.{PipelineUtil, ScalaLogging} import com.airbnb.common.ml.xgboost.config.XGBoostScoringConfig import com.airbnb.common.ml.xgboost.data.{ModelData, ScoringLabeledPoint, ScoringModelData} object XGBoostScoringPipeline extends ScalaLogging { def loadModels(path: String): Map[String, ml.dmlc.xgboost4j.scala.Booster] = { val streams = PipelineUtil.getHDFSInputStreams(path) streams.map { case (id, stream) => { (id, XGBoost.loadModel(stream)) } }.toMap } def scorePartition( path: String, groupNumber: Int ) (iter: Iterator[(String, ScoringLabeledPoint)]): Iterator[String] = { iter.grouped(groupNumber).flatMap { seq => seq.groupBy(_._1).flatMap { case (id, scoringData) => { val data = scoringData.map(x => x._2.labeledPoint) val model_path = path + id val prediction = scoreWithPath(model_path, data) prediction.zip(scoringData.map(x => x._2.data)).map { case (score, index) => { s"$index\t${score(0)}" } } } }.iterator } } def baseScore( modelData: ScoringModelData, sc: SparkContext, config: Config ): Unit = { // read training data, available at xgboost/demo/data val conf = XGBoostScoringConfig.loadConfig(sc, config) val data = ModelData.getScoringLabeledPoints( sc, conf.query, modelData) val output: RDD[String] = data.mapPartitions(scorePartition(conf.modelBasePath, conf .groupNumber)) if (conf.saveHiveTable) { val hc = new HiveContext(sc) PipelineUtil.saveToHdfsAndUpdateHive( hc, conf.outputPath, conf.outputTable, conf.partitionSpec, output, conf.overwrite) } else { PipelineUtil.saveAndCommitAsTextFile(output, conf.outputPath, conf.overwrite) } } def scoreWithPath( path: String, data: Seq[LabeledPoint] ): Array[Array[Float]] = { logger.info(s"model_path: $path") val scoring = new DMatrix(data.iterator, null) val inputStream = PipelineUtil.getHDFSInputStream(path) val model = XGBoost.loadModel(inputStream) // output array[[score],[score],...] val output = model.predict(scoring) scoring.delete() model.dispose assert(data.length == output.length) output } }
Example 17
Source File: HiveUtil.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.aerosolve.training.pipeline import org.apache.spark.sql.hive.HiveContext object HiveUtil { def updateHivePartition(hc: HiveContext, hiveTable: String, partitionSpec: String, hdfsLocation: String) : Boolean = { if (!hiveTable.contains('.')) { throw new RuntimeException(s"Missing namespace for the hive table $hiveTable.") } val Array(namespace, table) = hiveTable.split('.') hc.sql(s"USE $namespace") hc.sql(s"ALTER TABLE $table DROP IF EXISTS PARTITION ($partitionSpec)") hc.sql(s"ALTER TABLE $table ADD PARTITION ($partitionSpec) location '$hdfsLocation'") true } }
Example 18
Source File: UISeleniumSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import scala.util.Random import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.openqa.selenium.WebDriver import org.openqa.selenium.htmlunit.HtmlUnitDriver import org.scalatest.concurrent.Eventually._ import org.scalatest.selenium.WebBrowser import org.scalatest.time.SpanSugar._ import org.scalatest.{BeforeAndAfterAll, Matchers} import org.apache.spark.sql.hive.HiveContext class UISeleniumSuite extends HiveThriftJdbcTest with WebBrowser with Matchers with BeforeAndAfterAll { implicit var webDriver: WebDriver = _ var server: HiveThriftServer2 = _ var hc: HiveContext = _ val uiPort = 20000 + Random.nextInt(10000) override def mode: ServerMode.Value = ServerMode.binary override def beforeAll(): Unit = { webDriver = new HtmlUnitDriver super.beforeAll() } override def afterAll(): Unit = { if (webDriver != null) { webDriver.quit() } super.afterAll() } override protected def serverStartCommand(port: Int) = { val portConf = if (mode == ServerMode.binary) { ConfVars.HIVE_SERVER2_THRIFT_PORT } else { ConfVars.HIVE_SERVER2_THRIFT_HTTP_PORT } s"""$startScript | --master local | --hiveconf hive.root.logger=INFO,console | --hiveconf ${ConfVars.METASTORECONNECTURLKEY}=$metastoreJdbcUri | --hiveconf ${ConfVars.METASTOREWAREHOUSE}=$warehousePath | --hiveconf ${ConfVars.HIVE_SERVER2_THRIFT_BIND_HOST}=localhost | --hiveconf ${ConfVars.HIVE_SERVER2_TRANSPORT_MODE}=$mode | --hiveconf $portConf=$port | --driver-class-path ${sys.props("java.class.path")} | --conf spark.ui.enabled=true | --conf spark.ui.port=$uiPort """.stripMargin.split("\\s+").toSeq } ignore("thrift server ui test") { withJdbcStatement { statement => val baseURL = s"http://localhost:$uiPort" val queries = Seq( "CREATE TABLE test_map(key INT, value STRING)", s"LOAD DATA LOCAL INPATH '${TestData.smallKv}' OVERWRITE INTO TABLE test_map") queries.foreach(statement.execute) eventually(timeout(10 seconds), interval(50 milliseconds)) { go to baseURL find(cssSelector("""ul li a[href*="sql"]""")) should not be None } eventually(timeout(10 seconds), interval(50 milliseconds)) { go to (baseURL + "/sql") find(id("sessionstat")) should not be None find(id("sqlstat")) should not be None // check whether statements exists queries.foreach { line => findAll(cssSelector("""ul table tbody tr td""")).map(_.text).toList should contain (line) } } } } }
Example 19
Source File: MistScContext.scala From mist with Apache License 2.0 | 5 votes |
package io.hydrosphere.mist.worker import org.apache.spark.api.java.JavaSparkContext import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.{Duration => SDuration} import org.apache.spark.{SparkConf, SparkContext, SparkSessionUtils} import scala.collection.mutable import scala.concurrent.duration.Duration class MistScContext( val sc: SparkContext, val namespace: String, val streamingDuration: SDuration = SDuration(40 * 1000) ) { private val jars = mutable.Buffer.empty[String] def isK8S: Boolean = sc.getConf.get("spark.master").startsWith("k8s://") def addJar(artifact: SparkArtifact): Unit = synchronized { val path = if (isK8S) artifact.url else artifact.local.getAbsolutePath if (!jars.contains(path)) { sc.addJar(path) jars += path } } def getUIAddress(): Option[String] = SparkUtils.getSparkUiAddress(sc) //TODO: can we call that inside python directly using setupConfiguration? // python support def sparkConf: SparkConf = sc.getConf // python support def javaContext: JavaSparkContext = new JavaSparkContext(sc) // python support def sqlContext: SQLContext = new SQLContext(sc) // python support def hiveContext: HiveContext = new HiveContext(sc) def sparkSession(enableHive: Boolean): SparkSession = SparkSessionUtils.getOrCreate(sc, enableHive) def stop(): Unit = { sc.stop() } } object MistScContext { def apply(id: String, streamingDuration: Duration, sparkConf: SparkConf): MistScContext = { val upd = sparkConf.clone() .setAppName(id) .set("spark.streaming.stopSparkContextByDefault", "false") val duration = SDuration(streamingDuration.toMillis) val sc = new SparkContext(upd) new MistScContext(sc, id, duration) } def apply(id: String, streamingDuration: Duration): MistScContext = apply(id, streamingDuration, new SparkConf()) }
Example 20
Source File: HiveApp.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package main.scala import scala.collection.mutable.{ListBuffer, Queue} import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext case class Person(name: String, age: Int) object SparkSqlExample { def main(args: Array[String]) { val conf = sys.env.get("SPARK_AUDIT_MASTER") match { case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master) case None => new SparkConf().setAppName("Simple Sql App") } val sc = new SparkContext(conf) val hiveContext = new HiveContext(sc) import hiveContext._ sql("DROP TABLE IF EXISTS src") sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src") val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect() results.foreach(println) def test(f: => Boolean, failureMsg: String) = { if (!f) { println(failureMsg) System.exit(-1) } } test(results.size == 5, "Unexpected number of selected elements: " + results) println("Test succeeded") sc.stop() } } // scalastyle:on println
Example 21
Source File: CreateViewAsSelect.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.hive.{HiveMetastoreTypes, HiveContext} import org.apache.spark.sql.{AnalysisException, Row, SQLContext} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable} // TODO: Note that this class can NOT canonicalize the view SQL string entirely, which is different // from Hive and may not work for some cases like create view on self join. private[hive] case class CreateViewAsSelect( tableDesc: HiveTable, childSchema: Seq[Attribute], allowExisting: Boolean, orReplace: Boolean) extends RunnableCommand { assert(tableDesc.schema == Nil || tableDesc.schema.length == childSchema.length) assert(tableDesc.viewText.isDefined) val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database)) override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] if (hiveContext.catalog.tableExists(tableIdentifier)) { if (allowExisting) { // view already exists, will do nothing, to keep consistent with Hive } else if (orReplace) { hiveContext.catalog.client.alertView(prepareTable()) } else { throw new AnalysisException(s"View $tableIdentifier already exists. " + "If you want to update the view definition, please use ALTER VIEW AS or " + "CREATE OR REPLACE VIEW AS") } } else { hiveContext.catalog.client.createView(prepareTable()) } Seq.empty[Row] } private def prepareTable(): HiveTable = { // setup column types according to the schema of child. val schema = if (tableDesc.schema == Nil) { childSchema.map { attr => HiveColumn(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), null) } } else { childSchema.zip(tableDesc.schema).map { case (attr, col) => HiveColumn(col.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), col.comment) } } val columnNames = childSchema.map(f => verbose(f.name)) // When user specified column names for view, we should create a project to do the renaming. // When no column name specified, we still need to create a project to declare the columns // we need, to make us more robust to top level `*`s. val projectList = if (tableDesc.schema == Nil) { columnNames.mkString(", ") } else { columnNames.zip(tableDesc.schema.map(f => verbose(f.name))).map { case (name, alias) => s"$name AS $alias" }.mkString(", ") } val viewName = verbose(tableDesc.name) val expandedText = s"SELECT $projectList FROM (${tableDesc.viewText.get}) $viewName" tableDesc.copy(schema = schema, viewText = Some(expandedText)) } // escape backtick with double-backtick in column name and wrap it with backtick. private def verbose(name: String) = s"`${name.replaceAll("`", "``")}`" }
Example 22
Source File: CreateTableAsSelect.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.client.{HiveColumn, HiveTable} import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes, MetastoreRelation} import org.apache.spark.sql.{AnalysisException, Row, SQLContext} private[hive] case class CreateTableAsSelect( tableDesc: HiveTable, query: LogicalPlan, allowExisting: Boolean) extends RunnableCommand { val tableIdentifier = TableIdentifier(tableDesc.name, Some(tableDesc.database)) override def children: Seq[LogicalPlan] = Seq(query) override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withFormat = tableDesc.copy( inputFormat = tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName()))) val withSchema = if (withFormat.schema.isEmpty) { // Hive doesn't support specifying the column list for target table in CTAS // However we don't think SparkSQL should follow that. tableDesc.copy(schema = query.output.map(c => HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null))) } else { withFormat } hiveContext.catalog.client.createTable(withSchema) // Get the Metastore Relation hiveContext.catalog.lookupRelation(tableIdentifier, None) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (hiveContext.catalog.tableExists(tableIdentifier)) { if (allowExisting) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$tableIdentifier already exists.") } } else { hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, TableName: ${tableDesc.name}, InsertIntoHiveTable]" } }
Example 23
Source File: Main.scala From BigDatalog with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.hive.HiveContext object Main { def main(args: Array[String]) { // scalastyle:off println println("Running regression test for SPARK-8489.") val sc = new SparkContext("local", "testing") val hc = new HiveContext(sc) // This line should not throw scala.reflect.internal.MissingRequirementError. // See SPARK-8470 for more detail. val df = hc.createDataFrame(Seq(MyCoolClass("1", "2", "3"))) df.collect() println("Regression test for SPARK-8489 success!") // scalastyle:on println sc.stop() } }
Example 24
Source File: SparkSQLCLIService.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.IOException import java.util.{List => JList} import javax.security.auth.login.LoginException import scala.collection.JavaConverters._ import org.apache.commons.logging.Log import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.shims.Utils import org.apache.hadoop.security.UserGroupInformation import org.apache.hive.service.Service.STATE import org.apache.hive.service.auth.HiveAuthFactory import org.apache.hive.service.cli._ import org.apache.hive.service.server.HiveServer2 import org.apache.hive.service.{AbstractService, Service, ServiceException} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, hiveContext: HiveContext) extends CLIService(hiveServer) with ReflectedCompositeService { override def init(hiveConf: HiveConf) { setSuperField(this, "hiveConf", hiveConf) val sparkSqlSessionManager = new SparkSQLSessionManager(hiveServer, hiveContext) setSuperField(this, "sessionManager", sparkSqlSessionManager) addService(sparkSqlSessionManager) var sparkServiceUGI: UserGroupInformation = null if (UserGroupInformation.isSecurityEnabled) { try { HiveAuthFactory.loginFromKeytab(hiveConf) sparkServiceUGI = Utils.getUGI() setSuperField(this, "serviceUGI", sparkServiceUGI) } catch { case e @ (_: IOException | _: LoginException) => throw new ServiceException("Unable to login to kerberos with given principal/keytab", e) } } initCompositeService(hiveConf) } override def getInfo(sessionHandle: SessionHandle, getInfoType: GetInfoType): GetInfoValue = { getInfoType match { case GetInfoType.CLI_SERVER_NAME => new GetInfoValue("Spark SQL") case GetInfoType.CLI_DBMS_NAME => new GetInfoValue("Spark SQL") case GetInfoType.CLI_DBMS_VER => new GetInfoValue(hiveContext.sparkContext.version) case _ => super.getInfo(sessionHandle, getInfoType) } } } private[thriftserver] trait ReflectedCompositeService { this: AbstractService => def initCompositeService(hiveConf: HiveConf) { // Emulating `CompositeService.init(hiveConf)` val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList") serviceList.asScala.foreach(_.init(hiveConf)) // Emulating `AbstractService.init(hiveConf)` invoke(classOf[AbstractService], this, "ensureCurrentState", classOf[STATE] -> STATE.NOTINITED) setAncestorField(this, 3, "hiveConf", hiveConf) invoke(classOf[AbstractService], this, "changeState", classOf[STATE] -> STATE.INITED) getAncestorField[Log](this, 3, "LOG").info(s"Service: $getName is inited.") } }
Example 25
Source File: SparkSQLDriver.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.{Arrays, ArrayList => JArrayList, List => JList} import org.apache.log4j.LogManager import org.apache.spark.sql.AnalysisException import scala.collection.JavaConverters._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.Logging import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} private[hive] class SparkSQLDriver( val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: context.QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.isEmpty) { new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "") } new Schema(fieldSchemas.asJava, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.stringResult() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause) } } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getResults(res: JList[_]): Boolean = { if (hiveResponse == null) { false } else { res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava) hiveResponse = null true } } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } }
Example 26
Source File: SparkSQLSessionManager.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.util.concurrent.Executors import org.apache.commons.logging.Log import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.conf.HiveConf.ConfVars import org.apache.hive.service.cli.SessionHandle import org.apache.hive.service.cli.session.SessionManager import org.apache.hive.service.cli.thrift.TProtocolVersion import org.apache.hive.service.server.HiveServer2 import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, hiveContext: HiveContext) extends SessionManager(hiveServer) with ReflectedCompositeService { private lazy val sparkSqlOperationManager = new SparkSQLOperationManager() override def init(hiveConf: HiveConf) { setSuperField(this, "hiveConf", hiveConf) // Create operation log root directory, if operation logging is enabled if (hiveConf.getBoolVar(ConfVars.HIVE_SERVER2_LOGGING_OPERATION_ENABLED)) { invoke(classOf[SessionManager], this, "initOperationLogRootDir") } val backgroundPoolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS) setSuperField(this, "backgroundOperationPool", Executors.newFixedThreadPool(backgroundPoolSize)) getAncestorField[Log](this, 3, "LOG").info( s"HiveServer2: Async execution pool size $backgroundPoolSize") setSuperField(this, "operationManager", sparkSqlOperationManager) addService(sparkSqlOperationManager) initCompositeService(hiveConf) } override def openSession( protocol: TProtocolVersion, username: String, passwd: String, ipAddress: String, sessionConf: java.util.Map[String, String], withImpersonation: Boolean, delegationToken: String): SessionHandle = { val sessionHandle = super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation, delegationToken) val session = super.getSession(sessionHandle) HiveThriftServer2.listener.onSessionCreated( session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername) val ctx = if (hiveContext.hiveThriftServerSingleSession) { hiveContext } else { hiveContext.newSession() } ctx.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion) sparkSqlOperationManager.sessionToContexts += sessionHandle -> ctx sessionHandle } override def closeSession(sessionHandle: SessionHandle) { HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString) super.closeSession(sessionHandle) sparkSqlOperationManager.sessionToActivePool -= sessionHandle sparkSqlOperationManager.sessionToContexts.remove(sessionHandle) } }
Example 27
Source File: SparkSQLOperationManager.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.server import java.util.{Map => JMap} import scala.collection.mutable.Map import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager} import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.Logging import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, ReflectionUtils} private[thriftserver] class SparkSQLOperationManager() extends OperationManager with Logging { val handleToOperation = ReflectionUtils .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation") val sessionToActivePool = Map[SessionHandle, String]() val sessionToContexts = Map[SessionHandle, HiveContext]() override def newExecuteStatementOperation( parentSession: HiveSession, statement: String, confOverlay: JMap[String, String], async: Boolean): ExecuteStatementOperation = synchronized { val hiveContext = sessionToContexts(parentSession.getSessionHandle) val runInBackground = async && hiveContext.hiveThriftServerAsync val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay, runInBackground)(hiveContext, sessionToActivePool) handleToOperation.put(operation.getHandle, operation) logDebug(s"Created Operation for $statement with session=$parentSession, " + s"runInBackground=$runInBackground") operation } }
Example 28
Source File: SparkSQLEnv.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream import scala.collection.JavaConverters._ import org.apache.spark.scheduler.StatsReportListener import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{Logging, SparkConf, SparkContext} import org.apache.spark.util.Utils def stop() { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { sparkContext.stop() sparkContext = null hiveContext = null } } }
Example 29
Source File: ScalaSparkSQLBench.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.sql import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.hive.HiveContext object ScalaSparkSQLBench{ def main(args: Array[String]){ if (args.length < 2){ System.err.println( s"Usage: $ScalaSparkSQLBench <workload name> <SQL sciprt file>" ) System.exit(1) } val workload_name = args(0) val sql_file = args(1) val sparkConf = new SparkConf().setAppName(workload_name) .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(sparkConf) val hc = new HiveContext(sc) val _sql = scala.io.Source.fromFile(sql_file).mkString _sql.split(';').foreach { x => if (x.trim.nonEmpty) hc.sql(x) } sc.stop() } }
Example 30
Source File: KuduAccountMartSimpleSums.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.sql.kudu import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} object KuduAccountMartSimpleSums { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> <kuduMaster> " + "<kuduAccountMartTableName> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val kuduAccountMartTableName = args(2) val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val hiveContext = new HiveContext(sc) val kuduOptions = Map( "kudu.table" -> kuduAccountMartTableName, "kudu.master" -> kuduMaster) hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load. registerTempTable("account_mart_tmp") println("------------") val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id"). take(100) println("------------") values.foreach(println) println("------------") sc.stop() } }
Example 31
Source File: KuduToNestedHDFS.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.sql.kudu import com.cloudera.sa.taxi360.model.NyTaxiYellowTripBuilder import org.apache.spark.sql.Row import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} object KuduToNestedHDFS { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<kuduTaxiTripTableName> " + "<hdfsTaxiNestedTableName> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val kuduTaxiTripTableName = args(2) val hdfsTaxiNestedTableName = args(3) val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val hiveContext = new HiveContext(sc) val kuduOptions = Map( "kudu.table" -> kuduTaxiTripTableName, "kudu.master" -> kuduMaster) hiveContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") val kuduDataDf = hiveContext.sql("select * from ny_taxi_trip_tmp") val newNestedDf = kuduDataDf.map(r => { val pojo = NyTaxiYellowTripBuilder.build(r) (pojo.vender_id, pojo) }).groupByKey().map(grp => { Row(grp._1, grp._2.map(p => { Row(p.passenger_count, p.payment_type, p.total_amount, p.fare_amount) })) }) hiveContext.sql("create table " + hdfsTaxiNestedTableName + "( " + " vender_id string," + " trip array<struct< " + " passenger_count: INT," + " payment_type: STRING, " + " total_amount: DOUBLE, " + " fare_amount: DOUBLE " + " >>" + " ) stored as parquet") val emptyDf = hiveContext.sql("select * from " + hdfsTaxiNestedTableName + " limit 0") hiveContext.createDataFrame(newNestedDf, emptyDf.schema).registerTempTable("tmpNested") hiveContext.sql("insert into " + hdfsTaxiNestedTableName + " select * from tmpNested") sc.stop() } }
Example 32
Source File: KuduToHDFS.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.sql.kudu import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} object KuduToHDFS { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> <kuduMaster> " + "<kuduTaxiTripTableName> " + "<hdfsTaxiTripTableName> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val kuduTaxiTripTableName = args(2) val hdfsTaxiTripTableName = args(3) val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val hiveContext = new HiveContext(sc) val kuduOptions = Map( "kudu.table" -> kuduTaxiTripTableName, "kudu.master" -> kuduMaster) hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load. registerTempTable("kuduTaxiTripTableName") hiveContext.sql("CREATE TABLE " + hdfsTaxiTripTableName + " " + " AS SELECT * FROM kuduTaxiTripTableName " + " STORED AS PARQUET") sc.stop() } }
Example 33
Source File: KuduAppEventSimpleSums.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.sql.kudu import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} object KuduAppEventSimpleSums { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> <kuduMaster> " + "<kuduAppEventTableName> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val kuduAppEventTableName = args(2) val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val hiveContext = new HiveContext(sc) val kuduOptions = Map( "kudu.table" -> kuduAppEventTableName, "kudu.master" -> kuduMaster) hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load. registerTempTable("app_event_tmp") println("------------") val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id"). take(100) println("------------") values.foreach(println) println("------------") sc.stop() } }
Example 34
Source File: DateDataTypeDirectDictionaryWithNoDictTestCase.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.directdictionary import java.io.File import java.sql.Date import org.apache.spark.sql.Row import org.apache.spark.sql.hive.HiveContext import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.util.CarbonProperties import org.apache.spark.sql.test.util.QueryTest class DateDataTypeDirectDictionaryWithNoDictTestCase extends QueryTest with BeforeAndAfterAll { var hiveContext: HiveContext = _ override def beforeAll { try { sql("drop table if exists directDictionaryTable") CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "true") sql( """ CREATE TABLE IF NOT EXISTS directDictionaryTable (empno String, doj Date, salary Int) STORED AS carbondata """ ) CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy-MM-dd") val csvFilePath = s"$resourcesPath/datasample.csv" println(csvFilePath) sql("LOAD DATA local inpath '" + csvFilePath + "' INTO TABLE directDictionaryTable OPTIONS" + "('DELIMITER'= ',', 'QUOTECHAR'= '\"')"); } catch { case x: Throwable => x.printStackTrace() CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT) } } test("select doj from directDictionaryTable") { sql("select doj from directDictionaryTable") checkAnswer( sql("select doj from directDictionaryTable"), Seq(Row(Date.valueOf("2016-03-14")), Row(Date.valueOf("2016-04-14")), Row(null) ) ) } test("select doj from directDictionaryTable with equals filter") { sql("select doj from directDictionaryTable where doj='2016-03-14 15:00:09'") checkAnswer( sql("select doj from directDictionaryTable where doj='2016-03-14'"), Seq(Row(Date.valueOf("2016-03-14"))) ) } test("select doj from directDictionaryTable with greater than filter") { sql("select doj from directDictionaryTable where doj>'2016-03-14 15:00:09'") checkAnswer( sql("select doj from directDictionaryTable where doj>'2016-03-14 15:00:09'"), Seq(Row(Date.valueOf("2016-04-14"))) ) } override def afterAll { sql("drop table directDictionaryTable") CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT) CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "false") } }
Example 35
Source File: MakingNestedTableTest.scala From SparkUnitTestingExamples with Apache License 2.0 | 5 votes |
package com.cloudera.sa.spark.unittest.sql import org.apache.spark.sql.Row import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.hive.HiveContext import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} object MakingNestedTableTest extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll { @transient var sc: SparkContext = null @transient var hiveContext: HiveContext = null override def beforeAll(): Unit = { val envMap = Map[String, String](("Xmx", "512m")) val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sparkConfig.set("spark.io.compression.codec", "lzf") sc = new SparkContext("local[2]", "unit test", sparkConfig) hiveContext = new HiveContext(sc) } override def afterAll(): Unit = { sc.stop() } test("Test table creation and summing of counts") { val loanRDD = sc.parallelize(Seq(Row("100", "100000000"), Row("101", "100000000"), Row("102", "100000000"))) val partiesRDD = sc.parallelize(Seq(Row("100", "ted"), Row("101", "bob", "42"), Row("101", "cat", "42"), Row("102", "Jen", "42"), Row("102", "Jenny", "42"), Row("102", "Ed", "42"))) //loan hiveContext.sql("create table loan (id string, amount string) as parquet") val emptyLoanDF = hiveContext.sql("select * from loan limit 0;") val loanDF = hiveContext.createDataFrame(loanRDD, emptyLoanDF.schema) loanDF.registerTempTable("loanTmp") hiveContext.sql("insert into loan select * from loanTmp") //parties hiveContext.sql("create table party (loan_id string, name string, age string) as parquet") val emptyPartyDF = hiveContext.sql("select * from party limit 0;") val partyDF = hiveContext.createDataFrame(partiesRDD, emptyPartyDF.schema) partyDF.registerTempTable("partyTmp") hiveContext.sql("insert into party select * from partyTmp") val keyValueParty = hiveContext.sql("select * from party").map(r => { //Key Value (r.getString(r.fieldIndex("loan_id")), Seq(r)) }).reduceByKey((a, b) => { a ++ b }) val keyValueLoan = hiveContext.sql("select * from loan").map(r => { //Key Value (r.getString(r.fieldIndex("id")), r.getString(r.fieldIndex("amount"))) }) val nestedRDD = keyValueLoan.join(keyValueParty).map(r => { val loanId = r._1 val loanAmount = r._2._1 val seqOfParties = r._2._2.map(r => { Row(r.getString(r.fieldIndex("name")), r.getString(r.fieldIndex("age"))) }) Row(loanId, loanAmount, seqOfParties) }) hiveContext.sql("create table nested (" + "loan_id string, " + "amount string, " + "party <array<struct<" + " name: String," + " age: String>>" + ") as parquet") val emptyNestedDF = hiveContext.sql("select * from nested limit 0;") val nestedDF = hiveContext.createDataFrame(nestedRDD, emptyNestedDF.schema) nestedDF.registerTempTable("nestedTmp") hiveContext.sql("insert into nested select * from nestedTmp") } }
Example 36
Source File: KuduAccountMartSimpleSums.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.sql.kudu import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} object KuduAccountMartSimpleSums { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> <kuduMaster> " + "<kuduAccountMartTableName> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val kuduAccountMartTableName = args(2) val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val hiveContext = new HiveContext(sc) val kuduOptions = Map( "kudu.table" -> kuduAccountMartTableName, "kudu.master" -> kuduMaster) hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load. registerTempTable("account_mart_tmp") println("------------") val values = hiveContext.sql("select account_id, sum(win_count) from account_mart_tmp group by account_id"). take(100) println("------------") values.foreach(println) println("------------") sc.stop() } }
Example 37
Source File: KuduToNestedHDFS.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.sql.kudu import com.hadooparchitecturebook.taxi360.model.NyTaxiYellowTripBuilder import org.apache.spark.sql.Row import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} object KuduToNestedHDFS { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<kuduTaxiTripTableName> " + "<hdfsTaxiNestedTableName> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val kuduTaxiTripTableName = args(2) val hdfsTaxiNestedTableName = args(3) val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val hiveContext = new HiveContext(sc) val kuduOptions = Map( "kudu.table" -> kuduTaxiTripTableName, "kudu.master" -> kuduMaster) hiveContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") val kuduDataDf = hiveContext.sql("select * from ny_taxi_trip_tmp") val newNestedDf = kuduDataDf.map(r => { val pojo = NyTaxiYellowTripBuilder.build(r) (pojo.vender_id, pojo) }).groupByKey().map(grp => { Row(grp._1, grp._2.map(p => { Row(p.passenger_count, p.payment_type, p.total_amount, p.fare_amount) })) }) hiveContext.sql("create table " + hdfsTaxiNestedTableName + "( " + " vender_id string," + " trip array<struct< " + " passenger_count: INT," + " payment_type: STRING, " + " total_amount: DOUBLE, " + " fare_amount: DOUBLE " + " >>" + " ) stored as parquet") val emptyDf = hiveContext.sql("select * from " + hdfsTaxiNestedTableName + " limit 0") hiveContext.createDataFrame(newNestedDf, emptyDf.schema).registerTempTable("tmpNested") hiveContext.sql("insert into " + hdfsTaxiNestedTableName + " select * from tmpNested") sc.stop() } }
Example 38
Source File: KuduToHDFS.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.sql.kudu import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} object KuduToHDFS { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> <kuduMaster> " + "<kuduTaxiTripTableName> " + "<hdfsTaxiTripTableName> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val kuduTaxiTripTableName = args(2) val hdfsTaxiTripTableName = args(3) val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val hiveContext = new HiveContext(sc) val kuduOptions = Map( "kudu.table" -> kuduTaxiTripTableName, "kudu.master" -> kuduMaster) hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load. registerTempTable("kuduTaxiTripTableName") hiveContext.sql("CREATE TABLE " + hdfsTaxiTripTableName + " " + " AS SELECT * FROM kuduTaxiTripTableName " + " STORED AS PARQUET") sc.stop() } }
Example 39
Source File: KuduAppEventSimpleSums.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.sql.kudu import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} object KuduAppEventSimpleSums { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> <kuduMaster> " + "<kuduAppEventTableName> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val kuduAppEventTableName = args(2) val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val hiveContext = new HiveContext(sc) val kuduOptions = Map( "kudu.table" -> kuduAppEventTableName, "kudu.master" -> kuduMaster) hiveContext.read.options(kuduOptions).format("org.kududb.spark.kudu").load. registerTempTable("app_event_tmp") println("------------") val values = hiveContext.sql("select account_id, sum(purchase) from app_event_tmp group by account_id"). take(100) println("------------") values.foreach(println) println("------------") sc.stop() } }
Example 40
Source File: WithSQLContext.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package com.sap.spark import java.util.Locale import org.apache.spark.sql.SQLContext import org.apache.spark.sql.hive.HiveContext import org.scalatest.{BeforeAndAfterEach, Suite} trait WithSQLContext extends BeforeAndAfterEach { self: Suite with WithSparkContext => override def beforeEach(): Unit = { try { super.beforeEach() setUpSQLContext() } catch { case ex: Throwable => tearDownSQLContext() throw ex } } override def afterEach(): Unit = { try { super.afterEach() } finally { tearDownSQLContext() } } implicit def sqlContext: SQLContext = _sqlContext def sqlc: SQLContext = sqlContext var _sqlContext: SQLContext = _ protected def setUpSQLContext(): Unit = _sqlContext = SQLContext.getOrCreate(sc).newSession() protected def tearDownSQLContext(): Unit = _sqlContext = null protected def tableName(name: String): String = sqlc match { case _: HiveContext => name.toLowerCase(Locale.ENGLISH) case _ => name } }
Example 41
Source File: SapSQLEnv.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.sap.thriftserver import java.io.PrintStream import org.apache.spark.scheduler.StatsReportListener import org.apache.spark.sql.hive.{HiveContext, SapHiveContext} import org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver import org.apache.spark.sql.hive.thriftserver.SparkSQLEnv._ import org.apache.spark.util.Utils import org.apache.spark.{Logging, SparkConf, SparkContext} import scala.collection.JavaConversions._ object SapSQLEnv extends Logging { def init() { logDebug("Initializing SapSQLEnv") if (hiveContext == null) { logInfo("Creating SapSQLContext") val sparkConf = new SparkConf(loadDefaults = true) val maybeSerializer = sparkConf.getOption("spark.serializer") val maybeKryoReferenceTracking = sparkConf.getOption("spark.kryo.referenceTracking") // If user doesn't specify the appName, we want to get [SparkSQL::localHostName] instead of // the default appName [SparkSQLCLIDriver] in cli or beeline. val maybeAppName = sparkConf .getOption("spark.app.name") .filterNot(_ == classOf[SparkSQLCLIDriver].getName) sparkConf .setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}")) .set("spark.serializer", maybeSerializer.getOrElse("org.apache.spark.serializer.KryoSerializer")) .set("spark.kryo.referenceTracking", maybeKryoReferenceTracking.getOrElse("false")) sparkContext = new SparkContext(sparkConf) sparkContext.addSparkListener(new StatsReportListener()) hiveContext = new SapHiveContext(sparkContext) hiveContext.metadataHive.setOut(new PrintStream(System.out, true, "UTF-8")) hiveContext.metadataHive.setInfo(new PrintStream(System.err, true, "UTF-8")) hiveContext.metadataHive.setError(new PrintStream(System.err, true, "UTF-8")) hiveContext.setConf("spark.sql.hive.version", HiveContext.hiveExecutionVersion) if (log.isDebugEnabled) { hiveContext.hiveconf.getAllProperties.toSeq.sorted.foreach { case (k, v) => logDebug(s"HiveConf var: $k=$v") } } } } }
Example 42
Source File: SapThriftServer.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import org.apache.commons.logging.LogFactory import org.apache.spark.Logging import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.sap.thriftserver.SapSQLEnv import org.apache.spark.sql.hive.thriftserver.HiveThriftServer2._ import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab import org.apache.hive.service.server.HiveServerServerOptionsProcessor object SapThriftServer extends Logging { var LOG = LogFactory.getLog(classOf[SapThriftServer]) def main(args: Array[String]) { val optionsProcessor = new HiveServerServerOptionsProcessor("SapThriftServer") if (!optionsProcessor.process(args)) { System.exit(-1) } logInfo("Starting SparkContext") SapSQLEnv.init() org.apache.spark.util.ShutdownHookManager.addShutdownHook { () => SparkSQLEnv.stop() uiTab.foreach(_.detach()) } try { val server = new HiveThriftServer2(SparkSQLEnv.hiveContext) server.init(SparkSQLEnv.hiveContext.hiveconf) server.start() logInfo("SapThriftServer started") listener = new HiveThriftServer2Listener(server, SparkSQLEnv.hiveContext.conf) SparkSQLEnv.sparkContext.addSparkListener(listener) uiTab = if (SparkSQLEnv.sparkContext.getConf.getBoolean("spark.ui.enabled", true)) { Some(new ThriftServerTab(SparkSQLEnv.sparkContext)) } else { None } } catch { case e: Exception => logError("Error starting SapThriftServer", e) System.exit(-1) } } } private[hive] class SapThriftServer(val hiveContext: HiveContext) extends Logging{ def start: Unit = { logInfo("ThriftServer with SapSQLContext") logInfo("Starting SparkContext") HiveThriftServer2.startWithContext(hiveContext) } }
Example 43
Source File: L8-38SparkR.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import java.nio.file.Paths import org.apache.spark.SparkFiles object CdrStreamingSparkRApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: CdrStreamingSparkRApp <appname> <batchInterval> <hostname> <port> <tableName> <RScriptPath> <RScriptLogsPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, tableName, rScriptPath, logsPath) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ ssc.sparkContext.addFile(rScriptPath) val rScriptName = SparkFiles.get(Paths.get(rScriptPath).getFileName.toString) val master = hiveC.sparkContext.getConf.get("spark.master") val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD((rdd, time) => { val iTableName = tableName + time.milliseconds seqToCdr(rdd).toDF().write.saveAsTable(iTableName) hiveC.sparkContext.parallelize(Array(iTableName)).pipe("%s %s".format(rScriptName, master)).saveAsTextFile(Paths.get(logsPath, iTableName).toString) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 44
Source File: T8-5-L8-30-34DataFrameExamplesActions.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SaveMode import org.apache.spark.sql.functions.desc import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr import org.json4s.DefaultFormats object CdrDataframeExamplesActionsApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesActionsApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ implicit val formats = DefaultFormats val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count")) counts.show(5) counts.show() println("head(5): " + counts.head(5)) println("take(5): " + counts.take(5)) println("head(): " + counts.head()) println("first(5): " + counts.first()) println("count(): " + counts.count()) println("collect(): " + counts.collect()) println("collectAsList(): " + counts.collectAsList()) println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show()) counts.write.format("parquet").save("/tmp/parquent" + rdd.id) counts.write.format("json").save("/tmp/json" + rdd.id) counts.write.parquet("/tmp/parquent2" + rdd.id) counts.write.json("/tmp/json2" + rdd.id) counts.write.saveAsTable("count_table") cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts") val prop: java.util.Properties = new java.util.Properties() counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 45
Source File: L8-13HiveQL.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CdrHiveqlApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrHiveqlApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { seqToCdr(rdd).toDF().registerTempTable("cdrs") hiveC.sql("SET DATE_FMT='yy-MM-dd|HH'") hiveC.sql("SELECT from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) AS TS, SUM(smsInActivity + smsOutActivity + callInActivity + callOutActivity + internetTrafficActivity) AS Activity FROM cdrs GROUP BY from_unixtime(timeInterval, ${hiveconf:DATE_FMT}) ORDER BY Activity DESC").show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 46
Source File: BadRecordPathLoadOptionTest.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.carbondata import org.apache.spark.sql.CarbonEnv import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.test.util.QueryTest import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.datastore.filesystem.{CarbonFile, CarbonFileFilter} import org.apache.carbondata.core.datastore.impl.FileFactory import org.apache.carbondata.core.util.CarbonProperties class BadRecordPathLoadOptionTest extends QueryTest with BeforeAndAfterAll { var hiveContext: HiveContext = _ override def beforeAll { sql("drop table IF EXISTS salestest") } test("data load log file and csv file written at the configured location") { sql( s"""CREATE TABLE IF NOT EXISTS salestest(ID BigInt, date Timestamp, country String, actual_price Double, Quantity int, sold_price Decimal(19,2)) STORED AS carbondata TBLPROPERTIES('BAD_RECORD_PATH'='$warehouse')""") CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, "yyyy/MM/dd") val csvFilePath = s"$resourcesPath/badrecords/datasample.csv" sql("LOAD DATA local inpath '" + csvFilePath + "' INTO TABLE salestest OPTIONS" + "('bad_records_logger_enable'='true','bad_records_action'='redirect', 'DELIMITER'=" + " ',', 'QUOTECHAR'= '\"')") val location: Boolean = isFilesWrittenAtBadStoreLocation assert(location) } override def afterAll { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT) sql("drop table salestest") } def isFilesWrittenAtBadStoreLocation: Boolean = { val badStorePath = CarbonEnv.getCarbonTable(Some("default"), "salestest")(sqlContext.sparkSession).getTableInfo .getFactTable.getTableProperties.get("bad_record_path") + "/0/0" val carbonFile: CarbonFile = FileFactory.getCarbonFile(badStorePath) var exists: Boolean = carbonFile.exists() if (exists) { val listFiles: Array[CarbonFile] = carbonFile.listFiles(new CarbonFileFilter { override def accept(file: CarbonFile): Boolean = { if (file.getName.endsWith(".log") || file.getName.endsWith(".csv")) { return true; } return false; } }) exists = listFiles.size > 0 } return exists; } }
Example 47
Source File: DateDataTypeNullDataTest.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.directdictionary import java.sql.Date import org.apache.spark.sql.Row import org.apache.spark.sql.hive.HiveContext import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.util.CarbonProperties import org.apache.spark.sql.test.util.QueryTest class DateDataTypeNullDataTest extends QueryTest with BeforeAndAfterAll { var hiveContext: HiveContext = _ override def beforeAll { try { sql( """CREATE TABLE IF NOT EXISTS timestampTyeNullData (ID Int, dateField date, country String, name String, phonetype String, serialname String, salary Int) STORED AS carbondata""" ) CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy/MM/dd") val csvFilePath = s"$resourcesPath/datasamplenull.csv" sql("LOAD DATA LOCAL INPATH '" + csvFilePath + "' INTO TABLE timestampTyeNullData").collect(); } catch { case x: Throwable => x.printStackTrace() CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT) } } test("SELECT max(dateField) FROM timestampTyeNullData where dateField is not null") { checkAnswer( sql("SELECT max(dateField) FROM timestampTyeNullData where dateField is not null"), Seq(Row(Date.valueOf("2015-07-23")) ) ) } test("SELECT * FROM timestampTyeNullData where dateField is null") { checkAnswer( sql("SELECT dateField FROM timestampTyeNullData where dateField is null"), Seq(Row(null) )) } override def afterAll { sql("drop table timestampTyeNullData") CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, CarbonCommonConstants.CARBON_DATE_DEFAULT_FORMAT) CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "false") } }
Example 48
Source File: TimestampDataTypeNullDataTest.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.directdictionary import java.io.File import java.sql.Timestamp import org.apache.spark.sql.Row import org.apache.spark.sql.hive.HiveContext import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.keygenerator.directdictionary.timestamp.TimeStampGranularityConstants import org.apache.carbondata.core.util.CarbonProperties import org.apache.spark.sql.test.util.QueryTest class TimestampDataTypeNullDataTest extends QueryTest with BeforeAndAfterAll { var hiveContext: HiveContext = _ override def beforeAll { try { CarbonProperties.getInstance() .addProperty(TimeStampGranularityConstants.CARBON_CUTOFF_TIMESTAMP, "2000-12-13 02:10.00.0") CarbonProperties.getInstance() .addProperty(TimeStampGranularityConstants.CARBON_TIME_GRANULARITY, TimeStampGranularityConstants.TIME_GRAN_SEC.toString ) sql( """CREATE TABLE IF NOT EXISTS timestampTyeNullData (ID Int, dateField Timestamp, country String, name String, phonetype String, serialname String, salary Int) STORED AS carbondata""" ) CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, "yyyy/MM/dd") val csvFilePath = s"$resourcesPath/datasamplenull.csv" sql("LOAD DATA LOCAL INPATH '" + csvFilePath + "' INTO TABLE timestampTyeNullData").collect(); } catch { case x: Throwable => CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT) } } test("SELECT max(dateField) FROM timestampTyeNullData where dateField is not null") { checkAnswer( sql("SELECT max(dateField) FROM timestampTyeNullData where dateField is not null"), Seq(Row(Timestamp.valueOf("2015-07-23 00:00:00.0")) ) ) } test("SELECT * FROM timestampTyeNullData where dateField is null") { checkAnswer( sql("SELECT dateField FROM timestampTyeNullData where dateField is null"), Seq(Row(null) )) } override def afterAll { sql("drop table timestampTyeNullData") CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT, CarbonCommonConstants.CARBON_TIMESTAMP_DEFAULT_FORMAT) CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "false") } }
Example 49
Source File: TimestampDataTypeDirectDictionaryWithNoDictTestCase.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.directdictionary import java.sql.Timestamp import org.apache.spark.sql.Row import org.apache.spark.sql.hive.HiveContext import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.keygenerator.directdictionary.timestamp.TimeStampGranularityConstants import org.apache.carbondata.core.util.CarbonProperties import org.apache.spark.sql.test.util.QueryTest class TimestampDataTypeDirectDictionaryWithNoDictTestCase extends QueryTest with BeforeAndAfterAll { var hiveContext: HiveContext = _ override def beforeAll { CarbonProperties.getInstance() .addProperty(TimeStampGranularityConstants.CARBON_CUTOFF_TIMESTAMP, "2000-12-13 02:10.00.0") CarbonProperties.getInstance() .addProperty(TimeStampGranularityConstants.CARBON_TIME_GRANULARITY, TimeStampGranularityConstants.TIME_GRAN_SEC.toString ) CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "true") sql( """ CREATE TABLE IF NOT EXISTS directDictionaryTable (empno String, doj Timestamp, salary Int) STORED AS carbondata""" ) val csvFilePath = s"$resourcesPath/datasample.csv" sql("LOAD DATA local inpath '" + csvFilePath + "' INTO TABLE directDictionaryTable OPTIONS" + "('DELIMITER'= ',', 'QUOTECHAR'= '\"')") } test("select doj from directDictionaryTable") { checkAnswer( sql("select doj from directDictionaryTable"), Seq(Row(Timestamp.valueOf("2016-03-14 15:00:09.0")), Row(Timestamp.valueOf("2016-04-14 15:00:09.0")), Row(null) ) ) } test("select doj from directDictionaryTable with equals filter") { checkAnswer( sql("select doj from directDictionaryTable where doj='2016-03-14 15:00:09'"), Seq(Row(Timestamp.valueOf("2016-03-14 15:00:09"))) ) } test("select doj from directDictionaryTable with greater than filter") { checkAnswer( sql("select doj from directDictionaryTable where doj>'2016-03-14 15:00:09'"), Seq(Row(Timestamp.valueOf("2016-04-14 15:00:09"))) ) } override def afterAll { sql("drop table directDictionaryTable") CarbonProperties.getInstance().addProperty("carbon.direct.dictionary", "false") } }
Example 50
Source File: ModelDebug.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.aerosolve.training.pipeline import com.airbnb.aerosolve.core.ModelRecord import com.airbnb.aerosolve.core.function.{AbstractFunction, MultiDimensionSpline} import com.airbnb.aerosolve.core.util.Util import com.airbnb.aerosolve.training.pipeline.HiveUtil import com.typesafe.config.Config import org.apache.spark.SparkContext import org.apache.spark.sql.hive.HiveContext import org.slf4j.{Logger, LoggerFactory} import scala.util.Try object ModelDebug { val log: Logger = LoggerFactory.getLogger("DebugPipeline") def modelRecordToString(x: ModelRecord) : String = { if (x.weightVector != null && !x.weightVector.isEmpty) { val func = AbstractFunction.buildFunction(x) val tolerance = func.smooth(0, false) val tolerancePercentage = func.smooth(0, true) val nDTreeModelString: String = if (x.ndtreeModel != null) { val w = func.asInstanceOf[MultiDimensionSpline].getWeightsString x.ndtreeModel.toString + w } else { "" } s"%s\u0001%s\u0001%f\u0001%f\u0001%s\u0001%s\u0001%f\u0001%f\u0001%f".format( x.featureFamily, x.featureName, x.minVal, x.maxVal, x.weightVector.toString, nDTreeModelString, 0.0, tolerance, tolerancePercentage) } else { log.info(s" ${x.featureFamily} ${x.featureName} miss weightVector") "" } } def dumpModelForHive(sc: SparkContext, config: Config) = { val cfg = config.getConfig("dump_model_to_hive") dumpModel(sc, cfg, x => x.featureName != null, modelRecordToString) } def dumpModel(sc: SparkContext, config: Config, filterFunction: (ModelRecord) => Boolean, recordToString: (ModelRecord) => String): Unit = { val modelName = config.getString("model_name") val modelDump = config.getString("model_dump") val outputHiveTable = Try(config.getString("output_hive_table")).getOrElse("") val overwrite: Boolean = Try(config.getBoolean("overwrite")).getOrElse(false) val model = sc .textFile(modelName) .map(Util.decodeModel) .filter(filterFunction) .map(recordToString) .filter(_.length > 0) PipelineUtil.saveAndCommitAsTextFile(model, modelDump, overwrite) if (!outputHiveTable.isEmpty) { val hiveContext = new HiveContext(sc) val partitionKey = config.getString("partition_key") val partitionValue = config.getString("partition_value") // assume the value of partition key is string HiveUtil.updateHivePartition( hiveContext, outputHiveTable, s"$partitionKey='$partitionValue'", modelDump ) } } }
Example 51
Source File: DataLoader.scala From variantsdwh with Apache License 2.0 | 5 votes |
package pl.edu.pw.ii.zsibio.dwh.benchmark import com.typesafe.config.ConfigFactory import org.apache.kudu.spark.kudu.KuduContext import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} import org.rogach.scallop.ScallopConf import org.apache.kudu.spark.kudu._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{DataType, StructField, StructType} object DataLoader { class RunConf(args:Array[String]) extends ScallopConf(args){ val csvFile =opt[String]("csvFile",required = true, descr = "A CSV file to load" ) val tableName =opt[String]("tableName",required = true, descr = "A table to load" ) val storageType = opt[String]("storageType",required = true, descr = "Storage type parquet|orc|kudu|carbon") val dbName =opt[String]("dbName",required = true, descr = "Database name" ) verify() } def main(args: Array[String]): Unit = { val runConf = new RunConf(args) val scConf = new SparkConf() .setAppName("DataLoader") val sc = new SparkContext(scConf) val sqlContext = new HiveContext(sc) if(runConf.storageType().toLowerCase() == "orc" || runConf.storageType().toLowerCase() == "parquet") { val df = sqlContext.read .format("com.databricks.spark.csv") .option("delimiter", "|") .option("nullValue","\\N") .option("inferSchema", "true") // Automatically infer data types .load(runConf.csvFile()) .repartition(10) df.registerTempTable("temp_csv") sqlContext.sql( s""" |INSERT OVERWRITE TABLE ${runConf.dbName()}.${runConf.tableName()} |SELECT * FROM temp_csv """.stripMargin) } if(runConf.storageType().toLowerCase() == "kudu"){ val confFile = ConfigFactory.load() val kuduMaster = confFile.getString("kudu.master.server") val kuduContext = new KuduContext(kuduMaster) val dfTarget = sqlContext.read.options(Map("kudu.master" -> kuduMaster,"kudu.table" -> runConf.tableName())).kudu val df = sqlContext.read .format("com.databricks.spark.csv") .option("delimiter", "|") .option("nullValue","\\N") .schema(dfTarget.schema) .load(runConf.csvFile()) .repartition(10) kuduContext.upsertRows(df,runConf.tableName()) } } private def synSchemas(inSchema:StructType, outSchema:StructType) = { val size = inSchema.fields.length val structFields = (0 to size - 1).map{ i => StructField(outSchema.fields(i).name,inSchema.fields(i).dataType,outSchema.fields(i).nullable) } new StructType(structFields.toArray) } }
Example 52
Source File: SamplesGenerator.scala From variantsdwh with Apache License 2.0 | 5 votes |
package pl.edu.pw.ii.zsibio.dwh.benchmark.generation import java.lang import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.{Row, SQLContext} import pl.edu.pw.ii.zsibio.dwh.benchmark.generation.model.GeneratedSample import pl.edu.pw.ii.zsibio.dwh.benchmark.utils.Probability._ import scala.util.Random val population = sqlContext.sql( s""" |SELECT |population, |geo_id as id, |geo_country_name_en as countryName, |geo_region_name_en as region |FROM |${config.countryPopulation} """.stripMargin) val populationDist = population.map { case Row(population: Int, id: Int, countryName: String, region: String) => (region, (population.toLong, id)) }.groupByKey().collect().toMap def selectCountry(region: String): Int = { populationDist(region).selectWithProbability() } sc.parallelize(1 to n) .map((_, selectRegion())) .map(row => GeneratedSample(row._1, selectCountry(row._2.regionName), selectDisease(), row._2.afColumn)) } def selectDisease() = { val r = Math.random() val ret: java.lang.Long = if (r <= 0.05) (Random.nextInt(60) * 100).toLong else new lang.Long(-1) ret } def selectRegion(): RegionConfig = { val percentages = Seq((config.africaConfig.percent, config.africaConfig) , (config.americasConfig.percent, config.americasConfig) , (config.europaConfig.percent, config.europaConfig) , (config.finnishConfig.percent, config.finnishConfig) , (config.southAsianConfig.percent, config.southAsianConfig) , (config.westAsianConfig.percent, config.westAsianConfig) ) percentages.selectWithProbability() } }
Example 53
Source File: SavingStream.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService} import com.kakao.mango.text.ThreadSafeDateFormat import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream import java.util.concurrent.{Future => JFuture} import scala.reflect.runtime.universe.TypeTag object SavingStream { val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd") val hh = ThreadSafeDateFormat("HH") val mm = ThreadSafeDateFormat("mm") val m0 = (ms: Long) => mm(ms).charAt(0) + "0" } @transient var executor: RichExecutorService = _ def ex: RichExecutorService = { if (executor == null) { this.synchronized { if (executor == null) { executor = new RichExecutorService(es.get()) } } } executor } def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = { stream.foreachRDD { (rdd, time) => ex.submit { toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*) } } } def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms)) } } def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms)) } } def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms)) } } def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms)) } } } class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) { override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd) } class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) { override def toDF(rdd: RDD[String]) = ctx.read.json(rdd) } class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) { import com.kakao.mango.json._ override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson)) } class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) { override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema) }
Example 54
Source File: UdtfEnableQuery.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.examples import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.streaming.StreamSQLContext import org.apache.spark.streaming.{Duration, StreamingContext} import org.apache.spark.streaming.dstream.ConstantInputDStream object UdtfEnabledQuery { case class People(name: String, items: Array[String]) def main(args: Array[String]): Unit = { val ssc = new StreamingContext("local[10]", "test", Duration(3000)) val sc = ssc.sparkContext val hiveContext = new HiveContext(sc) val streamSqlContext = new StreamSQLContext(ssc, hiveContext) import hiveContext.implicits._ import streamSqlContext.createSchemaDStream val dummyRDD = sc.makeRDD(1 to 3).map(i => People(s"jack$i", Array("book", "gun"))) val dummyStream = new ConstantInputDStream[People](ssc, dummyRDD) streamSqlContext.registerDStreamAsTable(dummyStream, "people") streamSqlContext.sql( """SELECT | name, | item |FROM | people | lateral view explode(items) items AS item""".stripMargin).map(_.copy()).print() ssc.start() ssc.awaitTerminationOrTimeout(30 * 1000) ssc.stop() } }
Example 55
Source File: UdafEnableQuery.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.examples import scala.collection.mutable.ListBuffer import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.streaming.StreamSQLContext import org.apache.spark.streaming.{Duration, StreamingContext} import org.apache.spark.streaming.dstream.ConstantInputDStream object UdafEnabledQuery { case class Data(name: String, money: Int) def main(args: Array[String]): Unit = { val ssc = new StreamingContext("local[10]", "test", Duration(3000)) val sc = ssc.sparkContext val hiveContext = new HiveContext(sc) val streamSQlContext = new StreamSQLContext(ssc, hiveContext) val dummyRDD = sc.makeRDD(1 to 10).map(i => Data(s"jack$i", i)) val dummyStream = new ConstantInputDStream[Data](ssc, dummyRDD) val schemaStream = streamSQlContext.createSchemaDStream(dummyStream) streamSQlContext.registerDStreamAsTable(schemaStream, "data") val resultList = ListBuffer[String]() streamSQlContext.sql( """SELECT | percentile(money,0.8), | stddev_pop(money) |FROM data """.stripMargin).map(_.copy()).print() ssc.start() ssc.awaitTerminationOrTimeout(30 * 1000) ssc.stop() } }
Example 56
Source File: StreamHQL.scala From spark-cep with Apache License 2.0 | 5 votes |
import java.util.Properties import kafka.consumer.ConsumerConfig import org.I0Itec.zkclient.ZkClient import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.streaming.StreamSQLContext import org.apache.spark.sql.streaming.sources.MessageDelimiter import org.apache.spark.streaming.dstream.ConstantInputDStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} import redis.RedisManager import scala.util.parsing.json.JSON class TabDelimiter extends MessageDelimiter { override val delimiter = "\t" } object StreamDDL { def main(args: Array[String]): Unit = { Logger.getRootLogger.setLevel(Level.WARN) val query = args(0) val sc = new SparkContext(new SparkConf()) val ssc = new StreamingContext(sc, Seconds(1)) val streamSqlContext = new StreamSQLContext(ssc, new HiveContext(sc)) streamSqlContext.command(query) new ConstantInputDStream[Int](ssc, sc.parallelize(Seq(1))).print ssc.start() ssc.awaitTerminationOrTimeout(100) ssc.stop() } } object StreamHQL { object Redis { var initialized = false var manager: RedisManager = _ def init(confMap: Map[String, String]) { if (initialized == false) { manager = new RedisManager( confMap("redis.shards"), confMap("redis.sentinels"), confMap("redis.database").toInt) manager.init initialized = true } } } def removeConsumerGroup(zkQuorum: String, groupId: String) { val properties = new Properties() properties.put("zookeeper.connect", zkQuorum) properties.put("group.id", groupId) val conf = new ConsumerConfig(properties) val zkClient = new ZkClient(conf.zkConnect) zkClient.deleteRecursive(s"/consumers/${conf.groupId}") zkClient.close() } def main(args: Array[String]): Unit = { Logger.getRootLogger.setLevel(Level.WARN) val confMap = JSON.parseFull(args(0)).get.asInstanceOf[Map[String, String]] val qid = args(1) val query = args(2) val sc = new SparkContext(new SparkConf()) val ssc = new StreamingContext(sc, Seconds(1)) val hc = new HiveContext(sc) val streamSqlContext = new StreamSQLContext(ssc, hc) val redisExpireSec = confMap("redis.expire.sec").toInt ssc.checkpoint(s"checkpoint/$qid") hc.setConf("spark.streaming.query.id", qid) hc.setConf("spark.sql.shuffle.partitions", confMap("spark.sql.shuffle.partitions")) removeConsumerGroup(confMap("kafka.zookeeper.quorum"), qid) val result = streamSqlContext.sql(query) val schema = result.schema result.foreachRDD((rdd, time) => { rdd.foreachPartition(partition => { Redis.init(confMap) val jedis = Redis.manager.getResource val pipe = jedis.pipelined partition.foreach(record => { val seq = record.toSeq(schema) val ts = time.milliseconds / 1000 val hkey = seq.take(seq.size - 1).mkString(".") pipe.hset(qid + "." + ts, hkey, seq(seq.size - 1).toString) pipe.expire(qid + "." + ts, redisExpireSec) }) pipe.sync Redis.manager.returnResource(jedis) }) }) ssc.start() ssc.awaitTermination() ssc.stop() } }
Example 57
Source File: HiveApp.scala From iolap with Apache License 2.0 | 5 votes |
package main.scala import scala.collection.mutable.{ListBuffer, Queue} import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext case class Person(name: String, age: Int) object SparkSqlExample { def main(args: Array[String]) { val conf = sys.env.get("SPARK_AUDIT_MASTER") match { case Some(master) => new SparkConf().setAppName("Simple Sql App").setMaster(master) case None => new SparkConf().setAppName("Simple Sql App") } val sc = new SparkContext(conf) val hiveContext = new HiveContext(sc) import hiveContext._ sql("DROP TABLE IF EXISTS src") sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING)") sql("LOAD DATA LOCAL INPATH 'data.txt' INTO TABLE src") val results = sql("FROM src SELECT key, value WHERE key >= 0 AND KEY < 5").collect() results.foreach(println) def test(f: => Boolean, failureMsg: String) = { if (!f) { println(failureMsg) System.exit(-1) } } test(results.size == 5, "Unexpected number of selected elements: " + results) println("Test succeeded") sc.stop() } }
Example 58
Source File: DescribeHiveTableCommand.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.collection.JavaConversions._ import org.apache.hadoop.hive.metastore.api.FieldSchema import org.apache.spark.annotation.DeveloperApi import org.apache.spark.sql.catalyst.expressions.{Attribute, Row} import org.apache.spark.sql.execution.{SparkPlan, RunnableCommand} import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation} import org.apache.spark.sql.hive.HiveShim import org.apache.spark.sql.SQLContext private[hive] case class DescribeHiveTableCommand( table: MetastoreRelation, override val output: Seq[Attribute], isExtended: Boolean) extends RunnableCommand { override def run(sqlContext: SQLContext): Seq[Row] = { // Trying to mimic the format of Hive's output. But not exactly the same. var results: Seq[(String, String, String)] = Nil val columns: Seq[FieldSchema] = table.hiveQlTable.getCols val partitionColumns: Seq[FieldSchema] = table.hiveQlTable.getPartCols results ++= columns.map(field => (field.getName, field.getType, field.getComment)) if (partitionColumns.nonEmpty) { val partColumnInfo = partitionColumns.map(field => (field.getName, field.getType, field.getComment)) results ++= partColumnInfo ++ Seq(("# Partition Information", "", "")) ++ Seq((s"# ${output.get(0).name}", output.get(1).name, output.get(2).name)) ++ partColumnInfo } if (isExtended) { results ++= Seq(("Detailed Table Information", table.hiveQlTable.getTTable.toString, "")) } results.map { case (name, dataType, comment) => Row(name, dataType, comment) } } }
Example 59
Source File: CreateTableAsSelect.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.annotation.Experimental import org.apache.spark.sql.{AnalysisException, SQLContext} import org.apache.spark.sql.catalyst.expressions.Row import org.apache.spark.sql.catalyst.plans.logical.{InsertIntoTable, LogicalPlan} import org.apache.spark.sql.execution.RunnableCommand import org.apache.spark.sql.hive.client.{HiveTable, HiveColumn} import org.apache.spark.sql.hive.{HiveContext, MetastoreRelation, HiveMetastoreTypes} private[hive] case class CreateTableAsSelect( tableDesc: HiveTable, query: LogicalPlan, allowExisting: Boolean) extends RunnableCommand { def database: String = tableDesc.database def tableName: String = tableDesc.name override def run(sqlContext: SQLContext): Seq[Row] = { val hiveContext = sqlContext.asInstanceOf[HiveContext] lazy val metastoreRelation: MetastoreRelation = { import org.apache.hadoop.hive.serde2.`lazy`.LazySimpleSerDe import org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat import org.apache.hadoop.io.Text import org.apache.hadoop.mapred.TextInputFormat val withSchema = tableDesc.copy( schema = query.output.map(c => HiveColumn(c.name, HiveMetastoreTypes.toMetastoreType(c.dataType), null)), inputFormat = tableDesc.inputFormat.orElse(Some(classOf[TextInputFormat].getName)), outputFormat = tableDesc.outputFormat .orElse(Some(classOf[HiveIgnoreKeyTextOutputFormat[Text, Text]].getName)), serde = tableDesc.serde.orElse(Some(classOf[LazySimpleSerDe].getName()))) hiveContext.catalog.client.createTable(withSchema) // Get the Metastore Relation hiveContext.catalog.lookupRelation(Seq(database, tableName), None) match { case r: MetastoreRelation => r } } // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. if (hiveContext.catalog.tableExists(Seq(database, tableName))) { if (allowExisting) { // table already exists, will do nothing, to keep consistent with Hive } else { throw new AnalysisException(s"$database.$tableName already exists.") } } else { hiveContext.executePlan(InsertIntoTable(metastoreRelation, Map(), query, true, false)).toRdd } Seq.empty[Row] } override def argString: String = { s"[Database:$database, TableName: $tableName, InsertIntoHiveTable]\n" + query.toString } }
Example 60
Source File: Main.scala From iolap with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.hive.HiveContext object Main { def main(args: Array[String]) { println("Running regression test for SPARK-8489.") val sc = new SparkContext("local", "testing") val hc = new HiveContext(sc) // This line should not throw scala.reflect.internal.MissingRequirementError. // See SPARK-8470 for more detail. val df = hc.createDataFrame(Seq(MyCoolClass("1", "2", "3"))) df.collect() println("Regression test for SPARK-8489 success!") sc.stop() } }
Example 61
Source File: OrcTest.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc import java.io.File import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.test.TestHive import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql._ private[sql] trait OrcTest extends SQLTestUtils { protected def hiveContext = sqlContext.asInstanceOf[HiveContext] import sqlContext.sparkContext import sqlContext.implicits._ protected def withOrcTable[T <: Product: ClassTag: TypeTag] (data: Seq[T], tableName: String) (f: => Unit): Unit = { withOrcDataFrame(data) { df => hiveContext.registerDataFrameAsTable(df, tableName) withTempTable(tableName)(f) } } protected def makeOrcFile[T <: Product: ClassTag: TypeTag]( data: Seq[T], path: File): Unit = { data.toDF().write.format("orc").mode(SaveMode.Overwrite).save(path.getCanonicalPath) } protected def makeOrcFile[T <: Product: ClassTag: TypeTag]( df: DataFrame, path: File): Unit = { df.write.format("orc").mode(SaveMode.Overwrite).save(path.getCanonicalPath) } }
Example 62
Source File: SparkSQLCLIService.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.IOException import java.util.{List => JList} import javax.security.auth.login.LoginException import scala.collection.JavaConversions._ import org.apache.commons.logging.Log import org.apache.hadoop.hive.conf.HiveConf import org.apache.hadoop.hive.shims.ShimLoader import org.apache.hadoop.security.UserGroupInformation import org.apache.hive.service.Service.STATE import org.apache.hive.service.auth.HiveAuthFactory import org.apache.hive.service.cli._ import org.apache.hive.service.{AbstractService, Service, ServiceException} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._ import org.apache.spark.util.Utils private[hive] class SparkSQLCLIService(hiveContext: HiveContext) extends CLIService with ReflectedCompositeService { override def init(hiveConf: HiveConf) { setSuperField(this, "hiveConf", hiveConf) val sparkSqlSessionManager = new SparkSQLSessionManager(hiveContext) setSuperField(this, "sessionManager", sparkSqlSessionManager) addService(sparkSqlSessionManager) var sparkServiceUGI: UserGroupInformation = null if (ShimLoader.getHadoopShims.isSecurityEnabled) { try { HiveAuthFactory.loginFromKeytab(hiveConf) sparkServiceUGI = ShimLoader.getHadoopShims.getUGIForConf(hiveConf) HiveThriftServerShim.setServerUserName(sparkServiceUGI, this) } catch { case e @ (_: IOException | _: LoginException) => throw new ServiceException("Unable to login to kerberos with given principal/keytab", e) } } initCompositeService(hiveConf) } override def getInfo(sessionHandle: SessionHandle, getInfoType: GetInfoType): GetInfoValue = { getInfoType match { case GetInfoType.CLI_SERVER_NAME => new GetInfoValue("Spark SQL") case GetInfoType.CLI_DBMS_NAME => new GetInfoValue("Spark SQL") case GetInfoType.CLI_DBMS_VER => new GetInfoValue(hiveContext.sparkContext.version) case _ => super.getInfo(sessionHandle, getInfoType) } } } private[thriftserver] trait ReflectedCompositeService { this: AbstractService => def initCompositeService(hiveConf: HiveConf) { // Emulating `CompositeService.init(hiveConf)` val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList") serviceList.foreach(_.init(hiveConf)) // Emulating `AbstractService.init(hiveConf)` invoke(classOf[AbstractService], this, "ensureCurrentState", classOf[STATE] -> STATE.NOTINITED) setAncestorField(this, 3, "hiveConf", hiveConf) invoke(classOf[AbstractService], this, "changeState", classOf[STATE] -> STATE.INITED) getAncestorField[Log](this, 3, "LOG").info(s"Service: $getName is inited.") } }
Example 63
Source File: SparkSQLOperationManager.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver.server import java.util.{Map => JMap} import scala.collection.mutable.Map import org.apache.hive.service.cli._ import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager} import org.apache.hive.service.cli.session.HiveSession import org.apache.spark.Logging import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.hive.thriftserver.{SparkExecuteStatementOperation, ReflectionUtils} private[thriftserver] class SparkSQLOperationManager(hiveContext: HiveContext) extends OperationManager with Logging { val handleToOperation = ReflectionUtils .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation") val sessionToActivePool = Map[SessionHandle, String]() override def newExecuteStatementOperation( parentSession: HiveSession, statement: String, confOverlay: JMap[String, String], async: Boolean): ExecuteStatementOperation = synchronized { val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay)( hiveContext, sessionToActivePool) handleToOperation.put(operation.getHandle, operation) operation } }
Example 64
Source File: SparkSQLEnv.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import java.io.PrintStream import scala.collection.JavaConversions._ import org.apache.spark.scheduler.StatsReportListener import org.apache.spark.sql.hive.{HiveShim, HiveContext} import org.apache.spark.{Logging, SparkConf, SparkContext} import org.apache.spark.util.Utils def stop() { logDebug("Shutting down Spark SQL Environment") // Stop the SparkContext if (SparkSQLEnv.sparkContext != null) { sparkContext.stop() sparkContext = null hiveContext = null } } }
Example 65
Source File: AbstractSparkSQLDriver.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.thriftserver import scala.collection.JavaConversions._ import org.apache.commons.lang3.exception.ExceptionUtils import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema} import org.apache.hadoop.hive.ql.Driver import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse import org.apache.spark.Logging import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.hive.{HiveContext, HiveMetastoreTypes} private[hive] abstract class AbstractSparkSQLDriver( val context: HiveContext = SparkSQLEnv.hiveContext) extends Driver with Logging { private[hive] var tableSchema: Schema = _ private[hive] var hiveResponse: Seq[String] = _ override def init(): Unit = { } private def getResultSetSchema(query: context.QueryExecution): Schema = { val analyzed = query.analyzed logDebug(s"Result Schema: ${analyzed.output}") if (analyzed.output.size == 0) { new Schema(new FieldSchema("Response code", "string", "") :: Nil, null) } else { val fieldSchemas = analyzed.output.map { attr => new FieldSchema(attr.name, HiveMetastoreTypes.toMetastoreType(attr.dataType), "") } new Schema(fieldSchemas, null) } } override def run(command: String): CommandProcessorResponse = { // TODO unify the error code try { context.sparkContext.setJobDescription(command) val execution = context.executePlan(context.sql(command).logicalPlan) hiveResponse = execution.stringResult() tableSchema = getResultSetSchema(execution) new CommandProcessorResponse(0) } } def runWrapper(command: String): CommandProcessorResponseWrapper = try { val result = run(command) new CommandProcessorResponseWrapper(result, null) } catch { case ae: AnalysisException => logDebug(s"Failed in [$command]", ae) new CommandProcessorResponseWrapper(new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null), ae) case cause: Throwable => logError(s"Failed in [$command]", cause) new CommandProcessorResponseWrapper(new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null), cause) } override def close(): Int = { hiveResponse = null tableSchema = null 0 } override def getSchema: Schema = tableSchema override def destroy() { super.destroy() hiveResponse = null tableSchema = null } } private[hive] case class CommandProcessorResponseWrapper( rc : CommandProcessorResponse, cause : Throwable)