org.apache.spark.sql.SQLContext Scala Example

Source File: DefaultSource.scala From spark-snowflake with Apache License 2.0

7 votes

package net.snowflake.spark.snowflake

import net.snowflake.spark.snowflake.streaming.SnowflakeSink
import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_SHORT_NAME
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.sources._
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import org.slf4j.LoggerFactory


  override def createRelation(sqlContext: SQLContext,
                              saveMode: SaveMode,
                              parameters: Map[String, String],
                              data: DataFrame): BaseRelation = {

    val params = Parameters.mergeParameters(parameters)
    // check spark version for push down
    if (params.autoPushdown) {
      SnowflakeConnectorUtils.checkVersionAndEnablePushdown(
        sqlContext.sparkSession
      )
    }
    // pass parameters to pushdown functions
    pushdowns.setGlobalParameter(params)
    val table = params.table.getOrElse {
      throw new IllegalArgumentException(
        "For save operations you must specify a Snowfake table name with the 'dbtable' parameter"
      )
    }

    def tableExists: Boolean = {
      val conn = jdbcWrapper.getConnector(params)
      try {
        jdbcWrapper.tableExists(conn, table.toString)
      } finally {
        conn.close()
      }
    }

    val (doSave, dropExisting) = saveMode match {
      case SaveMode.Append => (true, false)
      case SaveMode.Overwrite => (true, true)
      case SaveMode.ErrorIfExists =>
        if (tableExists) {
          sys.error(
            s"Table $table already exists! (SaveMode is set to ErrorIfExists)"
          )
        } else {
          (true, false)
        }
      case SaveMode.Ignore =>
        if (tableExists) {
          log.info(s"Table $table already exists -- ignoring save request.")
          (false, false)
        } else {
          (true, false)
        }
    }

    if (doSave) {
      val updatedParams = parameters.updated("overwrite", dropExisting.toString)
      new SnowflakeWriter(jdbcWrapper)
        .save(
          sqlContext,
          data,
          saveMode,
          Parameters.mergeParameters(updatedParams)
        )

    }

    createRelation(sqlContext, parameters)
  }

  override def createSink(sqlContext: SQLContext,
                          parameters: Map[String, String],
                          partitionColumns: Seq[String],
                          outputMode: OutputMode): Sink =
    new SnowflakeSink(sqlContext, parameters, partitionColumns, outputMode)
}

Source File: JdbcRelationProvider.scala From drizzle-spark with Apache License 2.0

7 votes

package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn == null) {
      null
    } else {
      JDBCPartitioningInfo(
        partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val url = jdbcOptions.url
    val table = jdbcOptions.table
    val createTableOptions = jdbcOptions.createTableOptions
    val isTruncate = jdbcOptions.isTruncate

    val conn = JdbcUtils.createConnectionFactory(jdbcOptions)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, url, table)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (isTruncate && isCascadingTruncateTable(url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, table)
              saveTable(df, url, table, jdbcOptions)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, table)
              createTable(df.schema, url, table, createTableOptions, conn)
              saveTable(df, url, table, jdbcOptions)
            }

          case SaveMode.Append =>
            saveTable(df, url, table, jdbcOptions)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '$table' already exists. SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(df.schema, url, table, createTableOptions, conn)
        saveTable(df, url, table, jdbcOptions)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
}

Source File: MLlibTestSparkContext.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import java.io.File

import org.scalatest.Suite

import org.apache.spark.SparkContext
import org.apache.spark.ml.util.TempDirectory
import org.apache.spark.sql.{SparkSession, SQLContext, SQLImplicits}
import org.apache.spark.util.Utils

trait MLlibTestSparkContext extends TempDirectory { self: Suite =>
  @transient var spark: SparkSession = _
  @transient var sc: SparkContext = _
  @transient var checkpointDir: String = _

  override def beforeAll() {
    super.beforeAll()
    spark = SparkSession.builder
      .master("local[2]")
      .appName("MLlibUnitTest")
      .getOrCreate()
    sc = spark.sparkContext

    checkpointDir = Utils.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString
    sc.setCheckpointDir(checkpointDir)
  }

  override def afterAll() {
    try {
      Utils.deleteRecursively(new File(checkpointDir))
      SparkSession.clearActiveSession()
      if (spark != null) {
        spark.stop()
      }
      spark = null
    } finally {
      super.afterAll()
    }
  }

  
  protected object testImplicits extends SQLImplicits {
    protected override def _sqlContext: SQLContext = self.spark.sqlContext
  }
}

Source File: SparkSQLCLIService.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.IOException
import java.util.{List => JList}
import javax.security.auth.login.LoginException

import scala.collection.JavaConverters._

import org.apache.commons.logging.Log
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.shims.Utils
import org.apache.hadoop.security.UserGroupInformation
import org.apache.hive.service.{AbstractService, Service, ServiceException}
import org.apache.hive.service.Service.STATE
import org.apache.hive.service.auth.HiveAuthFactory
import org.apache.hive.service.cli._
import org.apache.hive.service.server.HiveServer2

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._

private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, sqlContext: SQLContext)
  extends CLIService(hiveServer)
  with ReflectedCompositeService {

  override def init(hiveConf: HiveConf) {
    setSuperField(this, "hiveConf", hiveConf)

    val sparkSqlSessionManager = new SparkSQLSessionManager(hiveServer, sqlContext)
    setSuperField(this, "sessionManager", sparkSqlSessionManager)
    addService(sparkSqlSessionManager)
    var sparkServiceUGI: UserGroupInformation = null

    if (UserGroupInformation.isSecurityEnabled) {
      try {
        HiveAuthFactory.loginFromKeytab(hiveConf)
        sparkServiceUGI = Utils.getUGI()
        setSuperField(this, "serviceUGI", sparkServiceUGI)
      } catch {
        case e @ (_: IOException | _: LoginException) =>
          throw new ServiceException("Unable to login to kerberos with given principal/keytab", e)
      }
    }

    initCompositeService(hiveConf)
  }

  override def getInfo(sessionHandle: SessionHandle, getInfoType: GetInfoType): GetInfoValue = {
    getInfoType match {
      case GetInfoType.CLI_SERVER_NAME => new GetInfoValue("Spark SQL")
      case GetInfoType.CLI_DBMS_NAME => new GetInfoValue("Spark SQL")
      case GetInfoType.CLI_DBMS_VER => new GetInfoValue(sqlContext.sparkContext.version)
      case _ => super.getInfo(sessionHandle, getInfoType)
    }
  }
}

private[thriftserver] trait ReflectedCompositeService { this: AbstractService =>
  def initCompositeService(hiveConf: HiveConf) {
    // Emulating `CompositeService.init(hiveConf)`
    val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList")
    serviceList.asScala.foreach(_.init(hiveConf))

    // Emulating `AbstractService.init(hiveConf)`
    invoke(classOf[AbstractService], this, "ensureCurrentState", classOf[STATE] -> STATE.NOTINITED)
    setAncestorField(this, 3, "hiveConf", hiveConf)
    invoke(classOf[AbstractService], this, "changeState", classOf[STATE] -> STATE.INITED)
    getAncestorField[Log](this, 3, "LOG").info(s"Service: $getName is inited.")
  }
}

Source File: SparkSQLDriver.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.util.{ArrayList => JArrayList, Arrays, List => JList}

import scala.collection.JavaConverters._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.execution.QueryExecution


private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.isEmpty) {
      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, attr.dataType.catalogString, "")
      }

      new Schema(fieldSchemas.asJava, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.sessionState.executePlan(context.sql(command).logicalPlan)
      hiveResponse = execution.hiveResultString()
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
}

Source File: SparkSQLSessionManager.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.util.concurrent.Executors

import org.apache.commons.logging.Log
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.conf.HiveConf.ConfVars
import org.apache.hive.service.cli.SessionHandle
import org.apache.hive.service.cli.session.SessionManager
import org.apache.hive.service.cli.thrift.TProtocolVersion
import org.apache.hive.service.server.HiveServer2

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.{HiveSessionState, HiveUtils}
import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager


private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext: SQLContext)
  extends SessionManager(hiveServer)
  with ReflectedCompositeService {

  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager()

  override def init(hiveConf: HiveConf) {
    setSuperField(this, "hiveConf", hiveConf)

    // Create operation log root directory, if operation logging is enabled
    if (hiveConf.getBoolVar(ConfVars.HIVE_SERVER2_LOGGING_OPERATION_ENABLED)) {
      invoke(classOf[SessionManager], this, "initOperationLogRootDir")
    }

    val backgroundPoolSize = hiveConf.getIntVar(ConfVars.HIVE_SERVER2_ASYNC_EXEC_THREADS)
    setSuperField(this, "backgroundOperationPool", Executors.newFixedThreadPool(backgroundPoolSize))
    getAncestorField[Log](this, 3, "LOG").info(
      s"HiveServer2: Async execution pool size $backgroundPoolSize")

    setSuperField(this, "operationManager", sparkSqlOperationManager)
    addService(sparkSqlOperationManager)

    initCompositeService(hiveConf)
  }

  override def openSession(
      protocol: TProtocolVersion,
      username: String,
      passwd: String,
      ipAddress: String,
      sessionConf: java.util.Map[String, String],
      withImpersonation: Boolean,
      delegationToken: String): SessionHandle = {
    val sessionHandle =
      super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation,
          delegationToken)
    val session = super.getSession(sessionHandle)
    HiveThriftServer2.listener.onSessionCreated(
      session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername)
    val sessionState = sqlContext.sessionState.asInstanceOf[HiveSessionState]
    val ctx = if (sessionState.hiveThriftServerSingleSession) {
      sqlContext
    } else {
      sqlContext.newSession()
    }
    ctx.setConf("spark.sql.hive.version", HiveUtils.hiveExecutionVersion)
    if (sessionConf != null && sessionConf.containsKey("use:database")) {
      ctx.sql(s"use ${sessionConf.get("use:database")}")
    }
    sparkSqlOperationManager.sessionToContexts.put(sessionHandle, ctx)
    sessionHandle
  }

  override def closeSession(sessionHandle: SessionHandle) {
    HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString)
    super.closeSession(sessionHandle)
    sparkSqlOperationManager.sessionToActivePool.remove(sessionHandle)
    sparkSqlOperationManager.sessionToContexts.remove(sessionHandle)
  }
}

Source File: SparkSQLOperationManager.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.server

import java.util.{Map => JMap}
import java.util.concurrent.ConcurrentHashMap

import org.apache.hive.service.cli._
import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager}
import org.apache.hive.service.cli.session.HiveSession

import org.apache.spark.internal.Logging
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveSessionState
import org.apache.spark.sql.hive.thriftserver.{ReflectionUtils, SparkExecuteStatementOperation}


private[thriftserver] class SparkSQLOperationManager()
  extends OperationManager with Logging {

  val handleToOperation = ReflectionUtils
    .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")

  val sessionToActivePool = new ConcurrentHashMap[SessionHandle, String]()
  val sessionToContexts = new ConcurrentHashMap[SessionHandle, SQLContext]()

  override def newExecuteStatementOperation(
      parentSession: HiveSession,
      statement: String,
      confOverlay: JMap[String, String],
      async: Boolean): ExecuteStatementOperation = synchronized {
    val sqlContext = sessionToContexts.get(parentSession.getSessionHandle)
    require(sqlContext != null, s"Session handle: ${parentSession.getSessionHandle} has not been" +
      s" initialized or had already closed.")
    val sessionState = sqlContext.sessionState.asInstanceOf[HiveSessionState]
    val runInBackground = async && sessionState.hiveThriftServerAsync
    val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay,
      runInBackground)(sqlContext, sessionToActivePool)
    handleToOperation.put(operation.getHandle, operation)
    logDebug(s"Created Operation for $statement with session=$parentSession, " +
      s"runInBackground=$runInBackground")
    operation
  }
}

Source File: SparkSQLEnv.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.PrintStream

import scala.collection.JavaConverters._

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{SparkSession, SQLContext}
import org.apache.spark.sql.hive.{HiveSessionState, HiveUtils}
import org.apache.spark.util.Utils


  def stop() {
    logDebug("Shutting down Spark SQL Environment")
    // Stop the SparkContext
    if (SparkSQLEnv.sparkContext != null) {
      sparkContext.stop()
      sparkContext = null
      sqlContext = null
    }
  }
}

Source File: HadoopFsRelation.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.{SparkSession, SQLContext}
import org.apache.spark.sql.catalyst.catalog.BucketSpec
import org.apache.spark.sql.execution.FileRelation
import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister}
import org.apache.spark.sql.types.StructType



case class HadoopFsRelation(
    location: FileCatalog,
    partitionSchema: StructType,
    dataSchema: StructType,
    bucketSpec: Option[BucketSpec],
    fileFormat: FileFormat,
    options: Map[String, String])(val sparkSession: SparkSession)
  extends BaseRelation with FileRelation {

  override def sqlContext: SQLContext = sparkSession.sqlContext

  val schema: StructType = {
    val dataSchemaColumnNames = dataSchema.map(_.name.toLowerCase).toSet
    StructType(dataSchema ++ partitionSchema.filterNot { column =>
      dataSchemaColumnNames.contains(column.name.toLowerCase)
    })
  }

  def partitionSchemaOption: Option[StructType] =
    if (partitionSchema.isEmpty) None else Some(partitionSchema)

  override def toString: String = {
    fileFormat match {
      case source: DataSourceRegister => source.shortName()
      case _ => "HadoopFiles"
    }
  }

  override def sizeInBytes: Long = location.sizeInBytes

  override def inputFiles: Array[String] = location.inputFiles
}

Source File: console.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
import org.apache.spark.sql.streaming.OutputMode

class ConsoleSink(options: Map[String, String]) extends Sink with Logging {
  // Number of rows to display, by default 20 rows
  private val numRowsToShow = options.get("numRows").map(_.toInt).getOrElse(20)

  // Truncate the displayed data if it is too long, by default it is true
  private val isTruncated = options.get("truncate").map(_.toBoolean).getOrElse(true)

  // Track the batch id
  private var lastBatchId = -1L

  override def addBatch(batchId: Long, data: DataFrame): Unit = synchronized {
    val batchIdStr = if (batchId <= lastBatchId) {
      s"Rerun batch: $batchId"
    } else {
      lastBatchId = batchId
      s"Batch: $batchId"
    }

    // scalastyle:off println
    println("-------------------------------------------")
    println(batchIdStr)
    println("-------------------------------------------")
    // scalastyle:off println
    data.sparkSession.createDataFrame(
      data.sparkSession.sparkContext.parallelize(data.collect()), data.schema)
      .show(numRowsToShow, isTruncated)
  }
}

class ConsoleSinkProvider extends StreamSinkProvider with DataSourceRegister {
  def createSink(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new ConsoleSink(parameters)
  }

  def shortName(): String = "console"
}

Source File: package.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.internal.SessionState
import org.apache.spark.sql.types.StructType

package object state {

  implicit class StateStoreOps[T: ClassTag](dataRDD: RDD[T]) {

    
    private[streaming] def mapPartitionsWithStateStore[U: ClassTag](
        checkpointLocation: String,
        operatorId: Long,
        storeVersion: Long,
        keySchema: StructType,
        valueSchema: StructType,
        sessionState: SessionState,
        storeCoordinator: Option[StateStoreCoordinatorRef])(
        storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U]): StateStoreRDD[T, U] = {
      val cleanedF = dataRDD.sparkContext.clean(storeUpdateFunction)
      new StateStoreRDD(
        dataRDD,
        cleanedF,
        checkpointLocation,
        operatorId,
        storeVersion,
        keySchema,
        valueSchema,
        sessionState,
        storeCoordinator)
    }
  }
}

Source File: DDLSourceLoadSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types.{StringType, StructField, StructType}


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name") {
    intercept[RuntimeException] {
      spark.read.format("Fluet da Bomb").load()
    }
  }

  test("load data source from format alias") {
    spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("specify full classname with duplicate formats") {
    spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false)))
  }

  test("should fail to load ORC without Hive Support") {
    val e = intercept[AnalysisException] {
      spark.read.format("orc").load()
    }
    assert(e.message.contains("The ORC data source must be used with Hive support enabled"))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider  with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

Source File: SharedSQLContext.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.test

import org.scalatest.BeforeAndAfterEach

import org.apache.spark.{DebugFilesystem, SparkConf}
import org.apache.spark.sql.{SparkSession, SQLContext}



  protected override def afterAll(): Unit = {
    try {
      if (_spark != null) {
        _spark.stop()
        _spark = null
      }
    } finally {
      super.afterAll()
    }
  }

  protected override def beforeEach(): Unit = {
    super.beforeEach()
    DebugFilesystem.clearOpenStreams()
  }

  protected override def afterEach(): Unit = {
    super.afterEach()
    DebugFilesystem.assertNoOpenStreams()
  }
}

Source File: DLClassifierLeNet.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.MLPipeline

import com.intel.analytics.bigdl._
import com.intel.analytics.bigdl.dataset.image.{BytesToGreyImg, GreyImgNormalizer, GreyImgToBatch}
import com.intel.analytics.bigdl.dataset.{DataSet, DistributedDataSet, MiniBatch, _}
import com.intel.analytics.bigdl.dlframes.DLClassifier
import com.intel.analytics.bigdl.models.lenet.LeNet5
import com.intel.analytics.bigdl.models.lenet.Utils._
import com.intel.analytics.bigdl.nn.ClassNLLCriterion
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext


object DLClassifierLeNet {

  LoggerFilter.redirectSparkInfoLogs()

  def main(args: Array[String]): Unit = {
    val inputs = Array[String]("Feature data", "Label data")
    trainParser.parse(args, new TrainParams()).foreach(param => {
      val conf = Engine.createSparkConf()
        .setAppName("MLPipeline Example")
        .set("spark.task.maxFailures", "1")
      val sc = new SparkContext(conf)
      val sqLContext = SQLContext.getOrCreate(sc)
      Engine.init

      val trainData = param.folder + "/train-images-idx3-ubyte"
      val trainLabel = param.folder + "/train-labels-idx1-ubyte"
      val validationData = param.folder + "/t10k-images-idx3-ubyte"
      val validationLabel = param.folder + "/t10k-labels-idx1-ubyte"

      val trainSet = DataSet.array(load(trainData, trainLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(trainMean, trainStd) -> GreyImgToBatch(1)

      val trainingRDD : RDD[Data[Float]] = trainSet.
        asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map(batch => {
          val feature = batch.getInput().asInstanceOf[Tensor[Float]]
          val label = batch.getTarget().asInstanceOf[Tensor[Float]]
          Data[Float](feature.storage().array(), label.storage().array())
        })
      val trainingDF = sqLContext.createDataFrame(trainingRDD).toDF(inputs: _*)

      val model = LeNet5(classNum = 10)
      val criterion = ClassNLLCriterion[Float]()
      val featureSize = Array(28, 28)
      val estimator = new DLClassifier[Float](model, criterion, featureSize)
        .setFeaturesCol(inputs(0))
        .setLabelCol(inputs(1))
        .setBatchSize(param.batchSize)
        .setMaxEpoch(param.maxEpoch)
      val transformer = estimator.fit(trainingDF)

      val validationSet = DataSet.array(load(validationData, validationLabel), sc) ->
        BytesToGreyImg(28, 28) -> GreyImgNormalizer(testMean, testStd) -> GreyImgToBatch(1)

      val validationRDD: RDD[Data[Float]] = validationSet.
        asInstanceOf[DistributedDataSet[MiniBatch[Float]]].data(false).map{batch =>
          val feature = batch.getInput().asInstanceOf[Tensor[Float]]
          val label = batch.getTarget().asInstanceOf[Tensor[Float]]
          Data[Float](feature.storage().array(), label.storage().array())
        }
      val validationDF = sqLContext.createDataFrame(validationRDD).toDF(inputs: _*)
      val transformed = transformer.transform(validationDF)
      transformed.show()
      sc.stop()
    })
  }
}

private case class Data[T](featureData : Array[T], labelData : Array[T])

Source File: DLEstimatorMultiLabelLR.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.MLPipeline

import com.intel.analytics.bigdl.dlframes.DLEstimator
import com.intel.analytics.bigdl.nn._
import com.intel.analytics.bigdl.optim.LBFGS
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericDouble
import com.intel.analytics.bigdl.utils.Engine
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext


object DLEstimatorMultiLabelLR {

  def main(args: Array[String]): Unit = {
    val conf = Engine.createSparkConf()
      .setAppName("DLEstimatorMultiLabelLR")
      .setMaster("local[1]")
    val sc = new SparkContext(conf)
    val sqlContext = SQLContext.getOrCreate(sc)
    Engine.init

    val model = Sequential().add(Linear(2, 2))
    val criterion = MSECriterion()
    val estimator = new DLEstimator(model, criterion, Array(2), Array(2))
      .setOptimMethod(new LBFGS[Double]())
      .setLearningRate(1.0)
      .setBatchSize(4)
      .setMaxEpoch(10)
    val data = sc.parallelize(Seq(
      (Array(2.0, 1.0), Array(1.0, 2.0)),
      (Array(1.0, 2.0), Array(2.0, 1.0)),
      (Array(2.0, 1.0), Array(1.0, 2.0)),
      (Array(1.0, 2.0), Array(2.0, 1.0))))
    val df = sqlContext.createDataFrame(data).toDF("features", "label")
    val dlModel = estimator.fit(df)
    dlModel.transform(df).show(false)
  }
}

Source File: DLClassifierLogisticRegression.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.MLPipeline

import com.intel.analytics.bigdl.dlframes.DLClassifier
import com.intel.analytics.bigdl.nn.{ClassNLLCriterion, Linear, LogSoftMax, Sequential}
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
import com.intel.analytics.bigdl.utils.Engine
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext


object DLClassifierLogisticRegression {

  def main(args: Array[String]): Unit = {
    val conf = Engine.createSparkConf()
      .setAppName("DLClassifierLogisticRegression")
      .setMaster("local[1]")
    val sc = new SparkContext(conf)
    val sqlContext = SQLContext.getOrCreate(sc)
    Engine.init

    val model = Sequential().add(Linear(2, 2)).add(LogSoftMax())
    val criterion = ClassNLLCriterion()
    val estimator = new DLClassifier(model, criterion, Array(2))
      .setBatchSize(4)
      .setMaxEpoch(10)
    val data = sc.parallelize(Seq(
      (Array(0.0, 1.0), 1.0),
      (Array(1.0, 0.0), 2.0),
      (Array(0.0, 1.0), 1.0),
      (Array(1.0, 0.0), 2.0)))
    val df = sqlContext.createDataFrame(data).toDF("features", "label")
    val dlModel = estimator.fit(df)
    dlModel.transform(df).show(false)
  }
}

Source File: ImagePredictor.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.imageclassification

import java.nio.file.Paths

import com.intel.analytics.bigdl.dataset.image._
import com.intel.analytics.bigdl.dlframes.DLClassifierModel
import com.intel.analytics.bigdl.example.imageclassification.MlUtils._
import com.intel.analytics.bigdl.numeric.NumericFloat
import com.intel.analytics.bigdl.utils.{Engine, LoggerFilter}
import org.apache.log4j.{Level, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext


object ImagePredictor {
  LoggerFilter.redirectSparkInfoLogs()
  Logger.getLogger("com.intel.analytics.bigdl.example").setLevel(Level.INFO)

  def main(args: Array[String]): Unit = {
    predictParser.parse(args, new PredictParams()).map(param => {
      val conf = Engine.createSparkConf()
      conf.setAppName("Predict with trained model")
      val sc = new SparkContext(conf)
      Engine.init
      val sqlContext = new SQLContext(sc)

      val partitionNum = Engine.nodeNumber() * Engine.coreNumber()
      val model = loadModel(param)
      val valTrans = new DLClassifierModel(model, Array(3, imageSize, imageSize))
        .setBatchSize(param.batchSize)
        .setFeaturesCol("features")
        .setPredictionCol("predict")

      val valRDD = if (param.isHdfs) {
        // load image set from hdfs
        imagesLoadSeq(param.folder, sc, param.classNum).coalesce(partitionNum, true)
      } else {
        // load image set from local
        val paths = LocalImageFiles.readPaths(Paths.get(param.folder), hasLabel = false)
        sc.parallelize(imagesLoad(paths, 256), partitionNum)
      }

      val transf = RowToByteRecords() ->
          BytesToBGRImg() ->
          BGRImgCropper(imageSize, imageSize) ->
          BGRImgNormalizer(testMean, testStd) ->
          BGRImgToImageVector()

      val valDF = transformDF(sqlContext.createDataFrame(valRDD), transf)

      valTrans.transform(valDF)
          .select("imageName", "predict")
          .collect()
          .take(param.showNum)
          .foreach(println)
      sc.stop()
    })
  }
}

Source File: ImageInference.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.example.dlframes.imageInference

import com.intel.analytics.bigdl.dlframes.{DLClassifierModel, DLModel}
import org.apache.spark.sql.DataFrame
import scopt.OptionParser
import com.intel.analytics.bigdl.dataset.Sample
import com.intel.analytics.bigdl.nn.Module
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric.NumericFloat
import com.intel.analytics.bigdl.transform.vision.image.augmentation._
import com.intel.analytics.bigdl.transform.vision.image._
import com.intel.analytics.bigdl.utils.Engine
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext

object ImageInference {

  def main(args: Array[String]): Unit = {

    val defaultParams = Utils.LocalParams()
    Utils.parser.parse(args, defaultParams).map { params =>

      val conf = Engine.createSparkConf().setAppName("ModelInference")
      val sc = SparkContext.getOrCreate(conf)
      val sqlContext = new SQLContext(sc)
      Engine.init

      val imagesDF = Utils.loadImages(params.folder, params.batchSize, sqlContext).cache()

      imagesDF.show(10)
      imagesDF.printSchema()

      val model = Module.loadCaffeModel[Float](params.caffeDefPath, params.modelPath)
      val dlmodel: DLModel[Float] = new DLClassifierModel[Float](
        model, Array(3, 224, 224))
        .setBatchSize(params.batchSize)
        .setFeaturesCol("features")
        .setPredictionCol("prediction")

      val count = imagesDF.count().toInt
      val tranDF = dlmodel.transform(imagesDF.limit(count))

      tranDF.select("imageName", "prediction").show(100, false)
    }
  }
}

object Utils {

  case class LocalParams(caffeDefPath: String = " ",
                         modelPath: String = " ",
                         folder: String = " ",
                         batchSize: Int = 16,
                         nEpochs: Int = 10
                        )

  val defaultParams = LocalParams()

  val parser = new OptionParser[LocalParams]("BigDL Example") {
    opt[String]("caffeDefPath")
      .text(s"caffeDefPath")
      .action((x, c) => c.copy(caffeDefPath = x))
    opt[String]("modelPath")
      .text(s"modelPath")
      .action((x, c) => c.copy(modelPath = x))
    opt[String]("folder")
      .text(s"folder")
      .action((x, c) => c.copy(folder = x))
    opt[Int]('b', "batchSize")
      .text(s"batchSize")
      .action((x, c) => c.copy(batchSize = x.toInt))
    opt[Int]('e', "nEpochs")
      .text("epoch numbers")
      .action((x, c) => c.copy(nEpochs = x))
  }

  def loadImages(path: String, partitionNum: Int, sqlContext: SQLContext): DataFrame = {

    val imageFrame: ImageFrame = ImageFrame.read(path, sqlContext.sparkContext)
    val transformer = Resize(256, 256) -> CenterCrop(224, 224) ->
      ChannelNormalize(123, 117, 104, 1, 1, 1) -> MatToTensor() -> ImageFrameToSample()
    val transformed: ImageFrame = transformer(imageFrame)
    val imageRDD = transformed.toDistributed().rdd.map { im =>
      (im.uri, im[Sample[Float]](ImageFeature.sample).getData())
    }
    val imageDF = sqlContext.createDataFrame(imageRDD)
      .withColumnRenamed("_1", "imageName")
      .withColumnRenamed("_2", "features")
    imageDF
  }

}

Source File: MLlibTestSparkContext.scala From spark-lp with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.scalatest.{BeforeAndAfterAll, Suite}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext


trait MLlibTestSparkContext extends BeforeAndAfterAll { self: Suite =>
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _

  override def beforeAll() {
    super.beforeAll()
    val conf = new SparkConf()
      .setMaster("local[2]")
      .setAppName("MLlibUnitTest")
    sc = new SparkContext(conf)
    sc.setLogLevel("WARN")
    sqlContext = new SQLContext(sc)
  }

  override def afterAll() {
    sqlContext = null
    if (sc != null) {
      sc.stop()
    }
    sc = null
    super.afterAll()
  }
}

Source File: TestSFObjectWriter.scala From spark-salesforce with Apache License 2.0

5 votes

package com.springml.spark.salesforce

import org.mockito.Mockito._
import org.mockito.Matchers._
import org.scalatest.mock.MockitoSugar
import org.scalatest.{ FunSuite, BeforeAndAfterEach}
import com.springml.salesforce.wave.api.BulkAPI
import org.apache.spark.{ SparkConf, SparkContext}
import com.springml.salesforce.wave.model.{ JobInfo, BatchInfo}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ Row, DataFrame, SQLContext}
import org.apache.spark.sql.types.{ StructType, StringType, StructField}

class TestSFObjectWriter extends FunSuite with MockitoSugar with BeforeAndAfterEach {
  val contact = "Contact";
  val jobId = "750B0000000WlhtIAC";
  val batchId = "751B0000000scSHIAY";
  val data = "Id,Description\n003B00000067Rnx,123456\n003B00000067Rnw,7890";

  val bulkAPI = mock[BulkAPI](withSettings().serializable())
  val writer = mock[SFObjectWriter]

  var sparkConf: SparkConf = _
  var sc: SparkContext = _

  override def beforeEach() {
    val jobInfo = new JobInfo
    jobInfo.setId(jobId)
    when(bulkAPI.createJob(contact)).thenReturn(jobInfo)

    val batchInfo = new BatchInfo
    batchInfo.setId(batchId)
    batchInfo.setJobId(jobId)
    when(bulkAPI.addBatch(jobId, data)).thenReturn(batchInfo)

    when(bulkAPI.closeJob(jobId)).thenReturn(jobInfo)
    when(bulkAPI.isCompleted(jobId)).thenReturn(true)

    sparkConf = new SparkConf().setMaster("local").setAppName("Test SF Object Update")
    sc = new SparkContext(sparkConf)
  }

  private def sampleDF() : DataFrame = {
    val rowArray = new Array[Row](2)
    val fieldArray = new Array[String](2)

    fieldArray(0) = "003B00000067Rnx"
    fieldArray(1) = "Desc1"
    rowArray(0) = Row.fromSeq(fieldArray)

    val fieldArray1 = new Array[String](2)
    fieldArray1(0) = "001B00000067Rnx"
    fieldArray1(1) = "Desc2"
    rowArray(1) = Row.fromSeq(fieldArray1)

    val rdd = sc.parallelize(rowArray)
    val schema = StructType(
      StructField("id", StringType, true) ::
      StructField("desc", StringType, true) :: Nil)

    val sqlContext = new SQLContext(sc)
    sqlContext.createDataFrame(rdd, schema)
  }

  test ("Write Object to Salesforce") {
    val df = sampleDF();
    val csvHeader = Utils.csvHeadder(df.schema)
    writer.writeData(df.rdd)
    sc.stop()
  }
}

Source File: SharedSparkSessionBase.scala From spark-alchemy with Apache License 2.0

5 votes

package org.apache.spark.sql.test

import org.apache.spark.sql.internal.StaticSQLConf
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.spark.{DebugFilesystem, SparkConf}
import org.scalatest.Suite
import org.scalatest.concurrent.Eventually

import scala.concurrent.duration._



  protected override def afterAll(): Unit = {
    try {
      super.afterAll()
    } finally {
      try {
        if (_spark != null) {
          try {
            _spark.sessionState.catalog.reset()
          } finally {
            try {
              waitForTasksToFinish()
            } finally {
              _spark.stop()
              _spark = null
            }
          }
        }
      } finally {
        SparkSession.clearActiveSession()
        SparkSession.clearDefaultSession()
      }
    }
  }

  protected override def beforeEach(): Unit = {
    super.beforeEach()
    DebugFilesystem.clearOpenStreams()
  }

  protected override def afterEach(): Unit = {
    super.afterEach()
    // Clear all persistent datasets after each test
    spark.sharedState.cacheManager.clearCache()
    // files can be closed from other threads, so wait a bit
    // normally this doesn't take more than 1s
    eventually(timeout(30.seconds), interval(2.seconds)) {
      DebugFilesystem.assertNoOpenStreams()
    }
  }
}

Source File: SparkSessionSpec.scala From spark-alchemy with Apache License 2.0

5 votes

package com.swoop.test_utils

import org.apache.logging.log4j.Level
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.test.{SharedSparkSessionBase, TestSparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.TestSuite

import scala.util.Try


trait SparkSessionSpec extends SharedSparkSessionBase {
  this: TestSuite =>

  override protected def createSparkSession: TestSparkSession = {
    val spark = super.createSparkSession
    configureLoggers(sparkLogLevel)
    spark
  }

  def sparkSession = spark

  def sqlc: SQLContext = sparkSession.sqlContext

  def sc: SparkContext = sparkSession.sparkContext

  protected def sparkLogLevel =
    Try(sys.env("SPARK_LOG_LEVEL")).getOrElse("WARN").toUpperCase match {
      case "DEBUG" => Level.DEBUG
      case "INFO" => Level.INFO
      case "WARN" => Level.WARN
      case _ => Level.ERROR
    }

  protected def configureLoggers(): Unit =
    configureLoggers(sparkLogLevel)

  protected def configureLoggers(logLevel: Level): Unit = {
    // Set logging through log4j v1 APIs also as v2 APIs are too tricky to manage
    org.apache.log4j.Logger.getRootLogger.setLevel(logLevel match {
      case Level.DEBUG => org.apache.log4j.Level.DEBUG
      case Level.INFO => org.apache.log4j.Level.INFO
      case Level.WARN => org.apache.log4j.Level.WARN
      case Level.ERROR => org.apache.log4j.Level.ERROR
    })
  }

  override protected def sparkConf: SparkConf =
    super.sparkConf
      .set("spark.driver.bindAddress", "127.0.0.1")

}

Source File: S2SinkProvider.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.spark.sql.streaming

import com.typesafe.config.{Config, ConfigFactory, ConfigRenderOptions}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
import org.apache.spark.sql.streaming.OutputMode

import scala.collection.JavaConversions._

class S2SinkProvider extends StreamSinkProvider with DataSourceRegister with Logger {
  override def createSink(
                  sqlContext: SQLContext,
                  parameters: Map[String, String],
                  partitionColumns: Seq[String],
                  outputMode: OutputMode): Sink = {

    logger.info(s"S2SinkProvider options : ${parameters}")
    val jobConf:Config = ConfigFactory.parseMap(parameters).withFallback(ConfigFactory.load())
    logger.info(s"S2SinkProvider Configuration : ${jobConf.root().render(ConfigRenderOptions.concise())}")

    new S2SparkSqlStreamingSink(sqlContext.sparkSession, jobConf)
  }

  override def shortName(): String = "s2graph"
}

Source File: JsonUtil.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.util

import org.apache.spark.sql.functions.explode
import org.apache.spark.sql.{Column, DataFrame, SQLContext, SparkSession}

import scala.collection.mutable.ArrayBuffer

object JsonUtil extends Serializable{


//  The tag you want to parse,If you want to open an array field,you have to write it like this:links_name(MasterField_ChildField)
  def ParserJsonDF(df:DataFrame,tag:String): DataFrame = {

    var openArrField:String=""
    var ArrSchame:String=""

    var tagARR: Array[String] = tag.split(",")
    var tagNew:String=""


    for(tt<-tagARR){

      if(tt.indexOf("_")> -1){
        //包含“.”
        val openField: Array[String] = tt.split("_")
        openArrField=openField(0)

        ArrSchame+=(openField(1)+",")
      }else{
        tagNew+=(tt+",")
      }
    }
    tagNew+=openArrField
    ArrSchame=ArrSchame.substring(0,ArrSchame.length-1)

    tagARR = tagNew.split(",")
    var FinalDF:DataFrame=df

    //如果用户选择返回字段
    var strings: Seq[Column] =tagNew.split(",").toSeq.map(p => new Column(p))

    if(tag.length>0){
      val df00 = FinalDF.select(strings : _*)
      FinalDF=df00
    }

    //如果用户选择打开的数组字段，并给出schame
    if(openArrField.length>0&&ArrSchame.length>0){

      val schames: Array[String] = ArrSchame.split(",")

      var selARR:ArrayBuffer[String]=ArrayBuffer()//分别取出已经打开的字段
      //遍历数组，封装到column对象中
      var coARR:ArrayBuffer[Column]=ArrayBuffer()//打开字段的select方法用
      val sss = tagNew.split(",")//打开字段后todf方法用
      var co: Column =null
      for(each<-tagARR){
        if(each==openArrField){
          co = explode(FinalDF(openArrField))
          for(x<-schames){

            selARR+=(openArrField+"."+x)
          }
        }else{
          selARR+=each
          co=FinalDF(each)
        }
        coARR+=co
      }
      println("###################")
      selARR.foreach(println(_))
      var selSEQ: Seq[Column] = selARR.toSeq.map(q => new Column(q))

      var df01: DataFrame = FinalDF.select(coARR : _*).toDF(sss:_*)
      FinalDF = df01.select(selSEQ : _*)

    }

FinalDF

  }
}

Source File: ScalaRiakParquetExample.scala From spark-riak-connector with Apache License 2.0

5 votes

package com.basho.riak.spark.examples.parquet
import org.apache.spark.sql.{SaveMode, SQLContext}
import org.apache.spark.{SparkContext, SparkConf}


object ScalaRiakParquetExample {
  case class TSData(site: String, species: String, measurementDate: Long, latitude: Double, longitude: Double, value: Double )
  val startDate = System.currentTimeMillis()
  val endDate = startDate + 100
  val tableName = "parquet_demo"
  val parquetFileName = "riak-ts-data.parquet"

  val testData = Seq(
    TSData("MY7", "PM10", startDate, 51.52254, -0.15459, 41.4),
    TSData("MY7", "PM10", startDate + 10, 51.52254, -0.15459, 41.2),
    TSData("MY7", "PM10", startDate + 20, 51.52254, -0.15459, 39.1),
    TSData("MY7", "PM10", startDate + 30, 51.52254, -0.15459, 39.5),
    TSData("MY7", "PM10", startDate + 40, 51.52254, -0.15459, 29.9),
    TSData("MY7", "PM10", startDate + 50, 51.52254, -0.15459, 34.2),
    TSData("MY7", "PM10", startDate + 60, 51.52254, -0.15459, 28.5),
    TSData("MY7", "PM10", startDate + 70, 51.52254, -0.15459, 39.6),
    TSData("MY7", "PM10", startDate + 80, 51.52254, -0.15459, 29.2),
    TSData("MY7", "PM10", startDate + 90, 51.52254, -0.15459, 31.3)
  )

  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf()
      .setAppName("Simple Scala Riak TS Demo")

    setSparkOpt(sparkConf, "spark.master", "local")
    setSparkOpt(sparkConf, "spark.riak.connection.host", "127.0.0.1:8087")
    println(s"Test data start time: $startDate")

    val sc = new SparkContext(sparkConf)
    val sqlCtx = SQLContext.getOrCreate(sc)

    import sqlCtx.implicits._

    val rdd = sc.parallelize(testData)
    rdd.toDF().write.format("org.apache.spark.sql.riak")
      .mode(SaveMode.Append).save(tableName)

    val df = sqlCtx.read.format("org.apache.spark.sql.riak")
      .load(tableName).registerTempTable(tableName)

    val from = (startDate / 1000).toInt
    val query = s"select * from $tableName where measurementDate >= CAST($from AS TIMESTAMP) " +
      s"AND measurementDate <= CAST(${from + 1} AS TIMESTAMP) AND site = 'MY7' AND species = 'PM10'"

    println(s"Query: $query")
    val rows = sqlCtx.sql(query)
    rows.show()
    val schema = rows.schema

    rows.write.mode("overwrite").parquet(parquetFileName)
    println(s"Data was successfully saved to Parquet file: $parquetFileName")

    val parquetFile = sqlCtx.read.parquet(parquetFileName)
    parquetFile.registerTempTable("parquetFile")
    val data = sqlCtx.sql("SELECT MAX(value) max_value FROM parquetFile ")

    println("Maximum value retrieved from Parquet file:")
    data.show()

  }

  private def setSparkOpt(sparkConf: SparkConf, option: String, defaultOptVal: String): SparkConf = {
    val optval = sparkConf.getOption(option).getOrElse(defaultOptVal)
    sparkConf.set(option, optval)
  }
}

Source File: SparkDataframesTest.scala From spark-riak-connector with Apache License 2.0

5 votes

package com.basho.riak.spark.rdd

import scala.reflect.runtime.universe
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLContext
import org.junit.Assert._
import org.junit.{ Before, Test }
import com.basho.riak.spark.toSparkContextFunctions
import org.junit.experimental.categories.Category

case class TestData(id: String, name: String, age: Int, category: String)

@Category(Array(classOf[RiakTSTests]))
class SparkDataframesTest extends AbstractRiakSparkTest {

  private val indexName = "creationNo"

  protected override val jsonData = Some(
    """[
      |   {key: 'key1', value: {id: 'u1', name: 'Ben', age: 20, category: 'CategoryA'}},
      |   {key: 'key2', value: {id: 'u2', name: 'Clair', age: 30, category: 'CategoryB'}},
      |   {key: 'key3', value: {id: 'u3', name: 'John', age: 70}},
      |   {key: 'key4', value: {id: 'u4', name: 'Chris', age: 10, category: 'CategoryC'}},
      |   {key: 'key5', value: {id: 'u5', name: 'Mary', age: 40, category: 'CategoryB'}},
      |   {key: 'key6', value: {id: 'u6', name: 'George', age: 50, category: 'CategoryC'}}
      |]""".stripMargin)

  protected override def initSparkConf() = super.initSparkConf().setAppName("Dataframes Test")

  var sqlContextHolder: SQLContext = _
  var df: DataFrame = _

  @Before
  def initializeDF(): Unit = {
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)
    import sqlContext.implicits._
    sqlContextHolder = sqlContext
    df = sc.riakBucket[TestData](DEFAULT_NAMESPACE.getBucketNameAsString)
      .queryAll().toDF
    df.registerTempTable("test")
  }

  @Test
  def schemaTest(): Unit = {
    df.printSchema()
    val schema = df.schema.map(_.name).toList
    val fields = universe.typeOf[TestData].members.withFilter(!_.isMethod).map(_.name.toString.trim).toList
    assertEquals(schema.sorted, fields.sorted)
  }

  @Test
  def sqlQueryTest(): Unit = {
    val sqlResult = sqlContextHolder.sql("select * from test where category >= 'CategoryC'").toJSON.collect
    val expected =
      """ [
        |   {id:'u4',name:'Chris',age:10,category:'CategoryC'},
        |   {id:'u6',name:'George',age:50,category:'CategoryC'}
        | ]""".stripMargin
    assertEqualsUsingJSONIgnoreOrder(expected, stringify(sqlResult))
  }

  @Test
  def udfTest(): Unit = {
    sqlContextHolder.udf.register("stringLength", (s: String) => s.length)
    val udf = sqlContextHolder.sql("select name, stringLength(name) strLgth from test order by strLgth, name").toJSON.collect
    val expected =
      """ [
        |   {name:'Ben',strLgth:3},
        |   {name:'John',strLgth:4},
        |   {name:'Mary',strLgth:4},
        |   {name:'Chris',strLgth:5},
        |   {name:'Clair',strLgth:5},
        |   {name:'George',strLgth:6}
        | ]""".stripMargin
    assertEqualsUsingJSON(expected, stringify(udf))
  }

  @Test
  def grouppingTest(): Unit = {
    val groupped = df.groupBy("category").count.toJSON.collect
    val expected =
      """ [
        |   {category:'CategoryA',count:1},
        |   {category:'CategoryB',count:2},
        |   {category:'CategoryC',count:2},
        |   {count:1}
        | ]""".stripMargin
    assertEqualsUsingJSONIgnoreOrder(expected, stringify(groupped))
  }

  @Test
  def sqlVsFilterTest(): Unit = {
    val sql = sqlContextHolder.sql("select id, name from test where age >= 50").toJSON.collect
    val filtered = df.where(df("age") >= 50).select("id", "name").toJSON.collect
    assertEqualsUsingJSONIgnoreOrder(stringify(sql), stringify(filtered))
  }

}

Source File: DefaultSource.scala From spark-solr with Apache License 2.0

5 votes

package solr

import com.lucidworks.spark.{SolrRelation, SolrStreamWriter}
import com.lucidworks.spark.util.Constants
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.streaming.OutputMode

class DefaultSource extends RelationProvider with CreatableRelationProvider with StreamSinkProvider with DataSourceRegister {

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
    try {
      new SolrRelation(parameters, sqlContext.sparkSession)
    } catch {
      case re: RuntimeException => throw re
      case e: Exception => throw new RuntimeException(e)
    }
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    try {
      // TODO: What to do with the saveMode?
      val solrRelation: SolrRelation = new SolrRelation(parameters, Some(df), sqlContext.sparkSession)
      solrRelation.insert(df, overwrite = true)
      solrRelation
    } catch {
      case re: RuntimeException => throw re
      case e: Exception => throw new RuntimeException(e)
    }
  }

  override def shortName(): String = Constants.SOLR_FORMAT

  override def createSink(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new SolrStreamWriter(sqlContext.sparkSession, parameters, partitionColumns, outputMode)
  }
}

Source File: MLlibTestSparkContext.scala From bisecting-kmeans with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, Suite}

trait MLlibTestSparkContext extends BeforeAndAfterAll { self: Suite =>
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _

  override def beforeAll() {
    super.beforeAll()
    val conf = new SparkConf()
      .setMaster("local[2]")
      .setAppName("MLlibUnitTest")
    sc = new SparkContext(conf)
    sqlContext = new SQLContext(sc)
  }

  override def afterAll() {
    sqlContext = null
    if (sc != null) {
      sc.stop()
    }
    sc = null
    super.afterAll()
  }
}

Source File: MLlibTestSparkContext.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.scalatest.{ BeforeAndAfterAll, Suite }

import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.sql.SQLContext


trait MLlibTestSparkContext extends BeforeAndAfterAll { self: Suite =>
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _

  override def beforeAll() {
    super.beforeAll()
    val conf = new SparkConf()
      .setMaster("local[2]")
      .setAppName("MLlibUnitTest")
    sc = new SparkContext(conf)
    sc.setLogLevel("WARN")
    sqlContext = new SQLContext(sc)
  }

  override def afterAll() {
    sqlContext = null
    if (sc != null) {
      sc.stop()
    }
    sc = null
    super.afterAll()
  }
}

Source File: SparkSqlExtension.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.extension

import java.util.concurrent._

import com.webank.wedatasphere.linkis.common.conf.CommonVars
import com.webank.wedatasphere.linkis.common.utils.{Logging, Utils}
import org.apache.spark.sql.execution.QueryExecution
import org.apache.spark.sql.{DataFrame, SQLContext}

import scala.collection.mutable.ArrayBuffer
import scala.concurrent.duration._

abstract class SparkSqlExtension extends Logging{

  private val maxPoolSize = CommonVars("wds.linkis.dws.ujes.spark.extension.max.pool",5).getValue

  private  val executor = new ThreadPoolExecutor(2, maxPoolSize, 2, TimeUnit.SECONDS, new LinkedBlockingQueue[Runnable](), new ThreadFactory {
    override def newThread(r: Runnable): Thread = {
      val thread = new Thread(r)
      thread.setDaemon(true)
      thread
    }
  })

  final def afterExecutingSQL(sqlContext: SQLContext,command: String,dataFrame: DataFrame,timeout:Long,sqlStartTime:Long):Unit = {
    try {
      val thread = new Runnable {
        override def run(): Unit = extensionRule(sqlContext,command,dataFrame.queryExecution,sqlStartTime)
      }
      val future = executor.submit(thread)
      Utils.waitUntil(future.isDone,timeout milliseconds)
    } catch {
      case e: Throwable => info("Failed to execute SparkSqlExtension: ", e)
    }
  }

  protected def extensionRule(sqlContext: SQLContext,command: String,queryExecution: QueryExecution,sqlStartTime:Long):Unit


}

object SparkSqlExtension extends Logging {

  private val extensions = ArrayBuffer[SparkSqlExtension]()

  def register(sqlExtension: SparkSqlExtension):Unit = {
    info("Get a sqlExtension register")
    extensions.append(sqlExtension)
  }

  def getSparkSqlExtensions():Array[SparkSqlExtension] = {
    extensions.toArray
  }
}

Source File: SparkSqlExecutor.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.linkis.engine.executors

import java.lang.reflect.InvocationTargetException
import java.util.concurrent.atomic.AtomicLong

import com.webank.wedatasphere.linkis.common.conf.CommonVars
import com.webank.wedatasphere.linkis.common.utils.{Logging, Utils}
import com.webank.wedatasphere.linkis.engine.configuration.SparkConfiguration
import com.webank.wedatasphere.linkis.engine.execute.EngineExecutorContext
import com.webank.wedatasphere.linkis.engine.extension.SparkSqlExtension
import com.webank.wedatasphere.linkis.engine.spark.common.{Kind, SparkSQL}
import com.webank.wedatasphere.linkis.engine.spark.utils.EngineUtils
import com.webank.wedatasphere.linkis.scheduler.executer.{ErrorExecuteResponse, ExecuteResponse, SuccessExecuteResponse}
import org.apache.commons.lang.exception.ExceptionUtils
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SQLContext}

      SQLSession.showDF(sc, jobGroup, rdd, null, SparkConfiguration.SHOW_DF_MAX_RES.getValue,engineExecutorContext)
      SuccessExecuteResponse()
    } catch {
      case e: InvocationTargetException =>
        var cause = ExceptionUtils.getCause(e)
        if(cause == null) cause = e
        error("execute sparkSQL failed!", cause)
        ErrorExecuteResponse(ExceptionUtils.getRootCauseMessage(e), cause)
      case ite: Exception =>
        error("execute sparkSQL failed!", ite)
        ErrorExecuteResponse(ExceptionUtils.getRootCauseMessage(ite), ite)
    } finally sc.clearJobGroup()
  }



  override def kind: Kind = SparkSQL()

  override def open: Unit = {}

  override def close: Unit = {}

}

Source File: DefaultSource.scala From Linkis with Apache License 2.0

5 votes

package com.webank.wedatasphere.spark.excel

import org.apache.hadoop.fs.Path
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {

  
  override def createRelation(
    sqlContext: SQLContext,
    parameters: Map[String, String],
    schema: StructType
  ): ExcelRelation = {
    ExcelRelation(
      location = checkParameter(parameters, "path"),
      sheetName = parameters.get("sheetName"),
      useHeader = checkParameter(parameters, "useHeader").toBoolean,
      treatEmptyValuesAsNulls = parameters.get("treatEmptyValuesAsNulls").fold(true)(_.toBoolean),
      userSchema = Option(schema),
      inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean),
      addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean),
      startColumn = parameters.get("startColumn").fold(0)(_.toInt),
      endColumn = parameters.get("endColumn").fold(Int.MaxValue)(_.toInt),
      timestampFormat = parameters.get("timestampFormat"),
      maxRowsInMemory = parameters.get("maxRowsInMemory").map(_.toInt),
      excerptSize = parameters.get("excerptSize").fold(10)(_.toInt),
      parameters = parameters,
      dateFormat = parameters.get("dateFormats").getOrElse("yyyy-MM-dd").split(";").toList
    )(sqlContext)
  }

  override def createRelation(
    sqlContext: SQLContext,
    mode: SaveMode,
    parameters: Map[String, String],
    data: DataFrame
  ): BaseRelation = {
    val path = checkParameter(parameters, "path")
    val sheetName = parameters.getOrElse("sheetName", "Sheet1")
    val useHeader = checkParameter(parameters, "useHeader").toBoolean
    val dateFormat = parameters.getOrElse("dateFormat", ExcelFileSaver.DEFAULT_DATE_FORMAT)
    val timestampFormat = parameters.getOrElse("timestampFormat", ExcelFileSaver.DEFAULT_TIMESTAMP_FORMAT)
    val filesystemPath = new Path(path)
    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
    fs.setWriteChecksum(false)
    val doSave = if (fs.exists(filesystemPath)) {
      mode match {
        case SaveMode.Append =>
          sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}")
        case SaveMode.Overwrite =>
          fs.delete(filesystemPath, true)
          true
        case SaveMode.ErrorIfExists =>
          sys.error(s"path $path already exists.")
        case SaveMode.Ignore => false
      }
    } else {
      true
    }
    if (doSave) {
      // Only save data when the save mode is not ignore.
      (new ExcelFileSaver(fs)).save(
        filesystemPath,
        data,
        sheetName = sheetName,
        useHeader = useHeader,
        dateFormat = dateFormat,
        timestampFormat = timestampFormat
      )
    }

    createRelation(sqlContext, parameters, data.schema)
  }

  // Forces a Parameter to exist, otherwise an exception is thrown.
  private def checkParameter(map: Map[String, String], param: String): String = {
    if (!map.contains(param)) {
      throw new IllegalArgumentException(s"Parameter ${'"'}$param${'"'} is missing in options.")
    } else {
      map.apply(param)
    }
  }

  // Gets the Parameter if it exists, otherwise returns the default argument
  private def parameterOrDefault(map: Map[String, String], param: String, default: String) =
    map.getOrElse(param, default)
}

Source File: SelectJSONSource.scala From spark-select with Apache License 2.0

5 votes

package io.minio.spark.select

// Java standard libraries
import java.io.File

// Spark internal libraries
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType

import org.apache.spark.sql.sources.DataSourceRegister

class SelectJSONSource
  extends SchemaRelationProvider
  with DataSourceRegister {

  private def checkPath(parameters: Map[String, String]): String = {
    parameters.getOrElse("path", sys.error("'path' must be specified for JSON data."))
  }

  
  override def shortName(): String = "minioSelectJSON"

  override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectJSONRelation = {
    val path = checkPath(params)
    SelectJSONRelation(Some(path), params, schema)(sqlContext)
  }
}

Source File: SelectCSVSource.scala From spark-select with Apache License 2.0

5 votes

package io.minio.spark.select

// Java standard libraries
import java.io.File

// Spark internal libraries
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType

import org.apache.spark.sql.sources.DataSourceRegister

class SelectCSVSource
  extends SchemaRelationProvider
  with DataSourceRegister {

  private def checkPath(parameters: Map[String, String]): String = {
    parameters.getOrElse("path", sys.error("'path' must be specified for CSV data."))
  }

  
  override def shortName(): String = "minioSelectCSV"

  override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectCSVRelation = {
    val path = checkPath(params)
    SelectCSVRelation(Some(path), params, schema)(sqlContext)
  }
}

Source File: SelectParquetSource.scala From spark-select with Apache License 2.0

5 votes

package io.minio.spark.select

// Java standard libraries
import java.io.File

// Spark internal libraries
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType

import org.apache.spark.sql.sources.DataSourceRegister

class SelectParquetSource
  extends SchemaRelationProvider
  with DataSourceRegister {

  private def checkPath(parameters: Map[String, String]): String = {
    parameters.getOrElse("path", sys.error("'path' must be specified for Parquet data."))
  }

  
  override def shortName(): String = "minioSelectParquet"

  override def createRelation(sqlContext: SQLContext, params: Map[String, String], schema: StructType): SelectParquetRelation = {
    val path = checkPath(params)
    SelectParquetRelation(Some(path), params, schema)(sqlContext)
  }
}

Source File: SparkCassBulkWriterSpec.scala From Spark2Cassandra with Apache License 2.0

5 votes

package com.github.jparkie.spark.cassandra

import com.datastax.driver.core.querybuilder.QueryBuilder
import com.datastax.spark.connector.AllColumns
import com.datastax.spark.connector.writer.{ RowWriterFactory, SqlRowWriter }
import com.github.jparkie.spark.cassandra.client.SparkCassSSTableLoaderClientManager
import com.github.jparkie.spark.cassandra.conf.{ SparkCassServerConf, SparkCassWriteConf }
import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.{ Row, SQLContext }
import org.scalatest.{ MustMatchers, WordSpec }

import scala.collection.JavaConverters._

class SparkCassBulkWriterSpec extends WordSpec with MustMatchers with CassandraServerSpecLike with SharedSparkContext {
  val testKeyspace = "test_keyspace"
  val testTable = "test_table"

  override def beforeAll(): Unit = {
    super.beforeAll()

    getCassandraConnector.withSessionDo { currentSession =>
      createKeyspace(currentSession, testKeyspace)

      currentSession.execute(
        s"""CREATE TABLE $testKeyspace.$testTable (
            |  test_key BIGINT PRIMARY KEY,
            |  test_value VARCHAR
            |);
         """.stripMargin
      )
    }
  }

  "SparkCassBulkWriter" must {
    "write() successfully" in {
      val sqlContext = new SQLContext(sc)

      import sqlContext.implicits._

      implicit val testRowWriterFactory: RowWriterFactory[Row] = SqlRowWriter.Factory

      val testCassandraConnector = getCassandraConnector
      val testSparkCassWriteConf = SparkCassWriteConf()
      val testSparkCassServerConf = SparkCassServerConf(
        // See https://github.com/jsevellec/cassandra-unit/blob/master/cassandra-unit/src/main/resources/cu-cassandra.yaml
        storagePort = 7010
      )

      val testSparkCassBulkWriter = SparkCassBulkWriter(
        testCassandraConnector,
        testKeyspace,
        testTable,
        AllColumns,
        testSparkCassWriteConf,
        testSparkCassServerConf
      )

      val testRDD = sc.parallelize(1 to 25)
        .map(currentNumber => (currentNumber.toLong, s"Hello World: $currentNumber!"))
      val testDataFrame = testRDD.toDF("test_key", "test_value")

      sc.runJob(testDataFrame.rdd, testSparkCassBulkWriter.write _)

      getCassandraConnector.withSessionDo { currentSession =>
        val queryStatement = QueryBuilder.select("test_key", "test_value")
          .from(testKeyspace, testTable)
          .limit(25)

        val resultSet = currentSession.execute(queryStatement)

        val outputSet = resultSet.all.asScala
          .map(currentRow => (currentRow.getLong("test_key"), currentRow.getString("test_value")))
          .toMap

        for (currentNumber <- 1 to 25) {
          val currentKey = currentNumber.toLong

          outputSet(currentKey) mustEqual s"Hello World: $currentNumber!"
        }
      }

      SparkCassSSTableLoaderClientManager.evictAll()
    }
  }
}

Source File: SparkCassDataFrameFunctionsSpec.scala From Spark2Cassandra with Apache License 2.0

5 votes

package com.github.jparkie.spark.cassandra.sql

import com.holdenkarau.spark.testing.SharedSparkContext
import org.apache.spark.sql.SQLContext
import org.scalatest.{ MustMatchers, WordSpec }

class SparkCassDataFrameFunctionsSpec extends WordSpec with MustMatchers with SharedSparkContext {
  "Package com.github.jparkie.spark.cassandra.sql" must {
    "lift DataFrame into SparkCassDataFrameFunctions" in {
      val sqlContext = new SQLContext(sc)

      import sqlContext.implicits._

      val testRDD = sc.parallelize(1 to 25)
        .map(currentNumber => (currentNumber.toLong, s"Hello World: $currentNumber!"))
      val testDataFrame = testRDD.toDF("test_key", "test_value")

      // If internalSparkContext is available, RDD was lifted.
      testDataFrame.internalSparkContext
    }
  }
}

Source File: PointCloudRelation.scala From geotrellis-pointcloud with Apache License 2.0

5 votes

package geotrellis.pointcloud.spark.datasource

import geotrellis.pointcloud.spark.store.hadoop._
import geotrellis.pointcloud.spark.store.hadoop.HadoopPointCloudRDD.{Options => HadoopOptions}
import geotrellis.pointcloud.util.Filesystem
import geotrellis.proj4.CRS
import geotrellis.store.hadoop.util.HdfsUtils
import geotrellis.vector.Extent

import cats.implicits._
import io.pdal._
import io.circe.syntax._
import org.apache.hadoop.fs.Path
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext}

import java.io.File

import scala.collection.JavaConverters._

// This class has to be serializable since it is shipped over the network.
class PointCloudRelation(
  val sqlContext: SQLContext,
  path: String,
  options: HadoopOptions
) extends BaseRelation with TableScan with Serializable {

  @transient implicit lazy val sc: SparkContext = sqlContext.sparkContext

  // TODO: switch between HadoopPointCloudRDD and S3PointcCloudRDD
  lazy val isS3: Boolean = path.startsWith("s3")

  override def schema: StructType = {
    lazy val (local, fixedPath) =
      if(path.startsWith("s3") || path.startsWith("hdfs")) {
        val tmpDir = Filesystem.createDirectory()
        val remotePath = new Path(path)
        // copy remote file into local tmp dir
        val localPath = new File(tmpDir, remotePath.getName)
        HdfsUtils.copyPath(remotePath, new Path(s"file:///${localPath.getAbsolutePath}"), sc.hadoopConfiguration)
        (true, localPath.toString)
      } else (false, path)

    val localPipeline =
      options.pipeline
        .hcursor
        .downField("pipeline").downArray
        .downField("filename").withFocus(_ => fixedPath.asJson)
        .top.fold(options.pipeline)(identity)

    val pl = Pipeline(localPipeline.noSpaces)
    if (pl.validate()) pl.execute()
    val pointCloud = try {
      pl.getPointViews().next().getPointCloud(0)
    } finally {
      pl.close()
      if(local) println(new File(fixedPath).delete)
    }

    val rdd = HadoopPointCloudRDD(new Path(path), options)

    val md: (Option[Extent], Option[CRS]) =
      rdd
        .map { case (header, _) => (header.projectedExtent3D.map(_.extent3d.toExtent), header.crs) }
        .reduce { case ((e1, c), (e2, _)) => ((e1, e2).mapN(_ combine _), c) }

    val metadata = new MetadataBuilder().putString("metadata", md.asJson.noSpaces).build

    pointCloud.deriveSchema(metadata)
  }

  override def buildScan(): RDD[Row] = {
    val rdd = HadoopPointCloudRDD(new Path(path), options)
    rdd.flatMap { _._2.flatMap { pc => pc.readAll.toList.map { k => Row(k: _*) } } }
  }
}

Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu

import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MlLibOnKudu {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<taxiTable> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val taxiTable = args(2)
    val numOfCenters = args(3).toInt
    val numOfIterations = args(4).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val sqlContext = new SQLContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> taxiTable,
      "kudu.master" -> kuduMaster)

    sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load.
      registerTempTable("ny_taxi_trip_tmp")

    //Vector
    val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => {
      val taxiTrip = NyTaxiYellowTripBuilder.build(r)
      generateVectorOnly(taxiTrip)
    })

    println("--Running KMeans")
    val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations)
    println(" > vector centers:")
    clusters.clusterCenters.foreach(v => println(" >> " + v))

    println("--Running corr")
    val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson")
    println(" > corr: " + correlMatrix.toString)

    println("--Running colStats")
    val colStats = Statistics.colStats(vectorRDD)
    println(" > max: " + colStats.max)
    println(" > count: " + colStats.count)
    println(" > mean: " + colStats.mean)
    println(" > min: " + colStats.min)
    println(" > normL1: " + colStats.normL1)
    println(" > normL2: " + colStats.normL2)
    println(" > numNonZeros: " + colStats.numNonzeros)
    println(" > variance: " + colStats.variance)

    //Labeled Points
    
}

Source File: TimeType.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries.time.types

import com.twosigma.flint.FlintConf
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.{ SQLContext, SparkSession, types }

trait TimeType {
  
  def roundDownPrecision(nanosSinceEpoch: Long): Long
}

object TimeType {
  case object LongType extends TimeType {
    override def internalToNanos(value: Long): Long = value
    override def nanosToInternal(nanos: Long): Long = nanos
    override def roundDownPrecision(nanos: Long): Long = nanos
  }

  // Spark sql represents timestamp as microseconds internally
  case object TimestampType extends TimeType {
    override def internalToNanos(value: Long): Long = value * 1000
    override def nanosToInternal(nanos: Long): Long = nanos / 1000
    override def roundDownPrecision(nanos: Long): Long = nanos - nanos % 1000
  }

  def apply(timeType: String): TimeType = {
    timeType match {
      case "long" => LongType
      case "timestamp" => TimestampType
      case _ => throw new IllegalAccessException(s"Unsupported time type: ${timeType}. " +
        s"Only `long` and `timestamp` are supported.")
    }
  }

  def apply(sqlType: types.DataType): TimeType = {
    sqlType match {
      case types.LongType => LongType
      case types.TimestampType => TimestampType
      case _ => throw new IllegalArgumentException(s"Unsupported time type: ${sqlType}")
    }
  }

  def get(sparkSession: SparkSession): TimeType = {
    TimeType(sparkSession.conf.get(
      FlintConf.TIME_TYPE_CONF, FlintConf.TIME_TYPE_DEFAULT
    ))
  }
}

Source File: TimeSeriesRDDConversionSpec.scala From flint with Apache License 2.0

5 votes

package com.twosigma.flint.timeseries

import java.util.concurrent.TimeUnit

import com.twosigma.flint.timeseries.row.Schema
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ SQLContext, DataFrame, Row }
import org.apache.spark.sql.types._
import org.apache.spark.sql.catalyst.expressions.{ GenericRowWithSchema => ExternalRow }
import org.scalatest.tagobjects.Slow

class TimeSeriesRDDConversionSpec extends TimeSeriesSuite {

  // The largest prime < 100
  override val defaultPartitionParallelism = 97

  // The 10000-th prime.
  private val defaultNumRows = 104729

  private def createDataFrame(isSorted: Boolean = true)(implicit sqlContext: SQLContext): DataFrame = {
    val n = defaultNumRows
    val schema = Schema("value" -> DoubleType)
    val rdd: RDD[Row] = sqlContext.sparkContext.parallelize(1 to n, defaultPartitionParallelism).map { i =>
      val data: Array[Any] = if (isSorted) {
        Array((i / 100).toLong, i.toDouble)
      } else {
        Array(((i + 1 - n) / 100).toLong, i.toDouble)
      }
      new ExternalRow(data, schema)
    }
    sqlContext.createDataFrame(rdd, schema)
  }

  "TimeSeriesRDD" should "convert from a sorted DataFrame correctly" taggedAs (Slow) in {
    implicit val _sqlContext = sqlContext
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = true))(isSorted = true, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    }
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = true))(isSorted = false, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    }
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(createDataFrame(isSorted = false))(isSorted = false, TimeUnit.NANOSECONDS)
        assert(tsRdd.count() == defaultNumRows)
    }
    (1 to 10).foreach {
      i =>
        val tsRdd = TimeSeriesRDD.fromDF(
          createDataFrame(isSorted = false).sort("time")
        )(
            isSorted = true, TimeUnit.NANOSECONDS
          )
        assert(tsRdd.count() == defaultNumRows)
    }
  }
}

Source File: Utils.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.packtpub.mmlwspark.utils

import org.apache.spark.h2o.H2OContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.expressions.UserDefinedFunction
import water.fvec.H2OFrame


object Utils {
  def colTransform(hf: H2OFrame, udf: UserDefinedFunction, colName: String)(implicit h2oContext: H2OContext, sqlContext: SQLContext): H2OFrame = {
    import sqlContext.implicits._
    val name = hf.key.toString
    val colHf = hf(Array(colName))
    val df = h2oContext.asDataFrame(colHf)
    val result = h2oContext.asH2OFrame(df.withColumn(colName, udf($"${colName}")), s"${name}_${colName}")
    colHf.delete()
    result
  }

  def let[A](in: A)(body: A => Unit) = {
    body(in)
    in
  }
}

Source File: RedisStreamProvider.scala From spark-redis with BSD 3-Clause "New" or "Revised" License

5 votes

package org.apache.spark.sql.redis.stream

import com.redislabs.provider.redis.util.Logging
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Source
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSourceProvider}
import org.apache.spark.sql.types.{StringType, StructField, StructType}


class RedisStreamProvider extends DataSourceRegister with StreamSourceProvider with Logging {

  override def shortName(): String = "redis"

  override def sourceSchema(sqlContext: SQLContext, schema: Option[StructType],
                            providerName: String, parameters: Map[String, String]): (String, StructType) = {
    providerName -> schema.getOrElse {
      StructType(Seq(StructField("_id", StringType)))
    }
  }

  override def createSource(sqlContext: SQLContext, metadataPath: String,
                            schema: Option[StructType], providerName: String,
                            parameters: Map[String, String]): Source = {
    val (_, ss) = sourceSchema(sqlContext, schema, providerName, parameters)
    val source = new RedisSource(sqlContext, metadataPath, Some(ss), parameters)
    source.start()
    source
  }
}

Source File: DefaultSource.scala From spark-redis with BSD 3-Clause "New" or "Revised" License

5 votes

package org.apache.spark.sql.redis

import org.apache.spark.sql.SaveMode.{Append, ErrorIfExists, Ignore, Overwrite}
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

class DefaultSource extends RelationProvider with SchemaRelationProvider
  with CreatableRelationProvider {

  override def createRelation(sqlContext: SQLContext,
                              parameters: Map[String, String]): BaseRelation = {
    new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = None)
  }

  
  override def createRelation(sqlContext: SQLContext, mode: SaveMode,
                              parameters: Map[String, String], data: DataFrame): BaseRelation = {
    val relation = new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = None)
    mode match {
      case Append => relation.insert(data, overwrite = false)
      case Overwrite => relation.insert(data, overwrite = true)
      case ErrorIfExists =>
        if (relation.nonEmpty) {
          throw new IllegalStateException("SaveMode is set to ErrorIfExists and dataframe " +
            "already exists in Redis and contains data.")
        }
        relation.insert(data, overwrite = false)
      case Ignore =>
        if (relation.isEmpty) {
          relation.insert(data, overwrite = false)
        }
    }

    relation
  }

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String],
                              schema: StructType): BaseRelation =
    new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = Some(schema))
}

Source File: JdbcExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch05

// scalastyle:off println
import java.util.Properties

import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}

object JdbcExample {

  
  def main(args: Seq[String]): Unit = {
    if (args.length != 3) {
      new IllegalArgumentException("Invalid arguments")
      System.exit(1)
    }
    val url = args(0)
    val user = args(1)
    val pass = args(2)

    val conf = new SparkConf().setAppName("JdbcExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    run(sc, sqlContext, url, user, pass)

    sc.stop()
  }

  def run(sc: SparkContext, sqlContext: SQLContext,
      url: String, user: String, pass: String): Unit = {
    val prop = new Properties()
    prop.setProperty("user", user)
    prop.setProperty("password", pass)

    val df: DataFrame = sqlContext.read.jdbc(url, "gihyo_spark.person", prop)
    df.printSchema()
    println("# Rows: " + df.count())
  }
}
// scalastyle:on println

Source File: DataFrameNaFunctionExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch05

// scalastyle:off println
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}


object DataFrameNaFunctionExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("BasicDataFrameExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    run(sc, sqlContext)
    sc.stop()
  }

  def run(
      sc: SparkContext,
      sqlContext: SQLContext): Unit = {
    import sqlContext.implicits._

    val nullDF = Seq[(String, java.lang.Integer, java.lang.Double)](
      ("Bob", 16, 176.5),
      ("Alice", null, 164.3),
      ("", 60, null),
      ("UNKNOWN", 25, Double.NaN),
      ("Amy", null, null),
      (null, null, Double.NaN)
    ).toDF("name", "age", "height")

    // drop
    nullDF.na.drop("any").show()
    nullDF.na.drop("all").show()
    nullDF.na.drop(Array("age")).show()
    nullDF.na.drop(Seq("age", "height")).show()
    nullDF.na.drop("any", Array("name", "age")).show()
    nullDF.na.drop("all", Array("age", "height")).show()

    // fill
    nullDF.na.fill(0.0, Array("name", "height")).show()
    nullDF.na.fill(Map(
      "name" -> "UNKNOWN",
      "height" -> 0.0
    )).show()

    // replace
    nullDF.na.replace("name", Map("" -> "UNKNOWN")).show()
  }
}

// scalastyle:on println

Source File: DatasetExample.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark.ch05

import org.apache.spark.{SparkContext, SparkConf}
import org.apache.spark.sql.{Dataset, SQLContext}
import org.apache.spark.sql.functions._

private case class Person(id: Int, name: String, age: Int)

object DatasetExample {

  
  def main(args: Seq[String]): Unit = {
    val conf = new SparkConf().setAppName("DatasetExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    run(sc, sqlContext)
    sc.stop()
  }

  def run(sc: SparkContext, sqlContext: SQLContext): Unit = {
    import sqlContext.implicits._

    // Creates a Dataset from a `Seq`
    val seq = Seq((1, "Bob", 23), (2, "Tom", 23), (3, "John", 22))
    val ds1: Dataset[(Int, String, Int)] = sqlContext.createDataset(seq)
    val ds2: Dataset[(Int, String, Int)] = seq.toDS()

    // Creates a Dataset from a `RDD`
    val rdd = sc.parallelize(seq)
    val ds3: Dataset[(Int, String, Int)] = sqlContext.createDataset(rdd)
    val ds4: Dataset[(Int, String, Int)] = rdd.toDS()

    // Creates a Dataset from a `DataFrame`
    val df = rdd.toDF("id", "name", "age")
    val ds5: Dataset[Person] = df.as[Person]

    // Selects a column
    ds5.select(expr("name").as[String]).show()

    // Filtering
    ds5.filter(_.name == "Bob").show()
    ds5.filter(person => person.age == 23).show()

    // Groups and counts the number of rows
    ds5.groupBy(_.age).count().show()
  }
}

Source File: TestSparkContext.scala From gihyo-spark-book-example with Apache License 2.0

5 votes

package jp.gihyo.spark

import org.scalatest.{BeforeAndAfterAll, Suite}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext

private[spark]
trait TestSparkContext extends BeforeAndAfterAll { self: Suite =>
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _

  override def beforeAll() {
    super.beforeAll()
    val conf = new SparkConf()
      .setMaster("local[2]")
      .setAppName("SparkUnitTest")
      .set("spark.sql.shuffle.partitions", "2")
    sc = new SparkContext(conf)
    SQLContext.clearActive()
    sqlContext = new SQLContext(sc)
    SQLContext.setActive(sqlContext)
  }

  override def afterAll() {
    try {
      sqlContext = null
      SQLContext.clearActive()
      if (sc != null) {
        sc.stop()
      }
      sc = null
    } finally {
      super.afterAll()
    }
  }
}

Source File: MovieRecommendation.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.MovieRecommendation

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.SQLImplicits
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset
import org.apache.spark.mllib.recommendation.ALS
import org.apache.spark.mllib.recommendation.MatrixFactorizationModel
import org.apache.spark.mllib.recommendation.Rating
import scala.Tuple2
import org.apache.spark.rdd.RDD

object MovieRecommendation {  
  //Compute the RMSE to evaluate the model. Less the RMSE better the model and it's prediction capability. 
  def computeRmse(model: MatrixFactorizationModel, data: RDD[Rating], implicitPrefs: Boolean): Double = {
    val predictions: RDD[Rating] = model.predict(data.map(x => (x.user, x.product)))
    val predictionsAndRatings = predictions.map { x => ((x.user, x.product), x.rating)
    }.join(data.map(x => ((x.user, x.product), x.rating))).values
    if (implicitPrefs) {
      println("(Prediction, Rating)")
      println(predictionsAndRatings.take(5).mkString("\n"))
    }
    math.sqrt(predictionsAndRatings.map(x => (x._1 - x._2) * (x._1 - x._2)).mean())
  }

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("JavaLDAExample")
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/").
      getOrCreate()

    val ratigsFile = "data/ratings.csv"
    val df1 = spark.read.format("com.databricks.spark.csv").option("header", true).load(ratigsFile)

    val ratingsDF = df1.select(df1.col("userId"), df1.col("movieId"), df1.col("rating"), df1.col("timestamp"))
    ratingsDF.show(false)

    val moviesFile = "data/movies.csv"
    val df2 = spark.read.format("com.databricks.spark.csv").option("header", "true").load(moviesFile)

    val moviesDF = df2.select(df2.col("movieId"), df2.col("title"), df2.col("genres"))
    moviesDF.show(false)

    ratingsDF.createOrReplaceTempView("ratings")
    moviesDF.createOrReplaceTempView("movies")

    

    var rmseTest = computeRmse(model, testRDD, true)
    println("Test RMSE: = " + rmseTest) //Less is better

    //Movie recommendation for a specific user. Get the top 6 movie predictions for user 668
    println("Recommendations: (MovieId => Rating)")
    println("----------------------------------")
    val recommendationsUser = model.recommendProducts(668, 6)
    recommendationsUser.map(rating => (rating.product, rating.rating)).foreach(println)
    println("----------------------------------")

    spark.stop()
  }
}

Source File: XmlReader.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Dataset, SQLContext, SparkSession}
import org.apache.spark.sql.types.StructType
import com.databricks.spark.xml.util.XmlFile
import com.databricks.spark.xml.util.FailFastMode


  @deprecated("Use xmlFile(SparkSession, ...)", "0.5.0")
  def xmlFile(sqlContext: SQLContext, path: String): DataFrame = {
    // We need the `charset` and `rowTag` before creating the relation.
    val (charset, rowTag) = {
      val options = XmlOptions(parameters.toMap)
      (options.charset, options.rowTag)
    }
    val relation = XmlRelation(
      () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag),
      Some(path),
      parameters.toMap,
      schema)(sqlContext)
    sqlContext.baseRelationToDataFrame(relation)
  }

  @deprecated("Use xmlRdd(SparkSession, ...)", "0.5.0")
  def xmlRdd(sqlContext: SQLContext, xmlRDD: RDD[String]): DataFrame = {
    val relation = XmlRelation(
      () => xmlRDD,
      None,
      parameters.toMap,
      schema)(sqlContext)
    sqlContext.baseRelationToDataFrame(relation)
  }

}

Source File: DefaultSource.scala From spark-xml with Apache License 2.0

5 votes

package com.databricks.spark.xml

import org.apache.hadoop.fs.Path

import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
import com.databricks.spark.xml.util.XmlFile


  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      schema: StructType): XmlRelation = {
    val path = checkPath(parameters)
    // We need the `charset` and `rowTag` before creating the relation.
    val (charset, rowTag) = {
      val options = XmlOptions(parameters)
      (options.charset, options.rowTag)
    }

    XmlRelation(
      () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag),
      Some(path),
      parameters,
      schema)(sqlContext)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      data: DataFrame): BaseRelation = {
    val path = checkPath(parameters)
    val filesystemPath = new Path(path)
    val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
    val doSave = if (fs.exists(filesystemPath)) {
      mode match {
        case SaveMode.Append =>
          throw new IllegalArgumentException(
            s"Append mode is not supported by ${this.getClass.getCanonicalName}")
        case SaveMode.Overwrite =>
          fs.delete(filesystemPath, true)
          true
        case SaveMode.ErrorIfExists =>
          throw new IllegalArgumentException(s"path $path already exists.")
        case SaveMode.Ignore => false
      }
    } else {
      true
    }
    if (doSave) {
      // Only save data when the save mode is not ignore.
      XmlFile.saveAsXmlFile(data, filesystemPath.toString, parameters)
    }
    createRelation(sqlContext, parameters, data.schema)
  }
}

Source File: HyperLogLog.scala From spark-hyperloglog with Apache License 2.0

5 votes

package com.mozilla.spark.sql.hyperloglog.test

import com.mozilla.spark.sql.hyperloglog.aggregates._
import com.mozilla.spark.sql.hyperloglog.functions._
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{FlatSpec, Matchers}

class HyperLogLogTest extends FlatSpec with Matchers{
 "Algebird's HyperLogLog" can "be used from Spark" in {
  val sparkConf = new SparkConf().setAppName("HyperLogLog")
  sparkConf.setMaster(sparkConf.get("spark.master", "local[1]"))

  val sc = new SparkContext(sparkConf)
  val sqlContext = new SQLContext(sc)
  import sqlContext.implicits._

  val hllMerge = new HyperLogLogMerge
  sqlContext.udf.register("hll_merge", hllMerge)
  sqlContext.udf.register("hll_create", hllCreate _)
  sqlContext.udf.register("hll_cardinality", hllCardinality _)

  val frame = sc.parallelize(List("a", "b", "c", "c"), 4).toDF("id")
  val count = frame
    .select(expr("hll_create(id, 12) as hll"))
    .groupBy()
    .agg(expr("hll_cardinality(hll_merge(hll)) as count"))
    .collect()
  count(0)(0) should be (3)
 }
}

Source File: HiSpeedRead.scala From spark-db2 with Apache License 2.0

5 votes

import com.ibm.spark.ibmdataserver.Constants
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkContext, SparkConf}

object HiSpeedRead {

  def main(args: Array[String]) {
    val DB2_CONNECTION_URL = "jdbc:db2://localhost:50700/sample:traceFile=C:\\1.txt;"

    val conf = new SparkConf().setMaster("local[2]").setAppName("read test")

    val sparkContext = new SparkContext(conf)

    val sqlContext = new SQLContext(sparkContext)

    Class.forName("com.ibm.db2.jcc.DB2Driver")

    val jdbcRdr = sqlContext.read.format("com.ibm.spark.ibmdataserver")
      .option("url", DB2_CONNECTION_URL)
      // .option(Constants.TABLE, tableName)
      .option("user", "pallavipr")
      .option("password", "9manjari")
      .option("dbtable", "employee")
      .load()

    jdbcRdr.show()
  }
}

Source File: DefaultSource.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.hbase

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType

class CustomedDefaultSource
  extends DefaultSource
  with DataSourceRegister
  with SchemaRelationProvider {

  override def shortName(): String = "hbase"

  
  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      schema: StructType): BaseRelation = {
    new CustomedHBaseRelation(parameters, Option(schema))(sqlContext)
  }
}

Source File: SparkSQLCLIService.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.IOException
import java.util.{List => JList}
import javax.security.auth.login.LoginException

import scala.collection.JavaConverters._

import org.apache.commons.logging.Log
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.conf.HiveConf.ConfVars
import org.apache.hadoop.hive.shims.Utils
import org.apache.hadoop.security.{SecurityUtil, UserGroupInformation}
import org.apache.hive.service.{AbstractService, Service, ServiceException}
import org.apache.hive.service.Service.STATE
import org.apache.hive.service.auth.HiveAuthFactory
import org.apache.hive.service.cli._
import org.apache.hive.service.server.HiveServer2

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._

private[hive] class SparkSQLCLIService(hiveServer: HiveServer2, sqlContext: SQLContext)
  extends CLIService(hiveServer)
  with ReflectedCompositeService {

  override def init(hiveConf: HiveConf) {
    setSuperField(this, "hiveConf", hiveConf)

    val sparkSqlSessionManager = new SparkSQLSessionManager(hiveServer, sqlContext)
    setSuperField(this, "sessionManager", sparkSqlSessionManager)
    addService(sparkSqlSessionManager)
    var sparkServiceUGI: UserGroupInformation = null
    var httpUGI: UserGroupInformation = null

    if (UserGroupInformation.isSecurityEnabled) {
      try {
        val principal = hiveConf.getVar(ConfVars.HIVE_SERVER2_KERBEROS_PRINCIPAL)
        val keyTabFile = hiveConf.getVar(ConfVars.HIVE_SERVER2_KERBEROS_KEYTAB)
        if (principal.isEmpty || keyTabFile.isEmpty) {
          throw new IOException(
            "HiveServer2 Kerberos principal or keytab is not correctly configured")
        }

        val originalUgi = UserGroupInformation.getCurrentUser
        sparkServiceUGI = if (HiveAuthFactory.needUgiLogin(originalUgi,
          SecurityUtil.getServerPrincipal(principal, "0.0.0.0"), keyTabFile)) {
          HiveAuthFactory.loginFromKeytab(hiveConf)
          Utils.getUGI()
        } else {
          originalUgi
        }

        setSuperField(this, "serviceUGI", sparkServiceUGI)
      } catch {
        case e @ (_: IOException | _: LoginException) =>
          throw new ServiceException("Unable to login to kerberos with given principal/keytab", e)
      }

      // Try creating spnego UGI if it is configured.
      val principal = hiveConf.getVar(ConfVars.HIVE_SERVER2_SPNEGO_PRINCIPAL).trim
      val keyTabFile = hiveConf.getVar(ConfVars.HIVE_SERVER2_SPNEGO_KEYTAB).trim
      if (principal.nonEmpty && keyTabFile.nonEmpty) {
        try {
          httpUGI = HiveAuthFactory.loginFromSpnegoKeytabAndReturnUGI(hiveConf)
          setSuperField(this, "httpUGI", httpUGI)
        } catch {
          case e: IOException =>
            throw new ServiceException("Unable to login to spnego with given principal " +
              s"$principal and keytab $keyTabFile: $e", e)
        }
      }
    }

    initCompositeService(hiveConf)
  }

  override def getInfo(sessionHandle: SessionHandle, getInfoType: GetInfoType): GetInfoValue = {
    getInfoType match {
      case GetInfoType.CLI_SERVER_NAME => new GetInfoValue("Spark SQL")
      case GetInfoType.CLI_DBMS_NAME => new GetInfoValue("Spark SQL")
      case GetInfoType.CLI_DBMS_VER => new GetInfoValue(sqlContext.sparkContext.version)
      case _ => super.getInfo(sessionHandle, getInfoType)
    }
  }
}

private[thriftserver] trait ReflectedCompositeService { this: AbstractService =>
  def initCompositeService(hiveConf: HiveConf) {
    // Emulating `CompositeService.init(hiveConf)`
    val serviceList = getAncestorField[JList[Service]](this, 2, "serviceList")
    serviceList.asScala.foreach(_.init(hiveConf))

    // Emulating `AbstractService.init(hiveConf)`
    invoke(classOf[AbstractService], this, "ensureCurrentState", classOf[STATE] -> STATE.NOTINITED)
    setAncestorField(this, 3, "hiveConf", hiveConf)
    invoke(classOf[AbstractService], this, "changeState", classOf[STATE] -> STATE.INITED)
    getAncestorField[Log](this, 3, "LOG").info(s"Service: $getName is inited.")
  }
}

Source File: SparkSQLDriver.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.util.{ArrayList => JArrayList, Arrays, List => JList}

import scala.collection.JavaConverters._

import org.apache.commons.lang3.exception.ExceptionUtils
import org.apache.hadoop.hive.metastore.api.{FieldSchema, Schema}
import org.apache.hadoop.hive.ql.Driver
import org.apache.hadoop.hive.ql.processors.CommandProcessorResponse

import org.apache.spark.internal.Logging
import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.execution.{QueryExecution, SQLExecution}


private[hive] class SparkSQLDriver(val context: SQLContext = SparkSQLEnv.sqlContext)
  extends Driver
  with Logging {

  private[hive] var tableSchema: Schema = _
  private[hive] var hiveResponse: Seq[String] = _

  override def init(): Unit = {
  }

  private def getResultSetSchema(query: QueryExecution): Schema = {
    val analyzed = query.analyzed
    logDebug(s"Result Schema: ${analyzed.output}")
    if (analyzed.output.isEmpty) {
      new Schema(Arrays.asList(new FieldSchema("Response code", "string", "")), null)
    } else {
      val fieldSchemas = analyzed.output.map { attr =>
        new FieldSchema(attr.name, attr.dataType.catalogString, "")
      }

      new Schema(fieldSchemas.asJava, null)
    }
  }

  override def run(command: String): CommandProcessorResponse = {
    // TODO unify the error code
    try {
      context.sparkContext.setJobDescription(command)
      val execution = context.sessionState.executePlan(context.sql(command).logicalPlan)
      hiveResponse = SQLExecution.withNewExecutionId(context.sparkSession, execution) {
        execution.hiveResultString()
      }
      tableSchema = getResultSetSchema(execution)
      new CommandProcessorResponse(0)
    } catch {
        case ae: AnalysisException =>
          logDebug(s"Failed in [$command]", ae)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(ae), null, ae)
        case cause: Throwable =>
          logError(s"Failed in [$command]", cause)
          new CommandProcessorResponse(1, ExceptionUtils.getStackTrace(cause), null, cause)
    }
  }

  override def close(): Int = {
    hiveResponse = null
    tableSchema = null
    0
  }

  override def getResults(res: JList[_]): Boolean = {
    if (hiveResponse == null) {
      false
    } else {
      res.asInstanceOf[JArrayList[String]].addAll(hiveResponse.asJava)
      hiveResponse = null
      true
    }
  }

  override def getSchema: Schema = tableSchema

  override def destroy() {
    super.destroy()
    hiveResponse = null
    tableSchema = null
  }
}

Source File: SparkSQLSessionManager.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.util.concurrent.Executors

import org.apache.commons.logging.Log
import org.apache.hadoop.hive.conf.HiveConf
import org.apache.hadoop.hive.conf.HiveConf.ConfVars
import org.apache.hive.service.cli.SessionHandle
import org.apache.hive.service.cli.session.SessionManager
import org.apache.hive.service.cli.thrift.TProtocolVersion
import org.apache.hive.service.server.HiveServer2

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveUtils
import org.apache.spark.sql.hive.thriftserver.ReflectionUtils._
import org.apache.spark.sql.hive.thriftserver.server.SparkSQLOperationManager


private[hive] class SparkSQLSessionManager(hiveServer: HiveServer2, sqlContext: SQLContext)
  extends SessionManager(hiveServer)
  with ReflectedCompositeService {

  private lazy val sparkSqlOperationManager = new SparkSQLOperationManager()

  override def init(hiveConf: HiveConf) {
    setSuperField(this, "operationManager", sparkSqlOperationManager)
    super.init(hiveConf)
  }

  override def openSession(
      protocol: TProtocolVersion,
      username: String,
      passwd: String,
      ipAddress: String,
      sessionConf: java.util.Map[String, String],
      withImpersonation: Boolean,
      delegationToken: String): SessionHandle = {
    val sessionHandle =
      super.openSession(protocol, username, passwd, ipAddress, sessionConf, withImpersonation,
          delegationToken)
    val session = super.getSession(sessionHandle)
    HiveThriftServer2.listener.onSessionCreated(
      session.getIpAddress, sessionHandle.getSessionId.toString, session.getUsername)
    val ctx = if (sqlContext.conf.hiveThriftServerSingleSession) {
      sqlContext
    } else {
      sqlContext.newSession()
    }
    ctx.setConf(HiveUtils.FAKE_HIVE_VERSION.key, HiveUtils.builtinHiveVersion)
    if (sessionConf != null && sessionConf.containsKey("use:database")) {
      ctx.sql(s"use ${sessionConf.get("use:database")}")
    }
    sparkSqlOperationManager.sessionToContexts.put(sessionHandle, ctx)
    sessionHandle
  }

  override def closeSession(sessionHandle: SessionHandle) {
    HiveThriftServer2.listener.onSessionClosed(sessionHandle.getSessionId.toString)
    super.closeSession(sessionHandle)
    sparkSqlOperationManager.sessionToActivePool.remove(sessionHandle)
    sparkSqlOperationManager.sessionToContexts.remove(sessionHandle)
  }
}

Source File: SparkSQLOperationManager.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.server

import java.util.{Map => JMap}
import java.util.concurrent.ConcurrentHashMap

import org.apache.hive.service.cli._
import org.apache.hive.service.cli.operation.{ExecuteStatementOperation, Operation, OperationManager}
import org.apache.hive.service.cli.session.HiveSession

import org.apache.spark.internal.Logging
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveUtils
import org.apache.spark.sql.hive.thriftserver.{ReflectionUtils, SparkExecuteStatementOperation}
import org.apache.spark.sql.internal.SQLConf


private[thriftserver] class SparkSQLOperationManager()
  extends OperationManager with Logging {

  val handleToOperation = ReflectionUtils
    .getSuperField[JMap[OperationHandle, Operation]](this, "handleToOperation")

  val sessionToActivePool = new ConcurrentHashMap[SessionHandle, String]()
  val sessionToContexts = new ConcurrentHashMap[SessionHandle, SQLContext]()

  override def newExecuteStatementOperation(
      parentSession: HiveSession,
      statement: String,
      confOverlay: JMap[String, String],
      async: Boolean): ExecuteStatementOperation = synchronized {
    val sqlContext = sessionToContexts.get(parentSession.getSessionHandle)
    require(sqlContext != null, s"Session handle: ${parentSession.getSessionHandle} has not been" +
      s" initialized or had already closed.")
    val conf = sqlContext.sessionState.conf
    val hiveSessionState = parentSession.getSessionState
    setConfMap(conf, hiveSessionState.getOverriddenConfigurations)
    setConfMap(conf, hiveSessionState.getHiveVariables)
    val runInBackground = async && conf.getConf(HiveUtils.HIVE_THRIFT_SERVER_ASYNC)
    val operation = new SparkExecuteStatementOperation(parentSession, statement, confOverlay,
      runInBackground)(sqlContext, sessionToActivePool)
    handleToOperation.put(operation.getHandle, operation)
    logDebug(s"Created Operation for $statement with session=$parentSession, " +
      s"runInBackground=$runInBackground")
    operation
  }

  def setConfMap(conf: SQLConf, confMap: java.util.Map[String, String]): Unit = {
    val iterator = confMap.entrySet().iterator()
    while (iterator.hasNext) {
      val kv = iterator.next()
      conf.setConfString(kv.getKey, kv.getValue)
    }
  }
}

Source File: SparkSQLEnv.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver

import java.io.PrintStream

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{SparkSession, SQLContext}
import org.apache.spark.sql.hive.{HiveExternalCatalog, HiveUtils}
import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION
import org.apache.spark.util.Utils


  def stop() {
    logDebug("Shutting down Spark SQL Environment")
    // Stop the SparkContext
    if (SparkSQLEnv.sparkContext != null) {
      sparkContext.stop()
      sparkContext = null
      sqlContext = null
    }
  }
}

Source File: RedisRelation.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.redis

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._
import org.apache.spark.sql.xsql.DataSourceManager._
import org.apache.spark.sql.xsql.execution.datasources.redis.RedisSpecialStrategy

trait RedisRelationTrait {
  val parameters: Map[String, String]
  val schema: StructType
  lazy val redisConfig: RedisConfig = new RedisConfig(new RedisEndpoint(parameters.get(URL).get))
}
case class RedisRelationImpl(val parameters: Map[String, String], val schema: StructType)
  extends RedisRelationTrait

case class RedisRelation(
    parameters: Map[String, String],
    schema: StructType,
    filter: Seq[Expression] = Nil)(@transient val sqlContext: SQLContext)
  extends BaseRelation
  with PrunedScan
  with RedisRelationTrait {

  override def toString: String = s"RedisRelation(${filter.mkString(",")})"

  val partitionNum: Int = parameters.getOrElse("partitionNum", "1").toInt

  override def buildScan(requiredColumns: Array[String]): RDD[Row] = {
    val filters = filter
      .map(RedisSpecialStrategy.getAttr)
      .groupBy(_._1)
      .map(tup => (tup._1, tup._2.map(_._2)))
    new RedisRDD(sqlContext.sparkContext, this, filters, requiredColumns, partitionNum)
  }
}

Source File: PythonSQLUtils.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.api.python

import java.io.InputStream
import java.nio.channels.Channels

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.api.python.PythonRDDServer
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.sql.catalyst.analysis.FunctionRegistry
import org.apache.spark.sql.catalyst.expressions.ExpressionInfo
import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
import org.apache.spark.sql.execution.arrow.ArrowConverters
import org.apache.spark.sql.types.DataType

private[sql] object PythonSQLUtils {
  def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText)

  // This is needed when generating SQL documentation for built-in functions.
  def listBuiltinFunctionInfos(): Array[ExpressionInfo] = {
    FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray
  }

  
private[sql] class ArrowRDDServer(sqlContext: SQLContext) extends PythonRDDServer {

  override protected def streamToRDD(input: InputStream): RDD[Array[Byte]] = {
    // Create array to consume iterator so that we can safely close the inputStream
    val batches = ArrowConverters.getBatchesFromStream(Channels.newChannel(input)).toArray
    // Parallelize the record batches to create an RDD
    JavaRDD.fromRDD(sqlContext.sparkContext.parallelize(batches, batches.length))
  }

}

Source File: JdbcRelationProvider.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources.jdbc

import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider}

class JdbcRelationProvider extends CreatableRelationProvider
  with RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val resolver = sqlContext.conf.resolver
    val timeZoneId = sqlContext.conf.sessionLocalTimeZone
    val schema = JDBCRelation.getSchema(resolver, jdbcOptions)
    val parts = JDBCRelation.columnPartition(schema, resolver, timeZoneId, jdbcOptions)
    JDBCRelation(schema, parts, jdbcOptions)(sqlContext.sparkSession)
  }

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      df: DataFrame): BaseRelation = {
    val options = new JdbcOptionsInWrite(parameters)
    val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis

    val conn = JdbcUtils.createConnectionFactory(options)()
    try {
      val tableExists = JdbcUtils.tableExists(conn, options)
      if (tableExists) {
        mode match {
          case SaveMode.Overwrite =>
            if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) {
              // In this case, we should truncate table and then load.
              truncateTable(conn, options)
              val tableSchema = JdbcUtils.getSchemaOption(conn, options)
              saveTable(df, tableSchema, isCaseSensitive, options)
            } else {
              // Otherwise, do not truncate the table, instead drop and recreate it
              dropTable(conn, options.table, options)
              createTable(conn, df, options)
              saveTable(df, Some(df.schema), isCaseSensitive, options)
            }

          case SaveMode.Append =>
            val tableSchema = JdbcUtils.getSchemaOption(conn, options)
            saveTable(df, tableSchema, isCaseSensitive, options)

          case SaveMode.ErrorIfExists =>
            throw new AnalysisException(
              s"Table or view '${options.table}' already exists. " +
                s"SaveMode: ErrorIfExists.")

          case SaveMode.Ignore =>
            // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected
            // to not save the contents of the DataFrame and to not change the existing data.
            // Therefore, it is okay to do nothing here and then just return the relation below.
        }
      } else {
        createTable(conn, df, options)
        saveTable(df, Some(df.schema), isCaseSensitive, options)
      }
    } finally {
      conn.close()
    }

    createRelation(sqlContext, parameters)
  }
}

Source File: HadoopFsRelation.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import java.util.Locale

import scala.collection.mutable

import org.apache.spark.sql.{SparkSession, SQLContext}
import org.apache.spark.sql.catalyst.catalog.BucketSpec
import org.apache.spark.sql.execution.FileRelation
import org.apache.spark.sql.sources.{BaseRelation, DataSourceRegister}
import org.apache.spark.sql.types.{StructField, StructType}



case class HadoopFsRelation(
    location: FileIndex,
    partitionSchema: StructType,
    dataSchema: StructType,
    bucketSpec: Option[BucketSpec],
    fileFormat: FileFormat,
    options: Map[String, String])(val sparkSession: SparkSession)
  extends BaseRelation with FileRelation {

  override def sqlContext: SQLContext = sparkSession.sqlContext

  private def getColName(f: StructField): String = {
    if (sparkSession.sessionState.conf.caseSensitiveAnalysis) {
      f.name
    } else {
      f.name.toLowerCase(Locale.ROOT)
    }
  }

  val overlappedPartCols = mutable.Map.empty[String, StructField]
  partitionSchema.foreach { partitionField =>
    if (dataSchema.exists(getColName(_) == getColName(partitionField))) {
      overlappedPartCols += getColName(partitionField) -> partitionField
    }
  }

  // When data and partition schemas have overlapping columns, the output
  // schema respects the order of the data schema for the overlapping columns, and it
  // respects the data types of the partition schema.
  val schema: StructType = {
    StructType(dataSchema.map(f => overlappedPartCols.getOrElse(getColName(f), f)) ++
      partitionSchema.filterNot(f => overlappedPartCols.contains(getColName(f))))
  }

  def partitionSchemaOption: Option[StructType] =
    if (partitionSchema.isEmpty) None else Some(partitionSchema)

  override def toString: String = {
    fileFormat match {
      case source: DataSourceRegister => source.shortName()
      case _ => "HadoopFiles"
    }
  }

  override def sizeInBytes: Long = {
    val compressionFactor = sqlContext.conf.fileCompressionFactor
    (location.sizeInBytes * compressionFactor).toLong
  }


  override def inputFiles: Array[String] = location.inputFiles
}

Source File: package.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import scala.reflect.ClassTag

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.internal.SessionState
import org.apache.spark.sql.types.StructType

package object state {

  implicit class StateStoreOps[T: ClassTag](dataRDD: RDD[T]) {

    
    private[streaming] def mapPartitionsWithStateStore[U: ClassTag](
        stateInfo: StatefulOperatorStateInfo,
        keySchema: StructType,
        valueSchema: StructType,
        indexOrdinal: Option[Int],
        sessionState: SessionState,
        storeCoordinator: Option[StateStoreCoordinatorRef])(
        storeUpdateFunction: (StateStore, Iterator[T]) => Iterator[U]): StateStoreRDD[T, U] = {

      val cleanedF = dataRDD.sparkContext.clean(storeUpdateFunction)
      val wrappedF = (store: StateStore, iter: Iterator[T]) => {
        // Abort the state store in case of error
        TaskContext.get().addTaskCompletionListener[Unit](_ => {
          if (!store.hasCommitted) store.abort()
        })
        cleanedF(store, iter)
      }

      new StateStoreRDD(
        dataRDD,
        wrappedF,
        stateInfo.checkpointLocation,
        stateInfo.queryRunId,
        stateInfo.operatorId,
        stateInfo.storeVersion,
        keySchema,
        valueSchema,
        indexOrdinal,
        sessionState,
        storeCoordinator)
    }
  }
}

Source File: DDLSourceLoadSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.test.SharedSQLContext
import org.apache.spark.sql.types._


// please note that the META-INF/services had to be modified for the test directory for this to work
class DDLSourceLoadSuite extends DataSourceTest with SharedSQLContext {

  test("data sources with the same name - internal data sources") {
    val e = intercept[AnalysisException] {
      spark.read.format("Fluet da Bomb").load()
    }
    assert(e.getMessage.contains("Multiple sources found for Fluet da Bomb"))
  }

  test("data sources with the same name - internal data source/external data source") {
    assert(spark.read.format("datasource").load().schema ==
      StructType(Seq(StructField("longType", LongType, nullable = false))))
  }

  test("data sources with the same name - external data sources") {
    val e = intercept[AnalysisException] {
      spark.read.format("Fake external source").load()
    }
    assert(e.getMessage.contains("Multiple sources found for Fake external source"))
  }

  test("load data source from format alias") {
    assert(spark.read.format("gathering quorum").load().schema ==
      StructType(Seq(StructField("stringType", StringType, nullable = false))))
  }

  test("specify full classname with duplicate formats") {
    assert(spark.read.format("org.apache.spark.sql.sources.FakeSourceOne")
      .load().schema == StructType(Seq(StructField("stringType", StringType, nullable = false))))
  }
}


class FakeSourceOne extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceTwo extends RelationProvider with DataSourceRegister {

  def shortName(): String = "Fluet da Bomb"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("integerType", IntegerType, nullable = false)))
    }
}

class FakeSourceThree extends RelationProvider with DataSourceRegister {

  def shortName(): String = "gathering quorum"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("stringType", StringType, nullable = false)))
    }
}

class FakeSourceFour extends RelationProvider with DataSourceRegister {

  def shortName(): String = "datasource"

  override def createRelation(cont: SQLContext, param: Map[String, String]): BaseRelation =
    new BaseRelation {
      override def sqlContext: SQLContext = cont

      override def schema: StructType =
        StructType(Seq(StructField("longType", LongType, nullable = false)))
    }
}

Source File: SharedSparkSession.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.test

import scala.concurrent.duration._

import org.scalatest.{BeforeAndAfterEach, Suite}
import org.scalatest.concurrent.Eventually

import org.apache.spark.{DebugFilesystem, SparkConf}
import org.apache.spark.sql.{SparkSession, SQLContext}
import org.apache.spark.sql.catalyst.optimizer.ConvertToLocalRelation
import org.apache.spark.sql.internal.SQLConf


  protected override def afterAll(): Unit = {
    try {
      super.afterAll()
    } finally {
      try {
        if (_spark != null) {
          try {
            _spark.sessionState.catalog.reset()
          } finally {
            _spark.stop()
            _spark = null
          }
        }
      } finally {
        SparkSession.clearActiveSession()
        SparkSession.clearDefaultSession()
      }
    }
  }

  protected override def beforeEach(): Unit = {
    super.beforeEach()
    DebugFilesystem.clearOpenStreams()
  }

  protected override def afterEach(): Unit = {
    super.afterEach()
    // Clear all persistent datasets after each test
    spark.sharedState.cacheManager.clearCache()
    // files can be closed from other threads, so wait a bit
    // normally this doesn't take more than 1s
    eventually(timeout(10.seconds), interval(2.seconds)) {
      DebugFilesystem.assertNoOpenStreams()
    }
  }
}

Source File: BlockingSource.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.util

import java.util.concurrent.CountDownLatch

import org.apache.spark.sql.{SQLContext, _}
import org.apache.spark.sql.execution.streaming.{LongOffset, Offset, Sink, Source}
import org.apache.spark.sql.sources.{StreamSinkProvider, StreamSourceProvider}
import org.apache.spark.sql.streaming.OutputMode
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class BlockingSource extends StreamSourceProvider with StreamSinkProvider {

  private val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    BlockingSource.latch.await()
    new Source {
      override def schema: StructType = fakeSchema
      override def getOffset: Option[Offset] = Some(new LongOffset(0))
      override def getBatch(start: Option[Offset], end: Offset): DataFrame = {
        import spark.implicits._
        Seq[Int]().toDS().toDF()
      }
      override def stop() {}
    }
  }

  override def createSink(
      spark: SQLContext,
      parameters: Map[String, String],
      partitionColumns: Seq[String],
      outputMode: OutputMode): Sink = {
    new Sink {
      override def addBatch(batchId: Long, data: DataFrame): Unit = {}
    }
  }
}

object BlockingSource {
  var latch: CountDownLatch = null
}

Source File: MockSourceProvider.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.util

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Source
import org.apache.spark.sql.sources.StreamSourceProvider
import org.apache.spark.sql.types.{IntegerType, StructField, StructType}


class MockSourceProvider extends StreamSourceProvider {
  override def sourceSchema(
      spark: SQLContext,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): (String, StructType) = {
    ("dummySource", MockSourceProvider.fakeSchema)
  }

  override def createSource(
      spark: SQLContext,
      metadataPath: String,
      schema: Option[StructType],
      providerName: String,
      parameters: Map[String, String]): Source = {
    MockSourceProvider.sourceProviderFunction()
  }
}

object MockSourceProvider {
  // Function to generate sources. May provide multiple sources if the user implements such a
  // function.
  private var sourceProviderFunction: () => Source = _

  final val fakeSchema = StructType(StructField("a", IntegerType) :: Nil)

  def withMockSources(source: Source, otherSources: Source*)(f: => Unit): Unit = {
    var i = 0
    val sources = source +: otherSources
    sourceProviderFunction = () => {
      val source = sources(i % sources.length)
      i += 1
      source
    }
    try {
      f
    } finally {
      sourceProviderFunction = null
    }
  }
}

Source File: SapHiveContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.SparkContext
import org.apache.spark.sql.{CommonSapSQLContext, SQLContext}
import org.apache.spark.sql.execution.CacheManager
import org.apache.spark.sql.execution.ui.SQLListener
import org.apache.spark.sql.hive.client.{ClientInterface, ClientWrapper}


class SapHiveContext(
    @transient sparkContext: SparkContext,
    cacheManager: CacheManager,
    listener: SQLListener,
    @transient execHive: ClientWrapper,
    @transient metaHive: ClientInterface,
    isRootContext: Boolean)
  extends ExtendableHiveContext(
    sparkContext,
    cacheManager,
    listener,
    execHive,
    metaHive,
    isRootContext)
  with CommonSapSQLContext {

  def this(sc: SparkContext) =
    this(sc, new CacheManager, SQLContext.createListenerAndUI(sc), null, null, true)

  override def newSession(): HiveContext =
    new SapHiveContext(
      sparkContext = this.sparkContext,
      cacheManager = this.cacheManager,
      listener = this.listener,
      executionHive.newSession(),
      metadataHive.newSession(),
      isRootContext = false)
}

Source File: BasicCurrencyConversionFunction.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.currency.basic

import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.currency._
import org.apache.spark.sql.util.ValidatingPropertyMap._

import scala.util.Try

protected object BasicCurrencyConversionConfig {

  
  private def updateRatesMapByTable(ratesTable: String, sqlContext: SQLContext): Unit = {
    val ratesTableData = sqlContext.sql(s"SELECT * FROM $ratesTable").collect()
    ratesTableData.foreach { row =>
      val from = row.getString(0)
      val to = row.getString(1)
      val date = row.getString(2).replaceAll("-", "").toInt
      val rate =
        Try(row.getDecimal(3)).recover {
          case ex: ClassCastException => new java.math.BigDecimal(row.getDouble(3))
        }.get
      ratesMap.put((from, to), date, rate)
    }
  }
}

Source File: CreateTableUsingTemporaryAwareCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, DatasourceResolver, Row, SQLContext}
import org.apache.spark.sql.catalyst.TableIdentifierUtils._
import org.apache.spark.sql.catalyst.CaseSensitivityUtils._


  // scalastyle:off method.length
  private def resolveDataSource(sqlContext: SQLContext,
                                dataSource: Any,
                                tableId: TableIdentifier): ResolvedDataSource = {
    dataSource match {
      case drp: PartitionedRelationProvider =>
        if (userSpecifiedSchema.isEmpty) {
          new ResolvedDataSource(drp.getClass,
            drp.createRelation(
              sqlContext,
              tableId.toSeq,
              new CaseInsensitiveMap(options),
              partitioningFunction,
              partitioningColumns,
              isTemporary,
              allowExisting))
        } else {
          new ResolvedDataSource(drp.getClass,
            drp.createRelation(
              sqlContext,
              tableId.toSeq,
              new CaseInsensitiveMap(options),
              userSpecifiedSchema.get,
              partitioningFunction,
              partitioningColumns,
              isTemporary,
              allowExisting))
        }
      case drp: TemporaryAndPersistentSchemaRelationProvider if userSpecifiedSchema.nonEmpty =>
            new ResolvedDataSource(drp.getClass,
              drp.createRelation(
                sqlContext,
                tableId.toSeq,
                new CaseInsensitiveMap(options),
                userSpecifiedSchema.get,
                isTemporary,
                allowExisting))
      case drp: TemporaryAndPersistentRelationProvider =>
        new ResolvedDataSource(drp.getClass,
          drp.createRelation(
            sqlContext,
            tableId.toSeq,
            new CaseInsensitiveMap(options),
            isTemporary,
            allowExisting))
      case _ => ResolvedDataSource(sqlContext, userSpecifiedSchema,
        partitionColumns, provider, options)
    }
  }
  // scalastyle:on method.length
}

Source File: SqlContextAccessor.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import scala.language.implicitConversions


object SqlContextAccessor {
  implicit def sqlContextToCatalogAccessable(sqlContext: SQLContext): SqlContextCatalogAccessor =
    new SqlContextCatalogAccessor(sqlContext)

  class SqlContextCatalogAccessor(sqlContext: SQLContext)
    extends SQLContext(sqlContext.sparkContext) {

    def registerRawPlan(lp: LogicalPlan, tableName: String): Unit = {
      sqlContext.catalog.registerTable(TableIdentifier(tableName), lp)
    }
  }
}

Source File: DropRunnableCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.Logging
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.sources.DropRelation
import org.apache.spark.sql.{Row, SQLContext}

import scala.util.Try


private[sql] case class DropRunnableCommand(toDrop: Map[String, Option[DropRelation]])
  extends RunnableCommand
  with Logging {

  override def run(sqlContext: SQLContext): Seq[Row] = {
    toDrop.foreach {
      case (name, dropOption) =>
        sqlContext.dropTempTable(name)
        dropOption.foreach { dropRelation =>
          Try {
            dropRelation.dropTable()
          }.recover {
            // When the provider indicates an exception while dropping, we still have to continue
            // dropping all the referencing tables, otherwise there could be integrity issues
            case ex =>
              logWarning(
                s"""Error occurred when dropping table '$name':${ex.getMessage}, however
                   |table '$name' will still be dropped from Spark catalog.
                 """.stripMargin)
          }.get
        }
    }
    Seq.empty
  }
}

Source File: ShowTablesUsingCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.sources.DatasourceCatalog
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.types.{StringType, StructField, StructType}


private[sql]
case class ShowTablesUsingCommand(provider: String, options: Map[String, String])
  extends LogicalPlan
  with RunnableCommand {

  override def output: Seq[Attribute] = StructType(
    StructField("TABLE_NAME", StringType, nullable = false) ::
    StructField("IS_TEMPORARY", StringType, nullable = false) ::
    StructField("KIND", StringType, nullable = false) ::
    Nil
  ).toAttributes

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val dataSource: Any = DatasourceResolver.resolverFor(sqlContext).newInstanceOf(provider)

    dataSource match {
      case describableRelation: DatasourceCatalog =>
        describableRelation
          .getRelations(sqlContext, new CaseInsensitiveMap(options))
          .map(relationInfo => Row(
            relationInfo.name,
            relationInfo.isTemporary.toString.toUpperCase,
            relationInfo.kind.toUpperCase))
      case _ =>
        throw new RuntimeException(s"The provided data source $provider does not support " +
        "showing its relations.")
    }
  }
}

Source File: DeepDescribeCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.sources.describable.Describable
import org.apache.spark.sql.sources.describable.FieldLike.StructFieldLike
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext}


private[sql] case class DeepDescribeCommand(
    relation: Describable)
  extends RunnableCommand {

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val description = relation.describe()
    Seq(description match {
      case r: Row => r
      case default => Row(default)
    })
  }

  override def output: Seq[Attribute] = {
    relation.describeOutput match {
      case StructType(fields) =>
        fields.map(StructFieldLike.toAttribute)
      case other =>
        AttributeReference("value", other)() :: Nil
    }
  }
}

Source File: DescribeTableUsingCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.TableIdentifierUtils._
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.sources.{DatasourceCatalog, RelationInfo}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}


private[sql]
case class DescribeTableUsingCommand(
    name: TableIdentifier,
    provider: String,
    options: Map[String, String])
  extends LogicalPlan
  with RunnableCommand {

  override def output: Seq[Attribute] = StructType(
    StructField("TABLE_NAME", StringType, nullable = false) ::
    StructField("DDL_STMT", StringType, nullable = false) ::
    Nil
  ).toAttributes

  override def run(sqlContext: SQLContext): Seq[Row] = {
    // Convert the table name according to the case-sensitivity settings
    val tableId = name.toSeq
    val resolver = DatasourceResolver.resolverFor(sqlContext)
    val catalog = resolver.newInstanceOfTyped[DatasourceCatalog](provider)

    Seq(catalog
      .getRelation(sqlContext, tableId, new CaseInsensitiveMap(options)) match {
        case None => Row("", "")
        case Some(RelationInfo(relName, _, _, ddl, _)) => Row(
          relName, ddl.getOrElse(""))
    })
  }
}

Source File: RawDDLCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.sources.RawDDLObjectType.RawDDLObjectType
import org.apache.spark.sql.sources.RawDDLStatementType.RawDDLStatementType
import org.apache.spark.sql.sources.{RawSqlSourceProvider}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.execution.RunnableCommand


private[sql] case class RawDDLCommand(
    identifier: String,
    objectType: RawDDLObjectType,
    statementType: RawDDLStatementType,
    sparkSchema: Option[StructType],
    ddlStatement: String,
    provider: String,
    options: Map[String, String])
  extends RunnableCommand {

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val dataSource: Any = ResolvedDataSource.lookupDataSource(provider).newInstance()

    dataSource match {
      case rsp: RawSqlSourceProvider =>
        rsp.executeDDL(identifier, objectType, statementType, sparkSchema, ddlStatement, options)
      case _ => throw new RuntimeException("The provided datasource does not support " +
        "executing raw DDL statements.")
    }
    Seq.empty[Row]
  }
}

Source File: RegisterAllTablesCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.CaseSensitivityUtils._
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.execution.datasources.SqlContextAccessor._
import org.apache.spark.sql.sources.{LogicalPlanSource, RegisterAllTableRelations}
import org.apache.spark.sql.util.CollectionUtils._
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}


    relations.map {
      case (name, source) =>
        val lp = source.logicalPlan(sqlContext)
        if (lp.resolved) {
          sqlContext.validatedSchema(lp.schema).recover {
            case d: DuplicateFieldsException =>
              throw new RuntimeException(
                s"Provider '$provider' returned a relation that has duplicate fields.",
                d)
          }.get
        } else {
          // TODO(AC): With the new view interface, this can be checked
          logWarning(s"Adding relation $name with potentially unreachable fields.")
        }
        name -> lp
    }.foreach {
      case (name, plan) =>
        sqlContext.registerRawPlan(plan, name)
    }
  }
}

Source File: AbstractViewCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.sources.{AbstractViewProvider, ViewKind}
import org.apache.spark.sql.{DatasourceResolver, SQLContext}


  def withValidProvider[B](sqlContext: SQLContext)(b: AbstractViewProvider[_] => B): B = {
    val resolver = DatasourceResolver.resolverFor(sqlContext)
    AbstractViewProvider.matcherFor(kind)(resolver.newInstanceOf(provider)) match {
      case Some(viewProvider) =>
        b(viewProvider)
      case _ =>
        throw new ProviderException(provider, "Does not support the " +
          s"execution of ${this.getClass.getSimpleName}")
    }
  }
}

class ProviderException(val provider: String, val reason: String)
  extends Exception(s"Exception using provider $provider: $reason")

Source File: RegisterTableCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.catalyst.CaseSensitivityUtils._
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.execution.datasources.SqlContextAccessor._
import org.apache.spark.sql.sources.RegisterAllTableRelations
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}


      val relation = resolvedProvider.getTableRelation(tableName, sqlContext, options)

      relation match {
        case None =>
          sys.error(s"Relation $tableName was not found in the catalog.")
        case Some(r) =>
          val lp = r.logicalPlan(sqlContext)
          if (lp.resolved) {
            sqlContext.validatedSchema(lp.schema).recover {
              case d: DuplicateFieldsException =>
                throw new RuntimeException(
                  s"Provider '$provider' returned a relation that has duplicate fields.",
                  d)
            }.get
          } else {
            // TODO(AC): With the new view interface, this can be checked
            logWarning(s"Adding relation $tableName with potentially unreachable fields.")
          }
          sqlContext.registerRawPlan(lp, tableName)
      }
    }
    Seq.empty
  }
}

Source File: CreateTableStrategy.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.datasources

import org.apache.spark.sql.{DatasourceResolver, SQLContext, Strategy}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.{ExecutedCommand, SparkPlan}
import org.apache.spark.sql.sources.TemporaryAndPersistentNature


private[sql] case class CreateTableStrategy(sqlContext: SQLContext) extends Strategy {

  override def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match {
    // Currently we only handle cases where the user wants to instantiate a
    // persistent relation any other cases has to be handled by the datasource itself
    case CreateTableUsing(tableName,
        userSpecifiedSchema, provider, temporary, options, allowExisting, _) =>
      DatasourceResolver.resolverFor(sqlContext).newInstanceOf(provider) match {
        case _: TemporaryAndPersistentNature =>
          ExecutedCommand(CreateTableUsingTemporaryAwareCommand(tableName,
            userSpecifiedSchema,
            Array.empty[String],
            None,
            None,
            provider,
            options,
            temporary,
            allowExisting)) :: Nil
        case _ => Nil
      }

    case CreateTablePartitionedByUsing(tableId, userSpecifiedSchema, provider,
    partitioningFunction, partitioningColumns, temporary, options, allowExisting, _) =>
      ResolvedDataSource.lookupDataSource(provider).newInstance() match {
        case _: TemporaryAndPersistentNature =>
          ExecutedCommand(CreateTableUsingTemporaryAwareCommand(
            tableId,
            userSpecifiedSchema,
            Array.empty[String],
            Some(partitioningFunction),
            Some(partitioningColumns),
            provider,
            options,
            isTemporary = false,
            allowExisting)) :: Nil
        case _ => Nil
      }
    case _ => Nil
  }
}

Source File: DescCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources.commands.hive

import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference}
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{Row, SQLContext}


case class DescCommand(ident: TableIdentifier) extends HiveRunnableCommand {

  override protected val commandName: String = s"DESC $ident"

  override def execute(sqlContext: SQLContext): Seq[Row] = {
    val plan = sqlContext.catalog.lookupRelation(ident)
    if (plan.resolved) {
      plan.schema.map { field =>
        Row(field.name, field.dataType.simpleString, None)
      }
    } else {
      Seq.empty
    }
  }

  override lazy val output: Seq[Attribute] =
    AttributeReference("col_name", StringType)() ::
    AttributeReference("data_type", StringType)() ::
    AttributeReference("comment", StringType)() :: Nil
}

Source File: inferSchemaCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources.commands

import org.apache.spark.sql.catalyst.analysis.systables.SchemaEnumeration
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.LeafNode
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.execution.datasources.parquet.ParquetRelation
import org.apache.spark.sql.execution.tablefunctions.DataTypeExtractor
import org.apache.spark.sql.hive.orc.OrcRelation
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext}


case class InferSchemaCommand(path: String, fileType: FileType) extends RunnableCommand {
  override lazy val output: Seq[Attribute] = InferSchemaCommand.schema.toAttributes

  override def run(sqlContext: SQLContext): Seq[Row] = {
    val fileSchema = fileType.readSchema(sqlContext, path)
    fileSchema.zipWithIndex.map {
      case (StructField(name, dataType, nullable, _), idx) =>
        val dataTypeExtractor = DataTypeExtractor(dataType)
        Row(
          name,
          idx + 1, // idx + 1 since the ordinal position has to start at 1
          nullable,
          dataTypeExtractor.inferredSqlType,
          dataTypeExtractor.numericPrecision.orNull,
          dataTypeExtractor.numericPrecisionRadix.orNull,
          dataTypeExtractor.numericScale.orNull)
    }
  }
}

object InferSchemaCommand extends SchemaEnumeration {
  val name = Field("COLUMN_NAME", StringType, nullable = false)
  val ordinalPosition = Field("ORDINAL_POSITION", IntegerType, nullable = false)
  val isNullable = Field("IS_NULLABLE", BooleanType, nullable = false)
  val dataType = Field("DATA_TYPE", StringType, nullable = false)
  val numericPrecision = Field("NUMERIC_PRECISION", IntegerType, nullable = true)
  val numericPrecisionRadix = Field("NUMERIC_PRECISION_RADIX", IntegerType, nullable = true)
  val numericScale = Field("NUMERIC_SCALE", IntegerType, nullable = true)
}

Source File: ShowPartitionFunctionsUsingCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources.commands

import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.execution.RunnableCommand
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DatasourceResolver, DefaultDatasourceResolver, Row, SQLContext}

case class ShowPartitionFunctionsUsingCommand(
    provider: String,
    options: Map[String, String])
  extends RunnableCommand {

  def run(sqlContext: SQLContext): Seq[Row] = {
    val resolver = DatasourceResolver.resolverFor(sqlContext)
    val pFunProvider = resolver.newInstanceOfTyped[PartitioningFunctionProvider](provider)
    val pFuns = pFunProvider.getAllPartitioningFunctions(sqlContext, options)

    pFuns.map { fun =>
      val (splittersOpt, rightClosedOpt) = fun match {
        case RangeSplitPartitioningFunction(_, _, splitters, rightClosed) =>
          (Some(splitters), Some(rightClosed))
        case _ =>
          (None, None)
      }
      val (startOpt, endOpt, intervalTypeOpt, intervalValueOpt) = fun match {
        case RangeIntervalPartitioningFunction(_, _, start, end, strideParts) =>
          (Some(start), Some(end), Some(strideParts.productPrefix), Some(strideParts.n))
        case _ =>
          (None, None, None, None)
      }
      val partitionsNoOpt = fun match {
        case HashPartitioningFunction(_, _, partitionsNo) =>
          partitionsNo
        case s: SimpleDataType =>
          None
      }
      Row(fun.name, fun.productPrefix, fun.dataTypes.map(_.toString).mkString(","),
        splittersOpt.map(_.mkString(",")).orNull, rightClosedOpt.orNull, startOpt.orNull,
        endOpt.orNull, intervalTypeOpt.orNull, intervalValueOpt.orNull, partitionsNoOpt.orNull)
    }
  }

  override lazy val output: Seq[Attribute] = StructType(
    StructField("name", StringType, nullable = false) ::
      StructField("kind", StringType, nullable = false) ::
      StructField("dataTypes", StringType, nullable = false) ::
      StructField("splitters", StringType, nullable = true) ::
      StructField("rightClosed", BooleanType, nullable = true) ::
      StructField("start", IntegerType, nullable = true) ::
      StructField("end", IntegerType, nullable = true) ::
      StructField("intervalType", StringType, nullable = true) ::
      StructField("intervalValue", IntegerType, nullable = true) ::
      StructField("partitionsNo", IntegerType, nullable = true) :: Nil
  ).toAttributes
}

Source File: PartitionedRelationProvider.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types.StructType


trait PartitionedRelationProvider
  extends SchemaRelationProvider
  with TemporaryAndPersistentNature {

  def createRelation(sqlContext: SQLContext,
                     tableName: Seq[String],
                     parameters: Map[String, String],
                     partitioningFunction: Option[String],
                     partitioningColumns: Option[Seq[String]],
                     isTemporary: Boolean,
                     allowExisting: Boolean): BaseRelation

  def createRelation(sqlContext: SQLContext,
                     tableName: Seq[String],
                     parameters: Map[String, String],
                     schema: StructType,
                     partitioningFunction: Option[String],
                     partitioningColumns: Option[Seq[String]],
                     isTemporary: Boolean,
                     allowExisting: Boolean): BaseRelation

}

Source File: AbstractViewProvider.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.plans.logical.view.{AbstractView, Persisted}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import scala.reflect._


  def name: String
}

abstract class BaseAbstractViewProvider[A <: AbstractView with Persisted: ClassTag]
  extends AbstractViewProvider[A] {
  val tag = implicitly[ClassTag[A]]
}

object AbstractViewProvider {
  def matcherFor(kind: ViewKind)(any: Any): Option[AbstractViewProvider[_]] = {
    val multiProvider = MultiAbstractViewProvider.matcherFor(kind)
    any match {
      case provider: AbstractViewProvider[_] if tagMatches(provider.tag) =>
        Some(provider)
      case multiProvider(provider) =>
        Some(provider)
      case _ => None
    }
  }

  private def tagMatches[A: ClassTag](tag: ClassTag[_]): Boolean = {
    classTag[A].runtimeClass.isAssignableFrom(tag.runtimeClass)
  }
}

case class CreateViewInput(
    sqlContext: SQLContext,
    plan: LogicalPlan,
    viewSql: String,
    options: Map[String, String],
    identifier: TableIdentifier,
    allowExisting: Boolean)

case class DropViewInput(
    sqlContext: SQLContext,
    options: Map[String, String],
    identifier: TableIdentifier,
    allowNotExisting: Boolean)

Source File: RawSqlSourceProvider.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import java.util.concurrent.atomic.AtomicReference

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.Statistics
import org.apache.spark.sql.execution.{PhysicalRDD, RDDConversions, SparkPlan}
import org.apache.spark.sql.sources.RawDDLObjectType.RawDDLObjectType
import org.apache.spark.sql.sources.RawDDLStatementType.RawDDLStatementType
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext}

case object RawDDLObjectType {

  sealed trait RawDDLObjectType {
    val name: String
    override def toString: String = name
  }

  sealed abstract class BaseRawDDLObjectType(val name: String) extends RawDDLObjectType
  sealed trait RawData

  case object PartitionFunction extends BaseRawDDLObjectType("partition function")
  case object PartitionScheme   extends BaseRawDDLObjectType("partition scheme")
  case object Collection        extends BaseRawDDLObjectType("collection") with RawData
  case object Series            extends BaseRawDDLObjectType("table") with RawData
  case object Graph             extends BaseRawDDLObjectType("graph") with RawData
}

case object RawDDLStatementType {

  sealed trait RawDDLStatementType

  case object Create extends RawDDLStatementType
  case object Drop   extends RawDDLStatementType
  case object Append extends RawDDLStatementType
  case object Load   extends RawDDLStatementType
}


  protected def calculateSchema(): StructType
}

Source File: LogicalPlanSource.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.{DataFrame, SQLContext}


case class CreatePersistentViewSource(createViewStatement: String, handle: ViewHandle)
  extends LogicalPlanSource {

  def logicalPlan(sqlContext: SQLContext): LogicalPlan = {
    sqlContext.parseSql(createViewStatement) match {
      // This might seem repetitive but in the future the commands might drastically differ
      case CreatePersistentViewCommand(kind, _, plan, _, provider, _, _) =>
        kind.createPersisted(plan, handle, provider)

      case unknown =>
        throw new RuntimeException(s"Could not extract view query from $unknown")
    }
  }
}

Source File: TemporaryAndPersistentSchemaRelationProvider.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types._


trait TemporaryAndPersistentSchemaRelationProvider
  extends SchemaRelationProvider
  with TemporaryAndPersistentNature {

  def createRelation(sqlContext: SQLContext,
                     tableName: Seq[String],
                     parameters: Map[String, String],
                     schema: StructType,
                     isTemporary: Boolean,
                     allowExisting: Boolean): BaseRelation
}

Source File: ResolveSelectUsing.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SelectUsing, UnresolvedSelectUsing}
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.sources.RawSqlSourceProvider
import org.apache.spark.sql.{DatasourceResolver, SQLContext}


private[sql] case class ResolveSelectUsing(sqlContext: SQLContext) extends Rule[LogicalPlan] {

  override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators {
    case UnresolvedSelectUsing(sqlCommand, provider, expectedSchema, options) => {
      val resolver = DatasourceResolver.resolverFor(sqlContext)
      val rawSqlProvider = resolver.newInstanceOfTyped[RawSqlSourceProvider](provider)
      val execution = rawSqlProvider.executionOf(sqlContext, options, sqlCommand, expectedSchema)
      SelectUsing(execution)
    }
  }

}

Source File: ResolveInferSchemaCommand.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.{AnalysisException, SQLContext}
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.sources.commands.{InferSchemaCommand, Orc, Parquet, UnresolvedInferSchemaCommand}

import scala.util.Try


case class ResolveInferSchemaCommand(sqlContext: SQLContext) extends Rule[LogicalPlan] {
  override def apply(plan: LogicalPlan): LogicalPlan = plan.transform {
    case UnresolvedInferSchemaCommand(path, explicitFileType) =>
      val fileType = explicitFileType.getOrElse(path.toLowerCase match {
        case p if p.endsWith(".orc") => Orc
        case p if p.endsWith(".parquet") => Parquet
        case invalid =>
          throw new AnalysisException(s"Could not determine file format of '$path'")
      })
      InferSchemaCommand(path, fileType)
  }
}

Source File: dependenciesSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.TableDependencyCalculator
import org.apache.spark.sql.sources.{RelationKind, Table}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SQLContext}

object DependenciesSystemTableProvider extends SystemTableProvider with LocalSpark {
  
  override def execute(): Seq[Row] = {
    val tables = getTables(sqlContext.catalog)
    val dependentsMap = buildDependentsMap(tables)

    def kindOf(tableIdentifier: TableIdentifier): String =
      tables
        .get(tableIdentifier)
        .map(plan => RelationKind.kindOf(plan).getOrElse(Table).name)
        .getOrElse(DependenciesSystemTable.UnknownType)
        .toUpperCase

    dependentsMap.flatMap {
      case (tableIdent, dependents) =>
        val curKind = kindOf(tableIdent)
        dependents.map { dependent =>
          val dependentKind = kindOf(dependent)
          Row(
            tableIdent.database.orNull,
            tableIdent.table,
            curKind,
            dependent.database.orNull,
            dependent.table,
            dependentKind,
            ReferenceDependency.id)
        }
    }.toSeq
  }

  override val schema: StructType = DependenciesSystemTable.schema
}

object DependenciesSystemTable extends SchemaEnumeration {
  val baseSchemaName = Field("BASE_SCHEMA_NAME", StringType, nullable = true)
  val baseObjectName = Field("BASE_OBJECT_NAME", StringType, nullable = false)
  val baseObjectType = Field("BASE_OBJECT_TYPE", StringType, nullable = false)
  val dependentSchemaName = Field("DEPENDENT_SCHEMA_NAME", StringType, nullable = true)
  val dependentObjectName = Field("DEPENDENT_OBJECT_NAME", StringType, nullable = false)
  val dependentObjectType = Field("DEPENDENT_OBJECT_TYPE", StringType, nullable = false)
  val dependencyType = Field("DEPENDENCY_TYPE", IntegerType, nullable = false)

  private[DependenciesSystemTable] val UnknownType = "UNKNOWN"
}

Source File: partitionFunctionSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables

import org.apache.spark.sql.execution.tablefunctions.OutputFormatter
import org.apache.spark.sql.sources._
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.apache.spark.sql.util.GenericUtil._


  private def typeNameOf(f: PartitionFunction): String = f match {
    case _: RangePartitionFunction => "RANGE"
    case _: BlockPartitionFunction => "BLOCK"
    case _: HashPartitionFunction => "HASH"
  }
}

object PartitionFunctionSystemTable extends SchemaEnumeration {
  val id = Field("ID", StringType, nullable = false)
  val functionType = Field("TYPE", StringType, nullable = false)
  val columnName = Field("COLUMN_NAME", StringType, nullable = false)
  val columnType = Field("COLUMN_TYPE", StringType, nullable = false)
  val boundaries = Field("BOUNDARIES", StringType, nullable = true)
  val block = Field("BLOCK_SIZE", IntegerType, nullable = true)
  val partitions = Field("PARTITIONS", IntegerType, nullable = true)
  val minP = Field("MIN_PARTITIONS", IntegerType, nullable = true)
  val maxP = Field("MAX_PARTITIONS", IntegerType, nullable = true)
}

Source File: sessionSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{Row, SQLConf, SQLContext}


  private def allSettingsOf(conf: SQLConf): Map[String, String] = {
    val setConfs = conf.getAllConfs
    val defaultConfs = conf.getAllDefinedConfs.collect {
      case (key, default, _) if !setConfs.contains(key) => key -> default
    }
    setConfs ++ defaultConfs
  }

  override def schema: StructType = SessionSystemTable.schema
}

object SessionSystemTable extends SchemaEnumeration {
  val section = Field("SECTION", StringType, nullable = false)
  val key = Field("KEY", StringType, nullable = false)
  val value = Field("VALUE", StringType, nullable = true)
}

Source File: systemTableRegistry.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.analysis.ResolveSystemTables
import org.apache.spark.sql.catalyst.plans.logical.{UnresolvedProviderBoundSystemTable, UnresolvedSparkLocalSystemTable, UnresolvedSystemTable}


object SystemTableRegistry extends SimpleSystemTableRegistry {
  register("tables", TablesSystemTableProvider)
  register("object_dependencies", DependenciesSystemTableProvider)
  register("table_metadata", MetadataSystemTableProvider)
  register("schemas", SchemaSystemTableProvider)
  register("session_context", SessionSystemTableProvider)
  register("partition_functions", PartitionFunctionSystemTableProvider)
  register("partition_schemes", PartitionSchemeSystemTableProvider)
  register("relation_sql_name", RelationMappingSystemTableProvider)
}

Source File: tablesSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.sources._
import org.apache.spark.sql.sources.commands.WithOrigin
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.util.CollectionUtils.CaseInsensitiveMap
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}
import org.apache.spark.sql.catalyst.CaseSensitivityUtils._

object TablesSystemTableProvider extends SystemTableProvider with LocalSpark with ProviderBound {
  
  override def buildScan(requiredColumns: Array[String],
                         filters: Array[Filter]): RDD[Row] =
    DatasourceResolver
      .resolverFor(sqlContext)
      .newInstanceOfTyped[DatasourceCatalog](provider) match {
      case catalog: DatasourceCatalog with DatasourceCatalogPushDown =>
        catalog.getRelations(sqlContext, options, requiredColumns, filters.toSeq.merge)
      case catalog: DatasourceCatalog =>
        val values =
          catalog
            .getRelations(sqlContext, new CaseInsensitiveMap(options))
            .map(relationInfo => Row(
              relationInfo.name,
              relationInfo.isTemporary.toString.toUpperCase,
              relationInfo.kind.toUpperCase,
              relationInfo.provider))
        val rows = schema.buildPrunedFilteredScan(requiredColumns, filters)(values)
        sparkContext.parallelize(rows)
    }
}

sealed trait TablesSystemTable extends SystemTable {
  override def schema: StructType = TablesSystemTable.schema
}

object TablesSystemTable extends SchemaEnumeration {
  val tableName = Field("TABLE_NAME", StringType, nullable = false)
  val isTemporary = Field("IS_TEMPORARY", StringType, nullable = false)
  val kind = Field("KIND", StringType, nullable = false)
  val provider = Field("PROVIDER", StringType, nullable = true)
}

Source File: metadataSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.tablefunctions.OutputFormatter
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DatasourceResolver, Row, SQLContext}
import org.apache.spark.sql.catalyst.CaseSensitivityUtils._


  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] =
    DatasourceResolver
      .resolverFor(sqlContext)
      .newInstanceOfTyped[MetadataCatalog](provider) match {
      case catalog: MetadataCatalog with MetadataCatalogPushDown =>
        catalog.getTableMetadata(sqlContext, options, requiredColumns, filters.toSeq.merge)
      case catalog =>
        val rows = catalog.getTableMetadata(sqlContext, options).flatMap { tableMetadata =>
          val formatter = new OutputFormatter(tableMetadata.tableName, tableMetadata.metadata)
          formatter.format().map(Row.fromSeq)
        }
        sparkContext.parallelize(schema.buildPrunedFilteredScan(requiredColumns, filters)(rows))
    }

  override def schema: StructType = MetadataSystemTable.schema
}

object MetadataSystemTable extends SchemaEnumeration {
  val tableName = Field("TABLE_NAME", StringType, nullable = false)
  val metadataKey = Field("METADATA_KEY", StringType, nullable = true)
  val metadataValue = Field("METADATA_VALUE", StringType, nullable = true)
}

Source File: relationMappingSystemTable.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis.systables
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.sources.sql.SqlLikeRelation
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{Row, SQLContext}

object RelationMappingSystemTableProvider extends SystemTableProvider with LocalSpark {

  
  override def execute(): Seq[Row] = {
    sqlContext.tableNames().map { tableName =>
      val plan = sqlContext.catalog.lookupRelation(TableIdentifier(tableName))
      val sqlName = plan.collectFirst {
        case s: SqlLikeRelation =>
          s.relationName
        case LogicalRelation(s: SqlLikeRelation, _) =>
          s.relationName
      }
      Row(tableName, sqlName)
    }
  }
}

object RelationMappingSystemTable extends SchemaEnumeration {
  val sparkName = Field("RELATION_NAME", StringType, nullable = false)
  val providerName = Field("SQL_NAME", StringType, nullable = true)
}

Source File: CaseSensitivityUtils.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.analysis.{Analyzer, Catalog}
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.util.CollectionUtils._

import scala.util.{Failure, Success, Try}


  case class DuplicateFieldsException(
      originalSchema: StructType,
      schema: StructType,
      duplicateFields: Set[String])
    extends RuntimeException(
      s"""Given schema contains duplicate fields after applying case sensitivity rules:
         |${duplicateFields.mkString(", ")}
         |Given schema:
         |$originalSchema
         |After applying case sensitivity rules:
         |$schema
       """.stripMargin)
}

Source File: SQLRunner.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark.cli

import java.io._

import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.{Logging, SparkContext}

import scala.annotation.tailrec

protected[cli] case class CLIOptions(
    sqlFiles: List[String] = Nil, output: Option[String] = None)


  def main(args: Array[String]): Unit = {
    def fail(msg: String = USAGE): Unit = {
      logError(msg)
      System.exit(1)
    }

    val opts = parseOpts(args.toList)

    val outputStream: OutputStream = opts.output match {
      case Some(filename) => new FileOutputStream(new File(filename))
      case None => System.out
    }

    opts.sqlFiles
      .map((string: String) => new FileInputStream(new File(string)))
      .foreach(sql(_, outputStream))
  }
}

Source File: MockedDefaultSourceSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark

import java.util.concurrent.{Callable, Executors}

import com.sap.spark.dsmock.DefaultSource
import org.apache.spark.sql.sources.HashPartitioningFunction
import org.apache.spark.sql.{GlobalSapSQLContext, Row, SQLContext}
import org.mockito.Matchers._
import org.mockito.Mockito._
import org.scalatest.FunSuite

import scala.concurrent.duration._


class MockedDefaultSourceSuite
  extends FunSuite
  with GlobalSapSQLContext {

  val testTimeout = 10 // seconds

  private def numberOfThreads: Int = {
    val noOfCores = Runtime.getRuntime.availableProcessors()
    assert(noOfCores > 0)

    if (noOfCores == 1) 2 // It should always be multithreaded although only
                          // one processor is available (pseudo-multithreading)
    else noOfCores
  }

  def runMultiThreaded[A](op: Int => A): Seq[A] = {
    info(s"Running with $numberOfThreads threads")
    val pool = Executors.newFixedThreadPool(numberOfThreads)

    val futures = 1 to numberOfThreads map { i =>
      val task = new Callable[A] {
        override def call(): A = op(i)
      }
      pool.submit(task)
    }

    futures.map(_.get(testTimeout, SECONDS))
  }

  test("Underlying mocks of multiple threads are distinct") {
    val dataSources = runMultiThreaded { _ =>
      DefaultSource.withMock(identity)
    }

    dataSources foreach { current =>
      val sourcesWithoutCurrent = dataSources.filter(_.ne(current))
      assert(sourcesWithoutCurrent.forall(_.underlying ne current))
    }
  }

  test("Mocking works as expected") {
    runMultiThreaded { i =>
      DefaultSource.withMock { defaultSource =>
        when(defaultSource.getAllPartitioningFunctions(
          anyObject[SQLContext],
          anyObject[Map[String, String]]))
          .thenReturn(Seq(HashPartitioningFunction(s"foo$i", Seq.empty, None)))

        val Array(Row(name)) = sqlc
          .sql("SHOW PARTITION FUNCTIONS USING com.sap.spark.dsmock")
          .select("name")
          .collect()

        assertResult(s"foo$i")(name)
      }
    }
  }
}

Source File: ERPCurrencyConversionTestUtils.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.currency.erp

import org.apache.spark.sql.SQLContext

object ERPCurrencyConversionTestUtils {
  private val DecimalOne = java.math.BigDecimal.ONE
  private val DecimalRateA = new java.math.BigDecimal("0.7")
  private val DecimalRateB = new java.math.BigDecimal("1.2")

  
    val tcurx = List(("USD", 2), ("EUR", 2))
    val tcurv = List(("000", "M", "1", "", "0", "", "", "0", "0"))
    val tcurf = List(("000", "M", "USD", "EUR", "79839898", DecimalOne, DecimalOne, "", ""),
                     ("000", "M", "EUR", "USD", "79839898", DecimalOne, DecimalOne, "", ""))
    val tcurr = List(("000", "M", "USD", "EUR", "79839898", DecimalRateA, DecimalOne, DecimalOne),
                     ("000", "M", "EUR", "USD", "79839898", DecimalRateB, DecimalOne, DecimalOne))
    val tcurn = List(("000", "M", "USD", "EUR", "79839898", ""),
                     ("000", "M", "EUR", "USD", "79839898", ""))

    val tcurxRDD = sqlContext.sparkContext.parallelize(tcurx, parallelism)
    val tcurvRDD = sqlContext.sparkContext.parallelize(tcurv, parallelism)
    val tcurfRDD = sqlContext.sparkContext.parallelize(tcurf, parallelism)
    val tcurrRDD = sqlContext.sparkContext.parallelize(tcurr, parallelism)
    val tcurnRDD = sqlContext.sparkContext.parallelize(tcurn, parallelism)

    sqlContext.createDataFrame(tcurxRDD).registerTempTable(tables("tcurx"))
    sqlContext.createDataFrame(tcurvRDD).registerTempTable(tables("tcurv"))
    sqlContext.createDataFrame(tcurfRDD).registerTempTable(tables("tcurf"))
    sqlContext.createDataFrame(tcurrRDD).registerTempTable(tables("tcurr"))
    sqlContext.createDataFrame(tcurnRDD).registerTempTable(tables("tcurn"))
  }

  def dropERPTables(sqlContext: SQLContext, tables: Map[String, String]): Unit = {
    tables.foreach { case (tableID, tableName) =>
      sqlContext.sql(s"DROP TABLE IF EXISTS $tableName")
    }
  }

}

Source File: testCatalystRelations.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.sources

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.types._

class DummyCatalystRelation(
                             override val schema: StructType,
                             @transient override val sqlContext: SQLContext)
  extends BaseRelation
  with CatalystSource
  with Serializable {

  @transient
  var isMultiplePartitionExecutionFunc: Seq[CatalystSource] => Boolean = (r) => false
  override def isMultiplePartitionExecution(relations: Seq[CatalystSource]): Boolean =
    isMultiplePartitionExecutionFunc(relations)

  @transient
  var supportsLogicalPlanFunc: LogicalPlan => Boolean = (plan) => true
  override def supportsLogicalPlan(plan: LogicalPlan): Boolean =
    supportsLogicalPlanFunc(plan)

  @transient
  var supportsExpressionFunc: Expression => Boolean = (expr) => true
  override def supportsExpression(expr: Expression): Boolean =
    supportsExpressionFunc(expr)

  @transient
  var logicalPlanToRDDFunc: LogicalPlan => RDD[Row] =
    (plan) => new LogicalPlanRDD(plan, sqlContext.sparkContext)
  override def logicalPlanToRDD(plan: LogicalPlan): RDD[Row] =
    logicalPlanToRDDFunc(plan)

}

Source File: UseAliasesForAggregationsInGroupingsSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.plans.logical.Aggregate
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.types._
import org.scalatest.FunSuite
import org.scalatest.mock.MockitoSugar

class UseAliasesForAggregationsInGroupingsSuite extends FunSuite with MockitoSugar {

  val br1 = new BaseRelation {
    override def sqlContext: SQLContext = mock[SQLContext]
    override def schema: StructType = StructType(Seq(
      StructField("name", StringType),
      StructField("age", IntegerType)
    ))
  }

  val lr1 = LogicalRelation(br1)
  val nameAtt = lr1.output.find(_.name == "name").get
  val ageAtt = lr1.output.find(_.name == "age").get

  test("replace functions in group by") {
    val avgExpr = avg(ageAtt)
    val avgAlias = avgExpr as 'avgAlias
    assertResult(
      lr1.groupBy(avgAlias.toAttribute)(avgAlias)
    )(UseAliasesForFunctionsInGroupings(
      lr1.groupBy(avgExpr)(avgAlias))
    )
    assertResult(
      lr1.select(ageAtt)
    )(UseAliasesForFunctionsInGroupings(
      lr1.select(ageAtt))
      )
    intercept[RuntimeException](
      UseAliasesForFunctionsInGroupings(Aggregate(Seq(avgExpr), Seq(ageAtt), lr1))
    )
  }

}

Source File: RemoveNestedAliasesSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import com.sap.spark.PlanTest
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.Alias
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.types._
import org.scalatest.FunSuite
import org.scalatest.mock.MockitoSugar

class RemoveNestedAliasesSuite extends FunSuite with MockitoSugar with PlanTest {

  val br1 = new BaseRelation {
    override def sqlContext: SQLContext = mock[SQLContext]

    override def schema: StructType = StructType(Seq(
      StructField("name", StringType),
      StructField("age", IntegerType)
    ))
  }

  val lr1 = LogicalRelation(br1)
  val nameAtt = lr1.output.find(_.name == "name").get
  val ageAtt = lr1.output.find(_.name == "age").get

  test("Replace alias into aliases") {
    val avgExpr = avg(ageAtt)
    val avgAlias = avgExpr as 'avgAlias
    val aliasAlias = avgAlias as 'aliasAlias
    val aliasAliasAlias = aliasAlias as 'aliasAliasAlias
    val copiedAlias = Alias(avgExpr, aliasAlias.name)(
      exprId = aliasAlias.exprId
    )
    val copiedAlias2 = Alias(avgExpr, aliasAliasAlias.name)(
      exprId = aliasAliasAlias.exprId
    )

    assertResult(
      lr1.groupBy(avgAlias.toAttribute)(avgAlias)
    )(RemoveNestedAliases(lr1.groupBy(avgAlias.toAttribute)(avgAlias)))

    assertResult(
      lr1.groupBy(copiedAlias.toAttribute)(copiedAlias)
    )(RemoveNestedAliases(lr1.groupBy(aliasAlias.toAttribute)(aliasAlias)))

    assertResult(
      lr1.groupBy(copiedAlias2.toAttribute)(copiedAlias2)
    )(RemoveNestedAliases(lr1.groupBy(aliasAliasAlias.toAttribute)(aliasAliasAlias)))
  }

  test("Replace alias into expressions") {
    val ageAlias = ageAtt as 'ageAlias
    val avgExpr = avg(ageAlias) as 'avgAlias
    val correctedAvgExpr = avg(ageAtt) as 'avgAlias
    comparePlans(
      lr1.groupBy(correctedAvgExpr.toAttribute)(correctedAvgExpr),
      RemoveNestedAliases(lr1.groupBy(avgExpr.toAttribute)(avgExpr))
    )
  }

}

Source File: ResolveHierarchySuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.expressions.{Attribute, EqualTo}
import org.apache.spark.sql.catalyst.plans.logical.{AdjacencyListHierarchySpec, Hierarchy}
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.types._
import org.scalatest.FunSuite
import org.scalatest.mock.MockitoSugar

class ResolveHierarchySuite extends FunSuite with MockitoSugar {

  val br1 = new BaseRelation {
    override def sqlContext: SQLContext = mock[SQLContext]
    override def schema: StructType = StructType(Seq(
      StructField("id", IntegerType),
      StructField("parent", IntegerType)
    ))
  }

  val lr1 = LogicalRelation(br1)
  val idAtt = lr1.output.find(_.name == "id").get
  val parentAtt = lr1.output.find(_.name == "parent").get

  test("Check parenthood expression has no conflicting expression IDs and qualifiers") {
    val source = SimpleAnalyzer.execute(lr1.select('id, 'parent).subquery('u))
    assert(source.resolved)

    val hierarchy = Hierarchy(
      AdjacencyListHierarchySpec(source, "v",
        
        UnresolvedAttribute("u" :: "id" :: Nil) === UnresolvedAttribute("v" :: "id" :: Nil),
        Some('id.isNull), Nil),
      'node
    )

    val resolveHierarchy = ResolveHierarchy(SimpleAnalyzer)
    val resolveReferences = ResolveReferencesWithHierarchies(SimpleAnalyzer)

    val resolvedHierarchy = (0 to 10).foldLeft(hierarchy: Hierarchy) { (h, _) =>
      SimpleAnalyzer.ResolveReferences(
        resolveReferences(resolveHierarchy(h))
      ).asInstanceOf[Hierarchy]
    }

    assert(resolvedHierarchy.node.resolved)
    val resolvedSpec = resolvedHierarchy.spec.asInstanceOf[AdjacencyListHierarchySpec]
    assert(resolvedSpec.parenthoodExp.resolved)
    assert(resolvedSpec.startWhere.forall(_.resolved))
    assert(resolvedHierarchy.childrenResolved)
    assert(resolvedHierarchy.resolved)

    val parenthoodExpression = resolvedSpec.parenthoodExp.asInstanceOf[EqualTo]

    assertResult("u" :: Nil)(parenthoodExpression.left.asInstanceOf[Attribute].qualifiers)
    assertResult("v" :: Nil)(parenthoodExpression.right.asInstanceOf[Attribute].qualifiers)
    assert(parenthoodExpression.right.asInstanceOf[Attribute].exprId !=
      source.output.find(_.name == "id").get.exprId)
  }

}

Source File: ResolveAnnotationsSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.logical.Project
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.types._
import org.scalatest.FunSuite
import org.scalatest.mock.MockitoSugar
import org.apache.spark.sql.catalyst.dsl.plans._


class ResolveAnnotationsSuite extends FunSuite with MockitoSugar {

  // scalastyle:off magic.number
  val annotatedRel1 = new BaseRelation {
    override def sqlContext: SQLContext = mock[SQLContext]
    override def schema: StructType = StructType(Seq(
      StructField("id1.1", IntegerType, metadata =
        new MetadataBuilder().putLong("key1.1", 11L).build()),
      StructField("id1.2", IntegerType, metadata =
        new MetadataBuilder()
          .putLong("key1.2", 12L)
            .putLong("key1.3", 13).build()))
    )
  }
  val lr1 = LogicalRelation(annotatedRel1)
  val id11Att = lr1.output.find(_.name == "id1.1").get
  val id12Att = lr1.output.find(_.name == "id1.2").get

  val id11AnnotatedAtt = AnnotatedAttribute(id11Att)(
    Map("key1.1" -> Literal.create(100L, LongType), // override the old key
    "newkey" -> Literal.create(200L, LongType))) // define a new key

  val simpleAnnotatedSelect = lr1.select(id11AnnotatedAtt)
}

Source File: ResolveCountDistinctStarSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.analysis

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference}
import org.apache.spark.sql.catalyst.plans.logical.Aggregate
import org.apache.spark.sql.execution.datasources.LogicalRelation
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import org.scalatest.FunSuite
import org.scalatest.Inside._
import org.scalatest.mock.MockitoSugar
import org.apache.spark.sql.catalyst.dsl.plans.DslLogicalPlan
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete, Count}

import scala.collection.mutable.ArrayBuffer

class ResolveCountDistinctStarSuite extends FunSuite with MockitoSugar {
  val persons = new LogicalRelation(new BaseRelation {
    override def sqlContext: SQLContext = mock[SQLContext]
    override def schema: StructType = StructType(Seq(
      StructField("age", IntegerType),
      StructField("name", StringType)
    ))
  })

  test("Count distinct star is resolved correctly") {
    val projection = persons.select(UnresolvedAlias(
      AggregateExpression(Count(UnresolvedStar(None) :: Nil), Complete, true)))
    val stillNotCompletelyResolvedAggregate = SimpleAnalyzer.execute(projection)
    val resolvedAggregate = ResolveCountDistinctStar(SimpleAnalyzer)
                              .apply(stillNotCompletelyResolvedAggregate)
    inside(resolvedAggregate) {
      case Aggregate(Nil,
      ArrayBuffer(Alias(AggregateExpression(Count(expressions), Complete, true), _)), _) =>
        assert(expressions.collect {
          case a:AttributeReference => a.name
        }.toSet == Set("name", "age"))
    }
    assert(resolvedAggregate.resolved)
  }
}

Source File: SqlContextConfigurationUtils.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.util

import org.apache.spark.sql.SQLContext


  def withConf[A](settings: Map[String, String])(a: => A): A = {
    val temps: Map[String, String] =
      settings.keys.map(key => key -> sqlContext.getConf(key))(scala.collection.breakOut)

    try {
      settings.foreach {
        case (key, value) => sqlContext.setConf(key, value)
      }
      a
    } finally {
      temps.foreach {
        case (key, value) => sqlContext.setConf(key, value)
      }
    }
  }
}

Source File: DummyRelationUtils.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.util

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.{ColumnName, Row, SQLContext}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.sources.sql.SqlLikeRelation
import org.apache.spark.sql.types.{StructField, StructType}


  case class DummyCatalystSourceRelation(
      schema: StructType,
      isMultiplePartitionExecutionFunc: Option[Seq[CatalystSource] => Boolean] = None,
      supportsLogicalPlanFunc: Option[LogicalPlan => Boolean] = None,
      supportsExpressionFunc: Option[Expression => Boolean] = None,
      logicalPlanToRDDFunc: Option[LogicalPlan => RDD[Row]] = None)
     (@transient implicit val sqlContext: SQLContext)
    extends BaseRelation
    with CatalystSource {

    override def isMultiplePartitionExecution(relations: Seq[CatalystSource]): Boolean =
      isMultiplePartitionExecutionFunc.forall(_.apply(relations))

    override def supportsLogicalPlan(plan: LogicalPlan): Boolean =
      supportsLogicalPlanFunc.forall(_.apply(plan))

    override def supportsExpression(expr: Expression): Boolean =
      supportsExpressionFunc.forall(_.apply(expr))

    override def logicalPlanToRDD(plan: LogicalPlan): RDD[Row] =
      logicalPlanToRDDFunc.getOrElse(
        (plan: LogicalPlan) => new LogicalPlanRDD(plan, sqlContext.sparkContext)).apply(plan)
  }
}

Source File: PartitioningFunctionUtils.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.util

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.Dissector


  def dropPartitioningFunction(name: String,
                               allowExisting: Boolean = false,
                               dataSource: String): Unit = {
    sqlc.sql(
      s"""
        |DROP PARTITION FUNCTION
        |${if (allowExisting) "IF EXISTS" else ""}
        |$name
        |USING $dataSource
      """.stripMargin
    )
  }

  private def intervalToSql(interval: Dissector) =
    s"${interval.productPrefix.toUpperCase()} ${interval.n}"
}

Source File: testRelations.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark.dstest

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.commands.WithOrigin
import org.apache.spark.sql.sources.{BaseRelation, DropRelation, Table}
import org.apache.spark.sql.types._


case class DummyRelationWithTempFlag(
    sqlContext: SQLContext,
    tableName: Seq[String],
    schema: StructType,
    temporary: Boolean)
  extends BaseRelation
  with Table
  with DropRelation
  with WithOrigin {

  override val provider: String = "com.sap.spark.dstest"

  override def isTemporary: Boolean = temporary

  override def dropTable(): Unit = {}
}

case class DummyRelationWithoutTempFlag(
    sqlContext: SQLContext,
    schema: StructType)
  extends BaseRelation
  with DropRelation
  with Table
  with WithOrigin {

  override def isTemporary: Boolean = false

  override val provider: String = "com.sap.spark.dstest"

  override def dropTable(): Unit = {}
}

Source File: WithSQLContext.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark

import java.util.Locale

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.hive.HiveContext
import org.scalatest.{BeforeAndAfterEach, Suite}

trait WithSQLContext extends BeforeAndAfterEach {
  self: Suite with WithSparkContext =>

  override def beforeEach(): Unit = {
    try {
      super.beforeEach()
      setUpSQLContext()
    } catch {
      case ex: Throwable =>
        tearDownSQLContext()
        throw ex
    }
  }

  override def afterEach(): Unit = {
    try {
      super.afterEach()
    } finally {
      tearDownSQLContext()
    }
  }

  implicit def sqlContext: SQLContext = _sqlContext
  def sqlc: SQLContext = sqlContext

  var _sqlContext: SQLContext = _

  protected def setUpSQLContext(): Unit =
    _sqlContext = SQLContext.getOrCreate(sc).newSession()


  protected def tearDownSQLContext(): Unit =
    _sqlContext = null

  protected def tableName(name: String): String =
    sqlc match {
      
      case _: HiveContext => name.toLowerCase(Locale.ENGLISH)
      case _ => name
    }

}

Source File: TestUtils.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark.util

import java.util.Locale

import scala.io.Source
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.{Row, SQLContext, SapSQLContext}
import org.apache.spark.sql.hive.SapHiveContext
import org.apache.spark.sql.sources.sql.SqlLikeRelation
import org.apache.spark.sql.sources.{BaseRelation, CatalystSource, Table}
import org.apache.spark.sql.types.StructType
import org.mockito.Matchers._
import org.mockito.Mockito._

import scala.tools.nsc.io.Directory
import scala.util.{Failure, Success}


  def parsePTestFile(fileName: String): List[(String, String, String)] = {
    val filePath = getFileFromClassPath(fileName)
    val fileContents = Source.fromFile(filePath).getLines
      .map(p => p.stripMargin.trim)
      .filter(p => !p.isEmpty && !p.startsWith("//")) // filter empty rows and comments
      .mkString("\n")
    val p = new PTestFileParser

    // strip semicolons from query and parsed
    p(fileContents) match {
      case Success(lines) =>
        lines.map {
          case (query, parsed, expect) =>
            (stripSemicolon(query).trim, stripSemicolon(parsed).trim, expect.trim)
        }
      case Failure(ex) => throw ex
    }
  }

  private def stripSemicolon(sql: String): String =
    if (sql.endsWith(";")) {
      sql.substring(0, sql.length-1)
    } else {
      sql
    }

  def withTempDirectory[A](f: Directory => A): A = {
    val dir = Directory.makeTemp()
    try {
      f(dir)
    } finally {
      dir.deleteIfExists()
    }
  }
}

Source File: SQLRunnerSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package com.sap.spark.cli

import java.io.{ByteArrayInputStream, ByteArrayOutputStream, InputStream}

import org.apache.spark.SparkContext
import org.apache.spark.sql.{GlobalSapSQLContext, SQLContext}
import org.scalatest.{BeforeAndAfterEach, FunSuite, ShouldMatchers}



    // good call
    val goodOpts =
      SQLRunner.parseOpts(List("a.sql", "b.sql", "-o", "output.csv"))

    goodOpts.sqlFiles should be(List("a.sql", "b.sql"))
    goodOpts.output should be(Some("output.csv"))

    // bad call
    val badOpts = SQLRunner.parseOpts(List())

    badOpts.sqlFiles should be(List())
    badOpts.output should be(None)

    // ugly call
    val uglyOpts =
      SQLRunner.parseOpts(List("a.sql", "-o", "output.csv", "b.sql"))

    uglyOpts.sqlFiles should be(List("a.sql", "b.sql"))
    uglyOpts.output should be(Some("output.csv"))
  }

  def runSQLTest(input: String, expectedOutput: String): Unit = {
    val inputStream: InputStream = new ByteArrayInputStream(input.getBytes())
    val outputStream = new ByteArrayOutputStream()

    SQLRunner.sql(inputStream, outputStream)

    val output = outputStream.toString
    output should be(expectedOutput)
  }

  test("can run dummy query") {
    val input = "SELECT 1;"
    val output = "1\n"

    runSQLTest(input, output)
  }

  test("can run multiple dummy queries") {
    val input = """
        |SELECT 1;SELECT 2;
        |SELECT 3;
      """.stripMargin

    val output = "1\n2\n3\n"

    runSQLTest(input, output)
  }

  test("can run a basic example with tables") {
    val input = """
                  |SELECT * FROM DEMO_TABLE;
                  |SELECT * FROM DEMO_TABLE LIMIT 1;
                  |DROP TABLE DEMO_TABLE;
                """.stripMargin

    val output = "1,a\n2,b\n3,c\n1,a\n"

    runSQLTest(input, output)
  }

  test("can run an example with comments") {
    val input = """
                  |SELECT * FROM DEMO_TABLE; -- this is the first query
                  |SELECT * FROM DEMO_TABLE LIMIT 1;
                  |-- now let's drop a table
                  |DROP TABLE DEMO_TABLE;
                """.stripMargin

    val output = "1,a\n2,b\n3,c\n1,a\n"

    runSQLTest(input, output)
  }
}

Source File: Preparator.scala From pio-template-sr with Apache License 2.0

5 votes

package org.template.sr



import org.apache.predictionio.controller.PPreparator
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.StandardScalerModel
import org.apache.spark.sql.SQLContext
import org.apache.spark.mllib.linalg.Vectors

class PreparedData(
  val rows: DataFrame,
  val dsp: DataSourceParams,
  val ssModel: org.apache.spark.mllib.feature.StandardScalerModel
) extends Serializable

class Preparator
  extends PPreparator[TrainingData, PreparedData] {

  def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    if (trainingData.dsp.useStandardScaler) {
      val training = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features")
      val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(trainingData.dsp.standardScalerWithStd).setWithMean(trainingData.dsp.standardScalerWithMean)
      val scalerModel = scaler.fit(training)
      val scaledData = scalerModel.transform(training)
      val s1 = scaledData.select("label","censor","scaledFeatures").withColumnRenamed("scaledFeatures","features")

      //Prepare old StandardScaler
      val oldScaler = new org.apache.spark.mllib.feature.StandardScaler(withMean = trainingData.dsp.standardScalerWithMean, withStd = trainingData.dsp.standardScalerWithStd)
      val oldSSModel = oldScaler.fit(trainingData.rows.map(x=>(Vectors.dense(x._3))))
            
      new PreparedData(rows = s1, dsp = trainingData.dsp, ssModel = oldSSModel)
    }
    else {
      new PreparedData(rows = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features"), dsp = trainingData.dsp, ssModel = null)
    }
  }
}

Source File: AvroTransformer.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import com.memsql.spark.etl.api.{UserTransformConfig, Transformer, PhaseConfig}
import com.memsql.spark.etl.utils.PhaseLogger
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SQLContext, DataFrame, Row}
import org.apache.spark.sql.types.StructType

import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.avro.io.DecoderFactory
import org.apache.avro.specific.SpecificDatumReader

// Takes DataFrames of byte arrays, where each row is a serialized Avro record.
// Returns DataFrames of deserialized data, where each field has its own column.
class AvroTransformer extends Transformer {
  var avroSchemaStr: String = null
  var sparkSqlSchema: StructType = null

  def AvroRDDToDataFrame(sqlContext: SQLContext, rdd: RDD[Row]): DataFrame = {

    val rowRDD: RDD[Row] = rdd.mapPartitions({ partition => {
      // Create per-partition copies of non-serializable objects
      val parser: Schema.Parser = new Schema.Parser()
      val avroSchema = parser.parse(avroSchemaStr)
      val reader = new SpecificDatumReader[GenericData.Record](avroSchema)

      partition.map({ rowOfBytes =>
        val bytes = rowOfBytes(0).asInstanceOf[Array[Byte]]
        val decoder = DecoderFactory.get().binaryDecoder(bytes, null)
        val record = reader.read(null, decoder)
        val avroToRow = new AvroToRow()

        avroToRow.getRow(record)
      })
    }})
    sqlContext.createDataFrame(rowRDD, sparkSqlSchema)
  }

  override def initialize(sqlContext: SQLContext, config: PhaseConfig, logger: PhaseLogger): Unit = {
    val userConfig = config.asInstanceOf[UserTransformConfig]

    val avroSchemaJson = userConfig.getConfigJsValue("avroSchema") match {
      case Some(s) => s
      case None => throw new IllegalArgumentException("avroSchema must be set in the config")
    }
    avroSchemaStr = avroSchemaJson.toString

    val parser = new Schema.Parser()
    val avroSchema = parser.parse(avroSchemaJson.toString)
    sparkSqlSchema = AvroToSchema.getSchema(avroSchema)
  }

  override def transform(sqlContext: SQLContext, df: DataFrame, config: PhaseConfig, logger: PhaseLogger): DataFrame = {
    AvroRDDToDataFrame(sqlContext, df.rdd)
  }
}

Source File: AvroRandomExtractor.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import com.memsql.spark.etl.api._
import com.memsql.spark.etl.utils.PhaseLogger
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.sql.{SQLContext, DataFrame, Row}
import org.apache.spark.sql.types._
import org.apache.avro.Schema
import org.apache.avro.generic.GenericData
import org.apache.avro.io.{DatumWriter, EncoderFactory}
import org.apache.avro.specific.SpecificDatumWriter

import java.io.ByteArrayOutputStream

// Generates an RDD of byte arrays, where each is a serialized Avro record.
class AvroRandomExtractor extends Extractor {
  var count: Int = 1
  var generator: AvroRandomGenerator = null
  var writer: DatumWriter[GenericData.Record] = null
  var avroSchema: Schema = null
  
  def schema: StructType = StructType(StructField("bytes", BinaryType, false) :: Nil)

  val parser: Schema.Parser = new Schema.Parser()

  override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = {
    val userConfig = config.asInstanceOf[UserExtractConfig]
    val avroSchemaJson = userConfig.getConfigJsValue("avroSchema") match {
      case Some(s) => s
      case None => throw new IllegalArgumentException("avroSchema must be set in the config")
    }
    count = userConfig.getConfigInt("count").getOrElse(1)
    avroSchema = parser.parse(avroSchemaJson.toString)

    writer = new SpecificDatumWriter(avroSchema)
    generator = new AvroRandomGenerator(avroSchema)
  }

  override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = {
    val rdd = sqlContext.sparkContext.parallelize((1 to count).map(_ => Row({
      val out = new ByteArrayOutputStream
      val encoder = EncoderFactory.get().binaryEncoder(out, null)
      val avroRecord: GenericData.Record = generator.next().asInstanceOf[GenericData.Record]

      writer.write(avroRecord, encoder)
      encoder.flush
      out.close
      out.toByteArray
    })))

    Some(sqlContext.createDataFrame(rdd, schema))
  }
}

Source File: AvroExtractorSpec.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.avro

import com.memsql.spark.etl.api.UserExtractConfig
import org.apache.spark.streaming._
import org.apache.spark.sql.SQLContext
import test.util.{Fixtures, UnitSpec, TestLogger, LocalSparkContext}
import spray.json._

class ExtractorsSpec extends UnitSpec with LocalSparkContext {
  var ssc: StreamingContext = _
  var sqlContext: SQLContext = _

  override def beforeEach(): Unit = {
    super.beforeEach()
    ssc = new StreamingContext(sc, Seconds(1))
    sqlContext = new SQLContext(sc)
  }

  val avroConfig = Fixtures.avroConfig.parseJson
  val extractConfig = UserExtractConfig(class_name = "Test", value = avroConfig)
  val logger = new TestLogger("test")

  "AvroRandomExtractor" should "emit a random DF" in {
    val extract = new AvroRandomExtractor
    extract.initialize(ssc, sqlContext, extractConfig, 1, logger)

    val maybeDf = extract.next(ssc, 1, sqlContext, extractConfig, 1, logger)
    assert(maybeDf.isDefined)
    assert(maybeDf.get.count == 5)
  }
}

Source File: ThriftTransformer.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.thrift

import com.memsql.spark.etl.api._
import com.memsql.spark.etl.utils.PhaseLogger
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.types._
import org.apache.thrift.{TBase, TDeserializer, TFieldIdEnum}

class ThriftTransformer extends Transformer {
  private var classObj: Class[_] = null
  private var thriftToRow: ThriftToRow = null
  private var deserializer: TDeserializer = null
  private var schema: StructType = null

  def thriftRDDToDataFrame(sqlContext: SQLContext, rdd: RDD[Row]): DataFrame = {
    val rowRDD: RDD[Row] = rdd.map({ record =>
      val recordAsBytes = record(0).asInstanceOf[Array[Byte]]
      val i = classObj.newInstance().asInstanceOf[TBase[_ <: TBase[_, _], _ <: TFieldIdEnum]]
      deserializer.deserialize(i, recordAsBytes)
      thriftToRow.getRow(i)
    })
    sqlContext.createDataFrame(rowRDD, schema)
  }

  override def initialize(sqlContext: SQLContext, config: PhaseConfig, logger: PhaseLogger): Unit = {
    val userConfig = config.asInstanceOf[UserTransformConfig]
    val className = userConfig.getConfigString("className") match {
      case Some(s) => s
      case None => throw new IllegalArgumentException("className must be set in the config")
    }

    classObj = Class.forName(className)
    thriftToRow = new ThriftToRow(classObj)
    deserializer = new TDeserializer()

    schema = ThriftToSchema.getSchema(classObj)
  }

  override def transform(sqlContext: SQLContext, df: DataFrame, config: PhaseConfig, logger: PhaseLogger): DataFrame = {
    thriftRDDToDataFrame(sqlContext, df.rdd)
  }
}

Source File: ThriftRandomExtractor.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.thrift

import com.memsql.spark.etl.api._
import com.memsql.spark.etl.utils.PhaseLogger
import org.apache.spark.SparkContext
import org.apache.spark.sql.{SQLContext, DataFrame, Row}
import org.apache.spark.sql.types._
import org.apache.spark.streaming.StreamingContext
import org.apache.thrift.protocol.TBinaryProtocol
import org.apache.thrift.{TBase, TFieldIdEnum, TSerializer}

class ThriftRandomExtractor extends Extractor {
  var count: Int = 1
  var thriftType: Class[_] = null
  var serializer: TSerializer = null

  def schema: StructType = StructType(StructField("bytes", BinaryType, false) :: Nil)

  override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = {
    val userConfig = config.asInstanceOf[UserExtractConfig]
    val className = userConfig.getConfigString("className") match {
      case Some(s) => s
      case None => throw new IllegalArgumentException("className must be set in the config")
    }
    thriftType = Class.forName(className)
    serializer = new TSerializer(new TBinaryProtocol.Factory())
    count = userConfig.getConfigInt("count").getOrElse(1)
  }

  override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Option[DataFrame] = {
    val rdd = sqlContext.sparkContext.parallelize((1 to count).map(_ => Row({
      val thriftObject = ThriftRandomGenerator.next(thriftType).asInstanceOf[TBase[_ <: TBase[_, _], _ <: TFieldIdEnum]]
      serializer.serialize(thriftObject)
    })))
    Some(sqlContext.createDataFrame(rdd, schema))
  }
}

Source File: CheckpointingKafkaExtractor.scala From streamliner-examples with Apache License 2.0

5 votes

package com.memsql.spark.examples.kafka

import com.memsql.spark.etl.api.{UserExtractConfig, PhaseConfig, ByteArrayExtractor}
import com.memsql.spark.etl.utils.PhaseLogger
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.StreamingContext

import kafka.serializer.{DefaultDecoder, StringDecoder}
import org.apache.spark.streaming.kafka.{CheckpointedDirectKafkaInputDStream, CheckpointedKafkaUtils}
import org.apache.spark.streaming.dstream.InputDStream


class CheckpointingKafkaExtractor extends ByteArrayExtractor {
  var CHECKPOINT_DATA_VERSION = 1

  var dstream: CheckpointedDirectKafkaInputDStream[String, Array[Byte], StringDecoder, DefaultDecoder, Array[Byte]] = null

  var zkQuorum: String = null
  var topic: String = null

  override def initialize(ssc: StreamingContext, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long, logger: PhaseLogger): Unit = {
    val kafkaConfig  = config.asInstanceOf[UserExtractConfig]
    zkQuorum = kafkaConfig.getConfigString("zk_quorum").getOrElse {
      throw new IllegalArgumentException("\"zk_quorum\" must be set in the config")
    }
    topic = kafkaConfig.getConfigString("topic").getOrElse {
      throw new IllegalArgumentException("\"topic\" must be set in the config")
    }
  }

  def extract(ssc: StreamingContext, extractConfig: PhaseConfig, batchDuration: Long, logger: PhaseLogger): InputDStream[Array[Byte]] = {
    val kafkaParams = Map[String, String](
      "memsql.zookeeper.connect" -> zkQuorum
    )
    val topics = Set(topic)

    dstream = CheckpointedKafkaUtils.createDirectStreamFromZookeeper[String, Array[Byte], StringDecoder, DefaultDecoder](
      ssc, kafkaParams, topics, batchDuration, lastCheckpoint)
    dstream
  }

  override def batchCheckpoint: Option[Map[String, Any]] = {
    dstream match {
      case null => None
      case default => {
        val currentOffsets = dstream.getCurrentOffsets.map { case (tp, offset) =>
          Map("topic" -> tp.topic, "partition" -> tp.partition, "offset" -> offset)
        }
        Some(Map("offsets" -> currentOffsets, "zookeeper" -> zkQuorum, "version" -> CHECKPOINT_DATA_VERSION))
      }
    }
  }

  override def batchRetry: Unit = {
    if (dstream.prevOffsets != null) {
      dstream.setCurrentOffsets(dstream.prevOffsets)
    }
  }
}

Source File: DefaultSource.scala From spark-power-bi with Apache License 2.0

5 votes

package com.granturing.spark.powerbi

import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext}
import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider}
import scala.concurrent._
import scala.concurrent.ExecutionContext.Implicits._
import scala.concurrent.duration.Duration

class DefaultSource extends CreatableRelationProvider with PowerBISink {

  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      data: DataFrame): BaseRelation = {

    val conf = ClientConf.fromSparkConf(sqlContext.sparkContext.getConf)
    implicit val client = new Client(conf)

    val dataset = parameters.getOrElse("dataset", sys.error("'dataset' must be specified"))
    val table = parameters.getOrElse("table", sys.error("'table' must be specified"))
    val batchSize = parameters.getOrElse("batchSize", conf.batchSize.toString).toInt
    val group = parameters.get("group")

    val step = for {
      groupId <- getGroupId(group)
      ds <- getOrCreateDataset(mode, groupId, dataset, table, data.schema)
    } yield (groupId, ds)

    val result = step map { case (groupId, ds) =>
      val fields = data.schema.fieldNames.zipWithIndex
      val _conf = conf
      val _token = Some(client.currentToken)
      val _table = table
      val _batchSize = batchSize

      val coalesced = data.rdd.partitions.size > _conf.maxPartitions match {
        case true => data.coalesce(_conf.maxPartitions)
        case false => data
      }

      coalesced foreachPartition { p =>
        val rows = p map { r =>
          fields map { case(name, index) => (name -> r(index)) } toMap
        } toSeq

        val _client = new Client(_conf, _token)

        val submit = rows.
          sliding(_batchSize, _batchSize).
          foldLeft(future()) { (fAccum, batch) =>
          fAccum flatMap { _ => _client.addRows(ds.id, _table, batch, groupId) } }

        submit.onComplete { _ => _client.shutdown() }

        Await.result(submit, _conf.timeout)
      }
    }

    result.onComplete { _ => client.shutdown() }

    Await.result(result, Duration.Inf)

    new BaseRelation {
      val sqlContext = data.sqlContext

      val schema = data.schema
    }
  }

}

Source File: ExcelRelation.scala From spark-hadoopoffice-ds with Apache License 2.0

5 votes

package org.zuinnote.spark.office.excel

import scala.collection.JavaConversions._

import org.apache.spark.sql.sources.{ BaseRelation, TableScan }
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.types.ArrayType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.SQLContext

import org.apache.spark.sql._
import org.apache.spark.rdd.RDD

import org.apache.hadoop.conf._
import org.apache.hadoop.mapreduce._

import org.apache.commons.logging.LogFactory
import org.apache.commons.logging.Log

import org.zuinnote.hadoop.office.format.common.dao._
import org.zuinnote.hadoop.office.format.mapreduce._

import org.zuinnote.spark.office.excel.util.ExcelFile


  override def buildScan: RDD[Row] = {
    // read ExcelRows
    val excelRowsRDD = ExcelFile.load(sqlContext, location, hadoopParams)
    // map to schema
    val schemaFields = schema.fields
    excelRowsRDD.flatMap(excelKeyValueTuple => {
      // map the Excel row data structure to a Spark SQL schema
      val rowArray = new Array[Any](excelKeyValueTuple._2.get.length)
      var i = 0;
      for (x <- excelKeyValueTuple._2.get) { // parse through the SpreadSheetCellDAO
        val spreadSheetCellDAOStructArray = new Array[String](schemaFields.length)
        val currentSpreadSheetCellDAO: Array[SpreadSheetCellDAO] = excelKeyValueTuple._2.get.asInstanceOf[Array[SpreadSheetCellDAO]]
        spreadSheetCellDAOStructArray(0) = currentSpreadSheetCellDAO(i).getFormattedValue
        spreadSheetCellDAOStructArray(1) = currentSpreadSheetCellDAO(i).getComment
        spreadSheetCellDAOStructArray(2) = currentSpreadSheetCellDAO(i).getFormula
        spreadSheetCellDAOStructArray(3) = currentSpreadSheetCellDAO(i).getAddress
        spreadSheetCellDAOStructArray(4) = currentSpreadSheetCellDAO(i).getSheetName
        // add row representing one Excel row
        rowArray(i) = spreadSheetCellDAOStructArray
        i += 1
      }
      Some(Row.fromSeq(rowArray))
    })

  }

}

Source File: HttpStreamSink.scala From spark-http-stream with BSD 2-Clause "Simplified" License

5 votes

package org.apache.spark.sql.execution.streaming.http

import org.apache.spark.internal.Logging
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.sources.DataSourceRegister
import org.apache.spark.sql.sources.StreamSinkProvider
import org.apache.spark.sql.streaming.OutputMode

import Params.map2Params

class HttpStreamSinkProvider
		extends StreamSinkProvider with DataSourceRegister {
	def createSink(
		sqlContext: SQLContext,
		parameters: Map[String, String],
		partitionColumns: Seq[String],
		outputMode: OutputMode): Sink = {
		new HttpStreamSink(parameters.getRequiredString("httpServletUrl"),
			parameters.getRequiredString("topic"),
			parameters.getInt("maxPacketSize", 10 * 1024 * 1024));
	}

	def shortName(): String = "httpStream"
}

class HttpStreamSink(httpPostURL: String, topic: String, maxPacketSize: Int)
		extends Sink with Logging {
	val producer = HttpStreamClient.connect(httpPostURL);
	val RETRY_TIMES = 5;
	val SLEEP_TIME = 100;

	override def addBatch(batchId: Long, data: DataFrame) {
		//send data to the HTTP server
		var success = false;
		var retried = 0;
		while (!success && retried < RETRY_TIMES) {
			try {
				retried += 1;
				producer.sendDataFrame(topic, batchId, data, maxPacketSize);
				success = true;
			}
			catch {
				case e: Throwable ⇒ {
					success = false;
					super.logWarning(s"failed to send", e);
					if (retried < RETRY_TIMES) {
						val sleepTime = SLEEP_TIME * retried;
						super.logWarning(s"will retry to send after ${sleepTime}ms");
						Thread.sleep(sleepTime);
					}
					else {
						throw e;
					}
				}
			}
		}
	}
}

Source File: SparkLeapFrame.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.spark

import ml.combust.mleap.core.types.{StructField, StructType}
import ml.combust.mleap.runtime.frame.{FrameBuilder, Row, RowUtil}
import ml.combust.mleap.runtime.function.{Selector, UserDefinedFunction}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql
import org.apache.spark.sql.mleap.TypeConverters
import org.apache.spark.sql.{DataFrame, SQLContext, types}

import scala.util.Try


case class SparkLeapFrame(schema: StructType,
                          dataset: RDD[Row],
                          sqlContext: SQLContext) extends FrameBuilder[SparkLeapFrame] {
  override def withColumn(output: String, inputs: Selector *)
                         (udf: UserDefinedFunction): Try[SparkLeapFrame] = {
    RowUtil.createRowSelectors(schema, inputs: _*)(udf).flatMap {
      rowSelectors =>
        val field = StructField(output, udf.outputTypes.head)

        schema.withField(field).map {
          schema2 =>
            val dataset2 = dataset.map {
              row => row.withValue(rowSelectors: _*)(udf)
            }
            copy(schema = schema2, dataset = dataset2)
        }
    }
  }

  override def withColumns(outputs: Seq[String], inputs: Selector*)
                          (udf: UserDefinedFunction): Try[SparkLeapFrame] = {
    RowUtil.createRowSelectors(schema, inputs: _*)(udf).flatMap {
      rowSelectors =>
        val fields = outputs.zip(udf.outputTypes).map {
          case (name, dt) => StructField(name, dt)
        }

        schema.withFields(fields).map {
          schema2 =>
            val dataset2 = dataset.map {
              row => row.withValues(rowSelectors: _*)(udf)
            }
            copy(schema = schema2, dataset = dataset2)
        }
    }
  }

  override def select(fieldNames: String *): Try[SparkLeapFrame] = {
    for(indices <- schema.indicesOf(fieldNames: _*);
      schema2 <- schema.selectIndices(indices: _*)) yield {
      val dataset2 = dataset.map(row => row.selectIndices(indices: _*))

      copy(schema = schema2, dataset = dataset2)
    }
  }

  override def drop(names: String*): Try[SparkLeapFrame] = {
    for(indices <- schema.indicesOf(names: _*);
        schema2 <- schema.dropIndices(indices: _*)) yield {
      val dataset2 = dataset.map(row => row.dropIndices(indices: _*))

      copy(schema = schema2, dataset = dataset2)
    }
  }

  override def filter(selectors: Selector*)
                     (udf: UserDefinedFunction): Try[SparkLeapFrame] = {
    RowUtil.createRowSelectors(schema, selectors: _*)(udf).map {
      rowSelectors =>
        val dataset2 = dataset.filter(row => row.shouldFilter(rowSelectors: _*)(udf))
        copy(schema = schema, dataset = dataset2)
    }
  }

  def toSpark: DataFrame = {
    val spec = schema.fields.map(TypeConverters.mleapToSparkConverter)
    val fields = spec.map(_._1)
    val converters = spec.map(_._2)
    val sparkSchema = new types.StructType(fields.toArray)
    val data = dataset.map {
      r =>
        val values = r.zip(converters).map {
          case (v, c) => c(v)
        }
        sql.Row(values.toSeq: _*)
    }

    sqlContext.createDataFrame(data, sparkSchema)
  }
}

Source File: package.scala From sparksql-scalapb with Apache License 2.0

5 votes

package scalapb

import org.apache.spark.sql.{DataFrame, Encoder, SQLContext, SparkSession}

import scala.reflect.ClassTag

package object spark {
  implicit class ProtoSQLContext(val sqlContext: SQLContext) extends AnyVal {
    def protoToDataFrame[T <: GeneratedMessage: Encoder](
        protoRdd: org.apache.spark.rdd.RDD[T]
    ) = {
      ProtoSQL.protoToDataFrame(sqlContext, protoRdd)
    }
  }

  implicit class ProtoRDD[T <: GeneratedMessage](
      val protoRdd: org.apache.spark.rdd.RDD[T]
  ) extends AnyVal {
    def toDataFrame(
        sqlContext: SQLContext
    )(implicit encoder: Encoder[T]): DataFrame = {
      ProtoSQL.protoToDataFrame(sqlContext, protoRdd)
    }

    def toDataFrame(
        sparkSession: SparkSession
    )(implicit encoder: Encoder[T]): DataFrame = {
      ProtoSQL.protoToDataFrame(sparkSession, protoRdd)
    }
  }
}

Source File: RegressionDatagen.scala From hivemall-spark with Apache License 2.0

5 votes

package hivemall.tools

import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.hive.HivemallOps._
import org.apache.spark.sql.hive.HivemallUtils._
import org.apache.spark.sql.types._

object RegressionDatagen {

  
  def exec(sc: SQLContext,
           n_partitions: Int = 2,
           min_examples: Int = 1000,
           n_features: Int = 10,
           n_dims: Int = 200,
           seed: Int = 43,
           dense: Boolean = false,
           prob_one: Float = 0.6f,
           sort: Boolean = false,
           cl: Boolean = false): DataFrame = {

    require(n_partitions > 0, "Non-negative #n_partitions required.")
    require(min_examples > 0, "Non-negative #min_examples required.")
    require(n_features > 0, "Non-negative #n_features required.")
    require(n_dims > 0, "Non-negative #n_dims required.")

    // Calculate #examples to generate in each partition
    val n_examples = (min_examples + n_partitions - 1) / n_partitions

    val df = sc.createDataFrame(
        sc.sparkContext.parallelize((0 until n_partitions).map(Row(_)), n_partitions),
        StructType(
          StructField("data", IntegerType, true) ::
          Nil)
      )
    import sc.implicits._
    df.lr_datagen(
      s"-n_examples $n_examples -n_features $n_features -n_dims $n_dims -prob_one $prob_one"
        + (if (dense) " -dense" else "")
        + (if (sort) " -sort" else "")
        + (if (cl) " -cl" else ""))
      .select($"label".cast(DoubleType).as("label"), $"features")
  }
}

Source File: HivemallStreamingOps.scala From hivemall-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming

import scala.reflect.ClassTag

import org.apache.spark.ml.feature.HmLabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, DataFrame, SQLContext}
import org.apache.spark.streaming.dstream.DStream

final class HivemallStreamingOps(ds: DStream[HmLabeledPoint]) {

  def predict[U: ClassTag](f: DataFrame => DataFrame)(implicit sqlContext: SQLContext)
      : DStream[Row] = {
    ds.transform[Row] { rdd: RDD[HmLabeledPoint] =>
      f(sqlContext.createDataFrame(rdd)).rdd
    }
  }
}

object HivemallStreamingOps {

  
  implicit def dataFrameToHivemallStreamingOps(ds: DStream[HmLabeledPoint])
      : HivemallStreamingOps = {
    new HivemallStreamingOps(ds)
  }
}

Source File: L8-1DataFrameAPI.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions.desc
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object CdrDataframeApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)
    import sqlC.implicits._

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        val cdrs = seqToCdr(rdd).toDF()

        cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: L8-3-6-7DataFrameCreation.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions.desc
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.native.Serialization.write
import org.json4s.DefaultFormats

object DataframeCreationApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CdrDataframeApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)
    import sqlC.implicits._

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        //val cdrs = sqlC.createDataFrame(seqToCdr(rdd))
        //val cdrs = sqlC.createDataFrame(seqToCdr(rdd).collect())
        //val cdrs = seqToCdr(rdd).toDF()
        val cdrsJson = seqToCdr(rdd).map(r => {
          implicit val formats = DefaultFormats
          write(r)
        })
        val cdrs = sqlC.read.json(cdrsJson)

        cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
      })

    ssc.start()
    ssc.awaitTermination()

  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: L8-29DataFrameExamplesJoin.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.DefaultFormats
import org.json4s.JDouble
import org.json4s.JObject
import org.json4s.jvalue2extractable
import org.json4s.jvalue2monadic
import org.json4s.native.JsonMethods.compact
import org.json4s.native.JsonMethods.parse
import org.json4s.native.JsonMethods.render
import org.json4s.string2JsonInput

object CdrDataframeExamples3App {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: CdrDataframeExamples3App <appname> <batchInterval> <hostname> <port> <gridJsonPath>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port, gridJsonPath) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)
    import sqlC.implicits._
    implicit val formats = DefaultFormats

    val gridFile = scala.io.Source.fromFile(gridJsonPath).mkString
    val gridGeo = (parse(gridFile) \ "features")
    val gridStr = gridGeo.children.map(r => {
      val c = (r \ "geometry" \ "coordinates").extract[List[List[List[Float]]]].flatten.flatten.map(r => JDouble(r))
      val l = List(("id", r \ "id"), ("x1", c(0)), ("y1", c(1)), ("x2", c(2)), ("y2", c(3)),
        ("x3", c(4)), ("y3", c(5)), ("x4", c(6)), ("y4", c(7)))
      compact(render(JObject(l)))
    })

    val gridDF = sqlC.read.json(ssc.sparkContext.makeRDD(gridStr))

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        val cdrs = seqToCdr(rdd).toDF()
        cdrs.join(gridDF, $"squareId" === $"id").show()
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: L8-10-11UDF.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.io.Source
import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.json4s.jackson.JsonMethods.parse
import org.json4s.jvalue2extractable
import org.json4s.string2JsonInput

object CdrUDFApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CdrUDFApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)
    import sqlC.implicits._

    def getCountryCodeMapping() = {
      implicit val formats = org.json4s.DefaultFormats
      parse(Source.fromURL("http://country.io/phone.json").mkString).extract[Map[String, String]].map(_.swap)
    }

    def getCountryNameMapping() = {
      implicit val formats = org.json4s.DefaultFormats
      parse(Source.fromURL("http://country.io/names.json").mkString).extract[Map[String, String]]
    }

    def getCountryName(mappingPhone: Map[String, String], mappingName: Map[String, String], code: Int) = {
      mappingName.getOrElse(mappingPhone.getOrElse(code.toString, "NotFound"), "NotFound")
    }

    val getCountryNamePartial = getCountryName(getCountryCodeMapping(), getCountryNameMapping(), _: Int)

    sqlC.udf.register("getCountryNamePartial", getCountryNamePartial)

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        val cdrs = seqToCdr(rdd).toDF()
        cdrs.registerTempTable("cdrs")

        sqlC.sql("SELECT getCountryNamePartial(countryCode) AS countryName, COUNT(countryCode) AS cCount FROM cdrs GROUP BY countryCode ORDER BY cCount DESC LIMIT 5").show()

      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }

}

Source File: L8-4DataFrameCreationSchema.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions.desc
import org.apache.spark.sql.types.DataType
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object DataframeCreationApp2 {

  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: CdrDataframeApp2 <appname> <batchInterval> <hostname> <port> <schemaPath>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)

    val schemaJson = scala.io.Source.fromFile(schemaFile).mkString
    val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType]

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        val cdrs = sqlC.createDataFrame(rdd.map(c => Row(c: _*)), schema)
        
        cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5)
      })

    ssc.start()
    ssc.awaitTermination()

  }
}

Source File: L8-14-27DataFrameExamples.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object CdrDataframeExamplesApp {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CdrDataframeExamplesApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)
    import sqlC.implicits._

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        val cdrs = seqToCdr(rdd).toDF()

        cdrs.select("squareId", "timeInterval", "countryCode").show()
        cdrs.select($"squareId", $"timeInterval", $"countryCode").show()
        cdrs.filter("squareId = 5").show()
        cdrs.drop("countryCode").show()
        cdrs.select($"squareId", $"timeInterval", $"countryCode").where($"squareId" === 5).show()
        cdrs.limit(5).show()
        cdrs.groupBy("squareId").count().show()
        cdrs.groupBy("countryCode").avg("internetTrafficActivity").show()
        cdrs.groupBy("countryCode").max("callOutActivity").show()
        cdrs.groupBy("countryCode").min("callOutActivity").show()
        cdrs.groupBy("squareId").sum("internetTrafficActivity").show()
        cdrs.groupBy("squareId").agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show()
        cdrs.groupBy("countryCode").sum("internetTrafficActivity").orderBy(desc("SUM(internetTrafficActivity)")).show()
        cdrs.agg(sum("callOutActivity"), sum("callInActivity"), sum("smsOutActivity"), sum("smsInActivity"), sum("internetTrafficActivity")).show()
        cdrs.rollup("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/rollup" + rdd.hashCode())
        cdrs.cube("squareId", "countryCode").count().orderBy(desc("squareId"), desc("countryCode")).rdd.saveAsTextFile("/tmp/cube" + rdd.hashCode())
        cdrs.dropDuplicates(Array("callOutActivity", "callInActivity")).show()
        cdrs.select("squareId", "countryCode", "internetTrafficActivity").distinct.show()
        cdrs.withColumn("endTime", cdrs("timeInterval") + 600000).show()
        cdrs.sample(true, 0.01).show()
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: L8-28DataFrameExamplesOps.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.runtime.universe

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object CdrDataframeExamples2App {

  case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int,
    smsInActivity: Float, smsOutActivity: Float, callInActivity: Float,
    callOutActivity: Float, internetTrafficActivity: Float)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CdrDataframeExamples2App <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)
    import sqlC.implicits._

    var previousCdrs: Option[DataFrame] = None

    val cdrStream = ssc.socketTextStream(hostname, port.toInt)
      .map(_.split("\\t", -1))
      .foreachRDD(rdd => {
        val cdrs = seqToCdr(rdd).toDF().select("squareId", "countryCode").dropDuplicates()
        previousCdrs match {
          case Some(prevCdrs) => cdrs.unionAll(prevCdrs).show()
          //case Some(prevCdrs) => cdrs.intersect(prevCdrs).show()
          //case Some(prevCdrs) => cdrs.except(prevCdrs).show()
          case None => Unit
        }
        previousCdrs = Some(cdrs)
      })

    ssc.start()
    ssc.awaitTermination()
  }

  def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = {
    rdd.map(c => c.map(f => f match {
      case x if x.isEmpty() => "0"
      case x => x
    })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat,
      c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat))
  }
}

Source File: HelloWorldDataSource.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend.spark.datasources.helloworld

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{ BaseRelation, DataSourceRegister, RelationProvider, TableScan }
import org.apache.spark.sql.types.{ StringType, StructField, StructType }
import org.apache.spark.sql.{ Row, SQLContext }

class HelloWorldDataSource extends RelationProvider with DataSourceRegister with Serializable {
  override def shortName(): String = "helloworld"

  override def hashCode(): Int = getClass.hashCode()

  override def equals(other: scala.Any): Boolean = other.isInstanceOf[HelloWorldDataSource]

  override def toString: String = "HelloWorldDataSource"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
    val path = parameters.get("path")
    path match {
      case Some(p) => new HelloWorldRelationProvider(sqlContext, p, parameters)
      case _       => throw new IllegalArgumentException("Path is required for Tickets datasets")
    }
  }
}

class HelloWorldRelationProvider(val sqlContext: SQLContext, path: String, parameters: Map[String, String]) extends BaseRelation with TableScan {
  import sqlContext.implicits._

  override def schema: StructType = StructType(Array(
    StructField("key", StringType, nullable = false),
    StructField("value", StringType, nullable = true)
  ))

  override def buildScan(): RDD[Row] =
    Seq(
      "path" -> path,
      "message" -> parameters.getOrElse("message", ""),
      "name" -> s"Hello ${parameters.getOrElse("name", "")}",
      "hello_world" -> "Hello World!"
    ).toDF.rdd
}

Source File: BigQueryReader.scala From sope with Apache License 2.0

5 votes

package com.sope.spark.utils.google

import com.google.cloud.hadoop.io.bigquery.{BigQueryConfiguration, GsonBigQueryInputFormat}
import com.google.gson.JsonObject
import com.sope.utils.Logging
import org.apache.hadoop.io.LongWritable
import org.apache.spark.sql.{DataFrame, SQLContext}


  def load(): DataFrame = {
    import sqlContext.implicits._
    // Load data from BigQuery.
    val tableData = sc.newAPIHadoopRDD(
      conf,
      classOf[GsonBigQueryInputFormat],
      classOf[LongWritable],
      classOf[JsonObject])
      .map(_._2.toString)
    sqlContext.read.json(tableData.toDS)
  }
}

Source File: TestContext.scala From sope with Apache License 2.0

5 votes

package com.sope.etl

import org.apache.spark.sql.{SQLContext, SparkSession}


object TestContext {

  def getSQlContext: SQLContext = {
    System.setProperty(UDFRegistrationClassProperty, "com.sope.etl.custom.CustomUDF")
    System.setProperty(TransformationRegistrationClassProperty, "com.sope.etl.custom.CustomTransformation")
    SparkSession.builder()
      .master("local[*]")
      .appName("SopeUnitTest")
      .getOrCreate().sqlContext
  }
}

Source File: UDFRegistration.scala From sope with Apache License 2.0

5 votes

package com.sope.etl.register

import com.sope.etl.{SopeETLConfig, getClassInstance}
import com.sope.utils.Logging
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.expressions.UserDefinedFunction


  def registerCustomUDFs(sqlContext: SQLContext): Unit = {
    SopeETLConfig.UDFRegistrationConfig match {
      case Some(classStr) =>
        logInfo(s"Registering custom UDFs from $classStr")
        getClassInstance[UDFRegistration](classStr) match {
          case Some(udfClass) =>
            udfClass.performRegistration(sqlContext)
            logInfo("Successfully registered custom UDFs")
          case _ => logError(s"UDF Registration failed")
        }
      case None => logInfo("No class defined for registering Custom udfs")
    }
  }
}

Source File: TensorflowRelation.scala From ecosystem with Apache License 2.0

5 votes

package org.tensorflow.spark.datasources.tfrecords

import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, SQLContext, SparkSession}
import org.tensorflow.example.{SequenceExample, Example}
import org.tensorflow.hadoop.io.TFRecordFileInputFormat
import org.tensorflow.spark.datasources.tfrecords.serde.DefaultTfRecordRowDecoder


case class TensorflowRelation(options: Map[String, String], customSchema: Option[StructType]=None)
                             (@transient val session: SparkSession) extends BaseRelation with TableScan {

  //Import TFRecords as DataFrame happens here
  lazy val (tfRdd, tfSchema) = {
    val rdd = session.sparkContext.newAPIHadoopFile(options("path"), classOf[TFRecordFileInputFormat], classOf[BytesWritable], classOf[NullWritable])

    val recordType = options.getOrElse("recordType", "Example")

    recordType match {
      case "Example" =>
        val exampleRdd = rdd.map{case (bytesWritable, nullWritable) =>
          Example.parseFrom(bytesWritable.getBytes)
        }
        val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(exampleRdd))
        val rowRdd = exampleRdd.map(example => DefaultTfRecordRowDecoder.decodeExample(example, finalSchema))
        (rowRdd, finalSchema)
      case "SequenceExample" =>
        val sequenceExampleRdd = rdd.map{case (bytesWritable, nullWritable) =>
          SequenceExample.parseFrom(bytesWritable.getBytes)
        }
        val finalSchema = customSchema.getOrElse(TensorFlowInferSchema(sequenceExampleRdd))
        val rowRdd = sequenceExampleRdd.map(example => DefaultTfRecordRowDecoder.decodeSequenceExample(example, finalSchema))
        (rowRdd, finalSchema)
      case _ =>
        throw new IllegalArgumentException(s"Unsupported recordType ${recordType}: recordType can be Example or SequenceExample")
    }
  }

  override def sqlContext: SQLContext = session.sqlContext

  override def schema: StructType = tfSchema

  override def buildScan(): RDD[Row] = tfRdd
}

Source File: TestUtils.scala From shc with Apache License 2.0

5 votes

package org.apache.spark.sql

import java.nio.ByteBuffer
import java.io.{IOException, File}
import java.nio.ByteBuffer
import java.util

import org.apache.avro.generic.GenericData

import scala.collection.immutable.HashSet
import scala.collection.mutable.ArrayBuffer
import scala.util.Random

import com.google.common.io.Files
import org.apache.spark.sql.SQLContext

import scala.util.Random

object TestUtils {

  def generateRandomByteBuffer(rand: Random, size: Int): ByteBuffer = {
    val bb = ByteBuffer.allocate(size)
    val arrayOfBytes = new Array[Byte](size)
    rand.nextBytes(arrayOfBytes)
    bb.put(arrayOfBytes)
  }

  def generateRandomMap(rand: Random, size: Int): java.util.Map[String, Int] = {
    val jMap = new util.HashMap[String, Int]()
    for (i <- 0 until size) {
      jMap.put(rand.nextString(5), i)
    }
    jMap
  }

  def generateRandomArray(rand: Random, size: Int): util.ArrayList[Boolean] = {
    val vec = new util.ArrayList[Boolean]()
    for (i <- 0 until size) {
      vec.add(rand.nextBoolean())
    }
    vec
  }
}

Source File: CurrentPersistenceIdsQuerySourceProvider.scala From apache-spark-test with Apache License 2.0

5 votes

package akka.persistence.jdbc.spark.sql.execution.streaming

import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source }
import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider }
import org.apache.spark.sql.types.{ StringType, StructField, StructType }
import org.apache.spark.sql.{ SQLContext, _ }

object CurrentPersistenceIdsQuerySourceProvider {
  val name = "current-persistence-id"
  val schema: StructType = StructType(Array(
    StructField("persistence_id", StringType, nullable = false)
  ))
}

class CurrentPersistenceIdsQuerySourceProvider extends StreamSourceProvider with DataSourceRegister with Serializable {
  override def sourceSchema(
    sqlContext: SQLContext,
    schema: Option[StructType],
    providerName: String,
    parameters: Map[String, String]
  ): (String, StructType) = {
    CurrentPersistenceIdsQuerySourceProvider.name -> CurrentPersistenceIdsQuerySourceProvider.schema
  }

  override def createSource(
    sqlContext: SQLContext,
    metadataPath: String,
    schema: Option[StructType],
    providerName: String,
    parameters: Map[String, String]
  ): Source = {
    new CurrentPersistenceIdsQuerySourceImpl(sqlContext, parameters("path"))
  }
  override def shortName(): String = CurrentPersistenceIdsQuerySourceProvider.name
}

class CurrentPersistenceIdsQuerySourceImpl(val sqlContext: SQLContext, val readJournalPluginId: String) extends Source with ReadJournalSource {
  override def schema: StructType = CurrentPersistenceIdsQuerySourceProvider.schema

  override def getOffset: Option[Offset] = {
    val offset = maxPersistenceIds
    println("[CurrentPersistenceIdsQuery]: Returning maximum offset: " + offset)
    Some(LongOffset(offset))
  }

  override def getBatch(_start: Option[Offset], _end: Offset): DataFrame = {
    val (start, end) = getStartEnd(_start, _end)
    println(s"[CurrentPersistenceIdsQuery]: Getting currentPersistenceIds from start: $start, end: $end")
    import sqlContext.implicits._
    persistenceIds(start, end).toDF()
  }
}

Source File: CurrentEventsByPersistenceIdQuerySourceProvider.scala From apache-spark-test with Apache License 2.0

5 votes

package akka.persistence.jdbc.spark.sql.execution.streaming

import org.apache.spark.sql.execution.streaming.{ LongOffset, Offset, Source }
import org.apache.spark.sql.sources.{ DataSourceRegister, StreamSourceProvider }
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{ SQLContext, _ }

object CurrentEventsByPersistenceIdQuerySourceProvider {
  val name = "current-events-by-persistence-id"
}

class CurrentEventsByPersistenceIdQuerySourceProvider extends StreamSourceProvider with DataSourceRegister with Serializable {
  override def sourceSchema(
    sqlContext: SQLContext,
    schema: Option[StructType],
    providerName: String,
    parameters: Map[String, String]
  ): (String, StructType) = {
    println(s"[CurrentEventsByPersistenceIdQuerySourceProvider.sourceSchema]: schema: $schema, providerName: $providerName, parameters: $parameters")
    CurrentEventsByPersistenceIdQuerySourceProvider.name -> schema.get
  }

  override def createSource(
    sqlContext: SQLContext,
    metadataPath: String,
    schema: Option[StructType],
    providerName: String,
    parameters: Map[String, String]
  ): Source = {

    val eventMapperFQCN: String = parameters.get("event-mapper") match {
      case Some(_eventMapper) => _eventMapper
      case _                  => throw new RuntimeException("No event mapper FQCN")
    }

    val pid = (parameters.get("pid"), parameters.get("persistence-id")) match {
      case (Some(pid), _) => pid
      case (_, Some(pid)) => pid
      case _              => throw new RuntimeException("No persistence_id")
    }

    new CurrentEventsByPersistenceIdQuerySourceImpl(sqlContext, parameters("path"), eventMapperFQCN, pid, schema.get)
  }
  override def shortName(): String = CurrentEventsByPersistenceIdQuerySourceProvider.name
}

class CurrentEventsByPersistenceIdQuerySourceImpl(val sqlContext: SQLContext, val readJournalPluginId: String, eventMapperFQCN: String, persistenceId: String, override val schema: StructType) extends Source with ReadJournalSource {
  override def getOffset: Option[Offset] = {
    val offset = maxEventsByPersistenceId(persistenceId)
    println("[CurrentEventsByPersistenceIdQuery]: Returning maximum offset: " + offset)
    Some(LongOffset(offset))
  }

  override def getBatch(_start: Option[Offset], _end: Offset): DataFrame = {
    val (start, end) = getStartEnd(_start, _end)
    val df: DataFrame = eventsByPersistenceId(persistenceId, start, end, eventMapperFQCN)
    println(s"[CurrentEventsByPersistenceIdQuery]: Getting currentPersistenceIds from start: $start, end: $end, DataFrame.count: ${df.count}")
    df
  }
}

Source File: PersonEventMapper.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend.spark.mapper

import akka.persistence.jdbc.spark.sql.execution.streaming.EventMapper
import akka.persistence.query.EventEnvelope
import com.github.dnvriend.spark.datasources.person.Person
import org.apache.spark.sql.{ Row, SQLContext }
import org.apache.spark.sql.types._

class PersonEventMapper extends EventMapper {
  override def row(envelope: EventEnvelope, sqlContext: SQLContext): Row = envelope match {
    case EventEnvelope(offset, persistenceId, sequenceNr, Person(id, name, age)) =>
      Row(offset, persistenceId, sequenceNr, id, name, age)
  }

  override def schema: StructType =
    PersonEventMapper.schema
}

object PersonEventMapper {
  val schema = StructType(Array(
    StructField("offset", LongType, nullable = false),
    StructField("persistence_id", StringType, nullable = false),
    StructField("sequence_number", LongType, nullable = false),
    StructField("id", LongType, nullable = false),
    StructField("name", StringType, nullable = true),
    StructField("age", IntegerType, nullable = true)
  ))
}

Source File: PersonDataSource.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend.spark.datasources.person

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{ Row, SQLContext }

final case class Person(id: Long, name: String, age: Int)

class PersonDataSource extends RelationProvider with DataSourceRegister with Serializable {
  override def shortName(): String = "person"

  override def hashCode(): Int = getClass.hashCode()

  override def equals(other: scala.Any): Boolean = other.isInstanceOf[PersonDataSource]

  override def toString: String = "PersonDataSource"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = {
    val path = parameters.get("path")
    path match {
      case Some(p) => new PersonRelationProvider(sqlContext, p)
      case _       => throw new IllegalArgumentException("Path is required for Tickets datasets")
    }
  }
}

object PersonRelationProvider {
  val regex = """(id="[\d]+)|(name="[\s\w]+)|(age="[\d]+)""".r
  val schema = StructType(Array(
    StructField("id", LongType, nullable = false),
    StructField("name", StringType, nullable = true),
    StructField("age", IntegerType, nullable = true)
  ))
}

class PersonRelationProvider(val sqlContext: SQLContext, path: String) extends BaseRelation with TableScan with Serializable {
  override def schema: StructType = PersonRelationProvider.schema

  override def buildScan(): RDD[Row] =
    sqlContext.sparkContext.textFile(path)
      .filter(_.contains("person"))
      .map(line => PersonRelationProvider.regex.findAllIn(line).toList)
      .map { xs =>
        val id = xs.head.replace("id=\"", "").toLong
        val name = xs.drop(1).map(str => str.replace("name=\"", "")).headOption
        val age = xs.drop(2).map(str => str.replace("age=\"", "")).headOption.map(_.toInt)
        Row(id, name, age)
      }
}

Source File: VectorTempTable.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector.loader.command

import org.apache.spark.sql.SQLContext

import com.actian.spark_vector.loader.options.UserOptions
import com.actian.spark_vector.sql.{ SparkSqlTable, TableRef, TempTable, VectorRelation }

object VectorTempTable {
  private def generateSQLOption(key: String, queries: Seq[String]): Seq[(String, String)] = for { i <- 0 until queries.size } yield {
    (s"${key}${i}", s"${queries(i)}")
  }

  private def parseOptions(config: UserOptions): Map[String, String] = {
    val base = Seq("host" -> config.vector.host,
      "instance" -> config.vector.instance,
      "database" -> config.vector.database,
      "port" -> config.vector.port,
      "table" -> config.vector.targetTable)
    val optional = Seq(config.vector.user.map("user" -> _),
      config.vector.password.map("password" -> _)).flatten ++
      config.vector.preSQL.map(generateSQLOption("loadpresql", _)).getOrElse(Nil) ++
      config.vector.postSQL.map(generateSQLOption("loadpostsql", _)).getOrElse(Nil)
    (base ++ optional).toMap
  }

  
  def register(config: UserOptions, sqlContext: SQLContext): SparkSqlTable = {
    val params = parseOptions(config)
    val tableName = params("table")
    val df = sqlContext.baseRelationToDataFrame(VectorRelation(TableRef(params), sqlContext, params))
    TempTable(tableName, df)
  }
}

Source File: SparkOperationTestPimpers.scala From sparkplug with MIT License

5 votes

package springnz.sparkplug.testkit

import com.typesafe.scalalogging.{ LazyLogging, Logger }
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{ DataFrame, SQLContext }
import springnz.sparkplug.core.SparkOperation
import springnz.sparkplug.util.Logging

import scala.reflect.ClassTag

object SparkOperationTestPimpers extends LazyLogging {

  private def persistTestResource[A: ClassTag](rdd: RDD[A], rddName: String, overwrite: Boolean = false)(
    implicit projectName: ProjectName): RDD[A] = {
    val path = RDDPersister.getPath(projectName.name, rddName)
    if (overwrite || (!overwrite && !path.exists)) {
      if (path.exists) {
        logger.info(s"deleting existing RDD at ${path.pathAsString}")
        path.delete()
      }
      RDDPersister.persistRDD(path.pathAsString, rdd)
    } else { // (!overwrite && path.exists)
      logger.info(s"Not persisting RDD that already exists at path [${path.pathAsString}]")
      rdd
    }
  }

  class RDDExtensions[A: ClassTag](operation: SparkOperation[RDD[A]]) {
    import RDDSamplers._

    def saveTo(rddName: String, sampler: RDD[A] ⇒ RDD[A] = identitySampler)(
      implicit projectName: ProjectName): SparkOperation[RDD[A]] =
      operation.map {
        rdd ⇒
          val sampled = sampler(rdd)
          persistTestResource(sampled, rddName, overwrite = false)
          sampled
      }

    def sourceFrom(rddName: String, sampler: RDD[A] ⇒ RDD[A] = identitySampler)(
      implicit projectName: ProjectName): SparkOperation[RDD[A]] =
      SparkOperation { ctx ⇒
        val path = RDDPersister.getPath(projectName.name, rddName)
        if (path.exists)
          ctx.objectFile[A](path.pathAsString)
        else {
          val rdd = operation.run(ctx)
          val sampled = sampler(rdd)
          persistTestResource(sampled, rddName, overwrite = false)
          sampled
        }
      }
  }

  class DataFrameExtensions(operation: SparkOperation[DataFrame]) {
    import RDDSamplers._

    def saveTo(rddName: String,
      overwrite: Boolean = false,
      sampler: RDD[String] ⇒ RDD[String] = identitySampler)(
        implicit projectName: ProjectName): SparkOperation[DataFrame] =
      operation.map {
        df ⇒
          val rdd: RDD[String] = df.toJSON
          val sampled = sampler(rdd)
          persistTestResource(sampled, rddName, overwrite)
          val sqlContext = new SQLContext(sampled.sparkContext)
          sqlContext.read.json(sampled)
      }

    def sourceFrom(dataFrameName: String,
      overwrite: Boolean = false,
      sampler: RDD[String] ⇒ RDD[String] = rdd ⇒ rdd)(
        implicit projectName: ProjectName, log: Logger): SparkOperation[DataFrame] =
      SparkOperation { ctx ⇒
        val path = RDDPersister.getPath(projectName.name, dataFrameName)
        val sampledRDD = if (path.exists)
          ctx.objectFile[String](path.pathAsString)
        else {
          val df = operation.run(ctx)
          val rdd: RDD[String] = df.toJSON
          val sampled = sampler(rdd)
          persistTestResource(sampled, dataFrameName, overwrite)
          sampled
        }
        val sqlContext = new SQLContext(ctx)
        sqlContext.read.json(sampledRDD)
      }

  }
}

Source File: BigFileDatasource.scala From glow with Apache License 2.0

5 votes

package io.projectglow.sql

import java.net.URI
import java.util.ServiceLoader

import scala.collection.JavaConverters._

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

import io.projectglow.common.{GlowLogging, WithUtils}


  def write(rdd: RDD[Array[Byte]], path: String) {
    val uri = new URI(path)
    uploaders.find(_.canUpload(rdd.sparkContext.hadoopConfiguration, path)) match {
      case Some(uploader) => uploader.upload(rdd, path)
      case None =>
        logger.info(s"Could not find a parallel uploader for $path, uploading from the driver")
        writeFileFromDriver(new Path(uri), rdd)
    }
  }

  private def writeFileFromDriver(path: Path, byteRdd: RDD[Array[Byte]]): Unit = {
    val sc = byteRdd.sparkContext
    val fs = path.getFileSystem(sc.hadoopConfiguration)
    WithUtils.withCloseable(fs.create(path)) { stream =>
      WithUtils.withCachedRDD(byteRdd) { cachedRdd =>
        cachedRdd.count()
        cachedRdd.toLocalIterator.foreach { chunk =>
          stream.write(chunk)
        }
      }
    }
  }
}

Source File: MLlibTestSparkContext.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.util

import java.io.File

import org.apache.spark.SparkContext
import org.apache.spark.sql.types.UDTRegistration
import org.apache.spark.sql.{SQLContext, SQLImplicits, SparkSession}
import org.apache.spark.util.{SparkUtil, Utils}
import org.scalatest.Suite

trait MLlibTestSparkContext extends TempDirectory { self: Suite =>
  @transient var spark: SparkSession = _
  @transient var sc: SparkContext = _
  @transient var checkpointDir: String = _

  override def beforeAll() {
    super.beforeAll()

    SparkUtil.UDTRegister("org.apache.spark.linalg.Vector", "org.apache.spark.linalg.VectorUDT")
    SparkUtil.UDTRegister("org.apache.spark.linalg.DenseVector", "org.apache.spark.linalg.VectorUDT")
    SparkUtil.UDTRegister("org.apache.spark.linalg.SparseVector", "org.apache.spark.linalg.VectorUDT")
    SparkUtil.UDTRegister("org.apache.spark.linalg.Matrix", "org.apache.spark.linalg.MatrixUDT")
    SparkUtil.UDTRegister("org.apache.spark.linalg.DenseMatrix", "org.apache.spark.linalg.MatrixUDT")
    SparkUtil.UDTRegister("org.apache.spark.linalg.SparseMatrix", "org.apache.spark.linalg.MatrixUDT")

    spark = SparkSession.builder
      .master("local[2]")
      .appName("MLlibUnitTest")
      .getOrCreate()
    sc = spark.sparkContext

    checkpointDir = SparkUtil.createDirectory(tempDir.getCanonicalPath, "checkpoints").toString
    sc.setCheckpointDir(checkpointDir)
  }

  override def afterAll() {
    try {
      SparkUtil.deleteRecursively(new File(checkpointDir))
      SparkSession.clearActiveSession()
      if (spark != null) {
        spark.stop()
      }
      spark = null
    } finally {
      super.afterAll()
    }
  }

  /**
   * A helper object for importing SQL implicits.
   *
   * Note that the alternative of importing `spark.implicits._` is not possible here.
   * This is because we create the `SQLContext` immediately before the first test is run,
   * but the implicits import is needed in the constructor.
   */
  protected object testImplicits extends SQLImplicits {
    protected override def _sqlContext: SQLContext = self.spark.sqlContext
  }
}

Source File: SharedSQLContext.scala From sona with Apache License 2.0

5 votes

package org.apache.spark.sql.test

import scala.concurrent.duration._

import org.scalatest.BeforeAndAfterEach
import org.scalatest.concurrent.Eventually

import org.apache.spark.{DebugFilesystem, SparkConf}
import org.apache.spark.sql.{SparkSession, SQLContext}


  protected override def afterAll(): Unit = {
    super.afterAll()
    if (_spark != null) {
      _spark.sessionState.catalog.reset()
      _spark.stop()
      _spark = null
    }
  }

  protected override def beforeEach(): Unit = {
    super.beforeEach()
    DebugFilesystem.clearOpenStreams()
  }

  protected override def afterEach(): Unit = {
    super.afterEach()
    // files can be closed from other threads, so wait a bit
    // normally this doesn't take more than 1s
    eventually(timeout(10.seconds)) {
      DebugFilesystem.assertNoOpenStreams()
    }
  }
}

Source File: DefaultSource.scala From spark-google-spreadsheets with Apache License 2.0

5 votes

package com.github.potix2.spark.google.spreadsheets

import java.io.File

import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}

class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {
  final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = {
    createRelation(sqlContext, parameters, null)
  }

  private[spreadsheets] def pathToSheetNames(parameters: Map[String, String]): (String, String) = {
    val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets."))
    val elems = path.split('/')
    if (elems.length < 2)
      throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'")

    (elems(0), elems(1))
  }

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = {
    val (spreadsheetName, worksheetName) = pathToSheetNames(parameters)
    val context = createSpreadsheetContext(parameters)
    createRelation(sqlContext, context, spreadsheetName, worksheetName, schema)
  }


  override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = {
    val (spreadsheetName, worksheetName) = pathToSheetNames(parameters)
    implicit val context = createSpreadsheetContext(parameters)
    val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName)
    if(!spreadsheet.isDefined)
      throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName")

    spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData)
    createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema)
  }

  private[spreadsheets] def createSpreadsheetContext(parameters: Map[String, String]) = {
    val serviceAccountIdOption = parameters.get("serviceAccountId")
    val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH)
    SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath))
  }

  private[spreadsheets] def createRelation(sqlContext: SQLContext,
                                           context: SparkSpreadsheetService.SparkSpreadsheetContext,
                                           spreadsheetName: String,
                                           worksheetName: String,
                                           schema: StructType): SpreadsheetRelation =
    if (schema == null) {
      createRelation(sqlContext, context, spreadsheetName, worksheetName, None)
    }
    else {
      createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema))
    }

  private[spreadsheets] def createRelation(sqlContext: SQLContext,
                                           context: SparkSpreadsheetService.SparkSpreadsheetContext,
                                           spreadsheetName: String,
                                           worksheetName: String,
                                           schema: Option[StructType]): SpreadsheetRelation =
    SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext)
}

Source File: SpreadsheetRelation.scala From spark-google-spreadsheets with Apache License 2.0

5 votes

package com.github.potix2.spark.google.spreadsheets

import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService.SparkSpreadsheetContext
import com.github.potix2.spark.google.spreadsheets.util._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, InsertableRelation, TableScan}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SQLContext}

case class SpreadsheetRelation protected[spark] (
                                                  context:SparkSpreadsheetContext,
                                                  spreadsheetName: String,
                                                  worksheetName: String,
                                                  userSchema: Option[StructType] = None)(@transient val sqlContext: SQLContext)
  extends BaseRelation with TableScan with InsertableRelation {

  import com.github.potix2.spark.google.spreadsheets.SparkSpreadsheetService._

  override def schema: StructType = userSchema.getOrElse(inferSchema())

  private lazy val aWorksheet: SparkWorksheet =
    findWorksheet(spreadsheetName, worksheetName)(context) match {
      case Right(aWorksheet) => aWorksheet
      case Left(e) => throw e
    }

  private lazy val rows: Seq[Map[String, String]] = aWorksheet.rows

  private[spreadsheets] def findWorksheet(spreadsheetName: String, worksheetName: String)(implicit ctx: SparkSpreadsheetContext): Either[Throwable, SparkWorksheet] =
    for {
      sheet <- findSpreadsheet(spreadsheetName).toRight(new RuntimeException(s"no such spreadsheet: $spreadsheetName")).right
      worksheet <- sheet.findWorksheet(worksheetName).toRight(new RuntimeException(s"no such worksheet: $worksheetName")).right
    } yield worksheet

  override def buildScan(): RDD[Row] = {
    val aSchema = schema
    sqlContext.sparkContext.makeRDD(rows).mapPartitions { iter =>
      iter.map { m =>
        var index = 0
        val rowArray = new Array[Any](aSchema.fields.length)
        while(index < aSchema.fields.length) {
          val field = aSchema.fields(index)
          rowArray(index) = if (m.contains(field.name)) {
            TypeCast.castTo(m(field.name), field.dataType, field.nullable)
          } else {
            null
          }
          index += 1
        }
        Row.fromSeq(rowArray)
      }
    }
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
    if(!overwrite) {
      sys.error("Spreadsheet tables only support INSERT OVERWRITE for now.")
    }

    findWorksheet(spreadsheetName, worksheetName)(context) match {
      case Right(w) =>
        w.updateCells(data.schema, data.collect().toList, Util.toRowData)
      case Left(e) =>
        throw e
    }
  }

  private def inferSchema(): StructType =
    StructType(aWorksheet.headers.toList.map { fieldName =>
      StructField(fieldName, StringType, nullable = true)
    })

}

Source File: TestSparkContext.scala From spark-images with Apache License 2.0

5 votes

package org.apache.spark.image

import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types._
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Row, DataFrame, SQLContext, SparkSession}

import scala.reflect.runtime.universe._
import org.scalatest.{FunSuite, BeforeAndAfterAll}

// This context is used for all tests in this project
trait TestSparkContext extends BeforeAndAfterAll { self: FunSuite =>
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _
  @transient lazy val spark: SparkSession = {
    val conf = new SparkConf()
      .setMaster("local[*]")
      .setAppName("Spark-Image-Test")
      .set("spark.ui.port", "4079")
      .set("spark.sql.shuffle.partitions", "4")  // makes small tests much faster

    val sess = SparkSession.builder().config(conf).getOrCreate()
    sess.sparkContext.setLogLevel("WARN")
    sess
  }

  override def beforeAll() {
    super.beforeAll()
    sc = spark.sparkContext
    sqlContext = spark.sqlContext
    import spark.implicits._
  }

  override def afterAll() {
    sqlContext = null
    if (sc != null) {
      sc.stop()
    }
    sc = null
    super.afterAll()
  }

  def makeDF[T: TypeTag](xs: Seq[T], col: String): DataFrame = {
    sqlContext.createDataFrame(xs.map(Tuple1.apply)).toDF(col)
  }

  def compareRows(r1: Array[Row], r2: Seq[Row]): Unit = {
    val a = r1.sortBy(_.toString())
    val b = r2.sortBy(_.toString())
    assert(a === b)
  }
}

Source File: KuduSinkProvider.scala From kafka-examples with Apache License 2.0

5 votes

package com.cloudera.streaming.refapp.kudu

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
import org.apache.spark.sql.streaming.OutputMode


class KuduSinkProvider extends StreamSinkProvider with DataSourceRegister {

  override def createSink(sqlContext: SQLContext,
                          parameters: Map[String, String],
                          partitionColumns: Seq[String],
                          outputMode: OutputMode): Sink = {
    require(outputMode == OutputMode.Update, "only 'update' OutputMode is supported")
    KuduSink.withDefaultContext(sqlContext, parameters)
  }

  override def shortName(): String = "kudu"
}

Source File: KuduSink.scala From kafka-examples with Apache License 2.0

5 votes

package com.cloudera.streaming.refapp.kudu

import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.slf4j.LoggerFactory

import scala.util.control.NonFatal

object KuduSink {
  def withDefaultContext(sqlContext: SQLContext, parameters: Map[String, String]) =
    new KuduSink(new KuduContext(parameters("kudu.master"), sqlContext.sparkContext), parameters)
}


class KuduSink(initKuduContext: => KuduContext, parameters: Map[String, String]) extends Sink {

  private val logger = LoggerFactory.getLogger(getClass)

  private var kuduContext = initKuduContext

  private val tablename = parameters("kudu.table")

  private val retries = parameters.getOrElse("retries", "1").toInt
  require(retries >= 0, "retries must be non-negative")

  logger.info(s"Created Kudu sink writing to table $tablename")

  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    for (attempt <- 0 to retries) {
      try {
        kuduContext.upsertRows(data, tablename)
        return
      } catch {
        case NonFatal(e) =>
          if (attempt < retries) {
            logger.warn("Kudu upsert error, retrying...", e)
            kuduContext = initKuduContext
          }
          else {
            logger.error("Kudu upsert error, exhausted", e)
            throw e
          }
      }
    }
  }
}

Source File: ExtractorsSpec.scala From streamliner-starter with Apache License 2.0

5 votes

package test

import com.memsql.spark.etl.api.UserExtractConfig
import com.memsql.spark.etl.utils.ByteUtils
import spray.json.JsString
import com.memsql.streamliner.starter.BasicExtractor
import org.apache.spark.streaming._
import org.apache.spark.sql.SQLContext

class ExtractorsSpec extends UnitSpec with LocalSparkContext {
  val emptyConfig = UserExtractConfig(class_name = "Test", value = new JsString("empty"))
  val logger = new TestLogger("test")

  var ssc: StreamingContext = _
  var sqlContext: SQLContext = _

  override def beforeEach(): Unit = {
    super.beforeEach()
    ssc = new StreamingContext(sc, Seconds(1))
    sqlContext = new SQLContext(sc)
  }

  "BasicExtractor" should "emit a constant DataFrame" in {
    val extract = new BasicExtractor
    
    val maybeDf = extract.next(ssc, 1, sqlContext, emptyConfig, 1, logger)
    assert(maybeDf.isDefined)

    val total = maybeDf.get.select("number").rdd.map(r => r(0).asInstanceOf[Int]).sum()
    assert(total == 15)
  }

}

Source File: QueryLshTest.scala From cosine-lsh-join-spark with MIT License

5 votes

package com.soundcloud.lsh

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.scalatest.{FunSuite, Matchers}

class QueryLshTest extends FunSuite with SparkLocalContext with Matchers {

  lazy val lsh = new QueryLsh(
    minCosineSimilarity = -1.0,
    dimensions = 100,
    numNeighbours = 10,
    maxMatches = 1000,
    rounds = 10)(new SQLContext(sc).sparkSession)

  test("join bitmap") {
    val bitseq = Seq(
      new BitSet(2),
      new BitSet(2)
    )
    bitseq(0).set(0)
    bitseq(1).set(1)
    val rdd = sc.parallelize(bitseq.map(bitSetToString(_)).zipWithIndex)
    val joined = rdd.join(rdd).values.collect
    joined shouldBe Seq((0,0), (1,1))
  }

  test("join") {
    val rows = Seq(
      IndexedRow(0, Vectors.dense(1, 1, 0, 0)),
      IndexedRow(1, Vectors.dense(1, 2, 0, 0)),
      IndexedRow(2, Vectors.dense(0, 1, 4, 2))
    )
    val inputMatrix = new IndexedRowMatrix(sc.parallelize(rows))
    val got = lsh.join(inputMatrix, inputMatrix)
    val expected = Seq(
      (0, 0),
      (1, 1),
      (2, 2)
    )
    val gotIndex = got.entries.collect.map {
      entry: MatrixEntry =>
        (entry.i, entry.j)
    }
    gotIndex.sorted should be(expected.sorted)
  }

  test("distinct") {
    val matrix = Seq(
      new MatrixEntry(1, 2, 3.4),
      new MatrixEntry(1, 2, 3.5),
      new MatrixEntry(1, 3, 3.4)
    )
    val got = distinct(sc.parallelize(matrix)).collect
    val expected = Seq(
      matrix(0), matrix(2)
    )
    got should be(expected)
  }

}

Source File: DefaultSource.scala From spark-google-adwords with Apache License 2.0

5 votes

package com.crealytics.google.adwords

import com.google.api.ads.common.lib.auth.OfflineCredentials
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources._

class DefaultSource extends RelationProvider {

  // The default User Agent
  private final val DEFAULT_USER_AGENT = "Spark"
  // Default During Clause: 30 days
  private final val DEFAULT_DURING = "LAST_30_DAYS"

  
  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): AdWordsRelation = {

    // gather parameters
    val clientId = checkParameter(parameters, "clientId")
    val clientSecret = checkParameter(parameters, "clientSecret")
    val developerToken = checkParameter(parameters, "developerToken")
    val refreshToken = checkParameter(parameters, "refreshToken")
    val reportType = checkParameter(parameters, "reportType")
    val clientCustomerId = checkParameter(parameters, "clientCustomerId")

    val userAgent =
      parameterOrDefault(parameters, "userAgent", DEFAULT_USER_AGENT)
    val duringStmt = parameterOrDefault(parameters, "during", DEFAULT_DURING)
    // Our OAuth2 Credential
    val credential = new OfflineCredentials.Builder()
      .forApi(OfflineCredentials.Api.ADWORDS)
      .withClientSecrets(clientId, clientSecret)
      .withRefreshToken(refreshToken)
      .build
      .generateCredential
    // create relation
    AdWordsRelation(credential, developerToken, clientCustomerId, userAgent, reportType, duringStmt)(sqlContext)
  }

  // Forces a Parameter to exist, otherwise an exception is thrown.
  private def checkParameter(map: Map[String, String], param: String) = {
    if (!map.contains(param)) {
      throw new IllegalArgumentException(s"Parameter ${'"'}$param${'"'} is missing in options.")
    } else {
      map.apply(param)
    }
  }

  // Gets the Parameter if it exists, otherwise returns the default argument
  private def parameterOrDefault(map: Map[String, String], param: String, default: String) =
    map.getOrElse(param, default)
}

Source File: RddToDataFrame.scala From spark-sframe with BSD 2-Clause "Simplified" License

5 votes

package org.apache.spark.turi

import org.graphlab.create.GraphLabUtil
import org.apache.spark.sql.{SQLContext, Row, DataFrame}
import org.apache.spark.rdd.RDD
import scala.collection.JavaConversions._
import org.apache.spark.sql.types._
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.ArrayBuffer
import scala.collection.immutable.Map
import java.util.HashMap
import java.util.ArrayList
import java.util.{Date,GregorianCalendar}
import java.sql.Date

object EvaluateRDD {
  
  def inferSchema(obj: Any): DataType = {
    if(obj.isInstanceOf[Int]) { 
      IntegerType
    } else if(obj.isInstanceOf[String]) { 
      StringType
    } else if(obj.isInstanceOf[Double]) { 
      DoubleType
    } else if(obj.isInstanceOf[Long]) { 
      LongType
    } else if(obj.isInstanceOf[Float]) { 
      FloatType
    } else if(obj.isInstanceOf[Map[_,_]]) {
      MapType(inferSchema(obj.asInstanceOf[Map[_,_]].head._1),inferSchema(obj.asInstanceOf[Map[_,_]].head._2))
    } else if(obj.isInstanceOf[java.util.HashMap[_,_]]) {
      MapType(inferSchema(obj.asInstanceOf[java.util.HashMap[_,_]].head._1),inferSchema(obj.asInstanceOf[java.util.HashMap[_,_]].head._2))
    } else if(obj.isInstanceOf[Array[_]]) {
      ArrayType(inferSchema(obj.asInstanceOf[Array[_]](0)))
    } else if(obj.isInstanceOf[java.util.ArrayList[_]]) {
      ArrayType(inferSchema(obj.asInstanceOf[java.util.ArrayList[_]](0)))
    } else if(obj.isInstanceOf[java.util.GregorianCalendar]) {
      TimestampType
    } else if(obj.isInstanceOf[java.util.Date] || obj.isInstanceOf[java.sql.Date]) {
      DateType
    } else { 
      StringType
    }
  }

  def toScala(obj: Any): Any = {
    if (obj.isInstanceOf[java.util.HashMap[_,_]]) {
      val jmap = obj.asInstanceOf[java.util.HashMap[_,_]]
      jmap.map { case (k,v) => toScala(k) -> toScala(v) }.toMap
    }
    else if(obj.isInstanceOf[java.util.ArrayList[_]]) {
      val buf = ArrayBuffer[Any]()
      val jArray = obj.asInstanceOf[java.util.ArrayList[_]]
      for(item <- jArray) {
        buf += toScala(item)
      }
      buf.toArray
    } else if(obj.isInstanceOf[java.util.GregorianCalendar]) {
      new java.sql.Timestamp(obj.asInstanceOf[java.util.GregorianCalendar].getTime().getTime())
    } else {
      obj
    }
  }
  def toSparkDataFrame(sqlContext: SQLContext, rdd: RDD[java.util.HashMap[String,_]]): DataFrame = { 
    val scalaRDD = rdd.map(l => toScala(l))
    val rowRDD = scalaRDD.map(l => Row.fromSeq(l.asInstanceOf[Map[_,_]].values.toList))
    
    var sample_data: java.util.HashMap[String,_] = rdd.take(1)(0)
    
    var schema_list: ListBuffer[StructField] = new ListBuffer[StructField]()
    for ((name,v) <- sample_data) { 
      schema_list.append(StructField(name,inferSchema(v)))
    }
    sqlContext.createDataFrame(rowRDD,StructType(schema_list))
  }
}

Source File: SparkFunSuite.scala From spark-corenlp with GNU General Public License v3.0

5 votes

package com.databricks.spark.corenlp

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, FunSuite}

trait SparkFunSuite extends FunSuite with BeforeAndAfterAll {
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _

  override def beforeAll(): Unit = {
    sc = SparkContext.getOrCreate(
      new SparkConf()
        .setMaster("local[2]")
        .setAppName(this.getClass.getSimpleName)
    )
    sqlContext = SQLContext.getOrCreate(sc)
  }

  override def afterAll(): Unit = {
    sc.stop()
    sc = null
    sqlContext = null
  }
}

Source File: DefaultSource.scala From spark-athena with Apache License 2.0

5 votes

package io.github.tmheo.spark.athena

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources._

class DefaultSource extends RelationProvider {

  override def createRelation(
                               sqlContext: SQLContext,
                               parameters: Map[String, String]): BaseRelation = {
    val jdbcOptions = new JDBCOptions(parameters)
    val partitionColumn = jdbcOptions.partitionColumn
    val lowerBound = jdbcOptions.lowerBound
    val upperBound = jdbcOptions.upperBound
    val numPartitions = jdbcOptions.numPartitions

    val partitionInfo = if (partitionColumn.isEmpty) {
      assert(lowerBound.isEmpty && upperBound.isEmpty)
      null
    } else {
      assert(lowerBound.nonEmpty && upperBound.nonEmpty && numPartitions.nonEmpty)
      JDBCPartitioningInfo(
        partitionColumn.get, lowerBound.get, upperBound.get, numPartitions.get)
    }
    val parts = JDBCRelation.columnPartition(partitionInfo)
    JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession)
  }

}

Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.etl.machinelearning.kudu

import com.cloudera.sa.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MlLibOnKudu {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<taxiTable> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val taxiTable = args(2)
    val numOfCenters = args(3).toInt
    val numOfIterations = args(4).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val sqlContext = new SQLContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> taxiTable,
      "kudu.master" -> kuduMaster)

    sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load.
      registerTempTable("ny_taxi_trip_tmp")

    //Vector
    val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => {
      val taxiTrip = NyTaxiYellowTripBuilder.build(r)
      generateVectorOnly(taxiTrip)
    })

    println("--Running KMeans")
    val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations)
    println(" > vector centers:")
    clusters.clusterCenters.foreach(v => println(" >> " + v))

    println("--Running corr")
    val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson")
    println(" > corr: " + correlMatrix.toString)

    println("--Running colStats")
    val colStats = Statistics.colStats(vectorRDD)
    println(" > max: " + colStats.max)
    println(" > count: " + colStats.count)
    println(" > mean: " + colStats.mean)
    println(" > min: " + colStats.min)
    println(" > normL1: " + colStats.normL1)
    println(" > normL2: " + colStats.normL2)
    println(" > numNonZeros: " + colStats.numNonzeros)
    println(" > variance: " + colStats.variance)

    //Labeled Points
    
}

Source File: SparkSqlRunner.scala From amaterasu with Apache License 2.0

5 votes

package org.apache.amaterasu.executor.execution.actions.runners.spark.SparkSql

import java.io.File

import org.apache.amaterasu.common.execution.actions.Notifier
import org.apache.amaterasu.common.logging.Logging
import org.apache.amaterasu.common.runtime.Environment
import org.apache.commons.io.FilenameUtils
import org.apache.spark.SparkContext
import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession}



  def findFileType(folderName: File): Array[String] = {
    // get all the files from a directory
    val files: Array[File] = folderName.listFiles()
    val extensions: Array[String] = files.map(file => FilenameUtils.getExtension(file.toString))
    extensions
  }

}

object SparkSqlRunner {

  def apply(env: Environment,
            jobId: String,
            actionName: String,
            notifier: Notifier,
            sc: SparkContext): SparkSqlRunner = {

    val sparkSqlRunnerObj = new SparkSqlRunner

    sparkSqlRunnerObj.env = env
    sparkSqlRunnerObj.jobId = jobId
    sparkSqlRunnerObj.actionName = actionName
    sparkSqlRunnerObj.notifier = notifier
    sparkSqlRunnerObj.sc = sc
    sparkSqlRunnerObj.spark = SparkSession.builder().config(sc.getConf).enableHiveSupport().getOrCreate()
    sparkSqlRunnerObj
  }
}

Source File: ModelSparkContext.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.conf.spark

import java.io.File

import com.flaminem.flamy.conf.{Flamy, FlamyContext, FlamyGlobalContext}
import com.flaminem.flamy.model.files.PresetsFile
import com.flaminem.flamy.parsing.hive.QueryUtils
import org.apache.spark.SparkConf
import org.apache.spark.sql.{SQLContext, SparkSession}


object ModelSparkContext {

  val LOCAL_METASTORE: String = "local_spark_metastore"
  val LOCAL_WAREHOUSE: String = "local_spark_warehouse"
  val LOCAL_DERBY_LOG: String = "derby.log"

  private var sparkConf: SparkConf = _

  def localMetastore(context: FlamyContext): String = {
    FlamyGlobalContext.getUniqueRunDir + "/" + LOCAL_METASTORE
  }

  def localWarehouse(context: FlamyContext): String = {
    FlamyGlobalContext.getUniqueRunDir + "/" + LOCAL_WAREHOUSE
  }


  private def init(context: FlamyContext) = {
    sparkConf = new SparkConf()
      .setAppName(Flamy.name)
      .setMaster("local")
      .set("spark.sql.crossJoin.enabled", "true")
      .set("derby.stream.error.file", FlamyGlobalContext.getUniqueRunDir + "/" + LOCAL_DERBY_LOG)
      .set("javax.jdo.option.ConnectionURL", "jdbc:derby:;databaseName=" + localMetastore(context) + ";create=true")
      .set("hive.metastore.warehouse.dir", localWarehouse(context))
  }

  private lazy val _spark = {
    SparkSession.builder()
      .enableHiveSupport()
      .config(sparkConf)
      .getOrCreate()
  }

  def spark(context: FlamyContext): SparkSession = {
    init(context)
    _spark
  }

  def getSparkSQLContext(context: FlamyContext): SQLContext = {
    val sqlContext: SQLContext = ModelSparkContext.spark(context).sqlContext.newSession()
    runPresets(sqlContext, context)
    sqlContext
  }

  private def addUDFJars(sqlContext: SQLContext, context: FlamyContext): Unit = {
    context.getUdfJarPaths.foreach{
      path => sqlContext.sql(s"ADD JAR $path")
    }
  }

  private def runPresets(sqlContext: SQLContext, context: FlamyContext): Unit = {
    addUDFJars(sqlContext, context)
    context.HIVE_PRESETS_PATH.getProperty match {
      case Some(path) =>
        val file = new PresetsFile(new File(path))
        QueryUtils.cleanAndSplitQuery(file.text).foreach{
          query =>
            val replacedQuery = context.getVariables.replaceInText(query)
            sqlContext.sql(replacedQuery)
        }
      case _ =>
        System.out.println("No presets to run")
    }
  }

}

Source File: Check.scala From flamy with Apache License 2.0

5 votes

package com.flaminem.flamy.commands

import com.flaminem.flamy.commands.utils.FlamySubcommand
import com.flaminem.flamy.conf.spark.ModelSparkContext
import com.flaminem.flamy.conf.{Environment, FlamyContext, FlamyGlobalOptions}
import com.flaminem.flamy.exec.FlamyRunner
import com.flaminem.flamy.exec.files.{FileRunner, ItemFileAction}
import com.flaminem.flamy.exec.hive.{HivePartitionFetcher, ModelHivePartitionFetcher}
import com.flaminem.flamy.exec.utils._
import com.flaminem.flamy.exec.utils.io.FlamyOutput
import com.flaminem.flamy.graph.TableGraph
import com.flaminem.flamy.model._
import com.flaminem.flamy.model.core.Model
import com.flaminem.flamy.model.files.FilePath
import com.flaminem.flamy.model.names.ItemName
import org.apache.spark.sql.SQLContext
import org.rogach.scallop.{ScallopConf, ScallopOption, Subcommand}

import scala.language.reflectiveCalls


      val runGraph: TableGraph = baseGraph.subGraph(items())

      val dryRunner: FlamyRunner = FlamyRunner(context)
      println("Creating schemas and tables ...")
      try {
        dryRunner.checkAll(baseGraph)
      }
      finally{
        //TODO: For some strange reason, closing the connection here will result in ClassNotFoundErrors for udfs in the RunActions...
        //      dryRunner.close()
      }
      FlamyOutput.out.info("Running Populates ...")
      dryRunner.populateAll(runGraph.model, context)
      dryRunner.close()
      ReturnStatus(success = dryRunner.getStats.getFailCount==0)
    }

  }

  override def doCommand(globalOptions: FlamyGlobalOptions, subCommands: List[ScallopConf]): ReturnStatus = {
    subCommands match {
      case  (command: FlamySubcommand)::Nil => command.doCommand(globalOptions, Nil)
      case Nil => throw new IllegalArgumentException("A subcommand is expected")
      case _ =>
        printHelp()
        ReturnFailure
    }
  }


}

Source File: KuduSinkProvider.scala From kafka-examples with Apache License 2.0

5 votes

package com.cloudera.streaming.refapp.kudu

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.sources.{DataSourceRegister, StreamSinkProvider}
import org.apache.spark.sql.streaming.OutputMode


class KuduSinkProvider extends StreamSinkProvider with DataSourceRegister {

  override def createSink(sqlContext: SQLContext,
                          parameters: Map[String, String],
                          partitionColumns: Seq[String],
                          outputMode: OutputMode): Sink = {
    require(outputMode == OutputMode.Update, "only 'update' OutputMode is supported")
    KuduSink.withDefaultContext(sqlContext, parameters)
  }

  override def shortName(): String = "kudu"
}

Source File: TransformersSpec.scala From streamliner-starter with Apache License 2.0

5 votes

package test

import com.memsql.spark.etl.api.UserTransformConfig
import com.memsql.spark.etl.utils.ByteUtils
import com.memsql.streamliner.starter.BasicTransformer
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.types._
import spray.json.JsString

class TransformersSpec extends UnitSpec with LocalSparkContext {
  val emptyConfig = UserTransformConfig(class_name = "Test", value = new JsString("empty"))
  val logger = new TestLogger("test")

  var sqlContext: SQLContext = _

  override def beforeEach(): Unit = {
    super.beforeEach()
    sqlContext = new SQLContext(sc)
  }

  "BasicTransformer" should "only emit even numbers" in {
    val transform = new BasicTransformer

    val schema = StructType(StructField("number", IntegerType, false) :: Nil)
    val sampleData = List(1,2,3)
    val rowRDD = sqlContext.sparkContext.parallelize(sampleData).map(Row(_))
    val dfIn = sqlContext.createDataFrame(rowRDD, schema)

    val df = transform.transform(sqlContext, dfIn, emptyConfig, logger)
    assert(df.schema == schema)
    assert(df.first == Row(2))
    assert(df.count == 1)
  }

  "BasicTransformer" should "only accept IntegerType fields" in {
    val transform = new BasicTransformer

    val schema = StructType(StructField("column", StringType, false) :: Nil)
    val sampleData = List(1,2,3)
    val rowRDD = sqlContext.sparkContext.parallelize(sampleData).map(Row(_))
    val dfIn = sqlContext.createDataFrame(rowRDD, schema)

    val e = intercept[IllegalArgumentException] {
      transform.transform(sqlContext, dfIn, emptyConfig, logger)
    }
    assert(e.getMessage() == "The first column of the input DataFrame should be IntegerType")
  }

}

Source File: Transformers.scala From streamliner-starter with Apache License 2.0

5 votes

package com.memsql.streamliner.starter

import org.apache.spark.sql.{Row, DataFrame, SQLContext}
import org.apache.spark.sql.types._
import com.memsql.spark.etl.api.{Transformer, PhaseConfig}
import com.memsql.spark.etl.utils.PhaseLogger

// A helper object to extract the first column of a schema
object ExtractFirstStructField {
  def unapply(schema: StructType): Option[(String, DataType, Boolean, Metadata)] = schema.fields match {
    case Array(first: StructField, _*) => Some((first.name, first.dataType, first.nullable, first.metadata))
  }
}

// This transformer expects an input DataFrame and returns it
class BasicTransformer extends Transformer {
  def transform(sqlContext: SQLContext, df: DataFrame, config: PhaseConfig, logger: PhaseLogger): DataFrame = {
    logger.info("transforming the DataFrame")

    // check that the first column is of type IntegerType and return its name
    val column = df.schema match {
      case ExtractFirstStructField(name: String, dataType: IntegerType, _, _) => name
      case _ => throw new IllegalArgumentException("The first column of the input DataFrame should be IntegerType")
    }

    // filter the dataframe, returning only even numbers
    df.filter(s"$column % 2 = 0")
  }
}

Source File: Extractors.scala From streamliner-starter with Apache License 2.0

5 votes

package com.memsql.streamliner.starter

import org.apache.spark.sql.{DataFrame, Row, SQLContext}
import org.apache.spark.sql.types._
import org.apache.spark.streaming.StreamingContext
import com.memsql.spark.etl.api.{Extractor, PhaseConfig}
import com.memsql.spark.etl.utils.PhaseLogger

// This extract just returns a static range of 5 integers each batch interval
class BasicExtractor extends Extractor {
  override def next(ssc: StreamingContext, time: Long, sqlContext: SQLContext, config: PhaseConfig, batchInterval: Long,
   logger: PhaseLogger): Option[DataFrame] = {
    logger.info("extracting a constant sequence DataFrame")

    val schema = StructType(StructField("number", IntegerType, false) :: Nil)

    val sampleData = List(1,2,3,4,5)
    val rowRDD = sqlContext.sparkContext.parallelize(sampleData).map(Row(_))

    val df = sqlContext.createDataFrame(rowRDD, schema)
    Some(df)
  }
}

Source File: ExtensionBuilderSuite.scala From spark-sql-server with Apache License 2.0

5 votes

package org.apache.spark.sql.server

import java.net.URL

import org.scalatest.BeforeAndAfterAll

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.sql.{SparkSession, SQLContext}
import org.apache.spark.util.Utils

class ExtensionBuilderSuite extends SparkFunSuite with BeforeAndAfterAll {

  var _sqlContext: SQLContext = _

  // TODO: This method works only in Java8
  private def addJarInClassPath(jarURLString: String): Unit = {
    // val cl = ClassLoader.getSystemClassLoader
    val cl = Utils.getSparkClassLoader
    val clazz = cl.getClass
    val method = clazz.getSuperclass.getDeclaredMethod("addURL", Seq(classOf[URL]): _*)
    method.setAccessible(true)
    method.invoke(cl, Seq[Object](new URL(jarURLString)): _*)
  }

  protected override def beforeAll(): Unit = {
    super.beforeAll()

    // Adds a jar for an extension builder
    val jarPath = "src/test/resources/extensions_2.12_3.0.0-preview2_0.1.7-spark3.0-SNAPSHOT.jar"
    val jarURL = s"file://${System.getProperty("user.dir")}/$jarPath"
    // sqlContext.sparkContext.addJar(jarURL)
    addJarInClassPath(jarURL)

    val conf = new SparkConf(loadDefaults = true)
      .setMaster("local[1]")
      .setAppName("spark-sql-server-test")
      .set("spark.sql.server.extensions.builder", "org.apache.spark.ExtensionBuilderExample")
    _sqlContext = SQLServerEnv.newSQLContext(conf)
  }

  protected override def afterAll(): Unit = {
    try {
      super.afterAll()
    } finally {
      try {
        if (_sqlContext != null) {
          _sqlContext.sparkContext.stop()
          _sqlContext = null
        }
      } finally {
        SparkSession.clearActiveSession()
        SparkSession.clearDefaultSession()
      }
    }
  }

  test("user-defined optimizer rules") {
    val rules = Seq("org.apache.spark.catalyst.EmptyRule1", "org.apache.spark.catalyst.EmptyRule2")
    val optimizerRuleNames = _sqlContext.sessionState.optimizer
      .extendedOperatorOptimizationRules.map(_.ruleName)
    rules.foreach { expectedRuleName =>
      assert(optimizerRuleNames.contains(expectedRuleName))
    }
  }
}

Source File: SQLServerEnv.scala From spark-sql-server with Apache License 2.0

5 votes

package org.apache.spark.sql.server

import scala.util.control.NonFatal

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.internal.Logging
import org.apache.spark.sql.{SparkSession, SparkSessionExtensions, SQLContext}
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.server.ui.SQLServerTab
import org.apache.spark.util.Utils

object SQLServerEnv extends Logging {

  // For test use
  private var _sqlContext: Option[SQLContext] = None

  @DeveloperApi
  def withSQLContext(sqlContext: SQLContext): Unit = {
    require(sqlContext != null)
    _sqlContext = Option(sqlContext)
    sqlServListener
    uiTab
  }

  private def mergeSparkConf(sqlConf: SQLConf, sparkConf: SparkConf): Unit = {
    sparkConf.getAll.foreach { case (k, v) =>
      sqlConf.setConfString(k, v)
    }
  }

  lazy val sparkConf: SparkConf = _sqlContext.map(_.sparkContext.conf).getOrElse {
    val sparkConf = new SparkConf(loadDefaults = true)

    // If user doesn't specify the appName, we want to get [SparkSQL::localHostName]
    // instead of the default appName [SQLServer].
    val maybeAppName = sparkConf
      .getOption("spark.app.name")
      .filterNot(_ == classOf[SQLServer].getName)
    sparkConf
      .setAppName(maybeAppName.getOrElse(s"SparkSQL::${Utils.localHostName()}"))
      .set("spark.sql.crossJoin.enabled", "true")
  }

  lazy val sqlConf: SQLConf = _sqlContext.map(_.conf).getOrElse {
    val newSqlConf = new SQLConf()
    mergeSparkConf(newSqlConf, sparkConf)
    newSqlConf
  }

  lazy val sqlContext: SQLContext = _sqlContext.getOrElse(newSQLContext(sparkConf))
  lazy val sparkContext: SparkContext = sqlContext.sparkContext
  lazy val sqlServListener: Option[SQLServerListener] = Some(newSQLServerListener(sqlContext))
  lazy val uiTab: Option[SQLServerTab] = newUiTab(sqlContext, sqlServListener.get)

  private[sql] def newSQLContext(conf: SparkConf): SQLContext = {
    def buildSQLContext(f: SparkSessionExtensions => Unit = _ => {}): SQLContext = {
      SparkSession.builder.config(conf).withExtensions(f).enableHiveSupport()
        .getOrCreate().sqlContext
    }
    val builderClassName = conf.get("spark.sql.server.extensions.builder", "")
    if (builderClassName.nonEmpty) {
      // Tries to install user-defined extensions
      try {
        val objName = builderClassName + (if (!builderClassName.endsWith("$")) "$" else "")
        val clazz = Utils.classForName(objName)
        val builder = clazz.getDeclaredField("MODULE$").get(null)
          .asInstanceOf[SparkSessionExtensions => Unit]
        val sqlContext = buildSQLContext(builder)
        logInfo(s"Successfully installed extensions from $builderClassName")
        sqlContext
      } catch {
        case NonFatal(e) =>
          logWarning(s"Failed to install extensions from $builderClassName: " + e.getMessage)
          buildSQLContext()
      }
    } else {
      buildSQLContext()
    }
  }
  def newSQLServerListener(sqlContext: SQLContext): SQLServerListener = {
    val listener = new SQLServerListener(sqlContext.conf)
    sqlContext.sparkContext.addSparkListener(listener)
    listener
  }
  def newUiTab(sqlContext: SQLContext, listener: SQLServerListener): Option[SQLServerTab] = {
    sqlContext.sparkContext.conf.getBoolean("spark.ui.enabled", true) match {
      case true => Some(SQLServerTab(SQLServerEnv.sqlContext.sparkContext, listener))
      case _ => None
    }
  }
}

Source File: StreamingJob.scala From confluent-platform-spark-streaming with Apache License 2.0

5 votes

package example

import com.typesafe.config.ConfigFactory
import io.confluent.kafka.serializers.KafkaAvroDecoder
import kafka.serializer.StringDecoder
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkContext, SparkConf}


object StreamingJob extends App {

  // Get job configuration
  val config = ConfigFactory.load()

  Logger.getLogger("example").setLevel(Level.toLevel(config.getString("loglevel")))
  private val logger = Logger.getLogger(getClass)

  // Spark config and contexts
  val sparkMaster = config.getString("spark.master")
  val sparkConf = new SparkConf()
    .setMaster(sparkMaster)
    .setAppName("StreamingExample")
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val sc = new SparkContext(sparkConf)
  val batchInterval = config.getInt("spark.batch.interval")
  val ssc = new StreamingContext(sc, Seconds(batchInterval))

  // Create Kafka stream
  val groupId = config.getString("kafka.group.id")
  val topic = config.getString("topic")
  val kafkaParams = Map(
    "bootstrap.servers" -> config.getString("kafka.bootstrap.servers"),
    "schema.registry.url" -> config.getString("kafka.schema.registry.url"),
    "group.id" -> groupId
  )

  @transient val kafkaStream: DStream[(String, Object)] =
      KafkaUtils.createDirectStream[String, Object, StringDecoder, KafkaAvroDecoder](
        ssc, kafkaParams, Set(topic)
      )

  // Load JSON strings into DataFrame
  kafkaStream.foreachRDD { rdd =>
    // Get the singleton instance of SQLContext
    val sqlContext = SQLContext.getOrCreate(rdd.sparkContext)
    import sqlContext.implicits._

    val topicValueStrings = rdd.map(_._2.toString)
    val df = sqlContext.read.json(topicValueStrings)

    df.printSchema()
    println("DataFrame count: " + df.count())
    df.take(1).foreach(println)
  }

  ssc.start()
  ssc.awaitTermination()

}

Source File: DefaultSource.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector.sql

import org.apache.spark.sql.{ DataFrame, SQLContext, SaveMode }
import org.apache.spark.sql.sources.{ BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider, SchemaRelationProvider }
import org.apache.spark.sql.types.StructType

import com.actian.spark_vector.util.Logging
import com.actian.spark_vector.vector.VectorJDBC

class DefaultSource extends DataSourceRegister with RelationProvider with SchemaRelationProvider with CreatableRelationProvider with Logging {
  override def shortName(): String = "vector"

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation =
    VectorRelation(TableRef(parameters), sqlContext, parameters)

  override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation =
    VectorRelation(TableRef(parameters), Some(schema), sqlContext, parameters)

  override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = {
    val tableRef = TableRef(parameters)
    val table = VectorRelation(tableRef, sqlContext, parameters)

    mode match {
      case SaveMode.Overwrite =>
        table.insert(data, true)
      case SaveMode.ErrorIfExists =>
        val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) }
        if (isEmpty) {
          table.insert(data, false)
        } else {
          throw new UnsupportedOperationException("Writing to a non-empty Vector table is not allowed with mode ErrorIfExists.")
        }
      case SaveMode.Append =>
        table.insert(data, false)
      case SaveMode.Ignore =>
        val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) }
        if (isEmpty) {
          table.insert(data, false)
        }
    }

    table
  }
}

Source File: CSVRead.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector.loader.command

import org.apache.spark.sql.SQLContext

import com.actian.spark_vector.loader.options.UserOptions
import com.actian.spark_vector.sql.sparkQuote
import com.actian.spark_vector.util.Logging

object CSVRead extends Logging {
  private def parseOptions(options: UserOptions): String = {
    Seq(
      Some(s"""sep "${options.csv.separatorChar.getOrElse(",")}""""),
      options.csv.headerRow.filter(identity).map(_ => """header "true""""),
      options.csv.inferSchema.filter(identity).map(_ => s"""inferSchema "true""""),
      options.csv.encoding.map(v => s"""encoding "${v}""""),
      options.csv.quoteChar.map(c => if (c != '\'') s"quote '$c'" else s"""quote "$c""""),
      options.csv.escapeChar.map(c => if (c != '\'') s"""escape '$c'""" else s"""escape "$c""""),
      options.csv.commentChar.map(c => if (c != '\'') s"comment '$c'" else s"""comment "$c""""),
      options.csv.ignoreLeading.filter(identity).map(_ => """ignoreLeadingWhiteSpace "true""""),
      options.csv.ignoreTrailing.filter(identity).map(_ => """ignoreTrailingWhiteSpace "true""""),
      options.csv.nullValue.map(v => s"""nullValue "${v}""""),
      options.csv.nanValue.map(v => s"""nanValue "${v}""""),
      options.csv.positiveInf.map(v => s"""positiveInf "${v}""""),
      options.csv.negativeInf.map(v => s"""negativeInf "${v}""""),
      options.csv.dateFormat.map(v => s"""dateFormat "${v}""""),
      options.csv.timestampFormat.map(v => s"""timestampFormat "${v}""""),
      options.csv.parseMode.map(_.toUpperCase()).collect {
        case "PERMISSIVE" => """mode "PERMISSIVE" """
        case "DROPMALFORMED" => """mode "DROPMALFORMED""""
        case "FAILFAST" => """mode "FAILFAST""""
      }
      ).flatten.mkString(",", ",", "")
  }

  
  def registerTempTable(options: UserOptions, sqlContext: SQLContext): String = {
    val table = s"csv_${options.vector.targetTable}_${System.currentTimeMillis}"
    val quotedTable = sparkQuote(table)
    val baseQuery = s"""CREATE TEMPORARY VIEW $quotedTable${options.csv.header.map(_.mkString("(", ",", ")")).getOrElse("")}
      USING csv
      OPTIONS (path "${options.general.sourceFile}"${parseOptions(options)})"""
    logDebug(s"CSV query to be executed for registering temporary table:\n$baseQuery")
    val df = sqlContext.sql(baseQuery)
    val cols = options.general.colsToLoad.getOrElse(sqlContext.sql(s"select * from $quotedTable where 1=0").columns.toSeq)
    s"select ${cols.map(c => s"""`${c.trim}`""").mkString(",")} from $quotedTable"
  }
}

Source File: JSONRead.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector.loader.command

import org.apache.spark.sql.SQLContext

import com.actian.spark_vector.loader.options.UserOptions
import com.actian.spark_vector.sql.sparkQuote
import com.actian.spark_vector.util.Logging
import com.actian.spark_vector.sql.colsSelectStatement

object JSONRead extends Logging { 
 
  private def parseOptions(options: UserOptions): String = {
    Seq(
      options.json.primitivesAsString.filter(identity).map(_ => """primitivesAsString "true""""),
      options.json.allowComments.filter(identity).map(_ => """allowComments "true""""),
      options.json.allowUnquoted.filter(identity).map(_ => """allowUnquotedFieldNames "true""""),
      options.json.allowSingleQuotes.filter(identity).map(_ => """allowSingleQuotes "true""""),
      options.json.allowLeadingZeros.filter(identity).map(_ => """allowNumericLeadingZeros "true""""),
      options.json.allowEscapingAny.filter(identity).map(_ => """allowBackslashEscapingAnyCharacter "true""""),
      options.json.allowUnquotedControlChars.filter(identity).map(_ => """allowUnquotedControlChars "true""""),
      options.json.multiline.filter(identity).map(_ => """multiline "true""""),
      options.json.parseMode.map(_.toUpperCase()).collect {
        case "PERMISSIVE" => """mode "PERMISSIVE" """
        case "DROPMALFORMED" => """mode "DROPMALFORMED""""
        case "FAILFAST" => """mode "FAILFAST""""
      }
    ).flatten.mkString(",", ",", "")
  }  
  
  
  def registerTempTable(options: UserOptions, sqlContext: SQLContext): String = {
      val table = s"json_${options.vector.targetTable}_${System.currentTimeMillis}"
      val quotedTable = sparkQuote(table)
      val baseQuery = s"""CREATE TEMPORARY VIEW $quotedTable${options.json.header.map(_.mkString("(", ",", ")")).getOrElse("")}
      USING json
      OPTIONS (path "${options.general.sourceFile}"${parseOptions(options)})"""
      logDebug(s"JSON query to be executed for registering temporary table:\n$baseQuery")
      val df = sqlContext.sql(baseQuery)
      val cols = options.general.colsToLoad.getOrElse(sqlContext.sql(s"select * from $quotedTable where 1=0").columns.toSeq)
      s"select ${cols.map(c => s"""`${c.trim}`""").mkString(",")} from $quotedTable"    
   }
}

Source File: StringIndexerDemo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License

5 votes

package com.chapter11.SparkMachineLearning

import org.apache.spark.sql.SparkSession
import org.apache.spark.ml.feature.{ OneHotEncoder, StringIndexer }
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.sql.functions.year
import org.apache.spark.ml.{ Pipeline, PipelineStage }
import org.apache.spark.ml.classification.{ LogisticRegression, LogisticRegressionModel }
import org.apache.spark.ml.feature.StringIndexer
import org.apache.spark.sql.{ DataFrame, SparkSession }
import scala.collection.mutable
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.sql._
import org.apache.spark.sql.SQLContext

object StringIndexerDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName(s"OneVsRestExample")
      .getOrCreate()

    val df = spark.createDataFrame(
      Seq((0, "Jason", "Germany"),
        (1, "David", "France"),
        (2, "Martin", "Spain"),
        (3, "Jason", "USA"),
        (4, "Daiel", "UK"),
        (5, "Moahmed", "Bangladesh"),
        (6, "David", "Ireland"),
        (7, "Jason", "Netherlands"))).toDF("id", "name", "address")

    df.show(false)

    val indexer = new StringIndexer()
      .setInputCol("name")
      .setOutputCol("label")
      .fit(df)

    val indexed = indexer.transform(df)
    indexed.show(false)

    spark.stop()
  }
}

Source File: PlyOutputWriter.scala From spark-iqmulus with Apache License 2.0

5 votes

package fr.ign.spark.iqmulus.ply

import org.apache.spark.sql.types._
import org.apache.hadoop.mapreduce.{ TaskAttemptID, RecordWriter, TaskAttemptContext, JobContext }
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
import java.io.DataOutputStream
import org.apache.spark.sql.sources.OutputWriter
import org.apache.hadoop.io.{ NullWritable, BytesWritable }
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat
import org.apache.hadoop.fs.Path
import java.text.NumberFormat
import org.apache.spark.sql.{ Row, SQLContext, sources }
import fr.ign.spark.iqmulus.RowOutputStream

class PlyOutputWriter(
  name: String,
  context: TaskAttemptContext,
  dataSchema: StructType,
  element: String,
  littleEndian: Boolean
)
    extends OutputWriter {

  private val file = {
    val path = getDefaultWorkFile(s".ply.$element")
    val fs = path.getFileSystem(context.getConfiguration)
    fs.create(path)
  }

  private var count = 0L

  // strip out ids
  private val schema = StructType(dataSchema.filterNot { Seq("fid", "pid") contains _.name })

  private val recordWriter = new RowOutputStream(new DataOutputStream(file), littleEndian, schema, dataSchema)

  def getDefaultWorkFile(extension: String): Path = {
    val uniqueWriteJobId = context.getConfiguration.get("spark.sql.sources.writeJobUUID")
    val taskAttemptId: TaskAttemptID = context.getTaskAttemptID
    val split = taskAttemptId.getTaskID.getId
    new Path(name, f"$split%05d-$uniqueWriteJobId$extension")
  }

  override def write(row: Row): Unit = {
    recordWriter.write(row)
    count += 1
  }

  override def close(): Unit = {
    recordWriter.close

    // write header
    val path = getDefaultWorkFile(".ply.header")
    val fs = path.getFileSystem(context.getConfiguration)
    val dos = new java.io.DataOutputStream(fs.create(path))
    val header = new PlyHeader(path.toString, littleEndian, Map(element -> ((count, schema))))
    header.write(dos)
    dos.close
  }
}

Source File: PerTestSparkSession.scala From Spark-RSVD with Apache License 2.0

5 votes

package com.criteo.rsvd

import java.io.File
import java.nio.file.{Files, Path}
import java.util.concurrent.locks.ReentrantLock

import org.apache.commons.io.FileUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.scalatest.{BeforeAndAfterEach, Suite}

import scala.reflect.ClassTag
import scala.util.control.NonFatal

object LocalSparkSession {
  private[this] val lock = new ReentrantLock()

  def acquire(): Unit = lock.lock()

  def release(): Unit = lock.unlock()

  def builder: SparkSession.Builder = {
    SparkSession
      .builder()
      .master("local[*]")
      .appName("test")
      .config("spark.ui.enabled", false)
  }
}


  def sparkConf: Map[String, Any] = Map()

  def toRDD[T: ClassTag](input: Seq[T]): RDD[T] = sc.parallelize(input)

  def toArray[T](input: RDD[T]): Array[T] = input.collect()

  protected def closeSession() = {
    currentSession.foreach(_.stop())
    currentSession = None
    try {
      checkpointDir.foreach(path =>
        FileUtils.deleteDirectory(new File(path.toString)))
    } catch {
      case NonFatal(_) =>
    }
    checkpointDir = None
    LocalSparkSession.release()
  }

  private def getOrCreateSession = synchronized {
    if (currentSession.isEmpty) {
      val builder = LocalSparkSession.builder
      for ((key, value) <- sparkConf) {
        builder.config(key, value.toString)
      }
      currentSession = Some(builder.getOrCreate())
      checkpointDir =
        Some(Files.createTempDirectory("spark-unit-test-checkpoint-"))
      currentSession.get.sparkContext
        .setCheckpointDir(checkpointDir.get.toString)
        currentSession.get.sparkContext.setLogLevel("WARN")
    }
    currentSession.get
  }

  override def beforeEach(): Unit = {
    LocalSparkSession.acquire()
    super.beforeEach()
  }

  override def afterEach(): Unit = {
    try {
      super.afterEach()
    } finally {
      closeSession()
    }
  }
}

Source File: Source.scala From modelmatrix with Apache License 2.0

5 votes

package com.collective.modelmatrix.cli

import org.apache.spark.sql.{DataFrame, SQLContext}

import scala.util.{Failure, Success, Try}

sealed trait Source {
  def asDataFrame(implicit sqlContext: SQLContext): DataFrame
}

object Source {
  private val hive = "hive://(.*)".r
  private val parquet = "parquet://(.*)".r

  def validate(source: String): Either[String, Unit] = {
    Try(apply(source)) match {
      case Success(s) => Right(())
      case Failure(err) => Left(s"Unsupported source type: $source")
    }
  }

  def apply(source: String): Source = source match {
    case hive(table) => HiveSource(table)
    case parquet(path) => ParquetSource(path)
  }
}

object NoSource extends Source {
  def asDataFrame(implicit sqlContext: SQLContext): DataFrame = {
    sys.error(s"Source is not defined")
  }

  override def toString: String = "Source is not defined"
}

case class HiveSource(
  tableName: String
) extends Source {

  def asDataFrame(implicit sqlContext: SQLContext): DataFrame = {
    sqlContext.sql(s"SELECT * FROM $tableName")
  }

  override def toString: String = {
    s"Hive table: $tableName"
  }

}

case class ParquetSource(
  path: String
) extends Source {

  def asDataFrame(implicit sqlContext: SQLContext): DataFrame = {
    sqlContext.parquetFile(path)
  }

  override def toString: String = {
    s"Parquet: $path"
  }

}

Source File: Sink.scala From modelmatrix with Apache License 2.0

5 votes

package com.collective.modelmatrix.cli

import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext}

import scala.util.{Failure, Success, Try}

sealed trait Sink {
  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit
}

object Sink {
  private val hive = "hive://(.*)".r
  private val parquet = "parquet://(.*)".r

  def validate(sink: String): Either[String, Unit] = {
    Try(apply(sink)) match {
      case Success(s) => Right(())
      case Failure(err) => Left(s"Unsupported sink type: $sink")
    }
  }

  def apply(sink: String): Sink = sink match {
    case hive(table) => HiveSink(table)
    case parquet(path) => ParquetSink(path)
  }
}

object NoSink extends Sink {
  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = {
    sys.error(s"Sink is not defined")
  }

  override def toString: String = "Sink is not defined"
}

case class HiveSink(
  tableName: String
) extends Sink {

  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = {
    df.saveAsTable(tableName, SaveMode.Overwrite)
  }

  override def toString: String =
    s"Hive table: $tableName"
}

case class ParquetSink(
  path: String
) extends Sink {

  def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = {
    df.saveAsParquetFile(path)
  }

  override def toString: String =
    s"Parquet: $path"
}

Source File: Transformers.scala From modelmatrix with Apache License 2.0

5 votes

package com.collective.modelmatrix.transform

import com.collective.modelmatrix.ModelFeature
import org.apache.spark.sql.{SQLContext, DataFrame}

import scalaz._

trait Transformers {

  protected class Transformers(input: DataFrame @@ Transformer.Features)(implicit sqlContext: SQLContext) {

    val identity = new IdentityTransformer(input)
    val top = new TopTransformer(input)
    val index = new IndexTransformer(input)
    val bins = new BinsTransformer(input)

    private val unknownFeature: PartialFunction[ModelFeature, FeatureTransformationError \/ TypedModelFeature] = {
      case feature => sys.error(s"Feature can't be validated by any of transformers: $feature")
    }

    def validate(feature: ModelFeature): FeatureTransformationError \/ TypedModelFeature =
      (identity.validate orElse
        top.validate orElse
        index.validate orElse
        bins.validate orElse
        unknownFeature
        )(feature)

  }
}

Source File: Main.scala From spark-gdb with Apache License 2.0

5 votes

package com.esri.app

import com.esri.core.geometry.Polyline
import com.esri.udt.{PointType, PolylineType}
import org.apache.spark.sql.{SQLContext, SaveMode}
import org.apache.spark.{Logging, SparkConf, SparkContext}



    val sqlContext = new SQLContext(sc)
    val df = sqlContext.read.format("com.esri.gdb")
      .option("path", path)
      .option("name", name)
      .option("numPartitions", "1")
      .load()
    df.printSchema()
    df.registerTempTable(name)
    sqlContext.udf.register("getX", (point: PointType) => point.x)
    sqlContext.udf.register("getY", (point: PointType) => point.y)
    sqlContext.udf.register("line", (point: PointType) => PolylineType({
      val polyline = new Polyline()
      polyline.startPath(point.x - 2, point.y - 2)
      polyline.lineTo(point.x + 2, point.y + 2)
      polyline
    }
    ))
    sqlContext.sql(s"select line(Shape),getX(Shape)-2 as x from $name")
      .write
      .mode(SaveMode.Overwrite)
      .format("json")
      .save(s"/tmp/$name.json")
  } finally {
    sc.stop()
  }

}

Source File: package.scala From spark-gdb with Apache License 2.0

5 votes

package com.esri

import java.nio.ByteBuffer

import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext


package object gdb {

  implicit class ByteBufferImplicits(byteBuffer: ByteBuffer) {

    implicit def getVarUInt() = {
      var shift = 7
      var b: Long = byteBuffer.get
      var ret = b & 0x7FL
      var old = ret
      while ((b & 0x80L) != 0) {
        b = byteBuffer.get
        ret = ((b & 0x7FL) << shift) | old
        old = ret
        shift += 7
      }
      ret
    }

    implicit def getVarInt() = {
      var shift = 7
      var b: Long = byteBuffer.get
      val isNeg = (b & 0x40L) != 0
      var ret = b & 0x3FL
      var old = ret
      while ((b & 0x80L) != 0) {
        b = byteBuffer.get
        ret = ((b & 0x7FL) << (shift - 1)) | old
        old = ret
        shift += 7
      }
      if (isNeg) -ret else ret
    }
  }

  implicit class SparkContextImplicits(sc: SparkContext) {
    implicit def gdbFile(path: String, name: String, numPartitions: Int = 8) = {
      GDBRDD(sc, path, name, numPartitions)
    }
  }

  implicit class SQLContextImplicits(sqlContext: SQLContext) extends Serializable {
    implicit def gdbFile(path: String, name: String, numPartitions: Int = 8) = {
      sqlContext.baseRelationToDataFrame(GDBRelation(path, name, numPartitions)(sqlContext))
    }
  }

}

Source File: GDBRelation.scala From spark-gdb with Apache License 2.0

5 votes

package com.esri.gdb

import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SQLContext}


case class GDBRelation(gdbPath: String, gdbName: String, numPartition: Int)
                      (@transient val sqlContext: SQLContext)
  extends BaseRelation with Logging with TableScan {

  override val schema = inferSchema()

  private def inferSchema() = {
    val sc = sqlContext.sparkContext
    GDBTable.findTable(gdbPath, gdbName, sc.hadoopConfiguration) match {
      case Some(catTab) => {
        val table = GDBTable(gdbPath, catTab.hexName, sc.hadoopConfiguration)
        try {
          table.schema()
        } finally {
          table.close()
        }
      }
      case _ => {
        log.error(s"Cannot find '$gdbName' in $gdbPath, creating an empty schema !")
        StructType(Seq.empty[StructField])
      }
    }
  }

  override def buildScan(): RDD[Row] = {
    GDBRDD(sqlContext.sparkContext, gdbPath, gdbName, numPartition)
  }
}

Source File: DefaultSource.scala From spark-gdb with Apache License 2.0

5 votes

package com.esri.gdb

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.{BaseRelation, RelationProvider, SchemaRelationProvider}
import org.apache.spark.sql.types.StructType


  override def createRelation(sqlContext: SQLContext,
                              parameters: Map[String, String],
                              schema: StructType
                             ): BaseRelation = {
    val path = parameters.getOrElse("path", sys.error("Parameter 'path' must be defined."))
    val name = parameters.getOrElse("name", sys.error("Parameter 'name' must be defined."))
    val numPartitions = parameters.getOrElse("numPartitions", "8").toInt
    GDBRelation(path, name, numPartitions)(sqlContext)
  }
}

Source File: DefaultSource.scala From spark-iqmulus with Apache License 2.0

5 votes

package fr.ign.spark.iqmulus.ply

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.sources.{ HadoopFsRelation, HadoopFsRelationProvider }
import org.apache.spark.sql.types.StructType

class DefaultSource extends HadoopFsRelationProvider {

  // override def shortName(): String = "ply"

  override def createRelation(
    sqlContext: SQLContext,
    paths: Array[String],
    dataSchema: Option[StructType],
    partitionColumns: Option[StructType],
    parameters: Map[String, String]
  ): HadoopFsRelation = {
    new PlyRelation(paths, dataSchema, partitionColumns, parameters)(sqlContext)
  }
}

Source File: BankDatawithZepplein.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License

5 votes

package com.example.ZeepleinAndSpark

import org.apache.spark.sql.SparkSession
import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext

object BankDatawithZepplein {

  case class Bank(age: Integer, job: String, marital: String, education: String, balance: Integer)

  def main(args: Array[String]): Unit = {
    val spark: SparkSession = SparkSession
      .builder()
      .appName("TrainSplitOCR")
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "C:/Exp/").
      getOrCreate();
    val bankText = spark.sparkContext.textFile("data/bank-full.csv")

    // split each line, filter out header (starts with "age"), and map it into Bank case class
    val bank = bankText.map(s => s.split(";")).filter(s => (s.size) > 5).filter(s => s(0) != "\"age\"").map(
      s => Bank(s(0).toInt,
        s(1).replaceAll("\"", ""),
        s(2).replaceAll("\"", ""),
        s(3).replaceAll("\"", ""),
        s(5).replaceAll("\"", "").toInt))

    val sqlContext = new SQLContext(spark.sparkContext)

    import sqlContext.implicits._
    import sqlContext._
    // convert to DataFrame and create temporal table
    val newDF = bank.toDF()
    newDF.show()
    newDF.createOrReplaceTempView("bank")

    spark.sql("select age, count(1) from bank where age <= 50 group by age order by age").show()

    spark.sql("select age, count(1) from bank where age <= 30 group by age order by age").show()
    spark.sql("select max(balance) as MaxBalance from bank where age <= 30 group by age order by MaxBalance DESC").show()

  }
}

Source File: TwitterBatchTimely.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.timeseries

import java.sql.Timestamp

import com.cloudera.sparkts.{DateTimeIndex, TimeSeriesRDD}
import io.gzet.timeseries.timely.MetricImplicits._
import io.gzet.timeseries.timely.TimelyImplicits._
import io.gzet.timeseries.twitter.Twitter._
import io.gzet.utils.spark.accumulo.AccumuloConfig
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.joda.time.{DateTime, Minutes, Period}

object TwitterBatchTimely extends SimpleConfig {

  case class Observation(
                          hashtag: String,
                          time: Timestamp,
                          count: Double
                        )

  def main(args: Array[String]) = {

    val sparkConf = new SparkConf().setAppName("Twitter Extractor")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    val twitterJsonRDD = sc.textFile("file:///Users/antoine/CHAPTER/twitter-trump", 500)
    val tweetRDD = twitterJsonRDD mapPartitions analyzeJson cache()

    // Publish metrics to Timely
    tweetRDD.count()
    tweetRDD.countByState.publish()
    tweetRDD.sentimentByState.publish()

    // Read metrics from Timely
    val conf = AccumuloConfig("GZET", "alice", "alice", "localhost:2181")
    val metricsRDD = sc.timely(conf, Some("io.gzet.count"))

    val minDate = metricsRDD.map(_.time).min()
    val maxDate = metricsRDD.map(_.time).max()

    class TwitterFrequency(val minutes: Int) extends com.cloudera.sparkts.PeriodFrequency(Period.minutes(minutes)) {
      def difference(dt1: DateTime, dt2: DateTime): Int = Minutes.minutesBetween(dt1, dt2).getMinutes / minutes
      override def toString: String = s"minutes $minutes"
    }

    val dtIndex = DateTimeIndex.uniform(minDate, maxDate, new TwitterFrequency(1))

    val metricsDF = metricsRDD.filter({
      metric =>
        metric.tags.keys.toSet.contains("tag")
    }).flatMap({
      metric =>
        metric.tags map {
          case (k, v) =>
            ((v, roundFloorMinute(metric.time, 1)), metric.value)
        }
    }).reduceByKey(_+_).map({
      case ((metric, time), sentiment) =>
        Observation(metric, new Timestamp(time), sentiment)
    }).toDF()

    val tsRDD = TimeSeriesRDD.timeSeriesRDDFromObservations(dtIndex, metricsDF, "time", "hashtag", "count").filter(_._2.toArray.exists(!_.isNaN))

  }

  def roundFloorMinute(time: Long, windowMinutes: Int) = {
    val dt = new DateTime(time)
    dt.withMinuteOfHour((dt.getMinuteOfHour / windowMinutes) * windowMinutes).minuteOfDay().roundFloorCopy().toDate.getTime
  }

}

Source File: StackBootstraping.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.tagging.stackoverflow


import io.gzet.tagging.classifier.Classifier
import io.gzet.tagging.html.HtmlHandler
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SparkSession, DataFrame, SQLContext}

import scala.collection.mutable
import scala.xml.{Elem, XML}

object StackBootstraping {

  def parse(spark: SparkSession, posts: RDD[String]): DataFrame = {

    import spark.sqlContext.implicits._
    posts filter { line =>
      line.contains("row Id")
    } map { line =>
      val xml = XML.loadString(line)
      (getBody(xml), getTags(xml))
    } filter { case (body, tags) =>
      body.isDefined && tags.isDefined
    } flatMap  { case (body, tags) =>
      tags.get.map(tag => (body.get, tag))
    } toDF("body", "tag")
  }

  private def getBody(xml: Elem): Option[String] = {
    val bodyAttr = xml.attribute("Body")
    if (bodyAttr.isDefined) {
      val html = bodyAttr.get.head.text
      val htmlHandler = new HtmlHandler()
      val content = htmlHandler.parseHtml(html)
      if (content.isDefined) {
        return content.get.body
      }
    }
    None: Option[String]
  }

  private def getTags(xml: Elem): Option[Array[String]] = {
    val tagsAttr = xml.attribute("Tags")
    if (tagsAttr.isDefined) {
      val tagsText = tagsAttr.get.head.text
      val tags = tagsText
        .replaceAll("<", "")
        .replaceAll(">", ",")
        .split(",")
      return Some(tags)
    }
    None: Option[Array[String]]
  }

  def bootstrapNaiveBayes(df: DataFrame, vectorSize: Option[Int]) = {
    val labeledText = df.rdd map { row =>
      val body = row.getString(0)
      val labels = row.getAs[mutable.WrappedArray[String]](1)
      (body, labels.toArray)
    }
    Classifier.train(labeledText)
  }

}

Source File: Neo4jJavaIntegration.scala From neo4j-spark-connector with Apache License 2.0

5 votes

package org.neo4j.spark

import java.util

import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.neo4j.spark.dataframe.Neo4jDataFrame
import org.neo4j.spark.rdd.{Neo4jRowRDD, Neo4jTupleRDD}

import scala.collection.JavaConverters._


object Neo4jJavaIntegration {
  def rowRDD(sc: SparkContext, query: String, parameters: java.util.Map[String, AnyRef]) =
    new Neo4jRowRDD(sc, query, if (parameters == null) Seq.empty else parameters.asScala.toSeq).toJavaRDD()

  def tupleRDD(sc: SparkContext, query: String, parameters: java.util.Map[String, AnyRef]): JavaRDD[util.Map[String, AnyRef]] = {
    val params = if (parameters == null) Seq.empty else parameters.asScala.toSeq
    Neo4jTupleRDD(sc, query, params)
      .map((t) => new util.LinkedHashMap[String, AnyRef](t.toMap.asJava).asInstanceOf[util.Map[String, AnyRef]])
      .toJavaRDD()
  }

  def dataFrame(sqlContext: SQLContext, query: String, parameters: java.util.Map[String, AnyRef], schemaInfo: util.Map[String, String]) = {
    Neo4jDataFrame(sqlContext, query, parameters.asScala.toSeq, schemaInfo.asScala.toSeq: _*)
  }
}

Source File: Neo4jGraphFrame.scala From neo4j-spark-connector with Apache License 2.0

5 votes

package org.neo4j.spark.dataframe

import org.apache.spark.SparkContext
import org.apache.spark.graphx.Graph
import org.apache.spark.sql.SQLContext
import org.neo4j.spark.Neo4jGraph

import org.neo4j.spark.cypher.CypherHelpers._


object Neo4jGraphFrame {

  def apply(sqlContext: SQLContext, src: (String, String), edge: (String, String), dst: (String, String)) = {
    def nodeStmt(s: (String, String)) = s"MATCH (n:${s._1.quote}) RETURN id(n) as id, n.${s._2.quote} as prop"

    val edgeProp = if (edge._2 == null) "" else s", r.${edge._2.quote} as prop"
    val edgeStmt = s"MATCH (n:${src._1.quote})-[r:${edge._1.quote}]->(m:${dst._1.quote}) RETURN id(n) as src, id(m) as dst" + edgeProp

    val vertices1 = Neo4jDataFrame(sqlContext, nodeStmt(src), Seq.empty, ("id", "integer"), ("prop", "string"))
    val vertices2 = Neo4jDataFrame(sqlContext, nodeStmt(dst), Seq.empty, ("id", "integer"), ("prop", "string"))
    val schema = Seq(("src", "integer"), ("dst", "integer")) ++ (if (edge._2 != null) Some("prop", "string") else None)
    val edges = Neo4jDataFrame(sqlContext, edgeStmt, Seq.empty, schema: _*)

    org.graphframes.GraphFrame(vertices1.union(vertices2).distinct(), edges)
  }

  def fromGraphX(sc: SparkContext, label1: String, rels: Seq[String], label2: String) = {
    val g: Graph[Any, Int] = Neo4jGraph.loadGraph(sc, label1, rels, label2)
    org.graphframes.GraphFrame.fromGraphX(g)
  }

  def fromEdges(sqlContext: SQLContext, label1: String, rels: Seq[String], label2: String) = {
    val relTypes = rels.map(_.quote).mkString("|")
    val edgeStmt = s"MATCH (n:${label1.quote})-[r:$relTypes]->(m:${label2.quote}) RETURN id(n) as src, id(m) as dst"
    val edges = Neo4jDataFrame(sqlContext, edgeStmt, Seq.empty, ("src", "integer"), ("dst", "integer"))
    org.graphframes.GraphFrame.fromEdges(edges)
  }
}

Source File: HiveAcidRelation.scala From spark-acid with Apache License 2.0

5 votes

package com.qubole.spark.hiveacid.datasource

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Column, DataFrame, Row, SQLContext, SparkSession}
import org.apache.spark.sql.sources.{BaseRelation, Filter, InsertableRelation, PrunedFilteredScan}
import org.apache.spark.sql.types._
import com.qubole.spark.hiveacid.{HiveAcidErrors, HiveAcidTable, SparkAcidConf}
import com.qubole.spark.hiveacid.hive.HiveAcidMetadata
import com.qubole.spark.hiveacid.merge.{MergeWhenClause, MergeWhenNotInsert}
import org.apache.spark.sql.catalyst.AliasIdentifier
import org.apache.spark.sql.catalyst.expressions.Expression
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import collection.JavaConversions._


case class HiveAcidRelation(sparkSession: SparkSession,
                            fullyQualifiedTableName: String,
                            parameters: Map[String, String])
    extends BaseRelation
    with InsertableRelation
    with PrunedFilteredScan
    with Logging {

  private val hiveAcidMetadata: HiveAcidMetadata = HiveAcidMetadata.fromSparkSession(
    sparkSession,
    fullyQualifiedTableName
  )
  private val hiveAcidTable: HiveAcidTable = new HiveAcidTable(sparkSession,
    hiveAcidMetadata, parameters)

  private val readOptions = SparkAcidConf(sparkSession, parameters)

  override def sqlContext: SQLContext = sparkSession.sqlContext

  override val schema: StructType = if (readOptions.includeRowIds) {
    hiveAcidMetadata.tableSchemaWithRowId
  } else {
    hiveAcidMetadata.tableSchema
  }

  override def insert(data: DataFrame, overwrite: Boolean): Unit = {
   // sql insert into and overwrite
    if (overwrite) {
      hiveAcidTable.insertOverwrite(data)
    } else {
      hiveAcidTable.insertInto(data)
    }
  }

  def update(condition: Option[Column], newValues: Map[String, Column]): Unit = {
    hiveAcidTable.update(condition, newValues)
  }

  def delete(condition: Column): Unit = {
    hiveAcidTable.delete(condition)
  }
  override def sizeInBytes: Long = {
    val compressionFactor = sparkSession.sessionState.conf.fileCompressionFactor
    (sparkSession.sessionState.conf.defaultSizeInBytes * compressionFactor).toLong
  }

  def merge(sourceDf: DataFrame,
            mergeExpression: Expression,
            matchedClause: Seq[MergeWhenClause],
            notMatched: Option[MergeWhenNotInsert],
            sourceAlias: Option[AliasIdentifier],
            targetAlias: Option[AliasIdentifier]): Unit = {
    hiveAcidTable.merge(sourceDf, mergeExpression, matchedClause,
      notMatched, sourceAlias, targetAlias)
  }

  def getHiveAcidTable(): HiveAcidTable = {
    hiveAcidTable
  }

  // FIXME: should it be true / false. Recommendation seems to
  //  be to leave it as true
  override val needConversion: Boolean = false

  override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = {
    val readOptions = SparkAcidConf(sparkSession, parameters)
    // sql "select *"
    hiveAcidTable.getRdd(requiredColumns, filters, readOptions)
  }
}

Source File: MLlibTestSparkContext.scala From spark-ranking-algorithms with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import org.scalatest.{BeforeAndAfterAll, Suite}

import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

trait MLlibTestSparkContext extends BeforeAndAfterAll { self: Suite =>
  @transient var sc: SparkContext = _
  @transient var sqlContext: SQLContext = _

  override def beforeAll() {
    super.beforeAll()
    val conf = new SparkConf()
      .setMaster("local[2]")
      .setAppName("MLlibUnitTest")
    sc = new SparkContext(conf)
    SQLContext.clearActive()
    sqlContext = new SQLContext(sc)
    SQLContext.setActive(sqlContext)
  }

  override def afterAll() {
    try {
      sqlContext = null
      SQLContext.clearActive()
      if (sc != null) {
        sc.stop()
      }
      sc = null
    } finally {
      super.afterAll()
    }
  }
}

Source File: PulsarRelation.scala From pulsar-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.pulsar

import java.{util => ju}

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.catalyst.json.JSONOptionsInRead
import org.apache.spark.sql.sources.{BaseRelation, TableScan}
import org.apache.spark.sql.types.StructType


private[pulsar] class PulsarRelation(
    override val sqlContext: SQLContext,
    override val schema: StructType,
    schemaInfo: SchemaInfoSerializable,
    adminUrl: String,
    clientConf: ju.Map[String, Object],
    readerConf: ju.Map[String, Object],
    startingOffset: SpecificPulsarOffset,
    endingOffset: SpecificPulsarOffset,
    pollTimeoutMs: Int,
    failOnDataLoss: Boolean,
    subscriptionNamePrefix: String,
    jsonOptions: JSONOptionsInRead)
    extends BaseRelation
    with TableScan
    with Logging {

  import PulsarSourceUtils._

  val reportDataLoss = reportDataLossFunc(failOnDataLoss)

  override def buildScan(): RDD[Row] = {
    val fromTopicOffsets = startingOffset.topicOffsets
    val endTopicOffsets = endingOffset.topicOffsets

    if (fromTopicOffsets.keySet != endTopicOffsets.keySet) {
      val fromTopics = fromTopicOffsets.keySet.toList.sorted.mkString(",")
      val endTopics = endTopicOffsets.keySet.toList.sorted.mkString(",")
      throw new IllegalStateException(
        "different topics " +
          s"for starting offsets topics[${fromTopics}] and " +
          s"ending offsets topics[${endTopics}]")
    }

    val offsetRanges = endTopicOffsets.keySet
      .map { tp =>
        val fromOffset = fromTopicOffsets.getOrElse(tp, {
          // this shouldn't happen since we had checked it
          throw new IllegalStateException(s"$tp doesn't have a from offset")
        })
        val untilOffset = endTopicOffsets(tp)
        PulsarOffsetRange(tp, fromOffset, untilOffset, None)
      }
      .filter { range =>
        if (range.untilOffset.compareTo(range.fromOffset) < 0) {
          reportDataLoss(
            s"${range.topic}'s offset was changed " +
              s"from ${range.fromOffset} to ${range.untilOffset}, " +
              "some data might has been missed")
          false
        } else {
          true
        }
      }
      .toSeq

    val rdd = new PulsarSourceRDD4Batch(
      sqlContext.sparkContext,
      schemaInfo,
      adminUrl,
      clientConf,
      readerConf,
      offsetRanges,
      pollTimeoutMs,
      failOnDataLoss,
      subscriptionNamePrefix,
      jsonOptions
    )
    sqlContext.internalCreateDataFrame(rdd.setName("pulsar"), schema).rdd
  }
}

Source File: InsertMysqlDemo.scala From spark_mysql with Apache License 2.0

5 votes

import java.sql.{Date, Timestamp}

import InsertMysqlDemo.CardMember
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import utils.MySQLUtils

/**
  * Created with IntelliJ IDEA.
  * Author: [email protected]
  * Description:DataFrame 中数据存入到MySQL
  * Date: Created in 2018-11-17 12:39
  */
object InsertMysqlDemo {

  case class CardMember(m_id: String, card_type: String, expire: Timestamp, duration: Int, is_sale: Boolean, date: Date, user: Long, salary: Float)

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName(getClass.getSimpleName).set("spark.testing.memory", "3147480000")
    val sparkContext = new SparkContext(conf)
    val hiveContext = new SQLContext(sparkContext)
    import hiveContext.implicits._
    val memberSeq = Seq(
      CardMember("member_2", "月卡", new Timestamp(System.currentTimeMillis()), 31, false, new Date(System.currentTimeMillis()), 123223, 0.32f),
      CardMember("member_1", "季卡", new Timestamp(System.currentTimeMillis()), 93, false, new Date(System.currentTimeMillis()), 124224, 0.362f)
    )
    val memberDF = memberSeq.toDF()
    MySQLUtils.saveDFtoDBCreateTableIfNotExist("member_test", memberDF)
    MySQLUtils.insertOrUpdateDFtoDBUsePool("member_test", memberDF, Array("user", "salary"))
    MySQLUtils.getDFFromMysql(hiveContext, "", null)


    sparkContext.stop()
  }
}

Source File: HBaseTestSource.scala From hbase-connectors with Apache License 2.0

5 votes

package org.apache.hadoop.hbase.spark

import org.apache.hadoop.hbase.spark.datasources.HBaseSparkConf
import org.apache.spark.SparkEnv
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, SQLContext}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types._

class HBaseTestSource extends RelationProvider {
  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    DummyScan(
      parameters("cacheSize").toInt,
      parameters("batchNum").toInt,
      parameters("blockCacheingEnable").toBoolean,
      parameters("rowNum").toInt)(sqlContext)
  }
}

case class DummyScan(
     cacheSize: Int,
     batchNum: Int,
     blockCachingEnable: Boolean,
     rowNum: Int)(@transient val sqlContext: SQLContext)
  extends BaseRelation with TableScan {
  private def sparkConf = SparkEnv.get.conf
  override def schema: StructType =
    StructType(StructField("i", IntegerType, nullable = false) :: Nil)

  override def buildScan(): RDD[Row] = sqlContext.sparkContext.parallelize(0 until rowNum)
    .map(Row(_))
    .map{ x =>
      if (sparkConf.getInt(HBaseSparkConf.QUERY_BATCHSIZE,
          -1) != batchNum ||
        sparkConf.getInt(HBaseSparkConf.QUERY_CACHEDROWS,
          -1) != cacheSize ||
        sparkConf.getBoolean(HBaseSparkConf.QUERY_CACHEBLOCKS,
          false) != blockCachingEnable) {
        throw new Exception("HBase Spark configuration cannot be set properly")
      }
      x
  }
}

Source File: KuduSink.scala From kafka-examples with Apache License 2.0

5 votes

package com.cloudera.streaming.refapp.kudu

import org.apache.kudu.spark.kudu.KuduContext
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.slf4j.LoggerFactory

import scala.util.control.NonFatal

object KuduSink {
  def withDefaultContext(sqlContext: SQLContext, parameters: Map[String, String]) =
    new KuduSink(new KuduContext(parameters("kudu.master"), sqlContext.sparkContext), parameters)
}


class KuduSink(initKuduContext: => KuduContext, parameters: Map[String, String]) extends Sink {

  private val logger = LoggerFactory.getLogger(getClass)

  private var kuduContext = initKuduContext

  private val tablename = parameters("kudu.table")

  private val retries = parameters.getOrElse("retries", "1").toInt
  require(retries >= 0, "retries must be non-negative")

  logger.info(s"Created Kudu sink writing to table $tablename")

  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    for (attempt <- 0 to retries) {
      try {
        kuduContext.upsertRows(data, tablename)
        return
      } catch {
        case NonFatal(e) =>
          if (attempt < retries) {
            logger.warn("Kudu upsert error, retrying...", e)
            kuduContext = initKuduContext
          }
          else {
            logger.error("Kudu upsert error, exhausted", e)
            throw e
          }
      }
    }
  }
}

Source File: SparkNRedshiftUtil.scala From SqlShift with MIT License

5 votes

package com.goibibo.sqlshift

import java.sql.{Connection, DriverManager}
import java.util.Properties

import com.databricks.spark.redshift.RedshiftReaderM
import com.typesafe.config.Config
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, Suite}
import org.slf4j.{Logger, LoggerFactory}


trait SparkNRedshiftUtil extends BeforeAndAfterAll {
    self: Suite =>
    private val logger: Logger = LoggerFactory.getLogger(this.getClass)
    @transient private var _sc: SparkContext = _
    @transient private var _sqlContext: SQLContext = _

    def sc: SparkContext = _sc
    def sqlContext: SQLContext = _sqlContext

    private def getRedshiftConnection(config: Config): Connection = {
        val mysql = config.getConfig("redshift")
        val connectionProps = new Properties()
        connectionProps.put("user", mysql.getString("username"))
        connectionProps.put("password", mysql.getString("password"))
        val jdbcUrl = s"jdbc:redshift://${mysql.getString("hostname")}:${mysql.getInt("portno")}/${mysql.getString("database")}?useSSL=false"
        Class.forName("com.amazon.redshift.jdbc4.Driver")
        DriverManager.getConnection(jdbcUrl, connectionProps)
    }

    val getSparkContext: (SparkContext, SQLContext) = {
        val sparkConf: SparkConf = new SparkConf().setAppName("Full Dump Testing").setMaster("local")
        val sc: SparkContext = new SparkContext(sparkConf)
        val sqlContext: SQLContext = new SQLContext(sc)

        System.setProperty("com.amazonaws.services.s3.enableV4", "true")
        sc.hadoopConfiguration.set("fs.s3a.endpoint", "s3.ap-south-1.amazonaws.com")
        sc.hadoopConfiguration.set("fs.s3a.fast.upload", "true")
        (sc, sqlContext)
    }

    def readTableFromRedshift(config: Config, tableName: String): DataFrame = {
        val redshift: Config = config.getConfig("redshift")
        val options = Map("dbtable" -> tableName,
            "user" -> redshift.getString("username"),
            "password" -> redshift.getString("password"),
            "url" -> s"jdbc:redshift://${redshift.getString("hostname")}:${redshift.getInt("portno")}/${redshift.getString("database")}",
            "tempdir" -> config.getString("s3.location"),
            "aws_iam_role" -> config.getString("redshift.iamRole")
        )
        RedshiftReaderM.getDataFrameForConfig(options, sc, sqlContext)
    }

    def dropTableRedshift(config: Config, tables: String*): Unit = {
        logger.info("Droping table: {}", tables)
        val conn = getRedshiftConnection(config)
        val statement = conn.createStatement()
        try {
            val dropTableQuery = s"""DROP TABLE ${tables.mkString(",")}"""
            logger.info("Running query: {}", dropTableQuery)
            statement.executeUpdate(dropTableQuery)
        } finally {
            statement.close()
            conn.close()
        }
    }

    override protected def beforeAll(): Unit = {
        super.beforeAll()
        val (sc, sqlContext) = getSparkContext
        _sc = sc
        _sqlContext = sqlContext
    }

    override protected def afterAll(): Unit = {
        super.afterAll()
        _sc.stop()
    }
}

Source File: RedshiftReaderM.scala From SqlShift with MIT License

5 votes

package com.databricks.spark.redshift

import com.amazonaws.auth.AWSCredentials
import com.amazonaws.services.s3.AmazonS3Client
import org.apache.spark.SparkContext
import org.apache.spark.sql.sources.BaseRelation
import org.apache.spark.sql.{DataFrame, SQLContext}

object RedshiftReaderM {

    val endpoint = "s3.ap-south-1.amazonaws.com"

    def getS3Client(provider: AWSCredentials): AmazonS3Client = {
        val client = new AmazonS3Client(provider)
        client.setEndpoint(endpoint)
        client
    }

    def getDataFrameForConfig(configs: Map[String, String], sparkContext: SparkContext, sqlContext: SQLContext): DataFrame = {
        val source: DefaultSource = new DefaultSource(new JDBCWrapper(), getS3Client)
        val br: BaseRelation = source.createRelation(sqlContext, configs)
        sqlContext.baseRelationToDataFrame(br)
    }
}

org.apache.spark.sql.SQLContext Scala Examples