org.apache.spark.sql.SaveMode Scala Examples
The following examples show how to use org.apache.spark.sql.SaveMode.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DefaultSource.scala From spark-snowflake with Apache License 2.0 | 7 votes |
package net.snowflake.spark.snowflake import net.snowflake.spark.snowflake.streaming.SnowflakeSink import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_SHORT_NAME import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} import org.slf4j.LoggerFactory override def createRelation(sqlContext: SQLContext, saveMode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val params = Parameters.mergeParameters(parameters) // check spark version for push down if (params.autoPushdown) { SnowflakeConnectorUtils.checkVersionAndEnablePushdown( sqlContext.sparkSession ) } // pass parameters to pushdown functions pushdowns.setGlobalParameter(params) val table = params.table.getOrElse { throw new IllegalArgumentException( "For save operations you must specify a Snowfake table name with the 'dbtable' parameter" ) } def tableExists: Boolean = { val conn = jdbcWrapper.getConnector(params) try { jdbcWrapper.tableExists(conn, table.toString) } finally { conn.close() } } val (doSave, dropExisting) = saveMode match { case SaveMode.Append => (true, false) case SaveMode.Overwrite => (true, true) case SaveMode.ErrorIfExists => if (tableExists) { sys.error( s"Table $table already exists! (SaveMode is set to ErrorIfExists)" ) } else { (true, false) } case SaveMode.Ignore => if (tableExists) { log.info(s"Table $table already exists -- ignoring save request.") (false, false) } else { (true, false) } } if (doSave) { val updatedParams = parameters.updated("overwrite", dropExisting.toString) new SnowflakeWriter(jdbcWrapper) .save( sqlContext, data, saveMode, Parameters.mergeParameters(updatedParams) ) } createRelation(sqlContext, parameters) } override def createSink(sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = new SnowflakeSink(sqlContext, parameters, partitionColumns, outputMode) }
Example 2
Source File: JdbcRelationProvider.scala From drizzle-spark with Apache License 2.0 | 7 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 3
Source File: OnErrorSuite.scala From spark-snowflake with Apache License 2.0 | 6 votes |
package net.snowflake.spark.snowflake import net.snowflake.client.jdbc.SnowflakeSQLException import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_NAME import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.apache.spark.sql.types.{StringType, StructField, StructType} class OnErrorSuite extends IntegrationSuiteBase { lazy val table = s"spark_test_table_$randomSuffix" lazy val schema = new StructType( Array(StructField("var", StringType, nullable = false)) ) lazy val df: DataFrame = sparkSession.createDataFrame( sc.parallelize( Seq(Row("{\"dsadas\nadsa\":12311}"), Row("{\"abc\":334}")) // invalid json key ), schema ) override def beforeAll(): Unit = { super.beforeAll() jdbcUpdate(s"create or replace table $table(var variant)") } override def afterAll(): Unit = { jdbcUpdate(s"drop table $table") super.afterAll() } test("continue_on_error off") { assertThrows[SnowflakeSQLException] { df.write .format(SNOWFLAKE_SOURCE_NAME) .options(connectorOptionsNoTable) .option("dbtable", table) .mode(SaveMode.Append) .save() } } test("continue_on_error on") { df.write .format(SNOWFLAKE_SOURCE_NAME) .options(connectorOptionsNoTable) .option("continue_on_error", "on") .option("dbtable", table) .mode(SaveMode.Append) .save() val result = sparkSession.read .format(SNOWFLAKE_SOURCE_NAME) .options(connectorOptionsNoTable) .option("dbtable", table) .load() assert(result.collect().length == 1) } }
Example 4
Source File: SFTableNameSuite.scala From spark-snowflake with Apache License 2.0 | 5 votes |
package net.snowflake.spark.snowflake import net.snowflake.spark.snowflake.DefaultJDBCWrapper.DataBaseOperations import net.snowflake.spark.snowflake.Utils.SNOWFLAKE_SOURCE_SHORT_NAME import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.sql.types.{ IntegerType, StringType, StructField, StructType } class SFTableNameSuite extends IntegrationSuiteBase { lazy val tableName = s""""spark_test_table_$randomSuffix"""" override def afterAll(): Unit = { conn.dropTable(tableName) super.afterAll() } test("table name include \"") { val schema = StructType( List(StructField("num", IntegerType), StructField("str", StringType)) ) val data = sc.parallelize(Seq(Row(1, "a"), Row(2, "b"))) val df = sparkSession.createDataFrame(data, schema) df.write .format(SNOWFLAKE_SOURCE_SHORT_NAME) .options(connectorOptions) .option("dbtable", tableName) .mode(SaveMode.Overwrite) .save() val result = sparkSession.read .format(SNOWFLAKE_SOURCE_SHORT_NAME) .options(connectorOptions) .option("dbtable", tableName) .load() .count() assert(result == 2) } }
Example 5
Source File: BigQueryImporter.scala From pg2bq with MIT License | 5 votes |
package com.powerspace.pg2bq import com.google.cloud.bigquery.JobInfo.WriteDisposition import com.google.cloud.bigquery._ import com.typesafe.scalalogging.LazyLogging import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} class BigQueryImporter(spark: SparkSession, tmpBucket: String, dataset: String) extends LazyLogging with DataImporter { val bigquery: BigQuery = BigQueryOptions.getDefaultInstance.getService // ensure the dataset exists or create it getOrCreateDataset(dataset) override def createOrOverride(df: DataFrame, tableName: String): Unit = { saveIntoGcs(df, tableName) loadFromGcsToBq(tableName) } private def loadFromGcsToBq(tableName: String): Unit = { val configuration = LoadJobConfiguration .builder(TableId.of(dataset, tableName), s"gs://$tmpBucket/$tableName/*.avro") .setFormatOptions(FormatOptions.avro()) .setWriteDisposition(WriteDisposition.WRITE_TRUNCATE) .build() val job = bigquery.create(JobInfo.newBuilder(configuration).build()) logger.info(s"Importing $tableName from bucket $tmpBucket to dataset $dataset...") job.waitFor() logger.info(s"$tableName import done!") } private def saveIntoGcs(df: DataFrame, tableName: String): Unit = { df.write .mode(SaveMode.Overwrite) .format("com.databricks.spark.avro") .save(s"gs://$tmpBucket/$tableName") } def getOrCreateDataset(datasetName: String): Dataset = { scala.Option(bigquery.getDataset(datasetName)) match { case Some(ds) => logger.info(s"Dataset $datasetName already exist.") ds case None => logger.info(s"Dataset $datasetName does not exist, creating...") val ds = bigquery.create(DatasetInfo.of(datasetName)) logger.info(s"Dataset $datasetName created!") ds } } }
Example 6
Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 7
Source File: CarbonDataFrameExample.scala From CarbonDataLearning with GNU General Public License v3.0 | 5 votes |
package org.github.xubo245.carbonDataLearning.example import org.apache.carbondata.examples.util.ExampleUtils import org.apache.spark.sql.{SaveMode, SparkSession} object CarbonDataFrameExample { def main(args: Array[String]) { val spark = ExampleUtils.createCarbonSession("CarbonDataFrameExample") exampleBody(spark) spark.close() } def exampleBody(spark : SparkSession): Unit = { // Writes Dataframe to CarbonData file: import spark.implicits._ val df = spark.sparkContext.parallelize(1 to 100) .map(x => ("a" + x % 10, "b", x)) .toDF("c1", "c2", "number") // Saves dataframe to carbondata file df.write .format("carbondata") .option("tableName", "carbon_df_table") .option("partitionColumns", "c1") // a list of column names .mode(SaveMode.Overwrite) .save() spark.sql(""" SELECT * FROM carbon_df_table """).show() spark.sql("SHOW PARTITIONS carbon_df_table").show() // Specify schema import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} val customSchema = StructType(Array( StructField("c1", StringType), StructField("c2", StringType), StructField("number", IntegerType))) // Reads carbondata to dataframe val carbondf = spark.read .format("carbondata") .schema(customSchema) // .option("dbname", "db_name") the system will use "default" as dbname if not set this option .option("tableName", "carbon_df_table") .load() df.write .format("csv") .option("tableName", "csv_df_table") .option("partitionColumns", "c1") // a list of column names // .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") .mode(SaveMode.Overwrite) .csv("/Users/xubo/Desktop/xubo/git/carbondata3/examples/spark2/target/csv/1.csv") // Reads carbondata to dataframe val carbondf2 = spark.read .format("csv") .schema(customSchema) // .option("dbname", "db_name") the system will use "default" as dbname if not set this option .option("tableName", "csv_df_table") // .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") .load("/Users/xubo/Desktop/xubo/git/carbondata3/examples/spark2/target/csv") carbondf2.show() // Dataframe operations carbondf.printSchema() carbondf.select($"c1", $"number" + 10).show() carbondf.filter($"number" > 31).show() spark.sql("DROP TABLE IF EXISTS carbon_df_table") } }
Example 8
Source File: SFObjectWriter.scala From spark-salesforce with Apache License 2.0 | 5 votes |
package com.springml.spark.salesforce import org.apache.log4j.Logger import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SaveMode} import com.springml.salesforce.wave.api.APIFactory import com.springml.salesforce.wave.api.BulkAPI import com.springml.salesforce.wave.util.WaveAPIConstants import com.springml.salesforce.wave.model.JobInfo class SFObjectWriter ( val username: String, val password: String, val login: String, val version: String, val sfObject: String, val mode: SaveMode, val upsert: Boolean, val externalIdFieldName: String, val csvHeader: String ) extends Serializable { @transient val logger = Logger.getLogger(classOf[SFObjectWriter]) def writeData(rdd: RDD[Row]): Boolean = { val csvRDD = rdd.map(row => row.toSeq.map(value => Utils.rowValue(value)).mkString(",")) val jobInfo = new JobInfo(WaveAPIConstants.STR_CSV, sfObject, operation(mode, upsert)) jobInfo.setExternalIdFieldName(externalIdFieldName) val jobId = bulkAPI.createJob(jobInfo).getId csvRDD.mapPartitionsWithIndex { case (index, iterator) => { val records = iterator.toArray.mkString("\n") var batchInfoId : String = null if (records != null && !records.isEmpty()) { val data = csvHeader + "\n" + records val batchInfo = bulkAPI.addBatch(jobId, data) batchInfoId = batchInfo.getId } val success = (batchInfoId != null) // Job status will be checked after completing all batches List(success).iterator } }.reduce((a, b) => a & b) bulkAPI.closeJob(jobId) var i = 1 while (i < 999999) { if (bulkAPI.isCompleted(jobId)) { logger.info("Job completed") return true } logger.info("Job not completed, waiting...") Thread.sleep(200) i = i + 1 } print("Returning false...") logger.info("Job not completed. Timeout..." ) false } // Create new instance of BulkAPI every time because Spark workers cannot serialize the object private def bulkAPI(): BulkAPI = { APIFactory.getInstance().bulkAPI(username, password, login, version) } private def operation(mode: SaveMode, upsert: Boolean): String = { if (upsert) { "upsert" } else if (mode != null && SaveMode.Overwrite.name().equalsIgnoreCase(mode.name())) { WaveAPIConstants.STR_UPDATE } else if (mode != null && SaveMode.Append.name().equalsIgnoreCase(mode.name())) { WaveAPIConstants.STR_INSERT } else { logger.warn("SaveMode " + mode + " Not supported. Using 'insert' operation") WaveAPIConstants.STR_INSERT } } }
Example 9
Source File: DefaultSource.scala From spark-cdm with MIT License | 5 votes |
package com.microsoft.cdm import java.util.Optional import com.microsoft.cdm.read.CDMDataSourceReader import com.microsoft.cdm.utils.{AADProvider, ADLGen2Provider, Constants, DataConverter} import com.microsoft.cdm.write.CDMDataSourceWriter import org.apache.spark.sql.SaveMode import org.apache.spark.sql.sources.v2._ import org.apache.spark.sql.sources.v2.writer.DataSourceWriter import org.apache.spark.sql.types.StructType def createWriter(jobId: String, schema: StructType, mode: SaveMode, options: DataSourceOptions): Optional[DataSourceWriter] = { val modelDirectory = options.get("cdmFolder").get() val modelName = options.get("cdmModelName").get() val entity = options.get("entity").get() Optional.of(new CDMDataSourceWriter(jobId, schema, mode, getDataStorage(options), modelDirectory, modelName, entity, new DataConverter())) } }
Example 10
Source File: CDMDataSourceWriter.scala From spark-cdm with MIT License | 5 votes |
package com.microsoft.cdm.write import com.microsoft.cdm.utils._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.sql.sources.v2.writer.{DataSourceWriter, DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.types.StructType class CDMDataSourceWriter(val jobId: String, val schema: StructType, val mode: SaveMode, val adlProvider: ADLGen2Provider, val modelDirectory: String, val modelName: String, val entityName: String, val dataConvert: DataConverter) extends DataSourceWriter { def createWriterFactory: DataWriterFactory[InternalRow] = { new CDMDataWriterFactory(adlProvider, schema, jobId, modelDirectory, entityName) } private val createNewModel = (modelUri: String, attributes: Seq[CDMAttribute], partitions: Seq[CDMPartition]) => { CDMModel.createNewModel(modelName, entityName, attributes, partitions).toJson } private val appendExistingModel = (modelUri: String, attributes: Seq[CDMAttribute], partitions: Seq[CDMPartition]) => { val existingModel = new CDMModel(adlProvider.getFullFile(modelUri)) existingModel.appendOrReplaceEntity(entityName, attributes, partitions) existingModel.toJson } def commit(messages: Array[WriterCommitMessage]): Unit = { val partitions = messages.map{ message => val csvMsg = message.asInstanceOf[CSVCommitMessage] new CDMPartition(name=csvMsg.name, location=csvMsg.csvLocation) } val attributes = schema.map{ col => new CDMAttribute(col.name, dataConvert.toCdmType(col.dataType)) } // Check if there's an existing model in this directory to append to val modelUri = adlProvider.getModelJsonInDirectory(modelDirectory) val modelJson = (if(adlProvider.fileExists(modelUri)) appendExistingModel else createNewModel)(modelUri, attributes, partitions) adlProvider.uploadData(modelJson, modelUri) } // TODO: error handling def abort(messages: Array[WriterCommitMessage]): Unit = {} }
Example 11
Source File: PutHiveMode.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.hive import cn.piflow._ import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.{SaveMode, SparkSession} class PutHiveMode extends ConfigurableStop { val authorEmail: String = "[email protected]" val description: String = "Modes for saving data hive" val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var database:String = _ var table:String = _ var saveMode:String = _ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() val inDF = in.read() inDF.write.format("hive").mode(saveMode).saveAsTable(database + "." + table) } def initialize(ctx: ProcessContext): Unit = { } def setProperties(map : Map[String, Any]) = { database = MapUtil.get(map,"database").asInstanceOf[String] table = MapUtil.get(map,"table").asInstanceOf[String] saveMode = MapUtil.get(map,"saveMode").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { val saveModeOption = Set("append","overwrite","error","ignore") var descriptor : List[PropertyDescriptor] = List() val database=new PropertyDescriptor() .name("database") .displayName("DataBase") .description("The database name") .defaultValue("") .required(true) .example("test") descriptor = database :: descriptor val table = new PropertyDescriptor() .name("table") .displayName("Table") .description("The table name") .defaultValue("") .required(true) .example("student") descriptor = table :: descriptor val saveMode = new PropertyDescriptor() .name("saveMode") .displayName("SaveMode") .description("The save mode for table") .allowableValues(saveModeOption) .defaultValue("append") .required(true) .example("append") descriptor = saveMode :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/hive/PutHiveMode.png") } override def getGroup(): List[String] = { List(StopGroup.HiveGroup) } }
Example 12
Source File: MysqlWrite.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.jdbc import java.util.Properties import cn.piflow._ import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.{SaveMode, SparkSession} import scala.beans.BeanProperty class MysqlWrite extends ConfigurableStop{ val authorEmail: String = "[email protected]" val description: String = "Write data to mysql database with jdbc" val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var url:String = _ var user:String = _ var password:String = _ var dbtable:String = _ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession]() val jdbcDF = in.read() val properties = new Properties() properties.put("user", user) properties.put("password", password) jdbcDF.write.mode(SaveMode.Append).jdbc(url,dbtable,properties) out.write(jdbcDF) } def initialize(ctx: ProcessContext): Unit = { } override def setProperties(map: Map[String, Any]): Unit = { url = MapUtil.get(map,"url").asInstanceOf[String] user = MapUtil.get(map,"user").asInstanceOf[String] password = MapUtil.get(map,"password").asInstanceOf[String] dbtable = MapUtil.get(map,"dbtable").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val url=new PropertyDescriptor() .name("url") .displayName("Url") .description("The Url, for example jdbc:mysql://127.0.0.1/dbname") .defaultValue("") .required(true) .example("jdbc:mysql://127.0.0.1/dbname") descriptor = url :: descriptor val user=new PropertyDescriptor() .name("user") .displayName("User") .description("The user name of database") .defaultValue("") .required(true) .example("root") descriptor = user :: descriptor val password=new PropertyDescriptor() .name("password") .displayName("Password") .description("The password of database") .defaultValue("") .required(true) .example("123456") .sensitive(true) descriptor = password :: descriptor val dbtable=new PropertyDescriptor() .name("dbtable") .displayName("DBTable") .description("The table you want to write") .defaultValue("") .required(true) .example("test") descriptor = dbtable :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/jdbc/MysqlWrite.png") } override def getGroup(): List[String] = { List(StopGroup.JdbcGroup) } }
Example 13
Source File: JsonSave.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.json import cn.piflow._ import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SaveMode import scala.beans.BeanProperty class JsonSave extends ConfigurableStop{ val authorEmail: String = "[email protected]" val description: String = "Save data into json file" val inportList: List[String] = List(Port.DefaultPort) val outportList: List[String] = List(Port.DefaultPort) var jsonSavePath: String = _ def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val jsonDF = in.read() jsonDF.write.format("json").mode(SaveMode.Overwrite).save(jsonSavePath) } def initialize(ctx: ProcessContext): Unit = { } override def setProperties(map: Map[String, Any]): Unit = { jsonSavePath = MapUtil.get(map,"jsonSavePath").asInstanceOf[String] } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val jsonSavePath = new PropertyDescriptor() .name("jsonSavePath") .displayName("JsonSavePath") .description("The save path of the json file") .defaultValue("") .required(true) .example("hdfs://192.168.3.138:8020/work/testJson/test/") descriptor = jsonSavePath :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/json/JsonSave.png") } override def getGroup(): List[String] = { List(StopGroup.JsonGroup) } }
Example 14
Source File: ReadFromRedisTest.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.redis import java.net.InetAddress import cn.piflow.Runner import cn.piflow.conf.bean.FlowBean import cn.piflow.conf.util.{FileUtil, OptionUtil} import cn.piflow.util.{PropertyUtil, ServerIpUtil} import org.apache.spark.sql.{SaveMode, SparkSession} import org.h2.tools.Server import org.junit.Test import scala.util.parsing.json.JSON class ReadFromRedisTest { @Test def testFlow(): Unit ={ //parse flow json val file = "src/main/resources/flow/redis/ReadFromRedis.json" val flowJsonStr = FileUtil.fileReader(file) val map = OptionUtil.getAny(JSON.parseFull(flowJsonStr)).asInstanceOf[Map[String, Any]] println(map) //create flow val flowBean = FlowBean(map) val flow = flowBean.constructFlow() val ip = InetAddress.getLocalHost.getHostAddress cn.piflow.util.FileUtil.writeFile("server.ip=" + ip, ServerIpUtil.getServerIpFile()) val h2Server = Server.createTcpServer("-tcp", "-tcpAllowOthers", "-tcpPort", "50001").start() //execute flow val spark = SparkSession.builder() .master("local[*]") .appName("CsvParserTest") .config("spark.driver.memory", "1g") .config("spark.executor.memory", "2g") .config("spark.cores.max", "2") .config("hive.metastore.uris",PropertyUtil.getPropertyValue("hive.metastore.uris")) .enableHiveSupport() .getOrCreate() val process = Runner.create() .bind(classOf[SparkSession].getName, spark) .bind("checkpoint.path", "") .bind("debug.path","") .start(flow); process.awaitTermination(); val pid = process.pid(); println(pid + "!!!!!!!!!!!!!!!!!!!!!") spark.close(); } }
Example 15
Source File: ScalaRiakParquetExample.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.examples.parquet import org.apache.spark.sql.{SaveMode, SQLContext} import org.apache.spark.{SparkContext, SparkConf} object ScalaRiakParquetExample { case class TSData(site: String, species: String, measurementDate: Long, latitude: Double, longitude: Double, value: Double ) val startDate = System.currentTimeMillis() val endDate = startDate + 100 val tableName = "parquet_demo" val parquetFileName = "riak-ts-data.parquet" val testData = Seq( TSData("MY7", "PM10", startDate, 51.52254, -0.15459, 41.4), TSData("MY7", "PM10", startDate + 10, 51.52254, -0.15459, 41.2), TSData("MY7", "PM10", startDate + 20, 51.52254, -0.15459, 39.1), TSData("MY7", "PM10", startDate + 30, 51.52254, -0.15459, 39.5), TSData("MY7", "PM10", startDate + 40, 51.52254, -0.15459, 29.9), TSData("MY7", "PM10", startDate + 50, 51.52254, -0.15459, 34.2), TSData("MY7", "PM10", startDate + 60, 51.52254, -0.15459, 28.5), TSData("MY7", "PM10", startDate + 70, 51.52254, -0.15459, 39.6), TSData("MY7", "PM10", startDate + 80, 51.52254, -0.15459, 29.2), TSData("MY7", "PM10", startDate + 90, 51.52254, -0.15459, 31.3) ) def main(args: Array[String]): Unit = { val sparkConf = new SparkConf() .setAppName("Simple Scala Riak TS Demo") setSparkOpt(sparkConf, "spark.master", "local") setSparkOpt(sparkConf, "spark.riak.connection.host", "127.0.0.1:8087") println(s"Test data start time: $startDate") val sc = new SparkContext(sparkConf) val sqlCtx = SQLContext.getOrCreate(sc) import sqlCtx.implicits._ val rdd = sc.parallelize(testData) rdd.toDF().write.format("org.apache.spark.sql.riak") .mode(SaveMode.Append).save(tableName) val df = sqlCtx.read.format("org.apache.spark.sql.riak") .load(tableName).registerTempTable(tableName) val from = (startDate / 1000).toInt val query = s"select * from $tableName where measurementDate >= CAST($from AS TIMESTAMP) " + s"AND measurementDate <= CAST(${from + 1} AS TIMESTAMP) AND site = 'MY7' AND species = 'PM10'" println(s"Query: $query") val rows = sqlCtx.sql(query) rows.show() val schema = rows.schema rows.write.mode("overwrite").parquet(parquetFileName) println(s"Data was successfully saved to Parquet file: $parquetFileName") val parquetFile = sqlCtx.read.parquet(parquetFileName) parquetFile.registerTempTable("parquetFile") val data = sqlCtx.sql("SELECT MAX(value) max_value FROM parquetFile ") println("Maximum value retrieved from Parquet file:") data.show() } private def setSparkOpt(sparkConf: SparkConf, option: String, defaultOptVal: String): SparkConf = { val optval = sparkConf.getOption(option).getOrElse(defaultOptVal) sparkConf.set(option, optval) } }
Example 16
Source File: SolrDataFrameImplicits.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark.util import org.apache.spark.sql.{DataFrameReader, DataFrameWriter, Row, SaveMode} object SolrDataFrameImplicits { implicit class SolrReader(reader: DataFrameReader) { def solr(collection: String, query: String = "*:*") = reader.format("solr").option("collection", collection).option("query", query).load() def solr(collection: String, options: Map[String, String]) = reader.format("solr").option("collection", collection).options(options).load() } implicit class SolrWriter(writer: DataFrameWriter[Row]) { def solr(collectionName: String, softCommitSecs: Int = 10, overwrite: Boolean = false, format: String = "solr") = { writer .format(format) .option("collection", collectionName) .option("soft_commit_secs", softCommitSecs.toString) .mode(if(overwrite) SaveMode.Overwrite else SaveMode.Append) .save() } } }
Example 17
Source File: DefaultSource.scala From spark-solr with Apache License 2.0 | 5 votes |
package solr import com.lucidworks.spark.{SolrRelation, SolrStreamWriter} import com.lucidworks.spark.util.Constants import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} import org.apache.spark.sql.sources._ import org.apache.spark.sql.streaming.OutputMode class DefaultSource extends RelationProvider with CreatableRelationProvider with StreamSinkProvider with DataSourceRegister { override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { try { new SolrRelation(parameters, sqlContext.sparkSession) } catch { case re: RuntimeException => throw re case e: Exception => throw new RuntimeException(e) } } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { try { // TODO: What to do with the saveMode? val solrRelation: SolrRelation = new SolrRelation(parameters, Some(df), sqlContext.sparkSession) solrRelation.insert(df, overwrite = true) solrRelation } catch { case re: RuntimeException => throw re case e: Exception => throw new RuntimeException(e) } } override def shortName(): String = Constants.SOLR_FORMAT override def createSink( sqlContext: SQLContext, parameters: Map[String, String], partitionColumns: Seq[String], outputMode: OutputMode): Sink = { new SolrStreamWriter(sqlContext.sparkSession, parameters, partitionColumns, outputMode) } }
Example 18
Source File: HDFSBase.scala From daf with BSD 3-Clause "New" or "Revised" License | 5 votes |
package daf.util import better.files.{ File, _ } import daf.util.DataFrameClasses.{ Address, Person } import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.hdfs.{ HdfsConfiguration, MiniDFSCluster } import org.apache.hadoop.test.PathUtils import org.apache.spark.sql.{ SaveMode, SparkSession } import org.scalatest.{ BeforeAndAfterAll, FlatSpec, Matchers } import org.slf4j.LoggerFactory import scala.util.{ Failure, Random, Try } abstract class HDFSBase extends FlatSpec with Matchers with BeforeAndAfterAll { var miniCluster: Try[MiniDFSCluster] = Failure[MiniDFSCluster](new Exception) var fileSystem: Try[FileSystem] = Failure[FileSystem](new Exception) val sparkSession: SparkSession = SparkSession.builder().master("local").getOrCreate() val alogger = LoggerFactory.getLogger(this.getClass) val (testDataPath, confPath) = { val testDataPath = s"${PathUtils.getTestDir(this.getClass).getCanonicalPath}/MiniCluster" val confPath = s"$testDataPath/conf" ( testDataPath.toFile.createIfNotExists(asDirectory = true, createParents = false), confPath.toFile.createIfNotExists(asDirectory = true, createParents = false) ) } def pathAvro = "opendata/test.avro" def pathParquet = "opendata/test.parquet" def pathCsv = "opendata/test.csv" def getSparkSession = sparkSession override def beforeAll(): Unit = { val conf = new HdfsConfiguration() conf.setBoolean("dfs.permissions", true) System.clearProperty(MiniDFSCluster.PROP_TEST_BUILD_DATA) conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, testDataPath.pathAsString) //FileUtil.fullyDelete(testDataPath.toJava) conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.groups", "*") conf.set(s"hadoop.proxyuser.${System.getProperties.get("user.name")}.hosts", "*") val builder = new MiniDFSCluster.Builder(conf) miniCluster = Try(builder.build()) fileSystem = miniCluster.map(_.getFileSystem) fileSystem.foreach(fs => { val confFile: File = confPath / "hdfs-site.xml" for { os <- confFile.newOutputStream.autoClosed } fs.getConf.writeXml(os) }) writeDf() } override def afterAll(): Unit = { miniCluster.foreach(_.shutdown(true)) val _ = testDataPath.parent.parent.delete(true) sparkSession.stop() } private def writeDf(): Unit = { import sparkSession.implicits._ alogger.info(s"TestDataPath ${testDataPath.toJava.getAbsolutePath}") alogger.info(s"ConfPath ${confPath.toJava.getAbsolutePath}") val persons = (1 to 10).map(i => Person(s"Andy$i", Random.nextInt(85), Address("Via Ciccio Cappuccio"))) val caseClassDS = persons.toDS() caseClassDS.write.format("parquet").mode(SaveMode.Overwrite).save(pathParquet) caseClassDS.write.format("com.databricks.spark.avro").mode(SaveMode.Overwrite).save(pathAvro) //writing directly the Person dataframe generates an exception caseClassDS.toDF.select("name", "age").write.format("csv").mode(SaveMode.Overwrite).option("header", "true").save(pathCsv) } } object DataFrameClasses { final case class Address(street: String) final case class Person(name: String, age: Int, address: Address) }
Example 19
Source File: Target.scala From almaren-framework with Apache License 2.0 | 5 votes |
package com.github.music.of.the.ainur.almaren.state.core import com.github.music.of.the.ainur.almaren.State import com.github.music.of.the.ainur.almaren.util.Constants import org.apache.spark.sql.{DataFrame, SaveMode} private[almaren] abstract class Target extends State { override def executor(df: DataFrame): DataFrame = target(df) def target(df: DataFrame): DataFrame } case class TargetSql(sql: String) extends Target { override def target(df: DataFrame): DataFrame = { logger.info(s"sql:{$sql}") df.createOrReplaceTempView(Constants.TempTableName) val sqlDf = df.sqlContext.sql(sql) df } } case class TargetJdbc(url: String, driver: String, dbtable: String, user: Option[String], password: Option[String], saveMode: SaveMode, params: Map[String, String]) extends Target { override def target(df: DataFrame): DataFrame = { logger.info(s"url:{$url}, driver:{$driver}, dbtable:{$dbtable}, user:{$user}, params:{$params}") val options = (user, password) match { case (Some(user), None) => params + ("user" -> user) case (Some(user), Some(password)) => params + ("user" -> user, "password" -> password) case (_, _) => params } df.write.format("jdbc") .option("url", url) .option("driver", driver) .option("dbtable", dbtable) .options(options) .mode(saveMode) .save() df } } case class TargetKafka(servers: String, options: Map[String, String]) extends Target { override def target(df: DataFrame): DataFrame = { logger.info(s"options: $options") df.write .format("kafka") .option("kafka.bootstrap.servers", servers) .options(options) .save() df } } case class TargetFile( format: String, path: String, params: Map[String, String], saveMode: SaveMode) extends Target { override def target(df: DataFrame): DataFrame = { logger.info(s"format:{$format}, path:{$path}, params:{$params}") df.write .format(format) .options(params) .save() df } }
Example 20
Source File: DefaultSource.scala From Linkis with Apache License 2.0 | 5 votes |
package com.webank.wedatasphere.spark.excel import org.apache.hadoop.fs.Path import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType ): ExcelRelation = { ExcelRelation( location = checkParameter(parameters, "path"), sheetName = parameters.get("sheetName"), useHeader = checkParameter(parameters, "useHeader").toBoolean, treatEmptyValuesAsNulls = parameters.get("treatEmptyValuesAsNulls").fold(true)(_.toBoolean), userSchema = Option(schema), inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean), addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean), startColumn = parameters.get("startColumn").fold(0)(_.toInt), endColumn = parameters.get("endColumn").fold(Int.MaxValue)(_.toInt), timestampFormat = parameters.get("timestampFormat"), maxRowsInMemory = parameters.get("maxRowsInMemory").map(_.toInt), excerptSize = parameters.get("excerptSize").fold(10)(_.toInt), parameters = parameters, dateFormat = parameters.get("dateFormats").getOrElse("yyyy-MM-dd").split(";").toList )(sqlContext) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame ): BaseRelation = { val path = checkParameter(parameters, "path") val sheetName = parameters.getOrElse("sheetName", "Sheet1") val useHeader = checkParameter(parameters, "useHeader").toBoolean val dateFormat = parameters.getOrElse("dateFormat", ExcelFileSaver.DEFAULT_DATE_FORMAT) val timestampFormat = parameters.getOrElse("timestampFormat", ExcelFileSaver.DEFAULT_TIMESTAMP_FORMAT) val filesystemPath = new Path(path) val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) fs.setWriteChecksum(false) val doSave = if (fs.exists(filesystemPath)) { mode match { case SaveMode.Append => sys.error(s"Append mode is not supported by ${this.getClass.getCanonicalName}") case SaveMode.Overwrite => fs.delete(filesystemPath, true) true case SaveMode.ErrorIfExists => sys.error(s"path $path already exists.") case SaveMode.Ignore => false } } else { true } if (doSave) { // Only save data when the save mode is not ignore. (new ExcelFileSaver(fs)).save( filesystemPath, data, sheetName = sheetName, useHeader = useHeader, dateFormat = dateFormat, timestampFormat = timestampFormat ) } createRelation(sqlContext, parameters, data.schema) } // Forces a Parameter to exist, otherwise an exception is thrown. private def checkParameter(map: Map[String, String], param: String): String = { if (!map.contains(param)) { throw new IllegalArgumentException(s"Parameter ${'"'}$param${'"'} is missing in options.") } else { map.apply(param) } } // Gets the Parameter if it exists, otherwise returns the default argument private def parameterOrDefault(map: Map[String, String], param: String, default: String) = map.getOrElse(param, default) }
Example 21
Source File: DefaultSource.scala From spark-redis with BSD 3-Clause "New" or "Revised" License | 5 votes |
package org.apache.spark.sql.redis import org.apache.spark.sql.SaveMode.{Append, ErrorIfExists, Ignore, Overwrite} import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = None) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val relation = new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = None) mode match { case Append => relation.insert(data, overwrite = false) case Overwrite => relation.insert(data, overwrite = true) case ErrorIfExists => if (relation.nonEmpty) { throw new IllegalStateException("SaveMode is set to ErrorIfExists and dataframe " + "already exists in Redis and contains data.") } relation.insert(data, overwrite = false) case Ignore => if (relation.isEmpty) { relation.insert(data, overwrite = false) } } relation } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = new RedisSourceRelation(sqlContext, parameters, userSpecifiedSchema = Some(schema)) }
Example 22
Source File: DefaultSource.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml import org.apache.hadoop.fs.Path import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} import com.databricks.spark.xml.util.XmlFile override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): XmlRelation = { val path = checkPath(parameters) // We need the `charset` and `rowTag` before creating the relation. val (charset, rowTag) = { val options = XmlOptions(parameters) (options.charset, options.rowTag) } XmlRelation( () => XmlFile.withCharset(sqlContext.sparkContext, path, charset, rowTag), Some(path), parameters, schema)(sqlContext) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val path = checkPath(parameters) val filesystemPath = new Path(path) val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) val doSave = if (fs.exists(filesystemPath)) { mode match { case SaveMode.Append => throw new IllegalArgumentException( s"Append mode is not supported by ${this.getClass.getCanonicalName}") case SaveMode.Overwrite => fs.delete(filesystemPath, true) true case SaveMode.ErrorIfExists => throw new IllegalArgumentException(s"path $path already exists.") case SaveMode.Ignore => false } } else { true } if (doSave) { // Only save data when the save mode is not ignore. XmlFile.saveAsXmlFile(data, filesystemPath.toString, parameters) } createRelation(sqlContext, parameters, data.schema) } }
Example 23
Source File: DataGenerator.scala From iterative-broadcast-join with Apache License 2.0 | 5 votes |
package com.godatadriven.generator import com.godatadriven.common.Config import com.godatadriven.generator.UniformDataGenerator.KeyLabel import org.apache.spark.sql.{SaveMode, SparkSession} import scala.util.Random trait DataGenerator { def numberOfRows(numberOfKeys: Int = Config.numberOfKeys, keysMultiplier: Int = Config.keysMultiplier): Long = generateSkewedSequence(numberOfKeys).map(_._2).sum * keysMultiplier.toLong def generateSkewedSequence(numberOfKeys: Int): List[(Int, Int)] = (0 to numberOfKeys).par.map(i => (i, Math.ceil( (numberOfKeys.toDouble - i.toDouble) / (i.toDouble + 1.0) ).toInt) ).toList def createMediumTable(spark: SparkSession, tableName: String, numberOfPartitions: Int): Unit = { import spark.implicits._ val df = spark .read .parquet("table_large.parquet") .as[Int] .distinct() .mapPartitions(rows => { val r = new Random() rows.map(key => KeyLabel( key, s"Description for entry $key, that can be anything", // Already preallocate the pass of the broadcast iteration here Math.floor(r.nextDouble() * Config.numberOfBroadcastPasses).toInt ) ) }) .repartition(numberOfPartitions) assert(df.count() == Config.numberOfKeys) df .write .mode(SaveMode.Overwrite) .parquet(tableName) } def buildTestset(spark: SparkSession, numberOfKeys: Int = Config.numberOfKeys, keysMultiplier: Int = Config.keysMultiplier, numberOfPartitions: Int = Config.numberOfPartitions): Unit def getName: String def getMediumTableName: String def getLargeTableName: String }
Example 24
Source File: SparkUtil.scala From iterative-broadcast-join with Apache License 2.0 | 5 votes |
package com.godatadriven import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} object SparkUtil { def dfWrite(df: DataFrame, name: String): Unit = df .write .mode(SaveMode.Overwrite) .parquet(name) def dfRead(spark: SparkSession, name: String): DataFrame = spark .read .load(name) }
Example 25
Source File: CreateHiveTableAsSelectCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumnNames: Seq[String], mode: SaveMode) extends DataWritingCommand { private val tableIdentifier = tableDesc.identifier override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog if (catalog.tableExists(tableIdentifier)) { assert(mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } // For CTAS, there is no static partition values to insert. val partition = tableDesc.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( tableDesc, partition, query, overwrite = false, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(tableDesc.schema.isEmpty) catalog.createTable( tableDesc.copy(schema = outputColumns.toStructType), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 26
Source File: XSQLCreateHiveTableAsSelectCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.xsql.execution.command import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand import org.apache.spark.sql.xsql.XSQLSessionCatalog case class XSQLCreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumnNames: Seq[String], mode: SaveMode) extends DataWritingCommand { override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog.asInstanceOf[XSQLSessionCatalog] val tableIdentifier = catalog.getUsedTableIdentifier(tableDesc.identifier) val newTableDesc = tableDesc.copy(identifier = tableIdentifier) if (catalog.tableExists(tableIdentifier)) { assert( mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } XSQLInsertIntoHiveTable( newTableDesc, Map.empty, query, overwrite = false, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(newTableDesc.schema.isEmpty) catalog.createTable(newTableDesc.copy(schema = query.schema), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(newTableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap XSQLInsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumnNames = outputColumnNames).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 27
Source File: JdbcRelationProvider.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val resolver = sqlContext.conf.resolver val timeZoneId = sqlContext.conf.sessionLocalTimeZone val schema = JDBCRelation.getSchema(resolver, jdbcOptions) val parts = JDBCRelation.columnPartition(schema, resolver, timeZoneId, jdbcOptions) JDBCRelation(schema, parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val options = new JdbcOptionsInWrite(parameters) val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis val conn = JdbcUtils.createConnectionFactory(options)() try { val tableExists = JdbcUtils.tableExists(conn, options) if (tableExists) { mode match { case SaveMode.Overwrite => if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, options) val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, options.table, options) createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } case SaveMode.Append => val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '${options.table}' already exists. " + s"SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 28
Source File: SaveIntoDataSourceCommand.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.CreatableRelationProvider case class SaveIntoDataSourceCommand( query: LogicalPlan, dataSource: CreatableRelationProvider, options: Map[String, String], mode: SaveMode) extends RunnableCommand { override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { dataSource.createRelation( sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query)) Seq.empty[Row] } override def simpleString: String = { val redacted = SQLConf.get.redactOptions(options) s"SaveIntoDataSourceCommand ${dataSource}, ${redacted}, ${mode}" } }
Example 29
Source File: SaveIntoDataSourceCommandSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.SparkConf import org.apache.spark.sql.SaveMode import org.apache.spark.sql.test.SharedSQLContext class SaveIntoDataSourceCommandSuite extends SharedSQLContext { test("simpleString is redacted") { val URL = "connection.url" val PASS = "mypassword" val DRIVER = "mydriver" val dataSource = DataSource( sparkSession = spark, className = "jdbc", partitionColumns = Nil, options = Map("password" -> PASS, "url" -> URL, "driver" -> DRIVER)) val logicalPlanString = dataSource .planForWriting(SaveMode.ErrorIfExists, spark.range(1).logicalPlan) .treeString(true) assert(!logicalPlanString.contains(URL)) assert(!logicalPlanString.contains(PASS)) assert(logicalPlanString.contains(DRIVER)) } }
Example 30
Source File: DefaultSource.scala From spark-power-bi with Apache License 2.0 | 5 votes |
package com.granturing.spark.powerbi import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider} import scala.concurrent._ import scala.concurrent.ExecutionContext.Implicits._ import scala.concurrent.duration.Duration class DefaultSource extends CreatableRelationProvider with PowerBISink { override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val conf = ClientConf.fromSparkConf(sqlContext.sparkContext.getConf) implicit val client = new Client(conf) val dataset = parameters.getOrElse("dataset", sys.error("'dataset' must be specified")) val table = parameters.getOrElse("table", sys.error("'table' must be specified")) val batchSize = parameters.getOrElse("batchSize", conf.batchSize.toString).toInt val group = parameters.get("group") val step = for { groupId <- getGroupId(group) ds <- getOrCreateDataset(mode, groupId, dataset, table, data.schema) } yield (groupId, ds) val result = step map { case (groupId, ds) => val fields = data.schema.fieldNames.zipWithIndex val _conf = conf val _token = Some(client.currentToken) val _table = table val _batchSize = batchSize val coalesced = data.rdd.partitions.size > _conf.maxPartitions match { case true => data.coalesce(_conf.maxPartitions) case false => data } coalesced foreachPartition { p => val rows = p map { r => fields map { case(name, index) => (name -> r(index)) } toMap } toSeq val _client = new Client(_conf, _token) val submit = rows. sliding(_batchSize, _batchSize). foldLeft(future()) { (fAccum, batch) => fAccum flatMap { _ => _client.addRows(ds.id, _table, batch, groupId) } } submit.onComplete { _ => _client.shutdown() } Await.result(submit, _conf.timeout) } } result.onComplete { _ => client.shutdown() } Await.result(result, Duration.Inf) new BaseRelation { val sqlContext = data.sqlContext val schema = data.schema } } }
Example 31
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 32
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 33
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 34
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 35
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 36
Source File: WriteRead.scala From SparkBuildExamples with Apache License 2.0 | 5 votes |
package com.datastax.spark.example import com.datastax.spark.connector._ import com.datastax.spark.connector.cql.CassandraConnector import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.cassandra._ // For DSE it is not necessary to set connection parameters for spark.master (since it will be done // automatically) object WriteRead extends App { val spark = SparkSession.builder .appName("Datastax Scala example") .enableHiveSupport() .getOrCreate() import spark.implicits._ // Create keyspace and table CassandraConnector(spark.sparkContext).withSessionDo { session => session.execute( """CREATE KEYSPACE IF NOT EXISTS ks WITH | replication = {'class': 'SimpleStrategy', 'replication_factor': 1 }""".stripMargin) session.execute("""CREATE TABLE IF NOT EXISTS ks.kv (k int, v int, PRIMARY KEY (k))""") } // Write some data spark.range(1, 10) .map(x => (x, x)) .rdd .saveToCassandra("ks", "kv") // Read data as RDD val rdd = spark.sparkContext .cassandraTable(keyspace = "ks", table = "kv") // Read data as DataSet (DataFrame) val dataset = spark.read .cassandraFormat(keyspace = "ks", table = "kv") .load() println("Data read as RDD") rdd.collect() .foreach(println) println("Data read as DataSet (DataFrame)") dataset.collect() .foreach(println) spark.stop() sys.exit(0) }
Example 37
Source File: T8-5-L8-30-34DataFrameExamplesActions.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SaveMode import org.apache.spark.sql.functions.desc import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apress.prospark.CdrDataframeExamplesActionsApp.Cdr import org.json4s.DefaultFormats object CdrDataframeExamplesActionsApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CdrDataframeExamplesActionsApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val cl = Thread.currentThread().getContextClassLoader() val hiveC = new HiveContext(ssc.sparkContext) Thread.currentThread().setContextClassLoader(cl) import hiveC.implicits._ implicit val formats = DefaultFormats val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val counts = cdrs.groupBy("countryCode").count().orderBy(desc("count")) counts.show(5) counts.show() println("head(5): " + counts.head(5)) println("take(5): " + counts.take(5)) println("head(): " + counts.head()) println("first(5): " + counts.first()) println("count(): " + counts.count()) println("collect(): " + counts.collect()) println("collectAsList(): " + counts.collectAsList()) println("describe(): " + cdrs.describe("smsInActivity", "smsOutActivity", "callInActivity", "callOutActivity", "internetTrafficActivity").show()) counts.write.format("parquet").save("/tmp/parquent" + rdd.id) counts.write.format("json").save("/tmp/json" + rdd.id) counts.write.parquet("/tmp/parquent2" + rdd.id) counts.write.json("/tmp/json2" + rdd.id) counts.write.saveAsTable("count_table") cdrs.groupBy("countryCode").count().orderBy(desc("count")).write.mode(SaveMode.Append).save("/tmp/counts") val prop: java.util.Properties = new java.util.Properties() counts.write.jdbc("jdbc:mysql://hostname:port/cdrsdb", "count_table", prop) }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 38
Source File: DefaultSource.scala From spark-excel with Apache License 2.0 | 5 votes |
package com.crealytics.spark.excel import com.crealytics.spark.excel.Utils._ import org.apache.hadoop.fs.Path import org.apache.poi.ss.util.{CellRangeAddress, CellReference} import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} import scala.util.Try class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType ): ExcelRelation = { val wbReader = WorkbookReader(parameters, sqlContext.sparkContext.hadoopConfiguration) val dataLocator = DataLocator(parameters) ExcelRelation( header = checkParameter(parameters, "header").toBoolean, treatEmptyValuesAsNulls = parameters.get("treatEmptyValuesAsNulls").fold(false)(_.toBoolean), userSchema = Option(schema), inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean), addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean), timestampFormat = parameters.get("timestampFormat"), excerptSize = parameters.get("excerptSize").fold(10)(_.toInt), dataLocator = dataLocator, workbookReader = wbReader )(sqlContext) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame ): BaseRelation = { val path = checkParameter(parameters, "path") val header = checkParameter(parameters, "header").toBoolean val filesystemPath = new Path(path) val fs = filesystemPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration) new ExcelFileSaver( fs, filesystemPath, data, saveMode = mode, header = header, dataLocator = DataLocator(parameters) ).save() createRelation(sqlContext, parameters, data.schema) } // Forces a Parameter to exist, otherwise an exception is thrown. private def checkParameter(map: Map[String, String], param: String): String = { if (!map.contains(param)) { throw new IllegalArgumentException(s"Parameter ${'"'}$param${'"'} is missing in options.") } else { map.apply(param) } } }
Example 39
Source File: ExcelFileSaver.scala From spark-excel with Apache License 2.0 | 5 votes |
package com.crealytics.spark.excel import java.io.BufferedOutputStream import com.crealytics.spark.excel.ExcelFileSaver.{DEFAULT_DATE_FORMAT, DEFAULT_SHEET_NAME, DEFAULT_TIMESTAMP_FORMAT} import com.norbitltd.spoiwo.model._ import com.norbitltd.spoiwo.natures.xlsx.Model2XlsxConversions._ import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} import org.apache.poi.ss.util.CellRangeAddress import org.apache.poi.xssf.usermodel.XSSFWorkbook import org.apache.spark.sql.{DataFrame, SaveMode} import scala.collection.JavaConverters._ object ExcelFileSaver { final val DEFAULT_SHEET_NAME = "Sheet1" final val DEFAULT_DATE_FORMAT = "yy-m-d h:mm" final val DEFAULT_TIMESTAMP_FORMAT = "yyyy-mm-dd hh:mm:ss.000" } class ExcelFileSaver( fs: FileSystem, location: Path, dataFrame: DataFrame, saveMode: SaveMode, dataLocator: DataLocator, header: Boolean = true ) { def save(): Unit = { def sheet(workbook: XSSFWorkbook) = { val headerRow = if (header) Some(dataFrame.schema.fields.map(_.name).toSeq) else None val dataRows = dataFrame .toLocalIterator() .asScala .map(_.toSeq) dataLocator.toSheet(headerRow, dataRows, workbook) } val fileAlreadyExists = fs.exists(location) def writeToWorkbook(workbook: XSSFWorkbook): Unit = { Workbook(sheet(workbook)).writeToExisting(workbook) autoClose(new BufferedOutputStream(fs.create(location)))(workbook.write) } (fileAlreadyExists, saveMode) match { case (false, _) | (_, SaveMode.Overwrite) => if (fileAlreadyExists) { fs.delete(location, true) } writeToWorkbook(new XSSFWorkbook()) case (true, SaveMode.ErrorIfExists) => sys.error(s"path $location already exists.") case (true, SaveMode.Ignore) => () case (true, SaveMode.Append) => val inputStream: FSDataInputStream = fs.open(location) val workbook = new XSSFWorkbook(inputStream) inputStream.close() writeToWorkbook(workbook) } } def autoClose[A <: AutoCloseable, B](closeable: A)(fun: (A) => B): B = { try { fun(closeable) } finally { closeable.close() } } }
Example 40
Source File: Output.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.sdk.pipeline.output import java.io.{Serializable => JSerializable} import akka.event.slf4j.SLF4JLogging import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._ import com.stratio.sparta.sdk.properties.{CustomProperties, Parameterizable} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, DataFrameWriter, Row, SaveMode} abstract class Output(val name: String, properties: Map[String, JSerializable]) extends Parameterizable(properties) with SLF4JLogging with CustomProperties { val customKey = "saveOptions" val customPropertyKey = "saveOptionsKey" val customPropertyValue = "saveOptionsValue" val propertiesWithCustom = properties ++ getCustomProperties def setUp(options: Map[String, String] = Map.empty[String, String]): Unit = {} def cleanUp(options: Map[String, String] = Map.empty[String, String]): Unit = {} def save(dataFrame: DataFrame, saveMode: SaveModeEnum.Value, options: Map[String, String]): Unit def supportedSaveModes: Seq[SaveModeEnum.Value] = SaveModeEnum.allSaveModes def validateSaveMode(saveMode: SaveModeEnum.Value): Unit = { if (!supportedSaveModes.contains(saveMode)) log.info(s"Save mode $saveMode selected not supported by the output $name." + s" Using the default mode ${SaveModeEnum.Append}" ) } } object Output extends SLF4JLogging { final val ClassSuffix = "Output" final val SparkConfigurationMethod = "getSparkConfiguration" final val Separator = "_" final val FieldsSeparator = "," final val PrimaryKey = "primaryKey" final val TableNameKey = "tableName" final val PartitionByKey = "partitionBy" final val TimeDimensionKey = "timeDimension" final val MeasureMetadataKey = "measure" final val PrimaryKeyMetadataKey = "pk" def getSparkSaveMode(saveModeEnum: SaveModeEnum.Value): SaveMode = saveModeEnum match { case SaveModeEnum.Append => SaveMode.Append case SaveModeEnum.ErrorIfExists => SaveMode.ErrorIfExists case SaveModeEnum.Overwrite => SaveMode.Overwrite case SaveModeEnum.Ignore => SaveMode.Ignore case SaveModeEnum.Upsert => SaveMode.Append case _ => log.warn(s"Save Mode $saveModeEnum not supported, using default save mode ${SaveModeEnum.Append}") SaveMode.Append } def getTimeFromOptions(options: Map[String, String]): Option[String] = options.get(TimeDimensionKey).notBlank def getPrimaryKeyOptions(options: Map[String, String]): Option[String] = options.get(PrimaryKey).notBlank def getTableNameFromOptions(options: Map[String, String]): String = options.getOrElse(TableNameKey, { log.error("Table name not defined") throw new NoSuchElementException("tableName not found in options") }) def applyPartitionBy(options: Map[String, String], dataFrame: DataFrameWriter[Row], schemaFields: Array[StructField]): DataFrameWriter[Row] = { options.get(PartitionByKey).notBlank.fold(dataFrame)(partitions => { val fieldsInDataFrame = schemaFields.map(field => field.name) val partitionFields = partitions.split(",") if (partitionFields.forall(field => fieldsInDataFrame.contains(field))) dataFrame.partitionBy(partitionFields: _*) else { log.warn(s"Impossible to execute partition by fields: $partitionFields because the dataFrame not contain all" + s" fields. The dataFrame only contains: ${fieldsInDataFrame.mkString(",")}") dataFrame } }) } def defaultTimeStampField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField = StructField(fieldName, TimestampType, nullable, metadata) def defaultDateField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField = StructField(fieldName, DateType, nullable, metadata) def defaultStringField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField = StructField(fieldName, StringType, nullable, metadata) def defaultGeoField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField = StructField(fieldName, ArrayType(DoubleType), nullable, metadata) def defaultLongField(fieldName: String, nullable: Boolean, metadata: Metadata = Metadata.empty): StructField = StructField(fieldName, LongType, nullable, metadata) }
Example 41
Source File: CreateDataSourceTableAsSelectHarvesterSuite.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.sql import com.hortonworks.spark.atlas.types.metadata import scala.util.Random import com.hortonworks.spark.atlas.{SACAtlasEntityWithDependencies, WithHiveSupport} import com.hortonworks.spark.atlas.utils.SparkUtils import org.apache.atlas.model.instance.AtlasEntity import org.apache.spark.sql.SaveMode import org.apache.spark.sql.catalyst.catalog.{CatalogTable, CatalogTableType} import org.apache.spark.sql.execution.command.CreateDataSourceTableAsSelectCommand import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.types.StructType import org.scalatest.{FunSuite, Matchers} // This is not leveraging BaseHarvesterSuite, as it doesn't need to be tested with // both non-remote HMS and remote HMS cases. class CreateDataSourceTableAsSelectHarvesterSuite extends FunSuite with Matchers with WithHiveSupport { private val sourceTblName = "source_" + Random.nextInt(100000) override protected def beforeAll(): Unit = { super.beforeAll() sparkSession.sql(s"CREATE TABLE $sourceTblName (name string, age int)") } test("saveAsTable should have output entity having table details - parquet") { testWithProvider("parquet") } test("saveAsTable should have output entity having table details - hive") { val entity = testWithProvider("hive") assert(entity.getAttribute("partitionProvider") == "Catalog") } def testWithProvider(provider: String): AtlasEntity = { val destTblName = "dest1_" + Random.nextInt(100000) val df = sparkSession.sql(s"SELECT * FROM $sourceTblName") // The codes below look after DataFrameWriter.saveAsTable codes as of Spark 2.4. // It uses internal APIs for this test. If the compatibility is broken, we should better // just remove this test. val tableIdent = df.sparkSession.sessionState.sqlParser.parseTableIdentifier(destTblName) val storage = DataSource.buildStorageFormatFromOptions(Map("path" -> "/tmp/foo")) val tableDesc = CatalogTable( identifier = tableIdent, tableType = CatalogTableType.EXTERNAL, storage = storage, schema = new StructType, provider = Some(provider), partitionColumnNames = Nil, bucketSpec = None) val cmd = CreateDataSourceTableAsSelectCommand( tableDesc, SaveMode.ErrorIfExists, df.queryExecution.logical, Seq("name", "age")) val newTable = tableDesc.copy( storage = tableDesc.storage.copy(), schema = df.schema) sparkSession.sessionState.catalog.createTable( newTable, ignoreIfExists = false, validateLocation = false) val qd = QueryDetail(df.queryExecution, 0L) val entities = CommandsHarvester.CreateDataSourceTableAsSelectHarvester.harvest(cmd, qd) val processDeps = entities.head.asInstanceOf[SACAtlasEntityWithDependencies].dependencies val maybeEntity = processDeps.find(_.typeName == metadata.TABLE_TYPE_STRING) .map(_.asInstanceOf[SACAtlasEntityWithDependencies].entity) assert(maybeEntity.isDefined, s"Output entity for table [$destTblName] was not found.") assert(maybeEntity.get.getAttribute("name") == destTblName) assert(maybeEntity.get.getAttribute("owner") == SparkUtils.currUser()) assert(maybeEntity.get.getAttribute("schemaDesc") == "struct<name:string,age:int>") assert(maybeEntity.get.getAttribute("provider") == provider) maybeEntity.get } }
Example 42
Source File: LibSVMRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 43
Source File: JdbcRelationProvider.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 44
Source File: DoubleDataTypeTestCase.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.integration.spark.testsuite.primitiveTypes import java.util.Random import org.apache.spark.sql.test.util.QueryTest import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.scalatest.BeforeAndAfterAll class DoubleDataTypeTestCase extends QueryTest with BeforeAndAfterAll { lazy val df: DataFrame = generateDataFrame private def generateDataFrame(): DataFrame = { val r = new Random() val rdd = sqlContext.sparkContext .parallelize(1 to 10, 2) .map { x => Row(x, "London" + (x % 2), x.toDouble / 13, x.toDouble / 11) } val schema = StructType( Seq( StructField("id", IntegerType, nullable = false), StructField("city", StringType, nullable = false), StructField("m1", DoubleType, nullable = false), StructField("m2", DoubleType, nullable = false) ) ) sqlContext.createDataFrame(rdd, schema) } override def beforeAll { sql("drop table if exists uniq_carbon") sql("drop table if exists uniq_hive") sql("drop table if exists doubleTypeCarbonTable") sql("drop table if exists doubleTypeHiveTable") df.write .format("carbondata") .option("tableName", "doubleTypeCarbonTable") .option("tempCSV", "false") .option("table_blocksize", "32") .mode(SaveMode.Overwrite) .save() df.write .mode(SaveMode.Overwrite) .saveAsTable("doubleTypeHiveTable") } test("detail query") { checkAnswer(sql("select * from doubleTypeCarbonTable order by id"), sql("select * from doubleTypeHiveTable order by id")) } test("duplicate values") { sql("create table uniq_carbon(name string, double_column double) STORED AS carbondata ") sql(s"load data inpath '$resourcesPath/uniq.csv' into table uniq_carbon") sql("create table uniq_hive(name string, double_column double) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','") sql(s"load data local inpath '$resourcesPath/uniqwithoutheader.csv' into table uniq_hive") checkAnswer(sql("select * from uniq_carbon where double_column>=11"), sql("select * from uniq_hive where double_column>=11")) } // test("agg query") { // checkAnswer(sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeCarbonTable group by city"), // sql("select city, sum(m1), avg(m1), count(m1), max(m1), min(m1) from doubleTypeHiveTable group by city")) // // checkAnswer(sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeCarbonTable group by city"), // sql("select city, sum(m2), avg(m2), count(m2), max(m2), min(m2) from doubleTypeHiveTable group by city")) // } override def afterAll { sql("drop table if exists uniq_carbon") sql("drop table if exists uniq_hive") sql("drop table if exists doubleTypeCarbonTable") sql("drop table if exists doubleTypeHiveTable") } }
Example 45
Source File: UpdateCarbonTableTestCaseWithBadRecord.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.iud import org.apache.spark.sql.{Row, SaveMode} import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.common.constants.LoggerAction import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.util.CarbonProperties import org.apache.spark.sql.test.util.QueryTest class UpdateCarbonTableTestCaseWithBadRecord extends QueryTest with BeforeAndAfterAll { override def beforeAll { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION , LoggerAction.FORCE.name()) } test("test update operation with Badrecords action as force.") { sql("""drop table if exists badtable""").show sql("""create table badtable (c1 string,c2 int,c3 string,c5 string) STORED AS carbondata""") sql(s"""LOAD DATA LOCAL INPATH '$resourcesPath/IUD/badrecord.csv' INTO table badtable""") sql("""update badtable d set (d.c2) = (d.c2 / 1)""").show() checkAnswer( sql("""select c1,c2,c3,c5 from badtable"""), Seq(Row("ravi",null,"kiran","huawei"),Row("manohar",null,"vanam","huawei")) ) sql("""drop table badtable""").show } test("test update operation with Badrecords action as FAIL.") { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION , LoggerAction.FAIL.name()) sql("""drop table if exists badtable""").show sql("""create table badtable (c1 string,c2 int,c3 string,c5 string) STORED AS carbondata""") sql(s"""LOAD DATA LOCAL INPATH '$resourcesPath/IUD/badrecord.csv' INTO table badtable""") val exec = intercept[Exception] { sql("""update badtable d set (d.c2) = (d.c2 / 1)""").show() } checkAnswer( sql("""select c1,c2,c3,c5 from badtable"""), Seq(Row("ravi",2,"kiran","huawei"),Row("manohar",4,"vanam","huawei")) ) sql("""drop table badtable""").show } override def afterAll { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION , LoggerAction.FORCE.name()) } }
Example 46
Source File: TestUpdateAndDeleteWithLargeData.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.spark.testsuite.iud import java.text.SimpleDateFormat import org.apache.spark.sql.test.util.QueryTest import org.apache.spark.sql.{DataFrame, Row, SaveMode} import org.scalatest.BeforeAndAfterAll import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.util.CarbonProperties class TestUpdateAndDeleteWithLargeData extends QueryTest with BeforeAndAfterAll { var df: DataFrame = _ override def beforeAll { dropTable() buildTestData() } private def buildTestData(): Unit = { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy-MM-dd") // Simulate data and write to table orders import sqlContext.implicits._ val sdf = new SimpleDateFormat("yyyy-MM-dd") df = sqlContext.sparkSession.sparkContext.parallelize(1 to 1500000) .map(value => (value, new java.sql.Date(sdf.parse("2015-07-" + (value % 10 + 10)).getTime), "china", "aaa" + value, "phone" + 555 * value, "ASD" + (60000 + value), 14999 + value, "ordersTable" + value)) .toDF("o_id", "o_date", "o_country", "o_name", "o_phonetype", "o_serialname", "o_salary", "o_comment") createTable() } private def createTable(): Unit = { df.write .format("carbondata") .option("tableName", "orders") .option("tempCSV", "true") .option("compress", "true") .mode(SaveMode.Overwrite) .save() } private def dropTable() = { sql("DROP TABLE IF EXISTS orders") } test("test the update and delete delete functionality for large data") { sql( """ update ORDERS set (o_comment) = ('yyy')""").show() checkAnswer(sql( """select o_comment from orders limit 2 """), Seq(Row("yyy"), Row("yyy"))) sql("delete from orders where exists (select 1 from orders)") checkAnswer(sql( """ SELECT count(*) FROM orders """), Row(0)) } }
Example 47
Source File: CaseClassDataFrameAPIExample.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.examples import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import org.apache.carbondata.examples.util.ExampleUtils case class People(name: String, occupation: String, id: Int) object CaseClassDataFrameAPIExample { def main(args: Array[String]) { val spark = ExampleUtils.createSparkSession("CaseClassDataFrameAPIExample") exampleBody(spark) spark.close() } def exampleBody(spark : SparkSession): Unit = { val people = List(People("sangeeta", "engineer", 1), People("pallavi", "consultant", 2)) val peopleRDD: RDD[People] = spark.sparkContext.parallelize(people) import spark.implicits._ val peopleDF: DataFrame = peopleRDD.toDF("name", "occupation", "id") // writing data to carbon table peopleDF.write .format("carbondata") .option("tableName", "caseclass_table") .option("compress", "true") .mode(SaveMode.Overwrite) .save() spark.sql("SELECT * FROM caseclass_table").show() spark.sql("DROP TABLE IF EXISTS caseclass_table") } }
Example 48
Source File: ExternalTableExample.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.examples import java.io.File import org.apache.spark.sql.{CarbonEnv, SaveMode, SparkSession} import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.metadata.CarbonTableIdentifier import org.apache.carbondata.core.util.CarbonProperties import org.apache.carbondata.examples.util.ExampleUtils object ExternalTableExample { def main(args: Array[String]) { val spark = ExampleUtils.createSparkSession("ExternalTableExample") exampleBody(spark) spark.close() } def exampleBody(spark : SparkSession): Unit = { CarbonProperties.getInstance() .addProperty(CarbonCommonConstants.CARBON_DATE_FORMAT, "yyyy/MM/dd") // Create origin_table spark.sql("DROP TABLE IF EXISTS origin_table") spark.sql( s""" | CREATE TABLE origin_table( | shortField SHORT, | intField INT, | bigintField LONG, | doubleField DOUBLE, | stringField STRING, | timestampField TIMESTAMP, | decimalField DECIMAL(18,2), | dateField DATE, | charField CHAR(5), | floatField FLOAT | ) | STORED AS carbondata """.stripMargin) val rootPath = new File(this.getClass.getResource("/").getPath + "../../../..").getCanonicalPath val path = s"$rootPath/examples/spark/src/main/resources/data.csv" // load 4 times, each load has 10 rows data // scalastyle:off (1 to 4).foreach(_ => spark.sql( s""" | LOAD DATA LOCAL INPATH '$path' | INTO TABLE origin_table | OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#') """.stripMargin)) // scalastyle:on // 40 rows spark.sql("SELECT count(*) FROM origin_table").show() val origin_table_path = CarbonEnv.getTablePath(Some("default"), "origin_table")(spark) // Create external_table spark.sql("DROP TABLE IF EXISTS external_table") spark.sql("CREATE EXTERNAL TABLE external_table STORED AS carbondata" + s" LOCATION '$origin_table_path'") spark.sql("SELECT count(*) FROM external_table").show() // Load 2 times again (1 to 2).foreach(_ => spark.sql( s""" | LOAD DATA LOCAL INPATH '$path' | INTO TABLE origin_table | OPTIONS('HEADER'='true', 'COMPLEX_DELIMITER_LEVEL_1'='#') """.stripMargin)) spark.sql("SELECT count(*) FROM external_table").show() // Drop tables spark.sql("DROP TABLE IF EXISTS origin_table") spark.sql("DROP TABLE IF EXISTS external_table") } }
Example 49
Source File: HadoopFileExample.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.examples import java.io.File import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.carbondata.examples.util.ExampleUtils import org.apache.carbondata.hadoop.api.{CarbonInputFormat, CarbonTableInputFormat} import org.apache.carbondata.hadoop.CarbonProjection // scalastyle:off println object HadoopFileExample { def main(args: Array[String]): Unit = { val spark = ExampleUtils.createSparkSession("HadoopFileExample") val rootPath = new File(this.getClass.getResource("/").getPath + "../../../..").getCanonicalPath val storeLocation: String = rootPath + "/examples/spark/target/store/default" exampleBody(spark, storeLocation) spark.close() } def exampleBody(spark : SparkSession, storeLocation : String): Unit = { import spark.implicits._ val df = spark.sparkContext.parallelize(1 to 1000) .map(x => ("a", "b", x)) .toDF("c1", "c2", "c3") df.write.format("carbondata") .option("tableName", "Hadoopfile_table") .option("compress", "true") .mode(SaveMode.Overwrite).save() // read two columns val projection = new CarbonProjection projection.addColumn("c1") // column c1 projection.addColumn("c3") // column c3 val conf = new Configuration() CarbonInputFormat.setColumnProjection(conf, projection) CarbonInputFormat.setDatabaseName(conf, "default") CarbonInputFormat.setTableName(conf, "Hadoopfile_table") val input = spark.sparkContext.newAPIHadoopFile(s"${storeLocation}/Hadoopfile_table", classOf[CarbonTableInputFormat[Array[Object]]], classOf[Void], classOf[Array[Object]], conf) val result = input.map(x => x._2.toList).collect result.foreach(x => println(x.mkString(", "))) // delete carbondata file ExampleUtils.cleanSampleCarbonFile(spark, "Hadoopfile_table") } } // scalastyle:on println
Example 50
Source File: LuceneIndexExample.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.examples import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.carbondata.examples.util.ExampleUtils object LuceneIndexExample { def main(args: Array[String]) { val spark = ExampleUtils.createSparkSession("LuceneIndexExample") exampleBody(spark) spark.close() } def exampleBody(spark : SparkSession): Unit = { // build the test data, please increase the data for more obvious comparison. // if set the data is larger than 100M, it will take 10+ mins. import scala.util.Random import spark.implicits._ val r = new Random() val df = spark.sparkContext.parallelize(1 to 10 * 10 * 1000) .map(x => ("which test" + r.nextInt(10000) + " good" + r.nextInt(10), "who and name" + x % 8, "city" + x % 50, x % 60)) .toDF("id", "name", "city", "age") spark.sql("DROP TABLE IF EXISTS personTable") df.write.format("carbondata") .option("tableName", "personTable") .option("compress", "true") .mode(SaveMode.Overwrite).save() // create lucene index on personTable spark.sql( s""" | CREATE INDEX IF NOT EXISTS dm ON TABLE personTable (id, name) | AS 'lucene' """.stripMargin) // 1. Compare the performance: def time(code: => Unit): Double = { val start = System.currentTimeMillis() code // return time in second (System.currentTimeMillis() - start).toDouble / 1000 } val timeWithoutLuceneIndex = time { spark.sql( s""" | SELECT count(*) | FROM personTable where id like '% test1 %' """.stripMargin).show() } val timeWithLuceneIndex = time { spark.sql( s""" | SELECT count(*) | FROM personTable where TEXT_MATCH('id:test1') """.stripMargin).show() } // scalastyle:off println("time for query on table with lucene index table:" + timeWithLuceneIndex.toString) println("time for query on table without lucene index table:" + timeWithoutLuceneIndex.toString) // scalastyle:on // 2. Search for word "test1" and not "good" in the id field spark.sql( s""" | SELECT id,name | FROM personTable where TEXT_MATCH('id:test1 -id:good1') """.stripMargin).show(100) // 3. TEXT_MATCH_WITH_LIMIT usage: spark.sql( s""" | SELECT id,name | FROM personTable where TEXT_MATCH_WITH_LIMIT('id:test1',10) """.stripMargin).show() spark.sql("DROP TABLE IF EXISTS personTable") } }
Example 51
Source File: CarbonDataFrameExample.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.examples import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.carbondata.examples.util.ExampleUtils object CarbonDataFrameExample { def main(args: Array[String]) { val spark = ExampleUtils.createSparkSession("CarbonDataFrameExample") exampleBody(spark) spark.close() } def exampleBody(spark : SparkSession): Unit = { // Writes Dataframe to CarbonData file: import spark.implicits._ val df = spark.sparkContext.parallelize(1 to 100) .map(x => ("a" + x % 10, "b", x)) .toDF("c1", "c2", "number") // Saves dataframe to carbondata file df.write .format("carbondata") .option("tableName", "carbon_df_table") .option("partitionColumns", "c1") // a list of column names .mode(SaveMode.Overwrite) .save() spark.sql(""" SELECT * FROM carbon_df_table """).show() spark.sql("SHOW PARTITIONS carbon_df_table").show() // Specify schema import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType} val customSchema = StructType(Array( StructField("c1", StringType), StructField("c2", StringType), StructField("number", IntegerType))) // Reads carbondata to dataframe val carbondf = spark.read .format("carbondata") .schema(customSchema) // .option("dbname", "db_name") the system will use "default" as dbname if not set this option .option("tableName", "carbon_df_table") .load() // Dataframe operations carbondf.printSchema() carbondf.select($"c1", $"number" + 10).show() carbondf.filter($"number" > 31).show() spark.sql("DROP TABLE IF EXISTS carbon_df_table") } }
Example 52
Source File: SortMergeJoin_7_6.scala From LearningSparkV2 with Apache License 2.0 | 5 votes |
package main.scala.chapter7 import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.SaveMode import scala.util.Random object SortMergeJoin_7_6 { // curried function to benchmark any code or function def benchmark(name: String)(f: => Unit) { val startTime = System.nanoTime f val endTime = System.nanoTime println(s"Time taken in $name: " + (endTime - startTime).toDouble / 1000000000 + " seconds") } // main class setting the configs def main (args: Array[String] ) { val spark = SparkSession.builder .appName("SortMergeJoin") .config("spark.sql.codegen.wholeStage", true) .config("spark.sql.join.preferSortMergeJoin", true) .config("spark.sql.autoBroadcastJoinThreshold", -1) .config("spark.sql.defaultSizeInBytes", 100000) .config("spark.sql.shuffle.partitions", 16) .getOrCreate () import spark.implicits._ var states = scala.collection.mutable.Map[Int, String]() var items = scala.collection.mutable.Map[Int, String]() val rnd = new scala.util.Random(42) // initialize states and items purchased states += (0 -> "AZ", 1 -> "CO", 2-> "CA", 3-> "TX", 4 -> "NY", 5-> "MI") items += (0 -> "SKU-0", 1 -> "SKU-1", 2-> "SKU-2", 3-> "SKU-3", 4 -> "SKU-4", 5-> "SKU-5") // create dataframes val usersDF = (0 to 100000).map(id => (id, s"user_${id}", s"user_${id}@databricks.com", states(rnd.nextInt(5)))) .toDF("uid", "login", "email", "user_state") val ordersDF = (0 to 100000).map(r => (r, r, rnd.nextInt(100000), 10 * r* 0.2d, states(rnd.nextInt(5)), items(rnd.nextInt(5)))) .toDF("transaction_id", "quantity", "users_id", "amount", "state", "items") usersDF.show(10) ordersDF.show(10) // do a Join val usersOrdersDF = ordersDF.join(usersDF, $"users_id" === $"uid") usersOrdersDF.show(10, false) usersOrdersDF.cache() usersOrdersDF.explain() // usersOrdersDF.explain("formated") // uncoment to view the SparkUI otherwise the program terminates and shutdowsn the UI // Thread.sleep(200000000) } }
Example 53
Source File: DefaultSource.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.metrics.source.MetricsHandler import org.apache.spark.sql.sources.{ BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider } import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} object DefaultSource { val MEMSQL_SOURCE_NAME = "com.memsql.spark" val MEMSQL_SOURCE_NAME_SHORT = "memsql" val MEMSQL_GLOBAL_OPTION_PREFIX = "spark.datasource.memsql." } class DefaultSource extends RelationProvider with DataSourceRegister with CreatableRelationProvider with LazyLogging { override def shortName: String = DefaultSource.MEMSQL_SOURCE_NAME_SHORT private def includeGlobalParams(sqlContext: SQLContext, params: Map[String, String]): Map[String, String] = sqlContext.getAllConfs.foldLeft(params)({ case (params, (k, v)) if k.startsWith(DefaultSource.MEMSQL_GLOBAL_OPTION_PREFIX) => params + (k.stripPrefix(DefaultSource.MEMSQL_GLOBAL_OPTION_PREFIX) -> v) case (params, _) => params }) override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val params = CaseInsensitiveMap(includeGlobalParams(sqlContext, parameters)) val options = MemsqlOptions(params) if (options.disablePushdown) { SQLPushdownRule.ensureRemoved(sqlContext.sparkSession) MemsqlReaderNoPushdown(MemsqlOptions.getQuery(params), options, sqlContext) } else { SQLPushdownRule.ensureInjected(sqlContext.sparkSession) MemsqlReader(MemsqlOptions.getQuery(params), Nil, options, sqlContext) } } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val opts = CaseInsensitiveMap(includeGlobalParams(sqlContext, parameters)) val conf = MemsqlOptions(opts) val table = MemsqlOptions .getTable(opts) .getOrElse( throw new IllegalArgumentException( s"To write a dataframe to MemSQL you must specify a table name via the '${MemsqlOptions.TABLE_NAME}' parameter" ) ) JdbcHelpers.prepareTableForWrite(conf, table, mode, data.schema) val isReferenceTable = JdbcHelpers.isReferenceTable(conf, table) val partitionWriterFactory = if (conf.onDuplicateKeySQL.isEmpty) { new LoadDataWriterFactory(table, conf) } else { new BatchInsertWriterFactory(table, conf) } val schema = data.schema var totalRowCount = 0L data.foreachPartition(partition => { val writer = partitionWriterFactory.createDataWriter(schema, TaskContext.getPartitionId(), 0, isReferenceTable, mode) try { partition.foreach(record => { writer.write(record) totalRowCount += 1 }) writer.commit() MetricsHandler.setRecordsWritten(totalRowCount) } catch { case e: Exception => { writer.abort() throw e } } }) createRelation(sqlContext, parameters) } }
Example 54
Source File: BinaryTypeBenchmark.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import java.sql.{Connection, DriverManager} import java.util.Properties import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import com.memsql.spark.BatchInsertBenchmark.{df, executeQuery} import org.apache.spark.sql.types.{BinaryType, IntegerType} import org.apache.spark.sql.{SaveMode, SparkSession} import scala.util.Random // BinaryTypeBenchmark is written to writing of the BinaryType with CPU profiler // this feature is accessible in Ultimate version of IntelliJ IDEA // see https://www.jetbrains.com/help/idea/async-profiler.html#profile for more details object BinaryTypeBenchmark extends App { final val masterHost: String = sys.props.getOrElse("memsql.host", "localhost") final val masterPort: String = sys.props.getOrElse("memsql.port", "5506") val spark: SparkSession = SparkSession .builder() .master("local") .config("spark.sql.shuffle.partitions", "1") .config("spark.driver.bindAddress", "localhost") .config("spark.datasource.memsql.ddlEndpoint", s"${masterHost}:${masterPort}") .config("spark.datasource.memsql.database", "testdb") .getOrCreate() def jdbcConnection: Loan[Connection] = { val connProperties = new Properties() connProperties.put("user", "root") Loan( DriverManager.getConnection( s"jdbc:mysql://$masterHost:$masterPort", connProperties )) } def executeQuery(sql: String): Unit = { jdbcConnection.to(conn => Loan(conn.createStatement).to(_.execute(sql))) } executeQuery("set global default_partitions_per_leaf = 2") executeQuery("drop database if exists testdb") executeQuery("create database testdb") def genRandomByte(): Byte = (Random.nextInt(256) - 128).toByte def genRandomRow(): Array[Byte] = Array.fill(1000)(genRandomByte()) val df = spark.createDF( List.fill(100000)(genRandomRow()).zipWithIndex, List(("data", BinaryType, true), ("id", IntegerType, true)) ) val start1 = System.nanoTime() df.write .format("memsql") .mode(SaveMode.Overwrite) .save("testdb.LoadData") println("Elapsed time: " + (System.nanoTime() - start1) + "ns [LoadData CSV]") val start2 = System.nanoTime() df.write .format("memsql") .option("tableKey.primary", "id") .option("onDuplicateKeySQL", "id = id") .mode(SaveMode.Overwrite) .save("testdb.BatchInsert") println("Elapsed time: " + (System.nanoTime() - start2) + "ns [BatchInsert]") val avroStart = System.nanoTime() df.write .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT) .mode(SaveMode.Overwrite) .option(MemsqlOptions.LOAD_DATA_FORMAT, "Avro") .save("testdb.AvroSerialization") println("Elapsed time: " + (System.nanoTime() - avroStart) + "ns [LoadData Avro] ") }
Example 55
Source File: LoadDataBenchmark.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import java.sql.{Connection, Date, DriverManager} import java.time.{Instant, LocalDate} import java.util.Properties import org.apache.spark.sql.types._ import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import org.apache.spark.sql.{SaveMode, SparkSession} import scala.util.Random // LoadDataBenchmark is written to test load data with CPU profiler // this feature is accessible in Ultimate version of IntelliJ IDEA // see https://www.jetbrains.com/help/idea/async-profiler.html#profile for more details object LoadDataBenchmark extends App { final val masterHost: String = sys.props.getOrElse("memsql.host", "localhost") final val masterPort: String = sys.props.getOrElse("memsql.port", "5506") val spark: SparkSession = SparkSession .builder() .master("local") .config("spark.sql.shuffle.partitions", "1") .config("spark.driver.bindAddress", "localhost") .config("spark.datasource.memsql.ddlEndpoint", s"${masterHost}:${masterPort}") .config("spark.datasource.memsql.database", "testdb") .getOrCreate() def jdbcConnection: Loan[Connection] = { val connProperties = new Properties() connProperties.put("user", "root") Loan( DriverManager.getConnection( s"jdbc:mysql://$masterHost:$masterPort", connProperties )) } def executeQuery(sql: String): Unit = { jdbcConnection.to(conn => Loan(conn.createStatement).to(_.execute(sql))) } executeQuery("set global default_partitions_per_leaf = 2") executeQuery("drop database if exists testdb") executeQuery("create database testdb") def genRow(): (Long, Int, Double, String) = (Random.nextLong(), Random.nextInt(), Random.nextDouble(), Random.nextString(20)) val df = spark.createDF( List.fill(1000000)(genRow()), List(("LongType", LongType, true), ("IntType", IntegerType, true), ("DoubleType", DoubleType, true), ("StringType", StringType, true)) ) val start = System.nanoTime() df.write .format("memsql") .mode(SaveMode.Append) .save("testdb.batchinsert") val diff = System.nanoTime() - start println("Elapsed time: " + diff + "ns [CSV serialization] ") executeQuery("truncate testdb.batchinsert") val avroStart = System.nanoTime() df.write .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT) .mode(SaveMode.Append) .option(MemsqlOptions.LOAD_DATA_FORMAT, "Avro") .save("testdb.batchinsert") val avroDiff = System.nanoTime() - avroStart println("Elapsed time: " + avroDiff + "ns [Avro serialization] ") }
Example 56
Source File: ReferenceTableTest.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.{DataFrame, SaveMode} import scala.util.Try class ReferenceTableTest extends IntegrationSuiteBase { val childAggregatorHost = "localhost" val childAggregatorPort = "5508" val dbName = "testdb" val commonCollectionName = "test_table" val referenceCollectionName = "reference_table" override def beforeEach(): Unit = { super.beforeEach() // Set child aggregator as a dmlEndpoint spark.conf .set("spark.datasource.memsql.dmlEndpoints", s"${childAggregatorHost}:${childAggregatorPort}") } def writeToTable(tableName: String): Unit = { val df = spark.createDF( List(4, 5, 6), List(("id", IntegerType, true)) ) df.write .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT) .mode(SaveMode.Append) .save(s"${dbName}.${tableName}") } def readFromTable(tableName: String): DataFrame = { spark.read .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT) .load(s"${dbName}.${tableName}") } def writeAndReadFromTable(tableName: String): Unit = { writeToTable(tableName) val dataFrame = readFromTable(tableName) val sqlRows = dataFrame.collect(); assert(sqlRows.length == 3) } def dropTable(tableName: String): Unit = executeQuery(s"drop table if exists $dbName.$tableName") describe("Success during write operations") { it("to common table") { dropTable(commonCollectionName) executeQuery( s"create table if not exists $dbName.$commonCollectionName (id INT NOT NULL, PRIMARY KEY (id))") writeAndReadFromTable(commonCollectionName) } it("to reference table") { dropTable(referenceCollectionName) executeQuery( s"create reference table if not exists $dbName.$referenceCollectionName (id INT NOT NULL, PRIMARY KEY (id))") writeAndReadFromTable(referenceCollectionName) } } describe("Success during creating") { it("common table") { dropTable(commonCollectionName) writeAndReadFromTable(commonCollectionName) } } describe("Failure because of") { it("database name not specified") { spark.conf.set("spark.datasource.memsql.database", "") val df = spark.createDF( List(4, 5, 6), List(("id", IntegerType, true)) ) val result = Try { df.write .format(DefaultSource.MEMSQL_SOURCE_NAME_SHORT) .mode(SaveMode.Append) .save(s"${commonCollectionName}") } assert(SQLHelper.isSQLExceptionWithCode(result.failed.get, List(1046))) } } }
Example 57
Source File: BatchInsertBenchmark.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import java.sql.{Connection, Date, DriverManager} import java.time.LocalDate import java.util.Properties import org.apache.spark.sql.types._ import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import org.apache.spark.sql.{SaveMode, SparkSession} import scala.util.Random // BatchInsertBenchmark is written to test batch insert with CPU profiler // this feature is accessible in Ultimate version of IntelliJ IDEA // see https://www.jetbrains.com/help/idea/async-profiler.html#profile for more details object BatchInsertBenchmark extends App { final val masterHost: String = sys.props.getOrElse("memsql.host", "localhost") final val masterPort: String = sys.props.getOrElse("memsql.port", "5506") val spark: SparkSession = SparkSession .builder() .master("local") .config("spark.sql.shuffle.partitions", "1") .config("spark.driver.bindAddress", "localhost") .config("spark.datasource.memsql.ddlEndpoint", s"${masterHost}:${masterPort}") .config("spark.datasource.memsql.database", "testdb") .getOrCreate() def jdbcConnection: Loan[Connection] = { val connProperties = new Properties() connProperties.put("user", "root") Loan( DriverManager.getConnection( s"jdbc:mysql://$masterHost:$masterPort", connProperties )) } def executeQuery(sql: String): Unit = { jdbcConnection.to(conn => Loan(conn.createStatement).to(_.execute(sql))) } executeQuery("set global default_partitions_per_leaf = 2") executeQuery("drop database if exists testdb") executeQuery("create database testdb") def genDate() = Date.valueOf(LocalDate.ofEpochDay(LocalDate.of(2001, 4, 11).toEpochDay + Random.nextInt(10000))) def genRow(): (Long, Int, Double, String, Date) = (Random.nextLong(), Random.nextInt(), Random.nextDouble(), Random.nextString(20), genDate()) val df = spark.createDF( List.fill(1000000)(genRow()), List(("LongType", LongType, true), ("IntType", IntegerType, true), ("DoubleType", DoubleType, true), ("StringType", StringType, true), ("DateType", DateType, true)) ) val start = System.nanoTime() df.write .format("memsql") .option("tableKey.primary", "IntType") .option("onDuplicateKeySQL", "IntType = IntType") .mode(SaveMode.Append) .save("testdb.batchinsert") val diff = System.nanoTime() - start println("Elapsed time: " + diff + "ns") }
Example 58
Source File: IssuesTest.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import org.apache.spark.sql.SaveMode import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ class IssuesTest extends IntegrationSuiteBase { it("https://github.com/memsql/memsql-spark-connector/issues/41") { executeQuery(""" | create table if not exists testdb.issue41 ( | start_video_pos smallint(5) unsigned DEFAULT NULL | ) |""".stripMargin) val df = spark.createDF( List(1.toShort, 2.toShort, 3.toShort, 4.toShort), List(("start_video_pos", ShortType, true)) ) df.write.format("memsql").mode(SaveMode.Append).save("issue41") val df2 = spark.read.format("memsql").load("issue41") assertSmallDataFrameEquality(df2, spark.createDF( List(1, 2, 3, 4), List(("start_video_pos", IntegerType, true)) )) } it("https://memsql.zendesk.com/agent/tickets/10451") { // parallel read should support columnar scan with filter executeQuery(""" | create table if not exists testdb.ticket10451 ( | t text, | h bigint(20) DEFAULT NULL, | KEY h (h) USING CLUSTERED COLUMNSTORE | ) | """.stripMargin) val df = spark.createDF( List(("hi", 2L), ("hi", 3L), ("foo", 4L)), List(("t", StringType, true), ("h", LongType, true)) ) df.write.format("memsql").mode(SaveMode.Append).save("ticket10451") val df2 = spark.read .format("memsql") .load("ticket10451") .where(col("t") === "hi") .where(col("h") === 3L) assert(df2.rdd.getNumPartitions > 1) assertSmallDataFrameEquality(df2, spark.createDF( List(("hi", 3L)), List(("t", StringType, true), ("h", LongType, true)) )) } it("supports reading count from query") { val df = spark.createDF( List((1, "Albert"), (5, "Ronny"), (7, "Ben"), (9, "David")), List(("id", IntegerType, true), ("name", StringType, true)) ) writeTable("testdb.testcount", df) val data = spark.read .format("memsql") .option("query", "select count(1) from testcount where id > 1 ") .option("database", "testdb") .load() .collect() val count = data.head.getLong(0) assert(count == 3) } it("handles exceptions raised by asCode") { // in certain cases asCode will raise NullPointerException due to this bug // https://issues.apache.org/jira/browse/SPARK-31403 writeTable("testdb.nulltest", spark.createDF( List(1, null), List(("i", IntegerType, true)) )) spark.sql(s"create table nulltest using memsql options ('dbtable'='testdb.nulltest')") val df2 = spark.sql("select if(isnull(i), null, 2) as x from nulltest order by i") assertSmallDataFrameEquality(df2, spark.createDF( List(null, 2), List(("x", IntegerType, true)) )) } }
Example 59
Source File: LoadMode.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.util import org.apache.spark.sql.SaveMode trait LoadMode { def sparkMode: SaveMode } object LoadMode { case object OverwriteTable extends LoadMode { override def sparkMode: SaveMode = SaveMode.Overwrite } case object OverwritePartitions extends LoadMode { override val sparkMode: SaveMode = SaveMode.Overwrite } case object OverwritePartitionsWithAddedColumns extends LoadMode { override val sparkMode: SaveMode = SaveMode.Overwrite } case object AppendJoinPartitions extends LoadMode { override def sparkMode: SaveMode = SaveMode.Append } case object AppendUnionPartitions extends LoadMode { override def sparkMode: SaveMode = SaveMode.Append } }
Example 60
Source File: DefaultSource.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb.datasource import java.util.Optional import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.sources.v2.reader.DataSourceReader import org.apache.spark.sql.sources.v2.writer.DataSourceWriter import org.apache.spark.sql.sources.v2.{DataSourceOptions, ReadSupport, WriteSupport} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{SaveMode, SparkSession} import org.slf4j.LoggerFactory import scala.collection.JavaConverters._ class DefaultSource extends ReadSupport with WriteSupport with DataSourceRegister { private val logger = LoggerFactory.getLogger(this.getClass) override def createReader(schema: StructType, options: DataSourceOptions): DataSourceReader = { val optionsMap = options.asMap().asScala val defaultParallelism = optionsMap.get("defaultparallelism").map(_.toInt).getOrElse(getDefaultParallelism) new DynamoDataSourceReader(defaultParallelism, Map(optionsMap.toSeq: _*), Some(schema)) } override def createReader(options: DataSourceOptions): DataSourceReader = { val optionsMap = options.asMap().asScala val defaultParallelism = optionsMap.get("defaultparallelism").map(_.toInt).getOrElse(getDefaultParallelism) new DynamoDataSourceReader(defaultParallelism, Map(optionsMap.toSeq: _*)) } override def createWriter(writeUUID: String, schema: StructType, mode: SaveMode, options: DataSourceOptions): Optional[DataSourceWriter] = { if (mode == SaveMode.Append || mode == SaveMode.Overwrite) throw new IllegalArgumentException(s"DynamoDB data source does not support save modes ($mode)." + " Please use option 'update' (true | false) to differentiate between append/overwrite and append/update behavior.") val optionsMap = options.asMap().asScala val defaultParallelism = optionsMap.get("defaultparallelism").map(_.toInt).getOrElse(getDefaultParallelism) val writer = new DynamoDataSourceWriter(defaultParallelism, Map(optionsMap.toSeq: _*), schema) Optional.of(writer) } override def shortName(): String = "dynamodb" private def getDefaultParallelism: Int = SparkSession.getActiveSession match { case Some(spark) => spark.sparkContext.defaultParallelism case None => logger.warn("Unable to read defaultParallelism from SparkSession." + " Parallelism will be 1 unless overwritten with option `defaultParallelism`") 1 } }
Example 61
Source File: BigQuerySinkLog.scala From spark-bigquery with Apache License 2.0 | 5 votes |
package com.samelamin.spark.bigquery.streaming import org.apache.hadoop.fs.Path import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.functions._ class BigQuerySinkLog(sparkSession: SparkSession, path: String) { def getLatest(): Option[Long] = { try { import sparkSession.implicits._ val df = sparkSession.read.json(path).as[Long] df.show() val latest: Long = df .sort(desc("inserted_batches")) .first() return Some(latest) } catch { case e: Exception => None } } def writeBatch(batchId: Long):Unit = { import sparkSession.implicits._ val df = Seq(batchId).toDF("inserted_batches").as[Long] df.write.mode(SaveMode.Overwrite).json(path) } }
Example 62
Source File: MQTTStreamSink.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.bahir.sql.streaming.mqtt import scala.collection.JavaConverters._ import scala.collection.mutable import org.eclipse.paho.client.mqttv3.MqttException import org.apache.spark.SparkEnv import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister} import org.apache.spark.sql.sources.v2.{DataSourceOptions, DataSourceV2, StreamWriteSupport} import org.apache.spark.sql.sources.v2.writer.{DataWriter, DataWriterFactory, WriterCommitMessage} import org.apache.spark.sql.sources.v2.writer.streaming.StreamWriter import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StructType import org.apache.bahir.utils.Logging import org.apache.bahir.utils.Retry class MQTTStreamWriter (schema: StructType, parameters: DataSourceOptions) extends StreamWriter with Logging { override def createWriterFactory(): DataWriterFactory[InternalRow] = { // Skipping client identifier as single batch can be distributed to multiple // Spark worker process. MQTT server does not support two connections // declaring same client ID at given point in time. val params = parameters.asMap().asScala.filterNot( _._1.equalsIgnoreCase("clientId") ) MQTTDataWriterFactory(params) } override def commit(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} override def abort(epochId: Long, messages: Array[WriterCommitMessage]): Unit = {} } case class MQTTDataWriterFactory(config: mutable.Map[String, String]) extends DataWriterFactory[InternalRow] { override def createDataWriter( partitionId: Int, taskId: Long, epochId: Long ): DataWriter[InternalRow] = new MQTTDataWriter(config) } case object MQTTWriterCommitMessage extends WriterCommitMessage class MQTTDataWriter(config: mutable.Map[String, String]) extends DataWriter[InternalRow] { private lazy val publishAttempts: Int = SparkEnv.get.conf.getInt("spark.mqtt.client.publish.attempts", -1) private lazy val publishBackoff: Long = SparkEnv.get.conf.getTimeAsMs("spark.mqtt.client.publish.backoff", "5s") private lazy val (_, _, topic, _, _, qos, _, _, _) = MQTTUtils.parseConfigParams(config.toMap) override def write(record: InternalRow): Unit = { val client = CachedMQTTClient.getOrCreate(config.toMap) val message = record.getBinary(0) Retry(publishAttempts, publishBackoff, classOf[MqttException]) { // In case of errors, retry sending the message. client.publish(topic, message, qos, false) } } override def commit(): WriterCommitMessage = MQTTWriterCommitMessage override def abort(): Unit = {} } case class MQTTRelation(override val sqlContext: SQLContext, data: DataFrame) extends BaseRelation { override def schema: StructType = data.schema } class MQTTStreamSinkProvider extends DataSourceV2 with StreamWriteSupport with DataSourceRegister with CreatableRelationProvider { override def createStreamWriter(queryId: String, schema: StructType, mode: OutputMode, options: DataSourceOptions): StreamWriter = { new MQTTStreamWriter(schema, options) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { MQTTRelation(sqlContext, data) } override def shortName(): String = "mqtt" }
Example 63
Source File: JdbcLoadJob.scala From comet-data-pipeline with Apache License 2.0 | 5 votes |
package com.ebiznext.comet.job.index.jdbcload import com.ebiznext.comet.config.Settings import com.ebiznext.comet.utils.{SparkJob, SparkJobResult, Utils} import com.google.cloud.bigquery.JobInfo.WriteDisposition import org.apache.spark.sql.SaveMode import scala.util.Try class JdbcLoadJob( cliConfig: JdbcLoadConfig )(implicit val settings: Settings) extends SparkJob { override def name: String = s"jdbcload-JDBC-${cliConfig.outputTable}" val conf = session.sparkContext.hadoopConfiguration logger.info(s"JDBC Config $cliConfig") val driver = cliConfig.driver val url = cliConfig.url val user = cliConfig.user val password = cliConfig.password Class.forName(driver) def runJDBC(): Try[SparkJobResult] = { val inputPath = cliConfig.sourceFile logger.info(s"Input path $inputPath") Try { val sourceDF = inputPath match { case Left(path) => session.read.parquet(path) case Right(df) => df } sourceDF.write .format("jdbc") .option("numPartitions", cliConfig.partitions) .option("batchsize", cliConfig.batchSize) .option("truncate", cliConfig.writeDisposition == WriteDisposition.WRITE_TRUNCATE) .option("driver", driver) .option("url", url) .option("dbtable", cliConfig.outputTable) .option("user", user) .option("password", password) .mode(SaveMode.Append) .save() SparkJobResult(session) } } override def run(): Try[SparkJobResult] = { val res = runJDBC() Utils.logFailure(res, logger) } }
Example 64
Source File: WriteMode.scala From comet-data-pipeline with Apache License 2.0 | 5 votes |
package com.ebiznext.comet.schema.model import com.ebiznext.comet.schema.model.WriteMode.{APPEND, ERROR_IF_EXISTS, IGNORE, OVERWRITE} import com.fasterxml.jackson.core.JsonParser import com.fasterxml.jackson.databind.annotation.{JsonDeserialize, JsonSerialize} import com.fasterxml.jackson.databind.ser.std.ToStringSerializer import com.fasterxml.jackson.databind.{DeserializationContext, JsonDeserializer} import org.apache.spark.sql.SaveMode @JsonSerialize(using = classOf[ToStringSerializer]) @JsonDeserialize(using = classOf[WriteDeserializer]) sealed case class WriteMode(value: String) { override def toString: String = value def toSaveMode: SaveMode = { this match { case OVERWRITE => SaveMode.Overwrite case APPEND => SaveMode.Append case ERROR_IF_EXISTS => SaveMode.ErrorIfExists case IGNORE => SaveMode.Ignore case _ => throw new Exception("Should never happen") } } } object WriteMode { def fromString(value: String): WriteMode = { value.toUpperCase() match { case "OVERWRITE" => WriteMode.OVERWRITE case "APPEND" => WriteMode.APPEND case "ERROR_IF_EXISTS" => WriteMode.ERROR_IF_EXISTS case "IGNORE" => WriteMode.IGNORE case _ => throw new Exception(s"Invalid Write Mode try one of ${writes}") } } object OVERWRITE extends WriteMode("OVERWRITE") object APPEND extends WriteMode("APPEND") object ERROR_IF_EXISTS extends WriteMode("ERROR_IF_EXISTS") object IGNORE extends WriteMode("IGNORE") val writes: Set[WriteMode] = Set(OVERWRITE, APPEND, ERROR_IF_EXISTS, IGNORE) } class WriteDeserializer extends JsonDeserializer[WriteMode] { override def deserialize(jp: JsonParser, ctx: DeserializationContext): WriteMode = { val value = jp.readValueAs[String](classOf[String]) WriteMode.fromString(value) } }
Example 65
Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 66
Source File: JdbcRelationProvider.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn == null) { null } else { JDBCPartitioningInfo( partitionColumn, lowerBound.toLong, upperBound.toLong, numPartitions.toInt) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val jdbcOptions = new JDBCOptions(parameters) val url = jdbcOptions.url val table = jdbcOptions.table val createTableOptions = jdbcOptions.createTableOptions val isTruncate = jdbcOptions.isTruncate val conn = JdbcUtils.createConnectionFactory(jdbcOptions)() try { val tableExists = JdbcUtils.tableExists(conn, url, table) if (tableExists) { mode match { case SaveMode.Overwrite => if (isTruncate && isCascadingTruncateTable(url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, table) saveTable(df, url, table, jdbcOptions) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, table) createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } case SaveMode.Append => saveTable(df, url, table, jdbcOptions) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '$table' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(df.schema, url, table, createTableOptions, conn) saveTable(df, url, table, jdbcOptions) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 67
Source File: ParquetTest.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.parquet import java.io.File import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.{DataFrame, SaveMode} protected def withParquetTable[T <: Product: ClassTag: TypeTag] (data: Seq[T], tableName: String) (f: => Unit): Unit = { withParquetDataFrame(data) { df => sqlContext.registerDataFrameAsTable(df, tableName) withTempTable(tableName)(f) } } protected def makeParquetFile[T <: Product: ClassTag: TypeTag]( data: Seq[T], path: File): Unit = { data.toDF().write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath) } protected def makeParquetFile[T <: Product: ClassTag: TypeTag]( df: DataFrame, path: File): Unit = { df.write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath) } protected def makePartitionDir( basePath: File, defaultPartitionName: String, partitionCols: (String, Any)*): File = { val partNames = partitionCols.map { case (k, v) => val valueString = if (v == null || v == "") defaultPartitionName else v.toString s"$k=$valueString" } val partDir = partNames.foldLeft(basePath) { (parent, child) => new File(parent, child) } assert(partDir.mkdirs(), s"Couldn't create directory $partDir") partDir } }
Example 68
Source File: ModelOutput.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.common.ml.strategy.data import org.apache.spark.rdd.RDD import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.types._ import org.apache.spark.sql.{Row, SaveMode} import com.airbnb.common.ml.strategy.config.TrainingOptions import com.airbnb.common.ml.strategy.eval.BinaryMetrics import com.airbnb.common.ml.strategy.params.StrategyParams import com.airbnb.common.ml.util.HiveManageTable case class ModelOutput[T]( id: String, params: StrategyParams[T], loss: Double, evalMetrics: BinaryMetrics, holdoutMetrics: BinaryMetrics, options: TrainingOptions ) extends HiveManageTable { override def toRow(partition: String): Row = { Row( id.toLong, holdoutMetrics.posCount, holdoutMetrics.negCount, holdoutMetrics.posSugHigher, holdoutMetrics.posSugLower, holdoutMetrics.negSugHigher, holdoutMetrics.negSugLower, holdoutMetrics.increasePrecision, holdoutMetrics.increaseRecall, holdoutMetrics.decreasePrecision, holdoutMetrics.decreaseRecall, holdoutMetrics.trueRegret, holdoutMetrics.trueRegretMedian, holdoutMetrics.trueRegret75Percentile, holdoutMetrics.falseRegret, holdoutMetrics.trueIncreaseMagnitude, holdoutMetrics.trueDecreaseMagnitude, holdoutMetrics.falseDecreaseMagnitude, holdoutMetrics.falseIncreaseMagnitude, params.params, loss, options.toPartialArray, partition ) } } object ModelOutput { lazy val schema = StructType( Seq( StructField("id", LongType), StructField("posCount", IntegerType), StructField("negCount", IntegerType), StructField("posSugHigher", IntegerType), StructField("posSugLower", IntegerType), StructField("negSugHigher", IntegerType), StructField("negSugLower", IntegerType), StructField("increasePrecision", DoubleType), StructField("increaseRecall", DoubleType), StructField("decreasePrecision", DoubleType), StructField("decreaseRecall", DoubleType), StructField("trueRegret", DoubleType), StructField("trueRegretMedian", DoubleType), StructField("trueRegret75Percentile", DoubleType), StructField("falseRegret", DoubleType), StructField("trueIncreaseMagnitude", DoubleType), StructField("trueDecreaseMagnitude", DoubleType), StructField("falseDecreaseMagnitude", DoubleType), StructField("falseIncreaseMagnitude", DoubleType), StructField("params", ArrayType(DoubleType)), StructField("loss", DoubleType), StructField("options", ArrayType(DoubleType)), StructField("model", StringType) ) ) def save[T]( hiveContext: HiveContext, data: RDD[ModelOutput[T]], table: String, partition: String ): Unit = { HiveManageTable.saveRDDToHive( hiveContext, data, table, ModelOutput.schema, SaveMode.Overwrite, "model", partition) } }
Example 69
Source File: HiveManageTable.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.common.ml.util import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.SaveMode import org.apache.spark.sql.hive.HiveContext import org.apache.spark.sql.types.StructType trait HiveManageTable { def toRow(partition: String): Row } object HiveManageTable { def saveRDDToHive[T <: HiveManageTable](hiveContext: HiveContext, data: RDD[T], table: String, schema: StructType, mode: SaveMode, partition: String, partitionValue: String, hiveConfig: Map[String, String] = dynamicPartitions):Unit = { hiveConfig.foreach { case (key, value) => hiveContext.setConf(key, value) } hiveContext.createDataFrame(data.map(_.toRow(partitionValue)), schema) .write .mode(mode) .partitionBy(partition) .insertInto(table) } lazy val dynamicPartitions = Map( "hive.exec.dynamic.partition" -> "true", "hive.exec.dynamic.partition.mode" -> "nonstrict" ) }
Example 70
Source File: DataFrameToFileWriter.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperations.readwritedataframe.filestorage import org.apache.spark.SparkException import ai.deepsense.commons.utils.LoggerForCallerClass import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.doperations.exceptions.WriteFileException import ai.deepsense.deeplang.doperations.inout.OutputFileFormatChoice.Csv import ai.deepsense.deeplang.doperations.inout.OutputStorageTypeChoice import ai.deepsense.deeplang.doperations.readwritedataframe.{FilePath, FilePathFromLibraryPath, FileScheme} import ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv.CsvSchemaStringifierBeforeCsvWriting import ai.deepsense.deeplang.exceptions.DeepLangException import ai.deepsense.deeplang.{ExecutionContext, FileSystemClient} import org.apache.spark.sql.SaveMode object DataFrameToFileWriter { val logger = LoggerForCallerClass() def writeToFile( fileChoice: OutputStorageTypeChoice.File, context: ExecutionContext, dataFrame: DataFrame): Unit = { implicit val ctx = context val path = FileSystemClient.replaceLeadingTildeWithHomeDirectory(fileChoice.getOutputFile()) val filePath = FilePath(path) val saveMode = if (fileChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists try { val preprocessed = fileChoice.getFileFormat() match { case csv: Csv => CsvSchemaStringifierBeforeCsvWriting.preprocess(dataFrame) case other => dataFrame } writeUsingProvidedFileScheme(fileChoice, preprocessed, filePath, saveMode) } catch { case e: SparkException => logger.error(s"WriteDataFrame error: Spark problem. Unable to write file to $path", e) throw WriteFileException(path, e) } } private def writeUsingProvidedFileScheme( fileChoice: OutputStorageTypeChoice.File, dataFrame: DataFrame, path: FilePath, saveMode: SaveMode )(implicit context: ExecutionContext): Unit = { import FileScheme._ path.fileScheme match { case Library => val filePath = FilePathFromLibraryPath(path) val FilePath(_, libraryPath) = filePath new java.io.File(libraryPath).getParentFile.mkdirs() writeUsingProvidedFileScheme(fileChoice, dataFrame, filePath, saveMode) case FileScheme.File => DriverFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode) case HDFS => ClusterFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode) case HTTP | HTTPS | FTP => throw NotSupportedScheme(path.fileScheme) } } case class NotSupportedScheme(fileScheme: FileScheme) extends DeepLangException(s"Not supported file scheme ${fileScheme.pathPrefix}") }
Example 71
Source File: WriteDataFrame.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperations import java.io.IOException import java.util.Properties import scala.reflect.runtime.{universe => ru} import ai.deepsense.commons.utils.Version import ai.deepsense.deeplang.DOperation.Id import ai.deepsense.deeplang._ import ai.deepsense.deeplang.documentation.OperationDocumentation import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.doperations.exceptions.DeepSenseIOException import ai.deepsense.deeplang.doperations.inout._ import ai.deepsense.deeplang.doperations.readwritedataframe.filestorage.DataFrameToFileWriter import ai.deepsense.deeplang.doperations.readwritedataframe.googlestorage.DataFrameToGoogleSheetWriter import ai.deepsense.deeplang.doperations.readwritedataframe.validators.{FilePathHasValidFileScheme, ParquetSupportedOnClusterOnly} import ai.deepsense.deeplang.inference.{InferContext, InferenceWarnings} import ai.deepsense.deeplang.params.choice.ChoiceParam import ai.deepsense.deeplang.params.{Param, Params} import org.apache.spark.sql.SaveMode class WriteDataFrame() extends DOperation1To0[DataFrame] with Params with OperationDocumentation { override val id: Id = "9e460036-95cc-42c5-ba64-5bc767a40e4e" override val name: String = "Write DataFrame" override val description: String = "Writes a DataFrame to a file or database" override val since: Version = Version(0, 4, 0) @transient override lazy val tTagTI_0: ru.TypeTag[DataFrame] = ru.typeTag[DataFrame] val storageType = ChoiceParam[OutputStorageTypeChoice]( name = "data storage type", description = Some("Storage type.")) def getStorageType(): OutputStorageTypeChoice = $(storageType) def setStorageType(value: OutputStorageTypeChoice): this.type = set(storageType, value) val specificParams: Array[Param[_]] = Array(storageType) setDefault(storageType, new OutputStorageTypeChoice.File()) override def execute(dataFrame: DataFrame)(context: ExecutionContext): Unit = { import OutputStorageTypeChoice._ try { getStorageType() match { case jdbcChoice: Jdbc => writeToJdbc(jdbcChoice, context, dataFrame) case googleSheetChoice: GoogleSheet => DataFrameToGoogleSheetWriter.writeToGoogleSheet( googleSheetChoice, context, dataFrame ) case fileChoice: File => DataFrameToFileWriter.writeToFile(fileChoice, context, dataFrame) } } catch { case e: IOException => logger.error(s"WriteDataFrame error. Could not write file to designated storage", e) throw DeepSenseIOException(e) } } private def writeToJdbc( jdbcChoice: OutputStorageTypeChoice.Jdbc, context: ExecutionContext, dataFrame: DataFrame): Unit = { val properties = new Properties() properties.setProperty("driver", jdbcChoice.getJdbcDriverClassName) val jdbcUrl = jdbcChoice.getJdbcUrl val jdbcTableName = jdbcChoice.getJdbcTableName val saveMode = if (jdbcChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists dataFrame.sparkDataFrame.write.mode(saveMode).jdbc(jdbcUrl, jdbcTableName, properties) } override def inferKnowledge(k0: DKnowledge[DataFrame])(context: InferContext): (Unit, InferenceWarnings) = { FilePathHasValidFileScheme.validate(this) ParquetSupportedOnClusterOnly.validate(this) super.inferKnowledge(k0)(context) } }
Example 72
Source File: Main.scala From example-spark-scala-read-and-write-from-hdfs with Apache License 2.0 | 5 votes |
package io.saagie.example.hdfs import org.apache.log4j.LogManager import org.apache.spark.sql.{SaveMode, SparkSession} object Main{ case class HelloWorld(message: String) def main(args: Array[String]): Unit = { val log = LogManager.getRootLogger // Creation of Spark Session val sparkSession = SparkSession.builder().appName("example-spark-scala-read-and-write-from-hdfs").getOrCreate() import sparkSession.implicits._ val hdfs_master = args(0) // ====== Creating a dataframe with 1 partition val df = Seq(HelloWorld("helloworld")).toDF().coalesce(1) // ======= Writing files // Writing file as parquet df.write.mode(SaveMode.Overwrite).parquet(hdfs_master + "user/hdfs/wiki/testwiki") // Writing file as csv df.write.mode(SaveMode.Overwrite).csv(hdfs_master + "user/hdfs/wiki/testwiki.csv") // ======= Reading files // Reading parquet files val df_parquet = sparkSession.read.parquet(hdfs_master + "user/hdfs/wiki/testwiki") log.info(df_parquet.show()) // Reading csv files val df_csv = sparkSession.read.option("inferSchema", "true").csv(hdfs_master + "user/hdfs/wiki/testwiki.csv") log.info(df_csv.show()) } }
Example 73
Source File: JDBCRelation.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.util.Properties import scala.collection.mutable.ArrayBuffer import org.apache.spark.Partition import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode} def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = { if (partitioning == null) return Array[Partition](JDBCPartition(null, 0)) val numPartitions = partitioning.numPartitions val column = partitioning.column if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0)) // Overflow and silliness can happen if you subtract then divide. // Here we get a little roundoff, but that's (hopefully) OK. val stride: Long = (partitioning.upperBound / numPartitions - partitioning.lowerBound / numPartitions) var i: Int = 0 var currentValue: Long = partitioning.lowerBound var ans = new ArrayBuffer[Partition]() while (i < numPartitions) { val lowerBound = if (i != 0) s"$column >= $currentValue" else null currentValue += stride val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null val whereClause = if (upperBound == null) { lowerBound } else if (lowerBound == null) { upperBound } else { s"$lowerBound AND $upperBound" } ans += JDBCPartition(whereClause, i) i = i + 1 } ans.toArray } } private[sql] case class JDBCRelation( url: String, table: String, parts: Array[Partition], properties: Properties = new Properties())(@transient val sqlContext: SQLContext) extends BaseRelation with PrunedFilteredScan with InsertableRelation { override val needConversion: Boolean = false override val schema: StructType = JDBCRDD.resolveTable(url, table, properties) override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { val driver: String = DriverRegistry.getDriverClassName(url) // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row] JDBCRDD.scanTable( sqlContext.sparkContext, schema, driver, url, properties, table, requiredColumns, filters, parts).asInstanceOf[RDD[Row]] } override def insert(data: DataFrame, overwrite: Boolean): Unit = { data.write .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append) .jdbc(url, table, properties) } }
Example 74
Source File: ParquetTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.parquet import java.io.File import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.test.SQLTestUtils import org.apache.spark.sql.{DataFrame, SaveMode, SQLContext} protected def withParquetTable[T <: Product: ClassTag: TypeTag] (data: Seq[T], tableName: String) (f: => Unit): Unit = { withParquetDataFrame(data) { df => //注册数据集Seq及临时表名 _sqlContext.registerDataFrameAsTable(df, tableName) withTempTable(tableName)(f) } } //产生Parquet文件 protected def makeParquetFile[T <: Product: ClassTag: TypeTag]( data: Seq[T], path: File): Unit = { //当数据输出的位置已存在时,覆盖重写 _sqlContext.createDataFrame(data).write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath) } //产生Parquet文件 protected def makeParquetFile[T <: Product: ClassTag: TypeTag]( df: DataFrame, path: File): Unit = { //当数据输出的位置已存在时,重写 df.write.mode(SaveMode.Overwrite).parquet(path.getCanonicalPath) } //产生分区目录 protected def makePartitionDir( basePath: File, defaultPartitionName: String, partitionCols: (String, Any)*): File = { val partNames = partitionCols.map { case (k, v) => val valueString = if (v == null || v == "") defaultPartitionName else v.toString s"$k=$valueString" } val partDir = partNames.foldLeft(basePath) { (parent, child) => new File(parent, child) } assert(partDir.mkdirs(), s"Couldn't create directory $partDir") partDir } }
Example 75
Source File: IOUtilities.scala From spark-practice with MIT License | 5 votes |
package probelms.customerInsights import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import utilities.SparkUtilities object IOUtilities { def readProductsDF(spark:SparkSession,path:String):DataFrame={ import spark.implicits._ spark.read.textFile(path) .map( line => line.split(CIConstants.DELIMITER)) .map(fields => new Product(fields(0).toInt, fields(1), fields(2), fields(3),SparkUtilities.convertCurrencyToDouble(fields(4)))) .toDF() } def readSalesDF(spark:SparkSession,path:String):DataFrame= { import spark.implicits._ spark.read.textFile(CIConstants.SALES_PATH) .map(line => line.split(CIConstants.DELIMITER)) .map(fields => new Sales(fields(0).toInt,fields(1).toInt,fields(2).toInt, SparkUtilities.getDate(fields(3)), SparkUtilities.convertCurrencyToDouble(fields(4)),fields(5).toInt)) .toDF() } def readRefundDF(spark:SparkSession,path:String)={ import spark.implicits._ spark.read.textFile(path) .map(line => line.split(CIConstants.DELIMITER)) .map(fields => new Refund(fields(0).toInt,fields(1).toInt,fields(2).toInt,fields(3).toInt, SparkUtilities.getDate(fields(4)), SparkUtilities.convertCurrencyToDouble(fields(5)),fields(6).toInt)) .toDF() } def readCustomerDF(spark:SparkSession,path:String)= { import spark.implicits._ spark.read.textFile(path) .map( line => line.split(CIConstants.DELIMITER)) .map(fields => new Customer(fields(0).toInt, fields(1), fields(2), fields(3).toLong)) .toDF() } def writeDF(df:DataFrame,path:String):Unit={ df.repartition(1) .write .format("csv") .option("header","true") .mode(SaveMode.Overwrite) .save(path) } }
Example 76
Source File: ContextsSpec.scala From mist with Apache License 2.0 | 5 votes |
package mist.api import org.apache.spark.SparkContext import org.apache.spark.sql.{SaveMode, SparkSession} import org.scalatest.{FunSpec, Matchers} import scala.util._ class ContextsSpec extends FunSpec with Matchers with TestSparkContext { import mist.api.ArgsInstances._ import mist.api.encoding.defaults._ import mist.api.MistFnSyntax._ import mist.api.data._ import mist.api.encoding.JsSyntax._ it("for spark context") { val spJob = arg[Seq[Int]]("nums").onSparkContext( (nums: Seq[Int], sp: SparkContext) => { sp.parallelize(nums).map(_ * 2).collect() "2" }) val res = spJob.invoke(testCtx(JsMap("nums" -> (1 to 10).to[Seq].js))) res shouldBe Success("2") } it("for only sc") { val spJob = onSparkContext((sc: SparkContext) => { 5 }) val res = spJob.invoke(testCtx(JsMap.empty)) res shouldBe Success(5) } def pathToResource(path: String): String = { this.getClass.getClassLoader.getResource(path).getPath } it("session with hive") { System.setSecurityManager(null) val spJob = onSparkSessionWithHive((spark: SparkSession) => { val df = spark.read.json(pathToResource("hive_job_data.json")) df.createOrReplaceTempView("temp") df.cache() spark.sql("DROP TABLE IF EXISTS temp_hive") spark.table("temp").write.mode(SaveMode.Overwrite).saveAsTable("temp_hive") spark.sql("SELECT MAX(age) AS avg_age FROM temp_hive") .take(1)(0).getLong(0) }) spJob.invoke(testCtx(JsMap.empty)) val res = spJob.invoke(testCtx(JsMap.empty)) res shouldBe Success(30) } def testCtx(params: JsMap): FnContext = { FnContext(spark, params) } }
Example 77
Source File: CreateHiveTableAsSelectCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import scala.util.control.NonFatal import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.command.DataWritingCommand case class CreateHiveTableAsSelectCommand( tableDesc: CatalogTable, query: LogicalPlan, outputColumns: Seq[Attribute], mode: SaveMode) extends DataWritingCommand { private val tableIdentifier = tableDesc.identifier override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { val catalog = sparkSession.sessionState.catalog if (catalog.tableExists(tableIdentifier)) { assert(mode != SaveMode.Overwrite, s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") if (mode == SaveMode.ErrorIfExists) { throw new AnalysisException(s"$tableIdentifier already exists.") } if (mode == SaveMode.Ignore) { // Since the table already exists and the save mode is Ignore, we will just return. return Seq.empty } InsertIntoHiveTable( tableDesc, Map.empty, query, overwrite = false, ifPartitionNotExists = false, outputColumns = outputColumns).run(sparkSession, child) } else { // TODO ideally, we should get the output data ready first and then // add the relation into catalog, just in case of failure occurs while data // processing. assert(tableDesc.schema.isEmpty) catalog.createTable(tableDesc.copy(schema = query.schema), ignoreIfExists = false) try { // Read back the metadata of the table which was created just now. val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier) // For CTAS, there is no static partition values to insert. val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap InsertIntoHiveTable( createdTableMeta, partition, query, overwrite = true, ifPartitionNotExists = false, outputColumns = outputColumns).run(sparkSession, child) } catch { case NonFatal(e) => // drop the created table. catalog.dropTable(tableIdentifier, ignoreIfNotExists = true, purge = false) throw e } } Seq.empty[Row] } override def argString: String = { s"[Database:${tableDesc.database}}, " + s"TableName: ${tableDesc.identifier.table}, " + s"InsertIntoHiveTable]" } }
Example 78
Source File: JdbcRelationProvider.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import org.apache.spark.sql.{AnalysisException, DataFrame, SaveMode, SQLContext} import org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider} class JdbcRelationProvider extends CreatableRelationProvider with RelationProvider with DataSourceRegister { override def shortName(): String = "jdbc" override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { import JDBCOptions._ val jdbcOptions = new JDBCOptions(parameters) val partitionColumn = jdbcOptions.partitionColumn val lowerBound = jdbcOptions.lowerBound val upperBound = jdbcOptions.upperBound val numPartitions = jdbcOptions.numPartitions val partitionInfo = if (partitionColumn.isEmpty) { assert(lowerBound.isEmpty && upperBound.isEmpty, "When 'partitionColumn' is not specified, " + s"'$JDBC_LOWER_BOUND' and '$JDBC_UPPER_BOUND' are expected to be empty") null } else { assert(lowerBound.nonEmpty && upperBound.nonEmpty && numPartitions.nonEmpty, s"When 'partitionColumn' is specified, '$JDBC_LOWER_BOUND', '$JDBC_UPPER_BOUND', and " + s"'$JDBC_NUM_PARTITIONS' are also required") JDBCPartitioningInfo( partitionColumn.get, lowerBound.get, upperBound.get, numPartitions.get) } val parts = JDBCRelation.columnPartition(partitionInfo) JDBCRelation(parts, jdbcOptions)(sqlContext.sparkSession) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], df: DataFrame): BaseRelation = { val options = new JDBCOptions(parameters) val isCaseSensitive = sqlContext.conf.caseSensitiveAnalysis val conn = JdbcUtils.createConnectionFactory(options)() try { val tableExists = JdbcUtils.tableExists(conn, options) if (tableExists) { mode match { case SaveMode.Overwrite => if (options.isTruncate && isCascadingTruncateTable(options.url) == Some(false)) { // In this case, we should truncate table and then load. truncateTable(conn, options) val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) } else { // Otherwise, do not truncate the table, instead drop and recreate it dropTable(conn, options.table) createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } case SaveMode.Append => val tableSchema = JdbcUtils.getSchemaOption(conn, options) saveTable(df, tableSchema, isCaseSensitive, options) case SaveMode.ErrorIfExists => throw new AnalysisException( s"Table or view '${options.table}' already exists. SaveMode: ErrorIfExists.") case SaveMode.Ignore => // With `SaveMode.Ignore` mode, if table already exists, the save operation is expected // to not save the contents of the DataFrame and to not change the existing data. // Therefore, it is okay to do nothing here and then just return the relation below. } } else { createTable(conn, df, options) saveTable(df, Some(df.schema), isCaseSensitive, options) } } finally { conn.close() } createRelation(sqlContext, parameters) } }
Example 79
Source File: SaveIntoDataSourceCommand.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.command.RunnableCommand import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.sources.CreatableRelationProvider case class SaveIntoDataSourceCommand( query: LogicalPlan, dataSource: CreatableRelationProvider, options: Map[String, String], mode: SaveMode) extends RunnableCommand { override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query) override def run(sparkSession: SparkSession): Seq[Row] = { dataSource.createRelation( sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query)) Seq.empty[Row] } override def simpleString: String = { val redacted = SQLConf.get.redactOptions(options) s"SaveIntoDataSourceCommand ${dataSource}, ${redacted}, ${mode}" } }
Example 80
Source File: SaveIntoDataSourceCommandSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.SparkConf import org.apache.spark.sql.SaveMode import org.apache.spark.sql.test.SharedSQLContext class SaveIntoDataSourceCommandSuite extends SharedSQLContext { test("simpleString is redacted") { val URL = "connection.url" val PASS = "123" val DRIVER = "mydriver" val dataSource = DataSource( sparkSession = spark, className = "jdbc", partitionColumns = Nil, options = Map("password" -> PASS, "url" -> URL, "driver" -> DRIVER)) val logicalPlanString = dataSource .planForWriting(SaveMode.ErrorIfExists, spark.range(1).logicalPlan) .treeString(true) assert(!logicalPlanString.contains(URL)) assert(!logicalPlanString.contains(PASS)) assert(logicalPlanString.contains(DRIVER)) } }
Example 81
Source File: DefaultSource.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.datasource.google.spreadsheet import java.io.File import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = { createRelation(sqlContext, parameters, null) } private[spreadsheet] def pathToSheetNames(parameters: Map[String, String]): (String, String) = { val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets.")) val elems = path.split('/') if (elems.length < 2) throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'") (elems(0), elems(1)) } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) val context = createSpreadsheetContext(parameters) createRelation(sqlContext, context, spreadsheetName, worksheetName, schema) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) implicit val context = createSpreadsheetContext(parameters) val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName) if(!spreadsheet.isDefined) throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName") spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData) createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema) } private[spreadsheet] def createSpreadsheetContext(parameters: Map[String, String]) = { val serviceAccountIdOption = parameters.get("serviceAccountId") val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH) SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath)) } private[spreadsheet] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: StructType): SpreadsheetRelation = if (schema == null) { createRelation(sqlContext, context, spreadsheetName, worksheetName, None) } else { createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema)) } private[spreadsheet] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: Option[StructType]): SpreadsheetRelation = SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext) }
Example 82
Source File: SparkSchemaProvider.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.data import com.typesafe.scalalogging.LazyLogging import org.apache.spark.sql.{ DataFrame, SaveMode } import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.analysis.{ UnresolvedRelation, NoSuchDatabaseException } import org.apache.spark.sql.execution.command.{ DropTableCommand, CreateDatabaseCommand } import mimir.Database import mimir.algebra._ import mimir.exec.spark.{MimirSpark, RAToSpark, RowIndexPlan} class SparkSchemaProvider(db: Database) extends LogicalPlanSchemaProvider with MaterializedTableProvider with LazyLogging { def listTables(): Seq[ID] = { try { val tables = MimirSpark.get.sparkSession .catalog .listTables( table.id) .collect() .map { col => ( ID(col.name), RAToSpark.getMimirType( RAToSpark.dataTypeFromHiveDataTypeString(col.dataType)) ) } ) } else { logger.trace(s"$table doesn't exist") None } } catch { case _:NoSuchDatabaseException => { logger.warn("Couldn't find database!!! ($sparkDBName)") None } } } def logicalplan(table: ID): LogicalPlan = { RowIndexPlan( UnresolvedRelation(TableIdentifier(table.id)), tableSchema(table).get ).getPlan(db) } def createStoredTableAs(data: DataFrame, name: ID) { data.persist() .createOrReplaceTempView(name.id) data.write .mode(SaveMode.Overwrite) .saveAsTable(name.id) } def dropStoredTable(name: ID) { DropTableCommand( TableIdentifier(name.id, None),//Option(sparkDBName)), true, false, true ).run(MimirSpark.get.sparkSession) } }
Example 83
Source File: LogisticRegressionDemo.scala From s4ds with Apache License 2.0 | 5 votes |
import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer, StringIndexer} import org.apache.spark.ml.tuning.{ParamGridBuilder, CrossValidator} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.SaveMode case class LabelledDocument(fileName:String, text:String, category:String) object LogisticRegressionDemo extends App { val conf = new SparkConf().setAppName("LrTest") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext._ import sqlContext.implicits._ val spamText = sc.wholeTextFiles("spam/*") val hamText = sc.wholeTextFiles("ham/*") val spamDocuments = spamText.map { case (fileName, text) => LabelledDocument(fileName, text, "spam") } val hamDocuments = hamText.map { case (fileName, text) => LabelledDocument(fileName, text, "ham") } val documentsDF = spamDocuments.union(hamDocuments).toDF documentsDF.persist val Array(trainDF, testDF) = documentsDF.randomSplit(Array(0.7, 0.3)) val indexer = new StringIndexer().setInputCol("category").setOutputCol("label") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val hasher = new HashingTF().setInputCol("words").setOutputCol("features") val lr = new LogisticRegression().setMaxIter(50).setRegParam(0.0) val pipeline = new Pipeline().setStages(Array(indexer, tokenizer, hasher, lr)) val model = pipeline.fit(trainDF) val transformedTrain = model.transform(trainDF) transformedTrain.persist val transformedTest = model.transform(testDF) transformedTest.persist println("in sample misclassified:", transformedTrain.filter($"prediction" !== $"label").count, " / ",transformedTrain.count) println("out sample misclassified:", transformedTest.filter($"prediction" !== $"label").count, " / ",transformedTest.count) transformedTrain.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTrain.parquet") transformedTest.select("fileName", "label", "prediction", "probability") .write.mode(SaveMode.Overwrite).parquet("transformedTest.parquet") }
Example 84
Source File: JDBCRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.jdbc import java.util.Properties import scala.collection.mutable.ArrayBuffer import org.apache.spark.Partition import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode} def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = { if (partitioning == null) return Array[Partition](JDBCPartition(null, 0)) val numPartitions = partitioning.numPartitions val column = partitioning.column if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0)) // Overflow and silliness can happen if you subtract then divide. // Here we get a little roundoff, but that's (hopefully) OK. val stride: Long = (partitioning.upperBound / numPartitions - partitioning.lowerBound / numPartitions) var i: Int = 0 var currentValue: Long = partitioning.lowerBound var ans = new ArrayBuffer[Partition]() while (i < numPartitions) { val lowerBound = if (i != 0) s"$column >= $currentValue" else null currentValue += stride val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null val whereClause = if (upperBound == null) { lowerBound } else if (lowerBound == null) { upperBound } else { s"$lowerBound AND $upperBound" } ans += JDBCPartition(whereClause, i) i = i + 1 } ans.toArray } } private[sql] case class JDBCRelation( url: String, table: String, parts: Array[Partition], properties: Properties = new Properties())(@transient val sqlContext: SQLContext) extends BaseRelation with PrunedFilteredScan with InsertableRelation { override val needConversion: Boolean = false override val schema: StructType = JDBCRDD.resolveTable(url, table, properties) override def buildScan(requiredColumns: Array[String], filters: Array[Filter]): RDD[Row] = { // Rely on a type erasure hack to pass RDD[InternalRow] back as RDD[Row] JDBCRDD.scanTable( sqlContext.sparkContext, schema, url, properties, table, requiredColumns, filters, parts).asInstanceOf[RDD[Row]] } override def insert(data: DataFrame, overwrite: Boolean): Unit = { data.write .mode(if (overwrite) SaveMode.Overwrite else SaveMode.Append) .jdbc(url, table, properties) } }
Example 85
Source File: DefaultSource.scala From Spark-MongoDB with Apache License 2.0 | 5 votes |
package com.stratio.datasource.mongodb import com.stratio.datasource.mongodb.config.MongodbConfigBuilder import com.stratio.datasource.mongodb.config.MongodbConfig._ import org.apache.spark.sql.SaveMode._ import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider{ override def createRelation( sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = { new MongodbRelation(MongodbConfigBuilder(parseParameters(parameters)).build())(sqlContext) } override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { new MongodbRelation(MongodbConfigBuilder(parseParameters(parameters)).build(), Some(schema))(sqlContext) } override def createRelation( sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val mongodbRelation = new MongodbRelation( MongodbConfigBuilder(parseParameters(parameters)).build(), Some(data.schema))(sqlContext) mode match{ case Append => mongodbRelation.insert(data, overwrite = false) case Overwrite => mongodbRelation.insert(data, overwrite = true) case ErrorIfExists => if(mongodbRelation.isEmptyCollection) mongodbRelation.insert(data, overwrite = false) else throw new UnsupportedOperationException("Writing in a non-empty collection.") case Ignore => if(mongodbRelation.isEmptyCollection) mongodbRelation.insert(data, overwrite = false) } mongodbRelation } }
Example 86
Source File: PileupTestBase.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.pileup import com.holdenkarau.spark.testing.{DataFrameSuiteBase, SharedSparkContext} import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} import org.apache.spark.sql.types.{IntegerType, ShortType, StringType, StructField, StructType} import org.scalatest.{BeforeAndAfter, FunSuite} class PileupTestBase extends FunSuite with DataFrameSuiteBase with BeforeAndAfter with SharedSparkContext{ val sampleId = "NA12878.multichrom.md" val samResPath: String = getClass.getResource("/multichrom/mdbam/samtools.pileup").getPath val referencePath: String = getClass.getResource("/reference/Homo_sapiens_assembly18_chr1_chrM.small.fasta").getPath val bamPath: String = getClass.getResource(s"/multichrom/mdbam/${sampleId}.bam").getPath val cramPath : String = getClass.getResource(s"/multichrom/mdcram/${sampleId}.cram").getPath val tableName = "reads_bam" val tableNameCRAM = "reads_cram" val schema: StructType = StructType( List( StructField("contig", StringType, nullable = true), StructField("position", IntegerType, nullable = true), StructField("reference", StringType, nullable = true), StructField("coverage", ShortType, nullable = true), StructField("pileup", StringType, nullable = true), StructField("quality", StringType, nullable = true) ) ) before { System.setProperty("spark.kryo.registrator", "org.biodatageeks.sequila.pileup.serializers.CustomKryoRegistrator") spark .conf.set("spark.sql.shuffle.partitions",1) //FIXME: In order to get orderBy in Samtools tests working - related to exchange partitions stage spark.sql(s"DROP TABLE IF EXISTS $tableName") spark.sql( s""" |CREATE TABLE $tableName |USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource |OPTIONS(path "$bamPath") | """.stripMargin) spark.sql(s"DROP TABLE IF EXISTS $tableNameCRAM") spark.sql( s""" |CREATE TABLE $tableNameCRAM |USING org.biodatageeks.sequila.datasources.BAM.CRAMDataSource |OPTIONS(path "$cramPath", refPath "$referencePath" ) | """.stripMargin) val mapToString = (map: Map[Byte, Short]) => { if (map == null) "null" else map.map({ case (k, v) => k.toChar -> v}).mkString.replace(" -> ", ":") } val byteToString = ((byte: Byte) => byte.toString) spark.udf.register("mapToString", mapToString) spark.udf.register("byteToString", byteToString) } }
Example 87
Source File: Writer.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.tests.pileup import org.apache.spark.sql.{Dataset, Row, SaveMode, SparkSession} object Writer { val mapToString = (map: Map[Byte, Short]) => { if (map == null) "null" else map.map({ case (k, v) => k.toChar -> v }).toSeq.sortBy(_._1).mkString.replace(" -> ", ":") } def saveToFile(spark: SparkSession, res: Dataset[Row], path: String) = { spark.udf.register("mapToString", mapToString) res .selectExpr("contig", "pos_start", "pos_end", "ref", "cast(coverage as int)", "mapToString(alts)") .coalesce(1) .write .mode(SaveMode.Overwrite) .csv(path) } }
Example 88
Source File: AirplaneDatalake.scala From lighthouse with Apache License 2.0 | 5 votes |
package be.dataminded.lighthouse.demo import be.dataminded.lighthouse.datalake._ import be.dataminded.lighthouse.spark.Csv import better.files._ import org.apache.spark.sql.SaveMode object AirplaneDatalake extends Datalake { environment("test") { refs => refs += DataUID("raw", "airplane") -> new FileSystemDataLink( resource("data/airplane"), Csv, SaveMode.ErrorIfExists, options = Map("header" -> "true", "inferSchema" -> "true") ) refs += DataUID("raw.weather", "daily") -> new FileSystemDataLink( resource("data/weather/daily"), Csv, SaveMode.ErrorIfExists, options = Map("header" -> "true", "inferSchema" -> "true") ) refs += DataUID("raw.weather", "station") -> new FileSystemDataLink( resource("data/weather/station"), Csv, SaveMode.ErrorIfExists, options = Map("header" -> "true", "inferSchema" -> "true", "delimiter" -> "|") ) refs += DataUID("clean", "airplane") -> new FileSystemDataLink(file"target/clean/airplane".pathAsString) refs += DataUID("clean", "weather") -> new FileSystemDataLink(file"target/clean/weather/daily".pathAsString) refs += DataUID("clean", "stations") -> new FileSystemDataLink(file"target/clean/weather/stations".pathAsString) refs += DataUID("master", "view") -> new HiveDataLink( file"target/master/airplane/view".pathAsString, "default", "airplane_view" ) } private def resource(path: String): String = Resource.getUrl(path).getPath() }
Example 89
Source File: FileSystemDataLink.scala From lighthouse with Apache License 2.0 | 5 votes |
package be.dataminded.lighthouse.datalake import be.dataminded.lighthouse.spark.{Orc, SparkFileFormat} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, SaveMode} class FileSystemDataLink( val path: LazyConfig[String], format: SparkFileFormat = Orc, saveMode: SaveMode = SaveMode.Overwrite, partitionedBy: List[String] = List.empty, options: Map[String, String] = Map.empty, schema: Option[StructType] = None ) extends PathBasedDataLink { override def doRead(path: String): DataFrame = { schema match { case Some(s) => spark.read.format(format.toString).options(options).schema(s).load(path) case None => spark.read.format(format.toString).options(options).load(path) } } override def doWrite[T](dataset: Dataset[T], path: String): Unit = { dataset.write .format(format.toString) .partitionBy(partitionedBy: _*) .options(options) .mode(saveMode) .save(path) } }
Example 90
Source File: AvroDataLink.scala From lighthouse with Apache License 2.0 | 5 votes |
package be.dataminded.lighthouse.datalake import org.apache.spark.sql.{DataFrame, Dataset, SaveMode} class AvroDataLink( val path: LazyConfig[String], saveMode: SaveMode = SaveMode.Overwrite, partitionedBy: List[String] = List.empty, options: Map[String, String] = Map.empty ) extends PathBasedDataLink { override def doRead(path: String): DataFrame = { spark.read .format("com.databricks.spark.avro") .options(options) .load(path) } override def doWrite[T](dataset: Dataset[T], path: String): Unit = { dataset.write .format("com.databricks.spark.avro") .partitionBy(partitionedBy: _*) .options(options) .mode(saveMode) .save(path) } }
Example 91
Source File: JdbcDataLinkTest.scala From lighthouse with Apache License 2.0 | 5 votes |
package be.dataminded.lighthouse.datalake import be.dataminded.lighthouse.common.Database import be.dataminded.lighthouse.testing.SparkFunSuite import org.apache.spark.sql.SaveMode import org.scalatest.BeforeAndAfterAll import org.scalatest.matchers.should.Matchers case class test_table(ID: java.lang.Integer, STR: String) class JdbcDataLinkTest extends SparkFunSuite with Matchers with BeforeAndAfterAll { import spark.implicits._ private val extraOptions = Map("MODE" -> "MYSQL") override protected def beforeAll(): Unit = { Database.inMemory("test", Map("MODE" -> "MYSQL", "user" -> "sa", "DB_CLOSE_DELAY" -> "-1")).withConnection { con => // Create user and password con.createStatement().execute("CREATE USER IF NOT EXISTS TEST PASSWORD 'testpw'") con.createStatement().execute("ALTER USER TEST ADMIN TRUE") // Create table con.createStatement().execute("CREATE TABLE IF NOT EXISTS TEST_TABLE(ID INTEGER PRIMARY KEY, STR VARCHAR(50))") // Add initial data to table val statement = con.createStatement() (0 until 100).foreach(x => statement.addBatch(s"INSERT INTO TEST_TABLE(ID, STR) VALUES($x, '$x')")) statement.executeBatch() } } test("Reading JDBC datalink") { val jdbcDataLink = new JdbcDataLink( url = "jdbc:h2:mem:test", username = "TEST", password = "testpw", driver = "org.h2.Driver", extraProperties = extraOptions, table = "TEST_TABLE", partitionColumn = "ID" ) val data = jdbcDataLink.readAs[test_table]() data.collect() should contain theSameElementsAs (0 until 100).map(x => test_table(x, s"$x")) } test("Append JDBC datalink") { val jdbcDataLink = new JdbcDataLink( url = "jdbc:h2:mem:test", username = "TEST", password = "testpw", driver = "org.h2.Driver", extraProperties = extraOptions, table = "TEST_TABLE", partitionColumn = "ID", saveMode = SaveMode.Append ) val saveData = (100 until 200).map(x => test_table(x, s"$x")).toDS() jdbcDataLink.write(saveData) val data = jdbcDataLink.readAs[test_table]() data.collect() should contain theSameElementsAs (0 until 100).map(x => test_table(x, s"$x")) ++ (100 until 200).map(x => test_table(x, s"$x")) } test("Overwrite JDBC datalink") { import spark.implicits._ val jdbcDataLink = new JdbcDataLink( url = "jdbc:h2:mem:test", username = "TEST", password = "testpw", driver = "org.h2.Driver", extraProperties = extraOptions, table = "TEST_TABLE", partitionColumn = "ID", saveMode = SaveMode.Overwrite ) val saveData = (100 until 200).map(x => test_table(x, s"$x")).toDS() jdbcDataLink.write(saveData) val data = jdbcDataLink.readAs[test_table]() data.collect() should contain theSameElementsAs (100 until 200).map(x => test_table(x, s"$x")) } }
Example 92
Source File: S3Writer.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.s3.writer import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import com.paypal.gimel.logger.Logger import com.paypal.gimel.s3.conf.{S3ClientConfiguration, S3Configs, S3Constants} object S3Writer { val logger: Logger = Logger() def write(sparkSession: SparkSession, dataFrame: DataFrame, conf: S3ClientConfiguration): org.apache.spark.sql.DataFrame = { def MethodName: String = new Exception().getStackTrace().apply(1).getMethodName() logger.info(" @Begin --> " + MethodName) System.setProperty(S3Configs.awsServicesEnableV4, "true") sparkSession.conf.set(S3Configs.accessId, conf.accessId) sparkSession.conf.set(S3Configs.secretKey, conf.secretKey) sparkSession.conf.set(S3Configs.s3aClientImpl, conf.s3aImpl) sparkSession.conf.set(S3Configs.sslEnabled, conf.sslEnabled) sparkSession.conf.set(S3Configs.endPoint, conf.endPoint) sparkSession.conf.set(S3Configs.pathStyleAccess, conf.pathStyleAccess) val saveMode = conf.saveMode.toLowerCase() match { case S3Constants.appendSaveMode => SaveMode.Append case S3Constants.overwriteSaveMode => SaveMode.Overwrite case S3Constants.ignoreSaveMode => SaveMode.Ignore case _ => SaveMode.ErrorIfExists } conf.objectFormat match { case S3Constants.csvFileFormat => dataFrame.write.mode(saveMode) .option(S3Constants.delimiter, conf.delimiter) .option(S3Constants.inferschema, conf.inferSchema) .option(S3Constants.header, conf.header) .csv(conf.objectPath) case S3Constants.jsonFileformat => dataFrame.write.mode(saveMode).json(conf.objectPath) case S3Constants.parquetFileFormat => dataFrame.write.mode(saveMode).parquet((conf.objectPath)) case _ => dataFrame.write.mode(saveMode).text(conf.objectPath) } dataFrame } }
Example 93
Source File: Main.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.app import com.esri.core.geometry.Polyline import com.esri.udt.{PointType, PolylineType} import org.apache.spark.sql.{SQLContext, SaveMode} import org.apache.spark.{Logging, SparkConf, SparkContext} val sqlContext = new SQLContext(sc) val df = sqlContext.read.format("com.esri.gdb") .option("path", path) .option("name", name) .option("numPartitions", "1") .load() df.printSchema() df.registerTempTable(name) sqlContext.udf.register("getX", (point: PointType) => point.x) sqlContext.udf.register("getY", (point: PointType) => point.y) sqlContext.udf.register("line", (point: PointType) => PolylineType({ val polyline = new Polyline() polyline.startPath(point.x - 2, point.y - 2) polyline.lineTo(point.x + 2, point.y + 2) polyline } )) sqlContext.sql(s"select line(Shape),getX(Shape)-2 as x from $name") .write .mode(SaveMode.Overwrite) .format("json") .save(s"/tmp/$name.json") } finally { sc.stop() } }
Example 94
Source File: SparkBatch.scala From smart-meter with MIT License | 5 votes |
package com.logimethods.nats.connector.spark.app import java.util.Properties; import java.io.File import java.io.Serializable import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.log4j.{Level, LogManager, PropertyConfigurator} import org.apache.log4j.Logger import org.apache.spark.sql.SparkSession //import com.datastax.spark.connector._ //import com.datastax.spark.connector.cql.CassandraConnector // @see http://stackoverflow.com/questions/39423131/how-to-use-cassandra-context-in-spark-2-0 // @see https://databricks.com/blog/2016/08/15/how-to-use-sparksession-in-apache-spark-2-0.html // @see https://dzone.com/articles/cassandra-with-spark-20-building-rest-api object SparkBatch extends App { val logLevel = System.getenv("APP_BATCH_LOG_LEVEL") println("APP_BATCH_LOG_LEVEL = " + logLevel) if ("DEBUG" != logLevel) { Logger.getLogger("org").setLevel(Level.OFF) } val cassandraUrl = System.getenv("CASSANDRA_URL") println("CASSANDRA_URL = " + cassandraUrl) val sparkMasterUrl = System.getenv("SPARK_MASTER_URL") println("SPARK_MASTER_URL = " + sparkMasterUrl) val spark = SparkSession .builder() .master(sparkMasterUrl) .appName("Smartmeter Batch") .config("spark.cassandra.connection.host", cassandraUrl) // .config("spark.sql.warehouse.dir", warehouseLocation) //.enableHiveSupport() .getOrCreate() spark .read .format("org.apache.spark.sql.cassandra") .options(Map("keyspace" -> "smartmeter", "table" -> "raw_data")) .load .createOrReplaceTempView("raw_data") val rawVoltageData = spark.sql("select * from raw_data") rawVoltageData.show(10) // @see http://stackoverflow.com/questions/40324153/what-is-the-best-way-to-insert-update-rows-in-cassandra-table-via-java-spark //Save data to Cassandra import org.apache.spark.sql.SaveMode avgByTransformer.write.format("org.apache.spark.sql.cassandra").options(Map("keyspace" -> "smartmeter", "table" -> "avg_voltage_by_transformer")).mode(SaveMode.Overwrite).save(); }
Example 95
Source File: Sink.scala From modelmatrix with Apache License 2.0 | 5 votes |
package com.collective.modelmatrix.cli import org.apache.spark.sql.{SaveMode, DataFrame, SQLContext} import scala.util.{Failure, Success, Try} sealed trait Sink { def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit } object Sink { private val hive = "hive://(.*)".r private val parquet = "parquet://(.*)".r def validate(sink: String): Either[String, Unit] = { Try(apply(sink)) match { case Success(s) => Right(()) case Failure(err) => Left(s"Unsupported sink type: $sink") } } def apply(sink: String): Sink = sink match { case hive(table) => HiveSink(table) case parquet(path) => ParquetSink(path) } } object NoSink extends Sink { def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = { sys.error(s"Sink is not defined") } override def toString: String = "Sink is not defined" } case class HiveSink( tableName: String ) extends Sink { def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = { df.saveAsTable(tableName, SaveMode.Overwrite) } override def toString: String = s"Hive table: $tableName" } case class ParquetSink( path: String ) extends Sink { def saveDataFrame(df: DataFrame)(implicit sqlContext: SQLContext): Unit = { df.saveAsParquetFile(path) } override def toString: String = s"Parquet: $path" }
Example 96
Source File: CouchbaseSink.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.sql.streaming import com.couchbase.spark.Logging import org.apache.spark.sql.{DataFrame, SaveMode} import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.sql.types.StringType import com.couchbase.spark.sql._ import com.couchbase.spark._ import com.couchbase.client.core.CouchbaseException import com.couchbase.client.java.document.JsonDocument import com.couchbase.client.java.document.json.JsonObject import scala.concurrent.duration._ class CouchbaseSink(options: Map[String, String]) extends Sink with Logging { override def addBatch(batchId: Long, data: DataFrame): Unit = { val bucketName = options.get("bucket").orNull val idFieldName = options.getOrElse("idField", DefaultSource.DEFAULT_DOCUMENT_ID_FIELD) val removeIdField = options.getOrElse("removeIdField", "true").toBoolean val timeout = options.get("timeout").map(v => Duration(v.toLong, MILLISECONDS)) val createDocument = options.get("expiry").map(_.toInt) .map(expiry => (id: String, content: JsonObject) => JsonDocument.create(id, expiry, content)) .getOrElse((id: String, content: JsonObject) => JsonDocument.create(id, content)) data.toJSON .queryExecution .toRdd .map(_.get(0, StringType).asInstanceOf[UTF8String].toString()) .map { rawJson => val encoded = JsonObject.fromJson(rawJson) val id = encoded.get(idFieldName) if (id == null) { throw new Exception(s"Could not find ID field $idFieldName in $encoded") } if (removeIdField) { encoded.removeKey(idFieldName) } createDocument(id.toString, encoded) } .saveToCouchbase(bucketName, StoreMode.UPSERT, timeout) } }
Example 97
Source File: N1qlSpec.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.n1ql import com.couchbase.client.core.CouchbaseException import com.couchbase.client.java.error.QueryExecutionException import com.couchbase.client.java.query.N1qlQuery import org.apache.spark.{SparkConf, SparkContext, SparkException} import org.apache.spark.sql.sources.EqualTo import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import org.scalatest._ import com.couchbase.spark._ import com.couchbase.spark.connection.CouchbaseConnection import com.couchbase.spark.sql.N1QLRelation import org.apache.spark.sql.types.{StringType, StructField, StructType} import scala.util.control.NonFatal class N1qlSpec extends FunSuite with Matchers with BeforeAndAfterAll { private val master = "local[2]" private val appName = "cb-int-specs1" private var spark: SparkSession = _ override def beforeAll(): Unit = { spark = SparkSession .builder() .master(master) .appName(appName) .config("spark.couchbase.username", "Administrator") .config("spark.couchbase.password", "password") // Open 2 buckets as tests below rely on it .config("com.couchbase.bucket.default", "") .config("com.couchbase.bucket.travel-sample", "") .getOrCreate() } override def afterAll(): Unit = { CouchbaseConnection().stop() spark.stop() } test("Creating N1QLRelation with default bucket, when two buckets exist, should fail") { assertThrows[IllegalStateException] { spark.read .format("com.couchbase.spark.sql.DefaultSource") .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline"))) .option("schemaFilter", "`type` = 'airline'") .schema(StructType(StructField("name", StringType) :: Nil)) .load() } } test("Creating N1QLRelation with non-default bucket, when two buckets exist, should succeed") { spark.read .format("com.couchbase.spark.sql.DefaultSource") .option("schemaFilter", N1QLRelation.filterToExpression(EqualTo("type", "airline"))) .option("schemaFilter", "`type` = 'airline'") .option("bucket", "travel-sample") .schema(StructType(StructField("name", StringType) :: Nil)) .load() } test("N1QL failures should fail the Observable") { try { spark.sparkContext .couchbaseQuery(N1qlQuery.simple("BAD QUERY"), bucketName = "default") .collect() .foreach(println) fail() } catch { case e: SparkException => assert (e.getCause.isInstanceOf[QueryExecutionException]) val err = e.getCause.asInstanceOf[QueryExecutionException] assert (err.getMessage == "syntax error - at QUERY") case NonFatal(e) => println(e) fail() } } }
Example 98
Source File: CouchbaseDataFrameSpec.scala From couchbase-spark-connector with Apache License 2.0 | 5 votes |
package com.couchbase.spark.sql import com.couchbase.spark.connection.CouchbaseConnection import org.apache.avro.generic.GenericData.StringType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession} import org.apache.spark.sql.sources.EqualTo import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.{SparkConf, SparkContext} import org.junit.runner.RunWith import org.scalatest._ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class CouchbaseDataFrameSpec extends FlatSpec with Matchers with BeforeAndAfterAll { private val master = "local[2]" private val appName = "cb-int-specs1" private var spark: SparkSession = null override def beforeAll(): Unit = { val conf = new SparkConf() .setMaster(master) .setAppName(appName) .set("spark.couchbase.nodes", "127.0.0.1") .set("com.couchbase.username", "Administrator") .set("com.couchbase.password", "password") .set("com.couchbase.bucket.default", "") .set("com.couchbase.bucket.travel-sample", "") spark = SparkSession.builder().config(conf).getOrCreate() loadData() } override def afterAll(): Unit = { CouchbaseConnection().stop() spark.stop() } def loadData(): Unit = { } "If two buckets are used and the bucket is specified the API" should "not fail" in { val ssc = spark.sqlContext ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample")) } "The DataFrame API" should "infer the schemas" in { val ssc = spark.sqlContext import com.couchbase.spark.sql._ val airline = ssc.read.couchbase(EqualTo("type", "airline"), Map("bucket" -> "travel-sample")) val airport = ssc.read.couchbase(EqualTo("type", "airport"), Map("bucket" -> "travel-sample")) val route = ssc.read.couchbase(EqualTo("type", "route"), Map("bucket" -> "travel-sample")) val landmark = ssc.read.couchbase(EqualTo("type", "landmark"), Map("bucket" -> "travel-sample")) airline .limit(10) .write .mode(SaveMode.Overwrite) .couchbase(Map("bucket" -> "default")) // TODO: validate schemas which are inferred on a field and type basis } it should "write and ignore" in { val ssc = spark.sqlContext import com.couchbase.spark.sql._ // create df, write it twice val data = ("Michael", 28, true) val df = ssc.createDataFrame(spark.sparkContext.parallelize(Seq(data))) df.write .mode(SaveMode.Ignore) .couchbase(options = Map("idField" -> "_1", "bucket" -> "default")) df.write .mode(SaveMode.Ignore) .couchbase(options = Map("idField" -> "_1", "bucket" -> "default")) } it should "filter based on a function" in { val ssc = spark.sqlContext import com.couchbase.spark.sql._ val airlineBySubstrCountry: DataFrame = ssc.read.couchbase( EqualTo("'substr(country, 0, 6)'", "United"), Map("bucket" -> "travel-sample")) airlineBySubstrCountry.count() should equal(6797) } }
Example 99
Source File: DefaultSource.scala From spark-google-spreadsheets with Apache License 2.0 | 5 votes |
package com.github.potix2.spark.google.spreadsheets import java.io.File import org.apache.spark.sql.sources.{BaseRelation, CreatableRelationProvider, RelationProvider, SchemaRelationProvider} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { final val DEFAULT_CREDENTIAL_PATH = "/etc/gdata/credential.p12" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]) = { createRelation(sqlContext, parameters, null) } private[spreadsheets] def pathToSheetNames(parameters: Map[String, String]): (String, String) = { val path = parameters.getOrElse("path", sys.error("'path' must be specified for spreadsheets.")) val elems = path.split('/') if (elems.length < 2) throw new Exception("'path' must be formed like '<spreadsheet>/<worksheet>'") (elems(0), elems(1)) } override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType) = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) val context = createSpreadsheetContext(parameters) createRelation(sqlContext, context, spreadsheetName, worksheetName, schema) } override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val (spreadsheetName, worksheetName) = pathToSheetNames(parameters) implicit val context = createSpreadsheetContext(parameters) val spreadsheet = SparkSpreadsheetService.findSpreadsheet(spreadsheetName) if(!spreadsheet.isDefined) throw new RuntimeException(s"no such a spreadsheet: $spreadsheetName") spreadsheet.get.addWorksheet(worksheetName, data.schema, data.collect().toList, Util.toRowData) createRelation(sqlContext, context, spreadsheetName, worksheetName, data.schema) } private[spreadsheets] def createSpreadsheetContext(parameters: Map[String, String]) = { val serviceAccountIdOption = parameters.get("serviceAccountId") val credentialPath = parameters.getOrElse("credentialPath", DEFAULT_CREDENTIAL_PATH) SparkSpreadsheetService(serviceAccountIdOption, new File(credentialPath)) } private[spreadsheets] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: StructType): SpreadsheetRelation = if (schema == null) { createRelation(sqlContext, context, spreadsheetName, worksheetName, None) } else { createRelation(sqlContext, context, spreadsheetName, worksheetName, Some(schema)) } private[spreadsheets] def createRelation(sqlContext: SQLContext, context: SparkSpreadsheetService.SparkSpreadsheetContext, spreadsheetName: String, worksheetName: String, schema: Option[StructType]): SpreadsheetRelation = SpreadsheetRelation(context, spreadsheetName, worksheetName, schema)(sqlContext) }
Example 100
Source File: GraphIO.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.utils import org.apache.hadoop.fs.Path import org.apache.spark.SparkContext import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} object GraphIO { private val DELIMITER = "delimiter" private val HEADER = "header" private val int2Long = udf[Long, Int](_.toLong) private val string2Long = udf[Long, String](_.toLong) private val int2Float = udf[Float, Int](_.toFloat) private val long2Float = udf[Float, Long](_.toFloat) private val double2Float = udf[Float, Double](_.toFloat) private val string2Float = udf[Float, String](_.toFloat) def convert2Float(df: DataFrame, structField: StructField, tmpSuffix: String): DataFrame = { val tmpName = structField.name + tmpSuffix structField.dataType match { case _: LongType => df.withColumn(tmpName, long2Float(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case _: IntegerType => df.withColumn(tmpName, int2Float(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case _: DoubleType => df.withColumn(tmpName, double2Float(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case _: StringType => df.withColumn(tmpName, string2Float(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case _: FloatType => df case t => throw new Exception(s"$t can't convert to Float") } } def convert2Long(df: DataFrame, structField: StructField, tmpSuffix: String): DataFrame = { val tmpName = structField.name + tmpSuffix structField.dataType match { case _: LongType => df case _: IntegerType => df.withColumn(tmpName, int2Long(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case _: StringType => df.withColumn(tmpName, string2Long(df(structField.name))) .drop(structField.name) .withColumnRenamed(tmpName, structField.name) case t => throw new Exception(s"$t can't convert to Long") } } def load(input: String, isWeighted: Boolean, srcIndex: Int = 0, dstIndex: Int = 1, weightIndex: Int = 2, sep: String = " "): DataFrame = { val ss = SparkSession.builder().getOrCreate() val schema = if (isWeighted) { StructType(Seq( StructField("src", LongType, nullable = false), StructField("dst", LongType, nullable = false), StructField("weight", FloatType, nullable = false) )) } else { StructType(Seq( StructField("src", LongType, nullable = false), StructField("dst", LongType, nullable = false) )) } ss.read .option("sep", sep) .option("header", "false") .schema(schema) .csv(input) } def save(df: DataFrame, output: String, seq: String = "\t"): Unit = { df.printSchema() df.write .mode(SaveMode.Overwrite) .option(HEADER, "false") .option(DELIMITER, seq) .csv(output) } def defaultCheckpointDir: Option[String] = { val sparkContext = SparkContext.getOrCreate() sparkContext.getConf.getOption("spark.yarn.stagingDir") .map { base => new Path(base, s".sparkStaging/${sparkContext.getConf.getAppId}").toString } } }
Example 101
Source File: BigFileDatasource.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql import java.net.URI import java.util.ServiceLoader import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.spark.rdd.RDD import org.apache.spark.sql.sources._ import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} import io.projectglow.common.{GlowLogging, WithUtils} def write(rdd: RDD[Array[Byte]], path: String) { val uri = new URI(path) uploaders.find(_.canUpload(rdd.sparkContext.hadoopConfiguration, path)) match { case Some(uploader) => uploader.upload(rdd, path) case None => logger.info(s"Could not find a parallel uploader for $path, uploading from the driver") writeFileFromDriver(new Path(uri), rdd) } } private def writeFileFromDriver(path: Path, byteRdd: RDD[Array[Byte]]): Unit = { val sc = byteRdd.sparkContext val fs = path.getFileSystem(sc.hadoopConfiguration) WithUtils.withCloseable(fs.create(path)) { stream => WithUtils.withCachedRDD(byteRdd) { cachedRdd => cachedRdd.count() cachedRdd.toLocalIterator.foreach { chunk => stream.write(chunk) } } } } }
Example 102
Source File: BigFileDatasourceSuite.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql import java.nio.file.{Files, Paths} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SaveMode} class BigFileDatasourceSuite extends GlowBaseTest { test("save mode: append") { val outFile = Files.createTempFile("tmp", ".tmp").toString val e = intercept[RuntimeException] { spark .emptyDataFrame .write .mode(SaveMode.Append) .format("io.projectglow.sql.DummyBigFileDatasource") .save(outFile) } assert( e.getMessage .contains("Append mode is not supported by io.projectglow.sql.DummyBigFileDatasource")) } test("save mode: overwrite") { val outDir = Files.createTempDirectory("tmp").toString spark .emptyDataFrame .write .mode(SaveMode.Overwrite) .format("io.projectglow.sql.DummyBigFileDatasource") .save(outDir) val filePath = Paths.get(outDir) assert(Files.isRegularFile(filePath)) val writtenBytes = Files.readAllBytes(filePath) assert(writtenBytes.toSeq == Seq(0, 1, 2).map(_.toByte)) } test("save mode: error if exists") { val outFile = Files.createTempFile("tmp", ".tmp").toString val e = intercept[RuntimeException] { spark .emptyDataFrame .write .mode(SaveMode.ErrorIfExists) .format("io.projectglow.sql.DummyBigFileDatasource") .save(outFile) } assert(e.getMessage.contains(s"Path $outFile already exists")) } test("save mode: ignore") { val outDir = Files.createTempDirectory("tmp").toString spark .emptyDataFrame .write .mode(SaveMode.Ignore) .format("io.projectglow.sql.DummyBigFileDatasource") .save(outDir) val dirPath = Paths.get(outDir) assert(Files.isDirectory(dirPath)) } } class DummyBigFileDatasource extends BigFileDatasource { override def serializeDataFrame( options: Map[String, String], data: DataFrame): RDD[Array[Byte]] = { data.sqlContext.sparkContext.parallelize(Seq(Array(0, 1, 2).map(_.toByte))) } }
Example 103
Source File: CreateZipcodesSpark.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend import akka.actor.ActorSystem import akka.event.{ Logging, LoggingAdapter } import akka.stream.{ ActorMaterializer, Materializer } import org.apache.spark.sql.{ SaveMode, SparkSession } import scala.concurrent.ExecutionContext object CreateZipcodesSpark extends App { implicit val system: ActorSystem = ActorSystem() implicit val mat: Materializer = ActorMaterializer() implicit val ec: ExecutionContext = system.dispatcher implicit val log: LoggingAdapter = Logging(system, this.getClass) val spark = SparkSession.builder() .config("spark.sql.warehouse.dir", "file:/tmp/spark-warehouse") .config("spark.cores.max", "4") .config("spark.scheduler.mode", "FAIR") .config("spark.sql.crossJoin.enabled", "true") .master("local[*]") // use as many threads as cores .appName("CreateZipcodesSpark").getOrCreate() import spark.implicits._ // define an RDD for the district range val districts = spark.sparkContext.parallelize(1000 to 9000).map(_.toString).toDS // create temp view districts.createOrReplaceTempView("districts") // define an RDD with a range for the letters val l1 = spark.sparkContext.parallelize('A' to 'Z').map(_.toString).toDS l1.createOrReplaceTempView("l1") // join the letters val letters = spark.sql("SELECT concat(a.value, b.value) letters from l1 a join l1 b") // define temp view letters.createOrReplaceTempView("letters") // define an RDD for the houses val houses = spark.sparkContext.makeRDD(1 to 399).toDS // create temp view houses.createOrReplaceTempView("houses") // join letters and houses val lettersWithHouseNr = spark.sql( """ |SELECT CONCAT(letters, '-', nr) letterswithhousenr FROM letters |JOIN |(SELECT format_string("%03d", value) nr FROM houses) """.stripMargin ) // create temp view lettersWithHouseNr.createOrReplaceTempView("lwh") // join the districts with the house numbers val tickets = spark.sql("SELECT concat(value, letterswithhousenr) value FROM districts JOIN lwh LIMIT 5000000") tickets.write.mode(SaveMode.Overwrite).parquet("/tmp/tickets_spark.parquet") shutdown def shutdown: Unit = { spark.stop() system.terminate() } sys.addShutdownHook(shutdown) }
Example 104
Source File: DefaultSource.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.sql import org.apache.spark.sql.{ DataFrame, SQLContext, SaveMode } import org.apache.spark.sql.sources.{ BaseRelation, CreatableRelationProvider, DataSourceRegister, RelationProvider, SchemaRelationProvider } import org.apache.spark.sql.types.StructType import com.actian.spark_vector.util.Logging import com.actian.spark_vector.vector.VectorJDBC class DefaultSource extends DataSourceRegister with RelationProvider with SchemaRelationProvider with CreatableRelationProvider with Logging { override def shortName(): String = "vector" override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation = VectorRelation(TableRef(parameters), sqlContext, parameters) override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = VectorRelation(TableRef(parameters), Some(schema), sqlContext, parameters) override def createRelation(sqlContext: SQLContext, mode: SaveMode, parameters: Map[String, String], data: DataFrame): BaseRelation = { val tableRef = TableRef(parameters) val table = VectorRelation(tableRef, sqlContext, parameters) mode match { case SaveMode.Overwrite => table.insert(data, true) case SaveMode.ErrorIfExists => val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) } if (isEmpty) { table.insert(data, false) } else { throw new UnsupportedOperationException("Writing to a non-empty Vector table is not allowed with mode ErrorIfExists.") } case SaveMode.Append => table.insert(data, false) case SaveMode.Ignore => val isEmpty = VectorJDBC.withJDBC(tableRef.toConnectionProps) { _.isTableEmpty(tableRef.table) } if (isEmpty) { table.insert(data, false) } } table } }
Example 105
Source File: SparkSqlRunner.scala From amaterasu with Apache License 2.0 | 5 votes |
package org.apache.amaterasu.executor.execution.actions.runners.spark.SparkSql import java.io.File import org.apache.amaterasu.common.execution.actions.Notifier import org.apache.amaterasu.common.logging.Logging import org.apache.amaterasu.common.runtime.Environment import org.apache.commons.io.FilenameUtils import org.apache.spark.SparkContext import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode, SparkSession} def findFileType(folderName: File): Array[String] = { // get all the files from a directory val files: Array[File] = folderName.listFiles() val extensions: Array[String] = files.map(file => FilenameUtils.getExtension(file.toString)) extensions } } object SparkSqlRunner { def apply(env: Environment, jobId: String, actionName: String, notifier: Notifier, sc: SparkContext): SparkSqlRunner = { val sparkSqlRunnerObj = new SparkSqlRunner sparkSqlRunnerObj.env = env sparkSqlRunnerObj.jobId = jobId sparkSqlRunnerObj.actionName = actionName sparkSqlRunnerObj.notifier = notifier sparkSqlRunnerObj.sc = sc sparkSqlRunnerObj.spark = SparkSession.builder().config(sc.getConf).enableHiveSupport().getOrCreate() sparkSqlRunnerObj } }
Example 106
Source File: SparkSQL.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.sparksql import org.apache.spark.sql.{SaveMode, SparkSession} object SparkSQL { def main(args: Array[String]) { val spark = SparkSession .builder() .appName("SparkSQL-on-MaxCompute") .config("spark.sql.broadcastTimeout", 20 * 60) .config("spark.sql.crossJoin.enabled", true) .config("odps.exec.dynamic.partition.mode", "nonstrict") .config("spark.sql.catalogImplementation", "odps") .getOrCreate() val project = spark.conf.get("odps.project.name") import spark._ import sqlContext.implicits._ val tableName = "mc_test_table" val ptTableName = "mc_test_pt_table" // Drop Create sql(s"DROP TABLE IF EXISTS ${tableName}") sql(s"DROP TABLE IF EXISTS ${ptTableName}") sql(s"CREATE TABLE ${tableName} (name STRING, num BIGINT)") sql(s"CREATE TABLE ${ptTableName} (name STRING, num BIGINT) PARTITIONED BY (pt1 STRING, pt2 STRING)") val df = spark.sparkContext.parallelize(0 to 99, 2).map(f => { (s"name-$f", f) }).toDF("name", "num") val ptDf = spark.sparkContext.parallelize(0 to 99, 2).map(f => { (s"name-$f", f, "2018", "0601") }).toDF("name", "num", "pt1", "pt2") // 写 普通表 df.write.insertInto(tableName) // insertInto语义 df.write.mode("overwrite").insertInto(tableName) // insertOverwrite语义 // 写 分区表 // DataFrameWriter 无法指定分区写入 需要通过临时表再用SQL写入特定分区 df.createOrReplaceTempView(s"${ptTableName}_tmp_view") sql(s"insert into table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view") sql(s"insert overwrite table ${ptTableName} partition (pt1='2018', pt2='0601') select * from ${ptTableName}_tmp_view") ptDf.write.insertInto(ptTableName) // 动态分区 insertInto语义 ptDf.write.mode("overwrite").insertInto(ptTableName) // 动态分区 insertOverwrite语义 // 读 普通表 val rdf = sql(s"select name, num from $tableName") println(s"rdf count, ${rdf.count()}") rdf.printSchema() // 读 分区表 val rptdf = sql(s"select name, num, pt1, pt2 from $ptTableName where pt1 = '2018' and pt2 = '0601'") println(s"rptdf count, ${rptdf.count()}") rptdf.printSchema() } }
Example 107
Source File: DataFrameToFileWriter.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations.readwritedataframe.filestorage import org.apache.spark.SparkException import io.deepsense.commons.utils.LoggerForCallerClass import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperations.exceptions.WriteFileException import io.deepsense.deeplang.doperations.inout.OutputFileFormatChoice.Csv import io.deepsense.deeplang.doperations.inout.OutputStorageTypeChoice import io.deepsense.deeplang.doperations.readwritedataframe.{FilePath, FilePathFromLibraryPath, FileScheme} import io.deepsense.deeplang.doperations.readwritedataframe.filestorage.csv.CsvSchemaStringifierBeforeCsvWriting import io.deepsense.deeplang.exceptions.DeepLangException import io.deepsense.deeplang.{ExecutionContext, FileSystemClient} import org.apache.spark.sql.SaveMode object DataFrameToFileWriter { val logger = LoggerForCallerClass() def writeToFile( fileChoice: OutputStorageTypeChoice.File, context: ExecutionContext, dataFrame: DataFrame): Unit = { implicit val ctx = context val path = FileSystemClient.replaceLeadingTildeWithHomeDirectory(fileChoice.getOutputFile()) val filePath = FilePath(path) val saveMode = if (fileChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists try { val preprocessed = fileChoice.getFileFormat() match { case csv: Csv => CsvSchemaStringifierBeforeCsvWriting.preprocess(dataFrame) case other => dataFrame } writeUsingProvidedFileScheme(fileChoice, preprocessed, filePath, saveMode) } catch { case e: SparkException => logger.error(s"WriteDataFrame error: Spark problem. Unable to write file to $path", e) throw WriteFileException(path, e) } } private def writeUsingProvidedFileScheme( fileChoice: OutputStorageTypeChoice.File, dataFrame: DataFrame, path: FilePath, saveMode: SaveMode )(implicit context: ExecutionContext): Unit = { import FileScheme._ path.fileScheme match { case Library => val filePath = FilePathFromLibraryPath(path) val FilePath(_, libraryPath) = filePath new java.io.File(libraryPath).getParentFile.mkdirs() writeUsingProvidedFileScheme(fileChoice, dataFrame, filePath, saveMode) case FileScheme.File => DriverFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode) case HDFS => ClusterFiles.write(dataFrame, path, fileChoice.getFileFormat(), saveMode) case HTTP | HTTPS | FTP => throw NotSupportedScheme(path.fileScheme) } } case class NotSupportedScheme(fileScheme: FileScheme) extends DeepLangException(s"Not supported file scheme ${fileScheme.pathPrefix}") }
Example 108
Source File: WriteDataFrame.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperations import java.io.IOException import java.util.Properties import scala.reflect.runtime.{universe => ru} import io.deepsense.commons.utils.Version import io.deepsense.deeplang.DOperation.Id import io.deepsense.deeplang._ import io.deepsense.deeplang.documentation.OperationDocumentation import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperations.exceptions.DeepSenseIOException import io.deepsense.deeplang.doperations.inout._ import io.deepsense.deeplang.doperations.readwritedataframe.filestorage.DataFrameToFileWriter import io.deepsense.deeplang.doperations.readwritedataframe.googlestorage.DataFrameToGoogleSheetWriter import io.deepsense.deeplang.doperations.readwritedataframe.validators.{FilePathHasValidFileScheme, ParquetSupportedOnClusterOnly} import io.deepsense.deeplang.inference.{InferContext, InferenceWarnings} import io.deepsense.deeplang.params.choice.ChoiceParam import io.deepsense.deeplang.params.{Param, Params} import org.apache.spark.sql.SaveMode class WriteDataFrame() extends DOperation1To0[DataFrame] with Params with OperationDocumentation { override val id: Id = "9e460036-95cc-42c5-ba64-5bc767a40e4e" override val name: String = "Write DataFrame" override val description: String = "Writes a DataFrame to a file or database" override val since: Version = Version(0, 4, 0) @transient override lazy val tTagTI_0: ru.TypeTag[DataFrame] = ru.typeTag[DataFrame] val storageType = ChoiceParam[OutputStorageTypeChoice]( name = "data storage type", description = Some("Storage type.")) def getStorageType(): OutputStorageTypeChoice = $(storageType) def setStorageType(value: OutputStorageTypeChoice): this.type = set(storageType, value) val params: Array[Param[_]] = Array(storageType) setDefault(storageType, new OutputStorageTypeChoice.File()) override def execute(dataFrame: DataFrame)(context: ExecutionContext): Unit = { import OutputStorageTypeChoice._ try { getStorageType() match { case jdbcChoice: Jdbc => writeToJdbc(jdbcChoice, context, dataFrame) case googleSheetChoice: GoogleSheet => DataFrameToGoogleSheetWriter.writeToGoogleSheet( googleSheetChoice, context, dataFrame ) case fileChoice: File => DataFrameToFileWriter.writeToFile(fileChoice, context, dataFrame) } } catch { case e: IOException => logger.error(s"WriteDataFrame error. Could not write file to designated storage", e) throw DeepSenseIOException(e) } } private def writeToJdbc( jdbcChoice: OutputStorageTypeChoice.Jdbc, context: ExecutionContext, dataFrame: DataFrame): Unit = { val properties = new Properties() properties.setProperty("driver", jdbcChoice.getJdbcDriverClassName) val jdbcUrl = jdbcChoice.getJdbcUrl val jdbcTableName = jdbcChoice.getJdbcTableName val saveMode = if (jdbcChoice.getShouldOverwrite) SaveMode.Overwrite else SaveMode.ErrorIfExists dataFrame.sparkDataFrame.write.mode(saveMode).jdbc(jdbcUrl, jdbcTableName, properties) } override def inferKnowledge(k0: DKnowledge[DataFrame])(context: InferContext): (Unit, InferenceWarnings) = { FilePathHasValidFileScheme.validate(this) ParquetSupportedOnClusterOnly.validate(this) super.inferKnowledge(k0)(context) } }