org.apache.spark.sql.functions.lit Scala Examples
The following examples show how to use org.apache.spark.sql.functions.lit.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TestUtils.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.utils import org.apache.spark.sql.functions.{col, count, lit} import org.apache.spark.sql.{DataFrame, Row} object TestUtils { implicit class ExtendedDataFrame(df: DataFrame) { def hasDiff(anotherDf: DataFrame): Boolean = { def printDiff(incoming: Boolean)(row: Row): Unit = { if (incoming) print("+ ") else print("- ") println(row) } val groupedDf = df.groupBy(df.columns.map(col): _*).agg(count(lit(1))).collect().toSet val groupedAnotherDf = anotherDf.groupBy(anotherDf.columns.map(col): _*).agg(count(lit(1))).collect().toSet groupedDf.diff(groupedAnotherDf).foreach(printDiff(incoming = true)) groupedAnotherDf.diff(groupedDf).foreach(printDiff(incoming = false)) groupedDf.diff(groupedAnotherDf).nonEmpty || groupedAnotherDf.diff(groupedDf).nonEmpty } } }
Example 2
Source File: Uniqueness.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{col, lit, sum} import org.apache.spark.sql.types.DoubleType case class Uniqueness(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil } override def filterCondition: Option[String] = where } object Uniqueness { def apply(column: String): Uniqueness = { new Uniqueness(column :: Nil) } def apply(column: String, where: Option[String]): Uniqueness = { new Uniqueness(column :: Nil, where) } }
Example 3
Source File: UniqueValueRatio.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import com.amazon.deequ.metrics.DoubleMetric import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{col, count, lit, sum} import org.apache.spark.sql.types.DoubleType case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil } override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = { val numUniqueValues = result.getDouble(offset) val numDistinctValues = result.getLong(offset + 1).toDouble toSuccessMetric(numUniqueValues / numDistinctValues) } override def filterCondition: Option[String] = where } object UniqueValueRatio { def apply(column: String): UniqueValueRatio = { new UniqueValueRatio(column :: Nil) } def apply(column: String, where: Option[String]): UniqueValueRatio = { new UniqueValueRatio(column :: Nil, where) } }
Example 4
Source File: EnrichPostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.postprocessors import java.util import com.typesafe.config.Config import it.agilelab.bigdata.DataQuality.checks.CheckResult import it.agilelab.bigdata.DataQuality.exceptions.IllegalParameterException import it.agilelab.bigdata.DataQuality.metrics.MetricResult import it.agilelab.bigdata.DataQuality.sources.HdfsFile import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig import it.agilelab.bigdata.DataQuality.utils import it.agilelab.bigdata.DataQuality.utils.DQSettings import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter} import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.functions.lit import org.apache.spark.sql.{DataFrame, SQLContext} import scala.collection.JavaConversions._ import scala.util.Try final class EnrichPostprocessor(config: Config, settings: DQSettings) extends BasicPostprocessor(config, settings) { private val vs: Option[String] = Try(config.getString("source")).toOption private val metrics: util.List[String] = config.getStringList("metrics") private val checks: util.List[String] = config.getStringList("checks") private val extra = config.getObject("extra").toMap private val target: HdfsTargetConfig = { val conf = config.getConfig("saveTo") utils.parseTargetConfig(conf)(settings).get } override def process(vsRef: Set[HdfsFile], metRes: Seq[MetricResult], chkRes: Seq[CheckResult])( implicit fs: FileSystem, sqlContext: SQLContext, settings: DQSettings): HdfsFile = { import sqlContext.implicits._ val df: DataFrame = vs match { case Some(vsource) => val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vsource).head HdfsReader.load(reqVS, settings.ref_date).head case None => sqlContext.sparkContext.parallelize(Seq(1)).toDF("teapot") } val reqMet: Seq[(String, Double)] = metRes .filter(mr => metrics.contains(mr.metricId)) .map(mr => mr.metricId -> mr.result) val reqCheck: Seq[(String, String)] = chkRes .filter(cr => checks.contains(cr.checkId)) .map(cr => cr.checkId -> cr.status) if (reqMet.size != metrics.size()) throw IllegalParameterException("Some of stated metrics are missing!") if (reqCheck.size != checks.size()) throw IllegalParameterException("Some of stated checks are missing!") val dfWithMet: DataFrame = reqMet.foldLeft(df)((df, met) => df.withColumn(met._1, lit(met._2))) val dfWithChecks = reqCheck.foldLeft(dfWithMet)((df, met) => df.withColumn(met._1, lit(met._2))) val dfWithExtra = extra.foldLeft(dfWithChecks)((df, ex) => df.withColumn(ex._1, lit(ex._2.unwrapped()))) HdfsWriter.saveVirtualSource( dfWithExtra.drop("teapot"), target, settings.refDateString)(fs, sqlContext.sparkContext) new HdfsFile(target) } }
Example 5
Source File: SchemaColumnFixed.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker.schema.table.columns import java.sql.{Date, Timestamp} import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol import org.apache.spark.sql.Column import org.apache.spark.sql.functions.lit case class SchemaColumnFixed[T](override val name: String, value: T) extends SchemaColumn { override def column(rowID: Option[Column] = None): Column = lit(value) } object SchemaColumnFixedProtocol extends SchemaColumnFixedProtocol trait SchemaColumnFixedProtocol extends YamlParserProtocol { import net.jcazevedo.moultingyaml._ implicit object SchemaColumnFixedFormat extends YamlFormat[SchemaColumnFixed[_]] { override def read(yaml: YamlValue): SchemaColumnFixed[_] = { val fields = yaml.asYamlObject.fields val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set")) val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError(s"data_type not set for $name")) val value = fields.getOrElse(YamlString("value"), deserializationError(s"value not set for $name")) dataType match { case SchemaColumnDataType.Int => SchemaColumnFixed(name, value.convertTo[Int]) case SchemaColumnDataType.Long => SchemaColumnFixed(name, value.convertTo[Long]) case SchemaColumnDataType.Float => SchemaColumnFixed(name, value.convertTo[Float]) case SchemaColumnDataType.Double => SchemaColumnFixed(name, value.convertTo[Double]) case SchemaColumnDataType.Date => SchemaColumnFixed(name, value.convertTo[Date]) case SchemaColumnDataType.Timestamp => SchemaColumnFixed(name, value.convertTo[Timestamp]) case SchemaColumnDataType.String => SchemaColumnFixed(name, value.convertTo[String]) case SchemaColumnDataType.Boolean => SchemaColumnFixed(name, value.convertTo[Boolean]) case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Fixed}") } } override def write(obj: SchemaColumnFixed[_]): YamlValue = ??? } }
Example 6
Source File: SimpleJsonIngestionJob.scala From comet-data-pipeline with Apache License 2.0 | 5 votes |
package com.ebiznext.comet.job.ingest import com.ebiznext.comet.config.Settings import com.ebiznext.comet.schema.handlers.{SchemaHandler, StorageHandler} import com.ebiznext.comet.schema.model._ import org.apache.hadoop.fs.Path import org.apache.spark.sql.functions.lit import org.apache.spark.sql.{DataFrame, Encoders} import scala.util.{Failure, Success, Try} class SimpleJsonIngestionJob( domain: Domain, schema: Schema, types: List[Type], path: List[Path], storageHandler: StorageHandler, schemaHandler: SchemaHandler )(implicit settings: Settings) extends DsvIngestionJob(domain, schema, types, path, storageHandler, schemaHandler) { override def loadDataSet(): Try[DataFrame] = { try { val df = if (metadata.isArray()) { val jsonRDD = session.sparkContext.wholeTextFiles(path.map(_.toString).mkString(",")).map(_._2) session.read .json(session.createDataset(jsonRDD)(Encoders.STRING)) .withColumn( // Spark cannot detect the input file automatically, so we should add it explicitly Settings.cometInputFileNameColumn, if (settings.comet.grouped) lit(path.map(_.toString).mkString(",")) else lit(path.head.toString) ) } else { session.read .option("encoding", metadata.getEncoding()) .option("multiline", metadata.getMultiline()) .json(path.map(_.toString): _*) .withColumn( // Spark here can detect the input file automatically, so we're just using the input_file_name spark function Settings.cometInputFileNameColumn, org.apache.spark.sql.functions.input_file_name() ) } import session.implicits._ val resDF = if (df.columns.contains("_corrupt_record")) { //TODO send rejected records to rejected area logger.whenDebugEnabled { df.filter($"_corrupt_record".isNotNull).show(1000, false) } throw new Exception( s"""Invalid JSON File: ${path .map(_.toString) .mkString(",")}. SIMPLE_JSON require a valid json file """ ) } else { df } Success( resDF ) } catch { case e: Exception => Failure(e) } } }
Example 7
Source File: PowerBiSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.split1 import java.io.File import com.microsoft.ml.spark.Secrets import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.io.powerbi.PowerBIWriter import org.apache.spark.SparkException import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.functions.{current_timestamp, lit} import scala.collection.JavaConverters._ class PowerBiSuite extends TestBase with FileReaderUtils { lazy val url: String = sys.env.getOrElse("MML_POWERBI_URL", Secrets.PowerbiURL) lazy val df: DataFrame = session .createDataFrame(Seq( (Some(0), "a"), (Some(1), "b"), (Some(2), "c"), (Some(3), ""), (None, "bad_row"))) .toDF("bar", "foo") .withColumn("baz", current_timestamp()) lazy val bigdf: DataFrame = (1 to 5).foldRight(df) { case (_, ldf) => ldf.union(df) }.repartition(2) lazy val delayDF: DataFrame = { val rows = Array.fill(100){df.collect()}.flatten.toList.asJava val df2 = session .createDataFrame(rows, df.schema) .coalesce(1).cache() df2.count() df2.map({x => Thread.sleep(10); x})(RowEncoder(df2.schema)) } test("write to powerBi", TestBase.BuildServer) { PowerBIWriter.write(df, url) } test("write to powerBi with delays"){ PowerBIWriter.write(delayDF, url) } test("using dynamic minibatching"){ PowerBIWriter.write(delayDF, url, Map("minibatcher"->"dynamic", "maxBatchSize"->"50")) } test("using timed minibatching"){ PowerBIWriter.write(delayDF, url, Map("minibatcher"->"timed")) } test("using consolidated timed minibatching"){ PowerBIWriter.write(delayDF, url, Map( "minibatcher"->"timed", "consolidate"->"true")) } test("using buffered batching"){ PowerBIWriter.write(delayDF, url, Map("buffered"->"true")) } ignore("throw useful error message when given an improper dataset") { //TODO figure out why this does not throw errors on the build machine assertThrows[SparkException] { PowerBIWriter.write(df.withColumn("bad", lit("foo")), url) } } test("stream to powerBi", TestBase.BuildServer) { bigdf.write.parquet(tmpDir + File.separator + "powerBI.parquet") val sdf = session.readStream.schema(df.schema).parquet(tmpDir + File.separator + "powerBI.parquet") val q1 = PowerBIWriter.stream(sdf, url).start() q1.processAllAvailable() } }
Example 8
Source File: ServingUDFs.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package org.apache.spark.sql.execution.streaming import com.microsoft.ml.spark.io.http.HTTPResponseData import com.microsoft.ml.spark.io.http.HTTPSchema.{binary_to_response, empty_response, string_to_response} import org.apache.spark.sql.execution.streaming.continuous.HTTPSourceStateHolder import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{lit, struct, to_json, udf} import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, Row} import scala.util.Try object ServingUDFs { private def jsonReply(c: Column) = string_to_response(to_json(c)) def makeReplyUDF(data: Column, dt: DataType, code: Column = lit(200), reason: Column = lit("Success")): Column = { dt match { case NullType => empty_response(code, reason) case StringType => string_to_response(data, code, reason) case BinaryType => binary_to_response(data) case _: StructType => jsonReply(data) case _: MapType => jsonReply(data) case at: ArrayType => at.elementType match { case _: StructType => jsonReply(data) case _: MapType => jsonReply(data) case _ => jsonReply(struct(data)) } case _ => jsonReply(struct(data)) } } private def sendReplyHelper(mapper: Row => HTTPResponseData)(serviceName: String, reply: Row, id: Row): Boolean = { if (Option(reply).isEmpty || Option(id).isEmpty) { null.asInstanceOf[Boolean] //scalastyle:ignore null } else { Try(HTTPSourceStateHolder.getServer(serviceName).replyTo(id.getString(0), id.getString(1), mapper(reply))) .toOption.isDefined } } def sendReplyUDF: UserDefinedFunction = { val toData = HTTPResponseData.makeFromRowConverter udf(sendReplyHelper(toData) _, BooleanType) } }
Example 9
Source File: DeltaSourceSnapshot.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.files import org.apache.spark.sql.delta.{DeltaLog, DeltaTableUtils, Snapshot} import org.apache.spark.sql.delta.sources.IndexedFile import org.apache.spark.sql.delta.util.StateCache import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.functions.lit class DeltaSourceSnapshot( val spark: SparkSession, val snapshot: Snapshot, val filters: Seq[Expression]) extends SnapshotIterator with StateCache { protected val version = snapshot.version protected val path = snapshot.path protected lazy val (partitionFilters, dataFilters) = { val partitionCols = snapshot.metadata.partitionColumns filters.partition { e => DeltaTableUtils.isPredicatePartitionColumnsOnly(e, partitionCols, spark) } } protected def initialFiles: Dataset[IndexedFile] = { import spark.implicits._ cacheDS( snapshot.allFiles.sort("modificationTime", "path") .rdd.zipWithIndex() .toDF("add", "index") .withColumn("version", lit(version)) .withColumn("isLast", lit(false)) .as[IndexedFile], s"Delta Source Snapshot #$version - ${snapshot.redactedPath}").getDS } override def close(unpersistSnapshot: Boolean): Unit = { super.close(unpersistSnapshot) if (unpersistSnapshot) { snapshot.uncache() } } } trait SnapshotIterator { self: DeltaSourceSnapshot => private var result: Iterable[IndexedFile] = _ def iterator(): Iterator[IndexedFile] = { import spark.implicits._ if (result == null) { result = DeltaLog.filterFileList( snapshot.metadata.partitionSchema, initialFiles.toDF(), partitionFilters, Seq("add")).as[IndexedFile].collect().toIterable } // This will always start from the beginning and re-use resources. If any exceptions were to // be thrown, the stream would stop, we would call stop on the source, and that will make // sure that we clean up resources. result.toIterator } def close(unpersistSnapshot: Boolean): Unit = { } }
Example 10
Source File: GpuDSArrayMult.scala From GPUEnabler with Apache License 2.0 | 5 votes |
package com.ibm.gpuenabler import org.apache.spark.SparkEnv import org.apache.spark.sql.functions.lit import com.ibm.gpuenabler.CUDADSImplicits._ object GpuDSArrayMult { case class jsonData(name : String, factor: Long, arr: Array[Long]) case class inputData(name : String, factor: Long, arr: Array[Long], result: Array[Long]) case class outputData(name: String, result: Array[Long]) def main(args : Array[String]): Unit = { val ss = org.apache.spark.sql.SparkSession.builder.master("local[*]").appName("test").getOrCreate() import ss.implicits._ if(args.length > 0) { println("Setting debug Mode" + args(0)) SparkEnv.get.conf.set("DebugMode", args(0)) } val ptxURL = "/GpuEnablerExamples.ptx" // 1. Sample Map Operation - multiple every element in the array by 2 val mulFunc = DSCUDAFunction("multiplyBy2", Seq("value"), Seq("value"), ptxURL) val N: Long = 100000 val dataPts = ss.range(1, N+1, 1, 10).cache val results = dataPts.mapExtFunc(_ * 2, mulFunc).collect() println("Count is " + results.length) assert(results.length == N) val expResults = (1 to N.toInt).map(_ * 2) assert(results.sameElements(expResults)) // 2. Sample Reduce Operation - Sum of all elements in the array val dimensions = (size: Long, stage: Int) => stage match { case 0 => (64, 256, 1, 1, 1, 1) case 1 => (1, 1, 1, 1, 1, 1) } val gpuParams = gpuParameters(dimensions) val sumFunc = DSCUDAFunction( "suml", Array("value"), Array("value"), ptxURL, Some((size: Long) => 2), Some(gpuParams), outputSize=Some(1)) val results2 = dataPts .mapExtFunc(_ * 2, mulFunc) .reduceExtFunc(_ + _, sumFunc) println("Output is "+ results2) println("Expected is " + (N * (N + 1))) assert(results2 == N * (N + 1)) // 3. Dataset - GPU Map - Dataset Operation. val ds = ss.read.json("src/main/resources/data.json").as[jsonData] val dds = ds.withColumn("result", lit(null: Array[Double] )).as[inputData] val dsFunc = DSCUDAFunction("arrayTest", Seq("factor", "arr"), Seq("result"), ptxURL) val mapDS = dds.mapExtFunc(x => outputData(x.name, x.result), dsFunc, Array((1 to 10).map(_ * 3).toArray, (1 to 35).map(_.toLong).toArray), outputArraySizes = Array(3)) mapDS.select($"name", $"result").show() } }
Example 11
Source File: AnyValInstances.scala From cleanframes with Apache License 2.0 | 5 votes |
package cleanframes.instances import cleanframes.Cleaner import org.apache.spark.sql.functions.{lower, trim, when, lit} import org.apache.spark.sql.types._ trait AnyValInstances extends IntInstances with ByteInstances with CharInstances with ShortInstances with LongInstances with FloatInstances with DoubleInstances with BooleanInstances with NumericAnyValInstance trait IntInstances { implicit lazy val integerType: SparkDataType[Int] = new SparkDataType[Int] { override def getDataType: DataType = IntegerType } } trait ByteInstances { implicit lazy val byteType: SparkDataType[Byte] = new SparkDataType[Byte] { override def getDataType: DataType = ByteType } } trait CharInstances { implicit val stdStringToChar: String => Char = _.charAt(0) } trait ShortInstances { implicit lazy val shortType: SparkDataType[Short] = new SparkDataType[Short] { override def getDataType: DataType = ShortType } } trait LongInstances { implicit lazy val longType: SparkDataType[Long] = new SparkDataType[Long] { override def getDataType: DataType = LongType } } trait FloatInstances { implicit lazy val floatType: SparkDataType[Float] = new SparkDataType[Float] { override def getDataType: DataType = FloatType } } trait DoubleInstances { implicit lazy val doubleType: SparkDataType[Double] = new SparkDataType[Double] { override def getDataType: DataType = DoubleType } } trait BooleanInstances { implicit lazy val booleanCleaner: Cleaner[Option[Boolean]] = { Cleaner.materialize { (frame, name, alias) => List( when( trim(lower(frame.col(name.get))) === "true", lit(true) cast BooleanType ).otherwise(false) as alias.get ) } } }
Example 12
Source File: IUberdataForecastUtil.scala From uberdata with Apache License 2.0 | 5 votes |
package eleflow.uberdata import eleflow.uberdata.core.IUberdataContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions.lit object IUberdataForecastUtil { lazy val FEATURES_PREDICTION_COL_NAME = "featuresPrediction" lazy val FEATURES_COL_NAME = "features" lazy val ALGORITHM = "algorithm" lazy val PARAMS = "parameters" lazy val METRIC_COL_NAME = "metric" def convertColumnToLong(row: Row, columnIndex: Int): Row = { row.get(columnIndex) match { case s: java.sql.Timestamp => val (prior, after) = row.toSeq.splitAt(columnIndex) val result = (prior :+ s.getTime) ++ after.tail :+ s Row(result: _*) case d: Double => val (prior, after) = row.toSeq.splitAt(columnIndex) val result = (prior :+ d.toLong) ++ after.tail :+ d Row(result: _*) case i: Int => val (prior, after) = row.toSeq.splitAt(columnIndex) val result = (prior :+ i.toLong) ++ after.tail :+ i Row(result: _*) case s: Short => val (prior, after) = row.toSeq.splitAt(columnIndex) val result = (prior :+ s.toLong) ++ after.tail :+ s Row(result: _*) case _ => row } } def convertColumnToLongAddAtEnd(row: Row, columnIndex: Int): Row = { val result = row.get(columnIndex) match { case s: java.sql.Timestamp => val result = row.toSeq :+ s.getTime Row(result: _*) case d: Double => val result = row.toSeq :+ d.toLong Row(result: _*) case i: Int => val result = row.toSeq :+ i.toLong Row(result: _*) case s: Short => val result = row.toSeq :+ s.toLong Row(result: _*) case _ => row } result } def createIdColColumn(dataFrame : DataFrame, context : IUberdataContext) : DataFrame = { val arrId = dataFrame.rdd.zipWithIndex.map( x => x._1.toSeq :+ x._2 ).map( x => Row.fromSeq(x)) context.sqlContext.createDataFrame(arrId, dataFrame.withColumn("idCol", lit(1L : Long)).schema) } }
Example 13
Source File: SuiteKickoff.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.workload import com.ibm.sparktc.sparkbench.utils.SparkFuncs._ import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.functions.{col, lit} import scala.collection.parallel.ForkJoinTaskSupport object SuiteKickoff { private val log = org.slf4j.LoggerFactory.getLogger(getClass) def run(s: Suite, spark: SparkSession): Unit = { verifyOutput(s.benchmarkOutput, s.saveMode, spark) // Translate the maps into runnable workloads val workloads: Seq[Workload] = s.workloadConfigs.map(ConfigCreator.mapToConf) val dataframes: Seq[DataFrame] = (0 until s.repeat).flatMap { i => // This will produce one DataFrame of one row for each workload in the sequence. // We're going to produce one coherent DF later from these val dfSeqFromOneRun: Seq[DataFrame] = { if (s.parallel) runParallel(workloads, spark) else runSerially(workloads, spark) } // Indicate which run of this suite this was. dfSeqFromOneRun.map(_.withColumn("run", lit(i))) } // getting the Spark confs so we can output them in the results. val strSparkConfs = spark.conf.getAll // Ah, see, here's where we're joining that series of one-row DFs val singleDF = joinDataFrames(dataframes, spark) s.description.foreach(log.info) // And now we're going to curry in the results val plusSparkConf = addConfToResults(singleDF, strSparkConfs) val plusDescription = addConfToResults(plusSparkConf, Map("description" -> s.description)).coalesce(1) // And write to disk. We're done with this suite! if(s.benchmarkOutput.nonEmpty) writeToDisk(s.benchmarkOutput.get, s.saveMode, plusDescription, spark) } private def runParallel(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = { val confSeqPar = workloadConfigs.par confSeqPar.tasksupport = new ForkJoinTaskSupport(new scala.concurrent.forkjoin.ForkJoinPool(confSeqPar.size)) confSeqPar.map(_.run(spark)).seq } private def runSerially(workloadConfigs: Seq[Workload], spark: SparkSession): Seq[DataFrame] = { workloadConfigs.map(_.run(spark)) } private def joinDataFrames(seq: Seq[DataFrame], spark: SparkSession): DataFrame = { if (seq.length == 1) seq.head else { val seqOfColNames = seq.map(_.columns.toSet) val allTheColumns = seqOfColNames.foldLeft(Set[String]())(_ ++ _) def expr(myCols: Set[String], allCols: Set[String]) = { allCols.toList.map { case x if myCols.contains(x) => col(x) case x => lit(null).as(x) } } val seqFixedDfs = seq.map(df => df.select(expr(df.columns.toSet, allTheColumns): _*)) // Folding left across this sequence should be fine because each DF should only have 1 row // Nevarr Evarr do this to legit dataframes that are all like big and stuff seqFixedDfs.foldLeft(spark.createDataFrame(spark.sparkContext.emptyRDD[Row], seqFixedDfs.head.schema))(_ union _) } } }
Example 14
Source File: StructuredStreamingWordCount.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import com.datastax.driver.core.Cluster import knolx.Config._ import knolx.KnolXLogger import knolx.spark.CassandraForeachWriter.writeToCassandra import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{col, lit, sum} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StringType object StructuredStreamingWordCount extends App with KnolXLogger { val cluster = Cluster.builder.addContactPoints(cassandraHosts).build val session = cluster.newSession() info("Creating Keypsace and tables in Cassandra...") session.execute(s"CREATE KEYSPACE IF NOT EXISTS $keyspace WITH " + "replication = {'class':'SimpleStrategy','replication_factor':1};") session.execute(s"CREATE TABLE IF NOT EXISTS $keyspace.wordcount ( word text PRIMARY KEY,count int );") info("Closing DB connection...") session.close() session.getCluster.close() info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") info("Creating Streaming DF...") val dataStream = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", topic) .load() info("Writing data to Cassandra...") val query = dataStream .select(col("value").cast(StringType).as("word"), lit(1).as("count")) .groupBy(col("word")) .agg(sum("count").as("count")) .writeStream .outputMode(OutputMode.Update()) .foreach(writeToCassandra) .option("checkpointLocation", checkPointDir) .start() info("Waiting for the query to terminate...") query.awaitTermination() query.stop() }
Example 15
Source File: AFTSurvivalRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.regression import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.regression.AFTSurvivalRegression import org.apache.spark.sql._ import org.apache.spark.sql.functions.lit class AFTSurvivalRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount").withColumn("censor", lit(1.0)) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new OneHotEncoderEstimator(). setInputCols(Array("fico_index")). setOutputCols(Array("fico")), new VectorAssembler(). setInputCols(Array("fico", "dti")). setOutputCol("features"), new AFTSurvivalRegression(). setQuantileProbabilities(Array(0.5)). setFeaturesCol("features"). setLabelCol("loan_amount"). setQuantilesCol("quant"). setPredictionCol("prediction"))).fit(dataset) override val unserializedParams = Set("labelCol", "stringOrderType", "maxIter", "tol") }
Example 16
Source File: WholeStageCodegenSparkSubmitSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.TimeLimits import org.apache.spark.{SparkFunSuite, TestUtils} import org.apache.spark.deploy.SparkSubmitSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.{LocalSparkSession, QueryTest, Row, SparkSession} import org.apache.spark.sql.functions.{array, col, count, lit} import org.apache.spark.sql.types.IntegerType import org.apache.spark.unsafe.Platform import org.apache.spark.util.ResetSystemProperties // Due to the need to set driver's extraJavaOptions, this test needs to use actual SparkSubmit. class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach with ResetSystemProperties { test("Generated code on driver should not embed platform-specific constant") { val unusedJar = TestUtils.createJarWithClasses(Seq.empty) // HotSpot JVM specific: Set up a local cluster with the driver/executor using mismatched // settings of UseCompressedOops JVM option. val argsForSparkSubmit = Seq( "--class", WholeStageCodegenSparkSubmitSuite.getClass.getName.stripSuffix("$"), "--master", "local-cluster[1,1,1024]", "--driver-memory", "1g", "--conf", "spark.ui.enabled=false", "--conf", "spark.master.rest.enabled=false", "--conf", "spark.driver.extraJavaOptions=-XX:-UseCompressedOops", "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops", unusedJar.toString) SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") } } object WholeStageCodegenSparkSubmitSuite extends Assertions with Logging { var spark: SparkSession = _ def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") spark = SparkSession.builder().getOrCreate() // Make sure the test is run where the driver and the executors uses different object layouts val driverArrayHeaderSize = Platform.BYTE_ARRAY_OFFSET val executorArrayHeaderSize = spark.sparkContext.range(0, 1).map(_ => Platform.BYTE_ARRAY_OFFSET).collect.head.toInt assert(driverArrayHeaderSize > executorArrayHeaderSize) val df = spark.range(71773).select((col("id") % lit(10)).cast(IntegerType) as "v") .groupBy(array(col("v"))).agg(count(col("*"))) val plan = df.queryExecution.executedPlan assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined) val expectedAnswer = Row(Array(0), 7178) :: Row(Array(1), 7178) :: Row(Array(2), 7178) :: Row(Array(3), 7177) :: Row(Array(4), 7177) :: Row(Array(5), 7177) :: Row(Array(6), 7177) :: Row(Array(7), 7177) :: Row(Array(8), 7177) :: Row(Array(9), 7177) :: Nil val result = df.collect QueryTest.sameRows(result.toSeq, expectedAnswer) match { case Some(errMsg) => fail(errMsg) case _ => } } }
Example 17
Source File: ConcatArrowAndExplodeSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import java.io.ByteArrayOutputStream import java.nio.channels.Channels import java.util.concurrent.TimeUnit import com.twosigma.flint.arrow.ArrowUtils import org.apache.arrow.memory.RootAllocator import org.apache.arrow.vector.ipc.ArrowFileWriter import org.apache.arrow.vector.{ BigIntVector, Float8Vector, VectorSchemaRoot } import org.apache.spark.sql.functions.{ array, col, lit, struct } import org.apache.spark.sql.types._ class ConcatArrowAndExplodeSpec extends TimeSeriesSuite { "ConcatArrowAndExplode" should "work" in { val batchSize = 10 var df = spark.range(1000, 2000, 1000).toDF("time") val columns = (0 until batchSize).map(v => struct((df("time") + v).as("time"), lit(v.toDouble).as("v"))) df = df.withColumn("base_rows", array(columns: _*)) val allocator = new RootAllocator(Long.MaxValue) val schema1 = StructType(Seq(StructField("v1", DoubleType))) val root1 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema1), allocator) val vector1 = root1.getVector("v1").asInstanceOf[Float8Vector] vector1.allocateNew() for (i <- 0 until batchSize) { vector1.set(i, i + 10.0) } vector1.setValueCount(batchSize) val out1 = new ByteArrayOutputStream() val arrowWriter1 = new ArrowFileWriter(root1, null, Channels.newChannel(out1)) arrowWriter1.writeBatch() arrowWriter1.close() root1.close() df = df.withColumn("f1_schema", struct(lit(0.0).as("v1"))) df = df.withColumn("f1_data", lit(out1.toByteArray)) val schema2 = StructType(Seq(StructField("v2", DoubleType), StructField("v3", LongType))) val root2 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema2), allocator) val vector2 = root2.getVector("v2").asInstanceOf[Float8Vector] val vector3 = root2.getVector("v3").asInstanceOf[BigIntVector] vector2.allocateNew() vector3.allocateNew() for (i <- 0 until batchSize) { vector2.set(i, i + 20.0) } vector2.setValueCount(batchSize) for (i <- 0 until batchSize) { vector3.set(i, i + 30L) } vector3.setValueCount(batchSize) val out2 = new ByteArrayOutputStream() val arrowWriter2 = new ArrowFileWriter(root2, null, Channels.newChannel(out2)) arrowWriter2.writeBatch() arrowWriter2.close() root2.close() df = df.withColumn("f2_schema", struct(lit(0.0).as("v2"), lit(0L).as("v3"))) df = df.withColumn("f2_data", lit(out2.toByteArray)) var tsrdd = TimeSeriesRDD.fromDF(df)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS) tsrdd = tsrdd.concatArrowAndExplode("base_rows", Seq("f1_schema", "f2_schema"), Seq("f1_data", "f2_data")) tsrdd.toDF.show() var expected = spark.range(1000, 1000 + batchSize).toDF("time") expected = expected.withColumn("v", col("time") - 1000.0) expected = expected.withColumn("v1", col("time") - 1000 + 10.0) expected = expected.withColumn("v2", col("time") - 1000 + 20.0) expected = expected.withColumn("v3", col("time") - 1000 + 30) val expectedTsrdd = TimeSeriesRDD.fromDF(expected)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS) assertEquals(tsrdd, expectedTsrdd) } }
Example 18
Source File: UserActionsRateSource.scala From spark-structured-streaming-examples with Apache License 2.0 | 5 votes |
package com.phylosoft.spark.learning.sql.streaming.source.rate import org.apache.spark.sql.functions.{col, lit, pmod, rand} import org.apache.spark.sql.{DataFrame, SparkSession} class UserActionsRateSource(val spark: SparkSession, val rowsPerSecond: String = "5", val numPartitions: String = "1") extends RateSource { def loadUserActions(): DataFrame = { readStream() .where((rand() * 100).cast("integer") < 30) // 30 out of every 100 user actions .select(pmod(col("value"), lit(9)).as("userId"), col("timestamp").as("actionTime")) } }
Example 19
Source File: TestIndexing.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark import java.util.UUID import com.lucidworks.spark.util.SolrDataFrameImplicits._ import com.lucidworks.spark.util.{ConfigurationConstants, SolrCloudUtil, SolrQuerySupport, SolrSupport} import org.apache.spark.sql.functions.{concat, lit} import org.apache.spark.sql.types.{DataTypes, StructField, StructType} class TestIndexing extends TestSuiteBuilder { test("Load csv file and index to Solr") { val collectionName = "testIndexing-" + UUID.randomUUID().toString SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc) try { val csvFileLocation = "src/test/resources/test-data/nyc_yellow_taxi_sample_1k.csv" val csvDF = sparkSession.read.format("com.databricks.spark.csv") .option("header", "true") .option("inferSchema", "true") .load(csvFileLocation) assert(csvDF.count() == 999) val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName) val newDF = csvDF .withColumn("pickup_location", concat(csvDF.col("pickup_latitude"), lit(","), csvDF.col("pickup_longitude"))) .withColumn("dropoff_location", concat(csvDF.col("dropoff_latitude"), lit(","), csvDF.col("dropoff_longitude"))) newDF.write.option("zkhost", zkHost).option(ConfigurationConstants.GENERATE_UNIQUE_KEY, "true").solr(collectionName) // Explicit commit to make sure all docs are visible val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost) solrCloudClient.commit(collectionName, true, true) val solrDF = sparkSession.read.format("solr").options(solrOpts).load() solrDF.printSchema() assert (solrDF.count() == 999) solrDF.take(10) } finally { SolrCloudUtil.deleteCollection(collectionName, cluster) } } test("Solr field types config") { val collectionName = "testIndexing-" + UUID.randomUUID().toString SolrCloudUtil.buildCollection(zkHost, collectionName, null, 2, cloudClient, sc) try { val csvFileLocation = "src/test/resources/test-data/simple.csv" val csvDF = sparkSession.read.format("com.databricks.spark.csv") .option("header", "true") .option("inferSchema", "true") .load(csvFileLocation) val solrOpts = Map("zkhost" -> zkHost, "collection" -> collectionName, ConfigurationConstants.SOLR_FIELD_TYPES -> "ntitle:text_en,nrating:string") csvDF.write.options(solrOpts).solr(collectionName) // Explicit commit to make sure all docs are visible val solrCloudClient = SolrSupport.getCachedCloudClient(zkHost) solrCloudClient.commit(collectionName, true, true) val solrBaseUrl = SolrSupport.getSolrBaseUrl(zkHost) val solrUrl = solrBaseUrl + collectionName + "/" val fieldTypes = SolrQuerySupport.getFieldTypes(Set.empty, solrUrl, cloudClient, collectionName) assert(fieldTypes("nrating").fieldType === "string") assert(fieldTypes("ntitle").fieldType === "text_en") } finally { SolrCloudUtil.deleteCollection(collectionName, cluster) } } test("Field additions") { val insertSchema = StructType(Array( StructField("index_only_field", DataTypes.StringType, nullable = true), StructField("store_only_field", DataTypes.BooleanType, nullable = true), StructField("a_s", DataTypes.StringType, nullable = true), StructField("s_b", DataTypes.StringType, nullable = true) )) val collection = "testFieldAdditions" + UUID.randomUUID().toString.replace("-", "_") try { SolrCloudUtil.buildCollection(zkHost, collection, null, 2, cloudClient, sc) val opts = Map("zkhost" -> zkHost, "collection" -> collection) val solrRelation = new SolrRelation(opts, sparkSession) val fieldsToAdd = SolrRelation.getFieldsToAdd(insertSchema, solrRelation.conf, solrRelation.solrVersion, solrRelation.dynamicSuffixes) assert(fieldsToAdd.isEmpty) } finally { SolrCloudUtil.deleteCollection(collection, cluster) } } }