org.apache.spark.sql.types.TimestampType Scala Examples
The following examples show how to use org.apache.spark.sql.types.TimestampType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MongodbSchemaIT.scala From Spark-MongoDB with Apache License 2.0 | 5 votes |
package com.stratio.datasource.mongodb.schema import java.text.SimpleDateFormat import java.util.Locale import com.stratio.datasource.MongodbTestConstants import com.stratio.datasource.mongodb.config.{MongodbConfig, MongodbConfigBuilder} import com.stratio.datasource.mongodb.partitioner.MongodbPartitioner import com.stratio.datasource.mongodb.rdd.MongodbRDD import com.stratio.datasource.mongodb._ import org.apache.spark.sql.mongodb.{TemporaryTestSQLContext, TestSQLContext} import org.apache.spark.sql.types.{ArrayType, StringType, StructField, TimestampType} import org.junit.runner.RunWith import org.scalatest._ import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class MongodbSchemaIT extends FlatSpec with Matchers with MongoEmbedDatabase with TestBsonData with MongodbTestConstants { private val host: String = "localhost" private val collection: String = "testCol" private val readPreference = "secondaryPreferred" val testConfig = MongodbConfigBuilder() .set(MongodbConfig.Host,List(host + ":" + mongoPort)) .set(MongodbConfig.Database,db) .set(MongodbConfig.Collection,collection) .set(MongodbConfig.SamplingRatio,1.0) .set(MongodbConfig.ReadPreference, readPreference) .build() val mongodbPartitioner = new MongodbPartitioner(testConfig) val mongodbRDD = new MongodbRDD(TemporaryTestSQLContext, testConfig, mongodbPartitioner) behavior of "A schema" it should "be inferred from rdd with primitives" + scalaBinaryVersion in { withEmbedMongoFixture(primitiveFieldAndType) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 7 schema.fieldNames should contain allOf("string", "integer", "long", "double", "boolean", "null") schema.printTreeString() } } it should "be inferred from rdd with complex fields" + scalaBinaryVersion in { withEmbedMongoFixture(complexFieldAndType1) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 13 schema.fields filter { case StructField(name, ArrayType(StringType, _), _, _) => Set("arrayOfNull", "arrayEmpty") contains name case _ => false } should have size 2 schema.printTreeString() } } it should "resolve type conflicts between fields" + scalaBinaryVersion in { withEmbedMongoFixture(primitiveFieldValueTypeConflict) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 7 schema.printTreeString() } } it should "be inferred from rdd with more complex fields" + scalaBinaryVersion in { withEmbedMongoFixture(complexFieldAndType2) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 5 schema.printTreeString() } } it should "read java.util.Date fields as timestamptype" + scalaBinaryVersion in { val dfunc = (s: String) => new SimpleDateFormat("EEE MMM dd HH:mm:ss Z yyyy", Locale.ENGLISH).parse(s) import com.mongodb.casbah.Imports.DBObject val stringAndDate = List(DBObject("string" -> "this is a simple string.", "date" -> dfunc("Mon Aug 10 07:52:49 EDT 2015"))) withEmbedMongoFixture(stringAndDate) { mongodProc => val schema = MongodbSchema(mongodbRDD, 1.0).schema() schema.fields should have size 3 schema.fields.filter(_.name == "date").head.dataType should equal(TimestampType) schema.printTreeString() } } }
Example 2
Source File: DataFrameReportPerformanceSpec.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.dataframe import java.sql.Timestamp import java.text.{DateFormat, SimpleDateFormat} import java.util.TimeZone import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructField, StructType, TimestampType} import org.scalatest.{BeforeAndAfter, Ignore} import io.deepsense.commons.utils.{DoubleUtils, Logging} import io.deepsense.deeplang.{TestFiles, DeeplangIntegTestSupport} // It's ignored because it does not have got assertions, it only prints report generation time. @Ignore class DataFrameReportPerformanceSpec extends DeeplangIntegTestSupport with BeforeAndAfter with TestFiles with Logging { val testFile = absoluteTestsDirPath.pathWithoutScheme + "/demand_without_header.csv" "DataFrame" should { "generate report" when { "DataFrame has 17K of rows" in { val numberOfTries = 10 var results: Seq[Double] = Seq() for (i <- 1 to numberOfTries) { val dataFrame: DataFrame = demandDataFrame() val start = System.nanoTime() val report = dataFrame.report val end = System.nanoTime() val time1: Double = (end - start).toDouble / 1000000000.0 results = results :+ time1 logger.debug("Report generation time: {}", DoubleUtils.double2String(time1)) } logger.debug( "Mean report generation time: {}", DoubleUtils.double2String(results.fold(0D)(_ + _) / numberOfTries.toDouble)) } } } private def demandDataFrame(): DataFrame = { val rddString: RDD[String] = executionContext.sparkContext.textFile(testFile) val data: RDD[Row] = rddString.map(DataFrameHelpers.demandString2Row) executionContext.dataFrameBuilder.buildDataFrame(demandSchema, data) } private def demandSchema: StructType = StructType(Seq( StructField("datetime", TimestampType), StructField("log_count", DoubleType), StructField("workingday", DoubleType), StructField("holiday", DoubleType), StructField("season2", DoubleType), StructField("season3", DoubleType), StructField("season4", DoubleType))) private def timestamp(s: String): Timestamp = { val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.setTimeZone(TimeZone.getTimeZone("UTC")) new Timestamp(format.parse(s).getTime) } } private object DataFrameHelpers { def demandString2Row(s: String): Row = { val split = s.split(",") Row( timestamp(split(0)), split(1).toDouble, split(2).toDouble, split(3).toDouble, split(4).toDouble, split(5).toDouble, split(6).toDouble ) } private def timestamp(s: String): Timestamp = { val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.setTimeZone(TimeZone.getTimeZone("UTC")) new Timestamp(format.parse(s).getTime) } }
Example 3
Source File: MovingAverageSparklet.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import pipelines.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class MovingAverageSparklet extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Agg]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data]): Dataset[Agg] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", "1 minutes") .groupBy(window($"ts", "1 minute", "30 seconds"), $"src", $"gauge").agg(avg($"value") as "avg") query.select($"src", $"gauge", $"avg" as "value").as[Agg] } } }
Example 4
Source File: IdentitySparkProcessor1.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import pipelines.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class IdentitySparkProcessor1 extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { writeStream(readStream(in).map(d ⇒ d.copy(t1 = TimeOps.nowAsOption)), out, OutputMode.Append).toQueryExecution } } }
Example 5
Source File: DateTimeDataFixture.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.vector import java.util.TimeZone import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{DateType, StructField, StructType, TimestampType} import com.actian.spark_vector.test.util.DateHelper.{ansiDateFor, timestampFor} object DateTimeDataFixture { def timeRDD(sparkContext: SparkContext): (RDD[Seq[Any]], StructType) = createTimeRDD(sparkContext, timeData) private[vector] val tz = TimeZone.getTimeZone("GMT-06:00") private[vector] val utc = TimeZone.getTimeZone("UTC") private[vector] val timeData = Seq( Seq[Any]( timestampFor(1995, 1, 22, 18, 3, 29, 234, tz), timestampFor(1996, 2, 22, 18, 3, 29, 234), timestampFor(1997, 2, 22, 18, 3, 29, 234), timestampFor(1998, 1, 22, 18, 3, 29, 234, tz), timestampFor(1999, 2, 22, 18, 3, 29, 234), timestampFor(2000, 2, 22, 18, 3, 29, 234), timestampFor(2015, 11, 23, 18, 3, 29, 123, tz), timestampFor(2015, 11, 23, 18, 3, 29, 123), ansiDateFor(1995, 2, 22)), Seq[Any]( timestampFor(2015, 3, 2, 17, 52, 12, 678, tz), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 3, 2, 17, 52, 12, 678, tz), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 11, 13, 17, 52, 12, 123, tz), ansiDateFor(2015, 4, 2))) private def createTimeRDD(sparkContext: SparkContext, data: Seq[Seq[Any]]): (RDD[Seq[Any]], StructType) = { val schema = StructType(Seq( StructField("tswtz", TimestampType), StructField("tsntz", TimestampType), StructField("tsltz", TimestampType), StructField("tswtz4", TimestampType), StructField("tsntz4", TimestampType), StructField("tsltz4", TimestampType), StructField("tmwtz", TimestampType), StructField("tmntz", TimestampType), StructField("tmltz", TimestampType), StructField("tmwtz3", TimestampType), StructField("tmntz3", TimestampType), StructField("tmltz3", TimestampType), StructField("date", DateType))) (sparkContext.parallelize(data, 2), schema) } def createTimeTable(connectionProps: VectorConnectionProperties)(tableName: String): Unit = { VectorJDBC.withJDBC(connectionProps) { cxn => cxn.dropTable(tableName) cxn.executeStatement( s"""|create table ${tableName} ( | tswtz timestamp with time zone, | tsntz timestamp without time zone, | tsltz timestamp with local time zone, | tswtz4 timestamp(4) with time zone, | tsntz4 timestamp(4) without time zone, | tsltz4 timestamp(4) with local time zone, | tmwtz time with time zone, | tmntz time without time zone, | tmltz time with local time zone, | tmwtz3 time(3) with time zone, | tmntz3 time(3) without time zone, | tmltz3 time(3) with local time zone, | dt date |)""".stripMargin) } } }
Example 6
Source File: FieldDateTime.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import java.sql.Timestamp import org.apache.spark.sql.types.{Metadata, TimestampType} class FieldDateTime(name: String, nullValueAllowed: Boolean, metadata:Metadata) extends Field(name, TimestampType, nullValueAllowed, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val numDays = byteBuffer.getDouble // convert days since 12/30/1899 to 1/1/1970 val unixDays = numDays - 25569 val millis = (unixDays * 1000 * 60 * 60 * 24).ceil.toLong new Timestamp(millis) } }
Example 7
Source File: MovingAverageSparklet.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.sparkdoc import cloudflow.spark._ import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import org.apache.spark.sql.functions._ import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.types.TimestampType import org.apache.spark.sql.streaming.OutputMode //tag::spark-streamlet-example[] class MovingAverageSparklet extends SparkStreamlet { // <1> val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.key) val shape = StreamletShape(in, out) // <2> override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { // <3> val groupedData = readStream(in) // <4> .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", "1 minutes") .groupBy(window($"ts", "1 minute", "30 seconds"), $"key") .agg(avg($"value").as("avg")) val query = groupedData.select($"key", $"avg".as("value")).as[Data] writeStream(query, out, OutputMode.Append).toQueryExecution } } } //end::spark-streamlet-example[]
Example 8
Source File: MovingAverageSparklet.scala From cloudflow with Apache License 2.0 | 5 votes |
package sensors import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode class MovingAverageSparklet extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Agg]("out", _.src) val shape = StreamletShape(in, out) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data]): Dataset[Agg] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withWatermark("ts", "1 minutes") .groupBy(window($"ts", "1 minute", "30 seconds"), $"src", $"gauge") .agg(avg($"value").as("avg")) query.select($"src", $"gauge", $"avg".as("value")).as[Agg] } } }
Example 9
Source File: SparkOutput.scala From cloudflow with Apache License 2.0 | 5 votes |
package swissknife.spark import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode import swissknife.data.Data class SparkOutput extends SparkStreamlet { val in = AvroInlet[Data]("in") val shape = StreamletShape(in) override def createLogic() = new SparkStreamletLogic { val sparkLocality = context.session.conf.getOption("spark.locality.wait").getOrElse("") val feedbackMsg = s"locality=[$sparkLocality]" override def buildStreamingQueries = { val query = readStream(in) // we add this to the output to make it observable from the outside .withColumn("payload", lit(feedbackMsg)) // we add this to the output to make it observable from the outside .writeStream .format("console") .option("truncate","false") .start query.toQueryExecution } } }
Example 10
Source File: SparkCounter.scala From cloudflow with Apache License 2.0 | 5 votes |
package swissknife.spark import cloudflow.streamlets.{StreamletShape, StringConfigParameter} import cloudflow.streamlets.avro._ import cloudflow.spark.{SparkStreamlet, SparkStreamletLogic} import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.TimestampType import cloudflow.spark.sql.SQLImplicits._ import org.apache.spark.sql.streaming.OutputMode import swissknife.data.Data class SparkCounter extends SparkStreamlet { val in = AvroInlet[Data]("in") val out = AvroOutlet[Data]("out", _.src) val shape = StreamletShape(in, out) val configurableMessage = StringConfigParameter("configurable-message", "Configurable message.", Some("spark-original")) override def configParameters = Vector(configurableMessage) override def createLogic() = new SparkStreamletLogic { val msg = configurableMessage.value override def buildStreamingQueries = { val dataset = readStream(in) val outStream = process(dataset, msg) writeStream(outStream, out, OutputMode.Append).toQueryExecution } private def process(inDataset: Dataset[Data], message: String): Dataset[Data] = { val query = inDataset .withColumn("ts", $"timestamp".cast(TimestampType)) .withColumn("updated_src", concat($"src", lit("-spark"))) .withWatermark("ts", "0 seconds") .groupBy(window($"ts", "5 seconds"), $"updated_src") .agg(max($"count").as("count")) query.select($"updated_src".as("src"), $"window.start".as("timestamp"), lit(message).as("payload"), $"count").as[Data] } } }
Example 11
Source File: A_1_BasicOperation.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.structured_streaming import java.sql.Timestamp import org.apache.spark.sql.types.{BooleanType, StringType, StructType, TimestampType} import org.apache.spark.sql.{Dataset, SparkSession} object A_1_BasicOperation { //DateTime要使用Timestamp case类必须使用java.sql。在catalyst中作为TimestampType调用的时间戳 case class DeviceData(device: String, deviceType: String, signal: Double, time: Timestamp) def main(args: Array[String]): Unit = { val spark = SparkSession.builder() .appName(A_1_BasicOperation.getClass.getName) .master("local") .getOrCreate() val timeStructType = new StructType().add("device", StringType) .add("deviceType", StringType) .add("signal", BooleanType) .add("time", TimestampType) val dataFrame = spark.read.json("src/main/resources/sparkresource/device.json") import spark.implicits._ val ds: Dataset[DeviceData] = dataFrame.as[DeviceData] //使用无类型方式查询,类sql dataFrame.select("device").where("signal>10").show() //使用有类型方式进行查询 ds.filter(_.signal > 10).map(_.device).show() //使用无类型方式进行groupBy,并进行统计 dataFrame.groupBy("deviceType").count().show() import org.apache.spark.sql.expressions.scalalang.typed //使用有类型方式进行 计算每种类型的设备的平均信号值 ds.groupByKey(_.deviceType).agg(typed.avg(_.signal)).show() //也可以使用创建临时视图的形式,使用sql语句进行查询 dataFrame.createOrReplaceTempView("device") spark.sql("select * from device").show() //可以使用isStreaming来判断是否有流数据 println(dataFrame.isStreaming) } }
Example 12
Source File: MyUDF.scala From spark-tools with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.TimestampType object MyUDF { private def myTimestampCast(xs: Seq[Expression]): Expression = { val expSource = xs.head expSource.dataType match { case LongType => new Column(expSource).divide(Literal(1000)).cast(TimestampType).expr case TimestampType => expSource } } def register(sparkSession: SparkSession): Unit = sparkSession.sessionState.functionRegistry .registerFunction(FunctionIdentifier("toTs",None), myTimestampCast) }
Example 13
Source File: ResolveInlineTablesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Cast, Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types.{LongType, NullType, TimestampType} class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(conf)(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables(conf).convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("convert TimeZoneAwareExpression") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType)))) val withTimeZone = ResolveTimeZone(conf).apply(table) val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone) val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType) .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long] assert(output.map(_.dataType) == Seq(TimestampType)) assert(data.size == 1) assert(data.head.getLong(0) == correct) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables(conf).convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables(conf).convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 14
Source File: DataFrameReportPerformanceSpec.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.dataframe import java.sql.Timestamp import java.text.{DateFormat, SimpleDateFormat} import java.util.TimeZone import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructField, StructType, TimestampType} import org.scalatest.{BeforeAndAfter, Ignore} import ai.deepsense.commons.utils.{DoubleUtils, Logging} import ai.deepsense.deeplang.{TestFiles, DeeplangIntegTestSupport} // It's ignored because it does not have got assertions, it only prints report generation time. @Ignore class DataFrameReportPerformanceSpec extends DeeplangIntegTestSupport with BeforeAndAfter with TestFiles with Logging { val testFile = absoluteTestsDirPath.pathWithoutScheme + "/demand_without_header.csv" "DataFrame" should { "generate report" when { "DataFrame has 17K of rows" in { val numberOfTries = 10 var results: Seq[Double] = Seq() for (i <- 1 to numberOfTries) { val dataFrame: DataFrame = demandDataFrame() val start = System.nanoTime() val report = dataFrame.report() val end = System.nanoTime() val time1: Double = (end - start).toDouble / 1000000000.0 results = results :+ time1 logger.debug("Report generation time: {}", DoubleUtils.double2String(time1)) } logger.debug( "Mean report generation time: {}", DoubleUtils.double2String(results.fold(0D)(_ + _) / numberOfTries.toDouble)) } } } private def demandDataFrame(): DataFrame = { val rddString: RDD[String] = executionContext.sparkContext.textFile(testFile) val data: RDD[Row] = rddString.map(DataFrameHelpers.demandString2Row) executionContext.dataFrameBuilder.buildDataFrame(demandSchema, data) } private def demandSchema: StructType = StructType(Seq( StructField("datetime", TimestampType), StructField("log_count", DoubleType), StructField("workingday", DoubleType), StructField("holiday", DoubleType), StructField("season2", DoubleType), StructField("season3", DoubleType), StructField("season4", DoubleType))) private def timestamp(s: String): Timestamp = { val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.setTimeZone(TimeZone.getTimeZone("UTC")) new Timestamp(format.parse(s).getTime) } } private object DataFrameHelpers { def demandString2Row(s: String): Row = { val split = s.split(",") Row( timestamp(split(0)), split(1).toDouble, split(2).toDouble, split(3).toDouble, split(4).toDouble, split(5).toDouble, split(6).toDouble ) } private def timestamp(s: String): Timestamp = { val format: DateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss") format.setTimeZone(TimeZone.getTimeZone("UTC")) new Timestamp(format.parse(s).getTime) } }
Example 15
Source File: StreamingQueryListenerSampleJob.scala From spark-monitoring with MIT License | 5 votes |
package com.microsoft.pnp.samplejob import com.microsoft.pnp.logging.Log4jConfiguration import com.microsoft.pnp.util.TryWith import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.metrics.UserMetricsSystems import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.window import org.apache.spark.sql.types.{StringType, StructType, TimestampType} object StreamingQueryListenerSampleJob extends Logging { private final val METRICS_NAMESPACE = "streamingquerylistenersamplejob" private final val COUNTER_NAME = "rowcounter" def main(args: Array[String]): Unit = { // Configure our logging TryWith(getClass.getResourceAsStream("/com/microsoft/pnp/samplejob/log4j.properties")) { stream => { Log4jConfiguration.configure(stream) } } logTrace("Trace message from StreamingQueryListenerSampleJob") logDebug("Debug message from StreamingQueryListenerSampleJob") logInfo("Info message from StreamingQueryListenerSampleJob") logWarning("Warning message from StreamingQueryListenerSampleJob") logError("Error message from StreamingQueryListenerSampleJob") val spark = SparkSession .builder .getOrCreate import spark.implicits._ // this path has sample files provided by databricks for trying out purpose val inputPath = "/databricks-datasets/structured-streaming/events/" val jsonSchema = new StructType().add("time", TimestampType).add("action", StringType) val driverMetricsSystem = UserMetricsSystems .getMetricSystem(METRICS_NAMESPACE, builder => { builder.registerCounter(COUNTER_NAME) }) driverMetricsSystem.counter(COUNTER_NAME).inc // Similar to definition of staticInputDF above, just using `readStream` instead of `read` val streamingInputDF = spark .readStream // `readStream` instead of `read` for creating streaming DataFrame .schema(jsonSchema) // Set the schema of the JSON data .option("maxFilesPerTrigger", 1) // Treat a sequence of files as a stream by picking one file at a time .json(inputPath) driverMetricsSystem.counter(COUNTER_NAME).inc(5) val streamingCountsDF = streamingInputDF .groupBy($"action", window($"time", "1 hour")) .count() // Is this DF actually a streaming DF? streamingCountsDF.isStreaming driverMetricsSystem.counter(COUNTER_NAME).inc(10) val query = streamingCountsDF .writeStream .format("memory") // memory = store in-memory table (for testing only in Spark 2.0) .queryName("counts") // counts = name of the in-memory table .outputMode("complete") // complete = all the counts should be in the table .start() } }
Example 16
Source File: CubeMakerTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.test.cube import java.sql.Timestamp import com.github.nscala_time.time.Imports._ import com.stratio.sparta.driver.step.{Cube, CubeOperations, Trigger} import com.stratio.sparta.driver.writer.WriterOptions import com.stratio.sparta.plugin.default.DefaultField import com.stratio.sparta.plugin.cube.field.datetime.DateTimeField import com.stratio.sparta.plugin.cube.operator.count.CountOperator import com.stratio.sparta.sdk.pipeline.aggregation.cube.{Dimension, DimensionValue, DimensionValuesTime, InputFields} import com.stratio.sparta.sdk.pipeline.schema.TypeOp import com.stratio.sparta.sdk.utils.AggregationTime import org.apache.spark.sql.Row import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType, TimestampType} import org.apache.spark.streaming.TestSuiteBase import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class CubeMakerTest extends TestSuiteBase { val PreserverOrder = false def getEventOutput(timestamp: Timestamp, millis: Long): Seq[Seq[(DimensionValuesTime, InputFields)]] = { val dimensionString = Dimension("dim1", "eventKey", "identity", new DefaultField) val dimensionTime = Dimension("minute", "minute", "minute", new DateTimeField) val dimensionValueString1 = DimensionValue(dimensionString, "value1") val dimensionValueString2 = dimensionValueString1.copy(value = "value2") val dimensionValueString3 = dimensionValueString1.copy(value = "value3") val dimensionValueTs = DimensionValue(dimensionTime, timestamp) val tsMap = Row(timestamp) val valuesMap1 = InputFields(Row("value1", timestamp), 1) val valuesMap2 = InputFields(Row("value2", timestamp), 1) val valuesMap3 = InputFields(Row("value3", timestamp), 1) Seq(Seq( (DimensionValuesTime("cubeName", Seq(dimensionValueString1, dimensionValueTs)), valuesMap1), (DimensionValuesTime("cubeName", Seq(dimensionValueString2, dimensionValueTs)), valuesMap2), (DimensionValuesTime("cubeName", Seq(dimensionValueString3, dimensionValueTs)), valuesMap3) )) } }
Example 17
Source File: RawDataWriterHelper.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.writer import com.stratio.sparta.driver.factory.SparkContextFactory import com.stratio.sparta.driver.step.RawData import com.stratio.sparta.sdk.pipeline.output.Output import com.stratio.sparta.sdk.utils.AggregationTime import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType} import org.apache.spark.streaming.dstream.DStream object RawDataWriterHelper { def writeRawData(rawData: RawData, outputs: Seq[Output], input: DStream[Row]): Unit = { val RawSchema = StructType(Seq( StructField(rawData.timeField, TimestampType, nullable = false), StructField(rawData.dataField, StringType, nullable = true))) val eventTime = AggregationTime.millisToTimeStamp(System.currentTimeMillis()) input.map(row => Row.merge(Row(eventTime), row)) .foreachRDD(rdd => { if (!rdd.isEmpty()) { val rawDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, RawSchema) WriterHelper.write(rawDataFrame, rawData.writerOptions, Map.empty[String, String], outputs) } }) } }
Example 18
Source File: utils.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
package org.apache.spark.sql.execution.streaming.http import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.TimestampType import org.apache.spark.SparkConf import org.apache.commons.io.IOUtils import org.apache.spark.serializer.KryoSerializer import java.io.InputStream import com.esotericsoftware.kryo.io.Input import java.io.ByteArrayOutputStream class WrongArgumentException(name: String, value: Any) extends RuntimeException(s"wrong argument: $name=$value") { } class MissingRequiredArgumentException(map: Map[String, String], paramName: String) extends RuntimeException(s"missing required argument: $paramName, all parameters=$map") { } class InvalidSerializerNameException(serializerName: String) extends RuntimeException(s"invalid serializer name: $serializerName") { } object SchemaUtils { def buildSchema(schema: StructType, includesTimestamp: Boolean, timestampColumnName: String = "_TIMESTAMP_"): StructType = { if (!includesTimestamp) schema; else StructType(schema.fields.toSeq :+ StructField(timestampColumnName, TimestampType, false)); } } object Params { def deserialize(bytes: Array[Byte]): Any = { val kryo = kryoSerializer.newKryo(); val input = new Input(); input.setBuffer(bytes); kryo.readClassAndObject(input); } }
Example 19
Source File: ResolveInlineTablesSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Cast, Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types.{LongType, NullType, TimestampType} class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(conf)(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables(conf).convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("convert TimeZoneAwareExpression") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType)))) val withTimeZone = ResolveTimeZone(conf).apply(table) val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone) val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType) .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long] assert(output.map(_.dataType) == Seq(TimestampType)) assert(data.size == 1) assert(data.head.getLong(0) == correct) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables(conf).convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables(conf).convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 20
Source File: TimestampCast.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.codegen.{ CodegenContext, ExprCode, CodeGenerator, JavaCode, Block } import org.apache.spark.sql.catalyst.expressions.{ Expression, NullIntolerant, UnaryExpression } import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{ DataType, LongType, TimestampType } case class TimestampToNanos(child: Expression) extends TimestampCast { val dataType: DataType = LongType protected def cast(childPrim: String): String = s"$childPrim * 1000L" override protected def nullSafeEval(input: Any): Any = input.asInstanceOf[Long] * 1000L } case class NanosToTimestamp(child: Expression) extends TimestampCast { val dataType: DataType = TimestampType protected def cast(childPrim: String): String = s"$childPrim / 1000L" override protected def nullSafeEval(input: Any): Any = input.asInstanceOf[Long] / 1000L } object TimestampToNanos { private[this] def castCode(ctx: CodegenContext, childPrim: String, childNull: String, resultPrim: String, resultNull: String, resultType: DataType): Block = { code""" boolean $resultNull = $childNull; ${CodeGenerator.javaType(resultType)} $resultPrim = ${CodeGenerator.defaultValue(resultType)}; if (!${childNull}) { $resultPrim = (long) ${cast(childPrim)}; } """ } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val eval = child.genCode(ctx) ev.copy(code = eval.code + castCode(ctx, eval.value, eval.isNull, ev.value, ev.isNull, dataType)) } }
Example 21
Source File: MyUDF.scala From spark-tools with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.TimestampType object MyUDF { private def myTimestampCast(xs: Seq[Expression]): Expression = { val expSource = xs.head expSource.dataType match { case LongType => new Column(expSource).divide(Literal(1000)).cast(TimestampType).expr case TimestampType => expSource } } def register(sparkSession: SparkSession): Unit = sparkSession.sessionState.functionRegistry .registerFunction("toTs", myTimestampCast) }