org.apache.spark.sql.types.DateType Scala Examples
The following examples show how to use org.apache.spark.sql.types.DateType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TimestampExpressionSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import java.sql.Timestamp import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types.{DateType, IntegerType} import org.scalatest.FunSuite class TimestampExpressionSuite extends FunSuite with ExpressionEvalHelper { test("add_seconds") { // scalastyle:off magic.number checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-01 00:11:33")), Literal(28)), DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-01-01 00:12:01"))) checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-02 00:00:00")), Literal(-1)), DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-01-01 23:59:59"))) checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-01 00:00:00")), Literal(-1)), DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2014-12-31 23:59:59"))) checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-02 00:00:00")), Literal.create(null, IntegerType)), null) checkEvaluation(AddSeconds(Literal.create(null, DateType), Literal(1)), null) checkEvaluation(AddSeconds(Literal.create(null, DateType), Literal.create(null, IntegerType)), null) } }
Example 2
Source File: YelpHelpers.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.integration.yelp import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, LongType} import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions} import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey} import org.opencypher.morpheus.impl.table.SparkTable._ import org.opencypher.morpheus.integration.yelp.YelpConstants._ object YelpHelpers { case class YelpTables( userDf: DataFrame, businessDf: DataFrame, reviewDf: DataFrame ) def loadYelpTables(inputPath: String)(implicit spark: SparkSession): YelpTables = { import spark.implicits._ log("read business.json", 2) val rawBusinessDf = spark.read.json(s"$inputPath/business.json") log("read review.json", 2) val rawReviewDf = spark.read.json(s"$inputPath/review.json") log("read user.json", 2) val rawUserDf = spark.read.json(s"$inputPath/user.json") val businessDf = rawBusinessDf.select($"business_id".as(sourceIdKey), $"business_id", $"name", $"address", $"city", $"state") val reviewDf = rawReviewDf.select($"review_id".as(sourceIdKey), $"user_id".as(sourceStartNodeKey), $"business_id".as(sourceEndNodeKey), $"stars", $"date".cast(DateType)) val userDf = rawUserDf.select( $"user_id".as(sourceIdKey), $"name", $"yelping_since".cast(DateType), functions.split($"elite", ",").cast(ArrayType(LongType)).as("elite")) YelpTables(userDf, businessDf, reviewDf) } def printYelpStats(inputPath: String)(implicit spark: SparkSession): Unit = { val rawBusinessDf = spark.read.json(s"$inputPath/business.json") val rawReviewDf = spark.read.json(s"$inputPath/review.json") import spark.implicits._ rawBusinessDf.select($"city", $"state").distinct().show() rawBusinessDf.withColumnRenamed("business_id", "id") .join(rawReviewDf, $"id" === $"business_id") .groupBy($"city", $"state") .count().as("count") .orderBy($"count".desc, $"state".asc) .show(100) } def extractYelpCitySubset(inputPath: String, outputPath: String, city: String)(implicit spark: SparkSession): Unit = { import spark.implicits._ def emailColumn(userId: String): Column = functions.concat($"$userId", functions.lit("@yelp.com")) val rawUserDf = spark.read.json(s"$inputPath/user.json") val rawReviewDf = spark.read.json(s"$inputPath/review.json") val rawBusinessDf = spark.read.json(s"$inputPath/business.json") val businessDf = rawBusinessDf.filter($"city" === city) val reviewDf = rawReviewDf .join(businessDf, Seq("business_id"), "left_semi") .withColumn("user_email", emailColumn("user_id")) .withColumnRenamed("stars", "stars_tmp") .withColumn("stars", $"stars_tmp".cast(IntegerType)) .drop("stars_tmp") val userDf = rawUserDf .join(reviewDf, Seq("user_id"), "left_semi") .withColumn("email", emailColumn("user_id")) val friendDf = userDf .select($"email".as("user1_email"), functions.explode(functions.split($"friends", ", ")).as("user2_id")) .withColumn("user2_email", emailColumn("user2_id")) .select(s"user1_email", s"user2_email") businessDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/business.json") reviewDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/review.json") userDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/user.json") friendDf.write.json(s"$outputPath/$cityGraphName/$yelpBookDB/friend.json") } implicit class DataFrameOps(df: DataFrame) { def prependIdColumn(idColumn: String, prefix: String): DataFrame = df.transformColumns(idColumn)(column => functions.concat(functions.lit(prefix), column).as(idColumn)) } }
Example 3
Source File: RDDFixtures.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector import java.util.Date import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ DateType, IntegerType, StringType, StructField, StructType } import com.actian.spark_vector.test.util.StructTypeUtil.createSchema trait RDDFixtures { // poor man's fixture, for other approaches see: // http://www.scalatest.org/user_guide/sharing_fixtures def createRecordRdd(sc: SparkContext): (RDD[Seq[Any]], StructType) = { val input = Seq( Seq(42, "a"), Seq(43, "b")) val inputRdd = sc.parallelize(input, 2) val inputSchema = createSchema("id" -> IntegerType, "name" -> StringType) (inputRdd, inputSchema) } def createRowRDD(sc: SparkContext): (RDD[Seq[Any]], StructType) = { val input = Seq( Seq[Any](42, "a", new Date(), new Date()), Seq[Any](43, "b", new Date(), new Date())) val inputRdd = sc.parallelize(input, 2) val inputSchema = createSchema("id" -> IntegerType, "name" -> StringType, "date" -> DateType) (inputRdd, inputSchema) } def wideRDD(sc: SparkContext, columnCount: Int, rowCount: Int = 2): (RDD[Row], StructType) = { val data: Row = Row.fromSeq(1 to columnCount) val fields = for (i <- 1 to rowCount) yield { StructField("field_" + i, IntegerType, true) } val inputSchema = StructType(fields.toSeq) val input = for (i <- 1 to rowCount) yield { data } val inputRDD = sc.parallelize(input, 2) (inputRDD, inputSchema) } }
Example 4
Source File: DateTimeDataFixture.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.vector import java.util.TimeZone import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{DateType, StructField, StructType, TimestampType} import com.actian.spark_vector.test.util.DateHelper.{ansiDateFor, timestampFor} object DateTimeDataFixture { def timeRDD(sparkContext: SparkContext): (RDD[Seq[Any]], StructType) = createTimeRDD(sparkContext, timeData) private[vector] val tz = TimeZone.getTimeZone("GMT-06:00") private[vector] val utc = TimeZone.getTimeZone("UTC") private[vector] val timeData = Seq( Seq[Any]( timestampFor(1995, 1, 22, 18, 3, 29, 234, tz), timestampFor(1996, 2, 22, 18, 3, 29, 234), timestampFor(1997, 2, 22, 18, 3, 29, 234), timestampFor(1998, 1, 22, 18, 3, 29, 234, tz), timestampFor(1999, 2, 22, 18, 3, 29, 234), timestampFor(2000, 2, 22, 18, 3, 29, 234), timestampFor(2015, 11, 23, 18, 3, 29, 123, tz), timestampFor(2015, 11, 23, 18, 3, 29, 123), ansiDateFor(1995, 2, 22)), Seq[Any]( timestampFor(2015, 3, 2, 17, 52, 12, 678, tz), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 3, 2, 17, 52, 12, 678, tz), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 4, 2, 17, 52, 12, 678), timestampFor(2015, 11, 13, 17, 52, 12, 123, tz), ansiDateFor(2015, 4, 2))) private def createTimeRDD(sparkContext: SparkContext, data: Seq[Seq[Any]]): (RDD[Seq[Any]], StructType) = { val schema = StructType(Seq( StructField("tswtz", TimestampType), StructField("tsntz", TimestampType), StructField("tsltz", TimestampType), StructField("tswtz4", TimestampType), StructField("tsntz4", TimestampType), StructField("tsltz4", TimestampType), StructField("tmwtz", TimestampType), StructField("tmntz", TimestampType), StructField("tmltz", TimestampType), StructField("tmwtz3", TimestampType), StructField("tmntz3", TimestampType), StructField("tmltz3", TimestampType), StructField("date", DateType))) (sparkContext.parallelize(data, 2), schema) } def createTimeTable(connectionProps: VectorConnectionProperties)(tableName: String): Unit = { VectorJDBC.withJDBC(connectionProps) { cxn => cxn.dropTable(tableName) cxn.executeStatement( s"""|create table ${tableName} ( | tswtz timestamp with time zone, | tsntz timestamp without time zone, | tsltz timestamp with local time zone, | tswtz4 timestamp(4) with time zone, | tsntz4 timestamp(4) without time zone, | tsltz4 timestamp(4) with local time zone, | tmwtz time with time zone, | tmntz time without time zone, | tmltz time with local time zone, | tmwtz3 time(3) with time zone, | tmntz3 time(3) without time zone, | tmltz3 time(3) with local time zone, | dt date |)""".stripMargin) } } }