org.apache.spark.sql.types.DateType Scala Example

Source File: TimestampExpressionSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import java.sql.Timestamp

import org.apache.spark.sql.catalyst.util.DateTimeUtils
import org.apache.spark.sql.types.{DateType, IntegerType}
import org.scalatest.FunSuite

class TimestampExpressionSuite extends FunSuite with ExpressionEvalHelper {

  test("add_seconds") {
    // scalastyle:off magic.number
    checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-01 00:11:33")), Literal(28)),
      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-01-01 00:12:01")))
    checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-02 00:00:00")), Literal(-1)),
      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-01-01 23:59:59")))
    checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-01 00:00:00")), Literal(-1)),
      DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2014-12-31 23:59:59")))
    checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-02 00:00:00")),
      Literal.create(null, IntegerType)), null)
    checkEvaluation(AddSeconds(Literal.create(null, DateType), Literal(1)), null)
    checkEvaluation(AddSeconds(Literal.create(null, DateType), Literal.create(null, IntegerType)),
      null)
  }
}

Source File: YelpHelpers.scala From morpheus with Apache License 2.0

5 votes

package org.opencypher.morpheus.integration.yelp

import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, LongType}
import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions}
import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey
import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey}
import org.opencypher.morpheus.impl.table.SparkTable._
import org.opencypher.morpheus.integration.yelp.YelpConstants._

object YelpHelpers {

  case class YelpTables(
    userDf: DataFrame,
    businessDf: DataFrame,
    reviewDf: DataFrame
  )

  def loadYelpTables(inputPath: String)(implicit spark: SparkSession): YelpTables = {
    import spark.implicits._

    log("read business.json", 2)
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")
    log("read review.json", 2)
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")
    log("read user.json", 2)
    val rawUserDf = spark.read.json(s"$inputPath/user.json")

    val businessDf = rawBusinessDf.select($"business_id".as(sourceIdKey), $"business_id", $"name", $"address", $"city", $"state")
    val reviewDf = rawReviewDf.select($"review_id".as(sourceIdKey), $"user_id".as(sourceStartNodeKey), $"business_id".as(sourceEndNodeKey), $"stars", $"date".cast(DateType))
    val userDf = rawUserDf.select(
      $"user_id".as(sourceIdKey),
      $"name",
      $"yelping_since".cast(DateType),
      functions.split($"elite", ",").cast(ArrayType(LongType)).as("elite"))

    YelpTables(userDf, businessDf, reviewDf)
  }

  def printYelpStats(inputPath: String)(implicit spark: SparkSession): Unit = {
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")

    import spark.implicits._

    rawBusinessDf.select($"city", $"state").distinct().show()
    rawBusinessDf.withColumnRenamed("business_id", "id")
      .join(rawReviewDf, $"id" === $"business_id")
      .groupBy($"city", $"state")
      .count().as("count")
      .orderBy($"count".desc, $"state".asc)
      .show(100)
  }

  def extractYelpCitySubset(inputPath: String, outputPath: String, city: String)(implicit spark: SparkSession): Unit = {
    import spark.implicits._

    def emailColumn(userId: String): Column = functions.concat($"$userId", functions.lit("@yelp.com"))

    val rawUserDf = spark.read.json(s"$inputPath/user.json")
    val rawReviewDf = spark.read.json(s"$inputPath/review.json")
    val rawBusinessDf = spark.read.json(s"$inputPath/business.json")

    val businessDf = rawBusinessDf.filter($"city" === city)
    val reviewDf = rawReviewDf
      .join(businessDf, Seq("business_id"), "left_semi")
      .withColumn("user_email", emailColumn("user_id"))
      .withColumnRenamed("stars", "stars_tmp")
      .withColumn("stars", $"stars_tmp".cast(IntegerType))
      .drop("stars_tmp")
    val userDf = rawUserDf
      .join(reviewDf, Seq("user_id"), "left_semi")
      .withColumn("email", emailColumn("user_id"))
    val friendDf = userDf
      .select($"email".as("user1_email"), functions.explode(functions.split($"friends", ", ")).as("user2_id"))
      .withColumn("user2_email", emailColumn("user2_id"))
      .select(s"user1_email", s"user2_email")

    businessDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/business.json")
    reviewDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/review.json")
    userDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/user.json")
    friendDf.write.json(s"$outputPath/$cityGraphName/$yelpBookDB/friend.json")
  }

  implicit class DataFrameOps(df: DataFrame) {
    def prependIdColumn(idColumn: String, prefix: String): DataFrame =
      df.transformColumns(idColumn)(column => functions.concat(functions.lit(prefix), column).as(idColumn))
  }
}

Source File: RDDFixtures.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector

import java.util.Date

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ DateType, IntegerType, StringType, StructField, StructType }

import com.actian.spark_vector.test.util.StructTypeUtil.createSchema


trait RDDFixtures {
  // poor man's fixture, for other approaches see:
  // http://www.scalatest.org/user_guide/sharing_fixtures
  def createRecordRdd(sc: SparkContext): (RDD[Seq[Any]], StructType) = {
    val input = Seq(
      Seq(42, "a"),
      Seq(43, "b"))
    val inputRdd = sc.parallelize(input, 2)
    val inputSchema = createSchema("id" -> IntegerType, "name" -> StringType)
    (inputRdd, inputSchema)
  }

  def createRowRDD(sc: SparkContext): (RDD[Seq[Any]], StructType) = {
    val input = Seq(
      Seq[Any](42, "a", new Date(), new Date()),
      Seq[Any](43, "b", new Date(), new Date()))
    val inputRdd = sc.parallelize(input, 2)
    val inputSchema = createSchema("id" -> IntegerType, "name" -> StringType, "date" -> DateType)
    (inputRdd, inputSchema)
  }

  def wideRDD(sc: SparkContext, columnCount: Int, rowCount: Int = 2): (RDD[Row], StructType) = {
    val data: Row = Row.fromSeq(1 to columnCount)

    val fields = for (i <- 1 to rowCount) yield {
      StructField("field_" + i, IntegerType, true)
    }

    val inputSchema = StructType(fields.toSeq)

    val input = for (i <- 1 to rowCount) yield {
      data
    }

    val inputRDD = sc.parallelize(input, 2)
    (inputRDD, inputSchema)
  }
}

Source File: DateTimeDataFixture.scala From spark-vector with Apache License 2.0

5 votes

package com.actian.spark_vector.vector

import java.util.TimeZone

import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{DateType, StructField, StructType, TimestampType}

import com.actian.spark_vector.test.util.DateHelper.{ansiDateFor, timestampFor}

object DateTimeDataFixture {
  def timeRDD(sparkContext: SparkContext): (RDD[Seq[Any]], StructType) = createTimeRDD(sparkContext, timeData)

  private[vector] val tz = TimeZone.getTimeZone("GMT-06:00")

  private[vector] val utc = TimeZone.getTimeZone("UTC")

  private[vector] val timeData = Seq(
    Seq[Any](
      timestampFor(1995, 1, 22, 18, 3, 29, 234, tz),
      timestampFor(1996, 2, 22, 18, 3, 29, 234),
      timestampFor(1997, 2, 22, 18, 3, 29, 234),
      timestampFor(1998, 1, 22, 18, 3, 29, 234, tz),
      timestampFor(1999, 2, 22, 18, 3, 29, 234),
      timestampFor(2000, 2, 22, 18, 3, 29, 234),
      timestampFor(2015, 11, 23, 18, 3, 29, 123, tz),
      timestampFor(2015, 11, 23, 18, 3, 29, 123),
      ansiDateFor(1995, 2, 22)),
    Seq[Any](
      timestampFor(2015, 3, 2, 17, 52, 12, 678, tz),
      timestampFor(2015, 4, 2, 17, 52, 12, 678),
      timestampFor(2015, 4, 2, 17, 52, 12, 678),
      timestampFor(2015, 3, 2, 17, 52, 12, 678, tz),
      timestampFor(2015, 4, 2, 17, 52, 12, 678),
      timestampFor(2015, 4, 2, 17, 52, 12, 678),
      timestampFor(2015, 11, 13, 17, 52, 12, 123, tz),
      ansiDateFor(2015, 4, 2)))

  private def createTimeRDD(sparkContext: SparkContext, data: Seq[Seq[Any]]): (RDD[Seq[Any]], StructType) = {
    val schema = StructType(Seq(
      StructField("tswtz", TimestampType),
      StructField("tsntz", TimestampType),
      StructField("tsltz", TimestampType),
      StructField("tswtz4", TimestampType),
      StructField("tsntz4", TimestampType),
      StructField("tsltz4", TimestampType),
      StructField("tmwtz", TimestampType),
      StructField("tmntz", TimestampType),
      StructField("tmltz", TimestampType),
      StructField("tmwtz3", TimestampType),
      StructField("tmntz3", TimestampType),
      StructField("tmltz3", TimestampType),
      StructField("date", DateType)))

    (sparkContext.parallelize(data, 2), schema)
  }

  def createTimeTable(connectionProps: VectorConnectionProperties)(tableName: String): Unit = {
    VectorJDBC.withJDBC(connectionProps) { cxn =>
      cxn.dropTable(tableName)
      cxn.executeStatement(
        s"""|create table ${tableName} (
            |  tswtz timestamp with time zone,
            |  tsntz timestamp without time zone,
            |  tsltz timestamp with local time zone,
            |  tswtz4 timestamp(4) with time zone,
            |  tsntz4 timestamp(4) without time zone,
            |  tsltz4 timestamp(4) with local time zone,
            |  tmwtz time with time zone,
            |  tmntz time without time zone,
            |  tmltz time with local time zone,
            |  tmwtz3 time(3) with time zone,
            |  tmntz3 time(3) without time zone,
            |  tmltz3 time(3) with local time zone,
            |  dt date
            |)""".stripMargin)
    }
  }
}

org.apache.spark.sql.types.DateType Scala Examples