org.apache.spark.sql.catalyst.util.DateTimeUtils Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.util.DateTimeUtils.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: KustoCsvSerializationUtils.scala From azure-kusto-spark with Apache License 2.0 | 6 votes |
package com.microsoft.kusto.spark.datasink import java.util.TimeZone import com.microsoft.kusto.spark.utils.DataTypeMapping import org.apache.commons.lang3.time.FastDateFormat import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types.DataTypes._ import org.apache.spark.sql.types.StructType private[kusto] class KustoCsvSerializationUtils (val schema: StructType, timeZone: String){ private[kusto] val dateFormat = FastDateFormat.getInstance("yyyy-MM-dd'T'HH:mm:ss.SSSXXX", TimeZone.getTimeZone(timeZone)) private[kusto] def convertRow(row: InternalRow) = { val values = new Array[String](row.numFields) for (i <- 0 until row.numFields if !row.isNullAt(i)) { val dataType = schema.fields(i).dataType values(i) = dataType match { case DateType => DateTimeUtils.toJavaDate(row.getInt(i)).toString case TimestampType => dateFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(i))) case _ => row.get(i, dataType).toString } } values } } private[kusto] object KustoCsvMapper { import org.apache.spark.sql.types.StructType import org.json def createCsvMapping(schema: StructType): String = { val csvMapping = new json.JSONArray() for (i <- 0 until schema.length) { val field = schema.apply(i) val dataType = field.dataType val mapping = new json.JSONObject() mapping.put("Name", field.name) mapping.put("Ordinal", i) mapping.put("DataType", DataTypeMapping.sparkTypeToKustoTypeMap.getOrElse(dataType, StringType)) csvMapping.put(mapping) } csvMapping.toString } }
Example 2
Source File: ComputeCurrentTimeSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.util.DateTimeUtils class ComputeCurrentTimeSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Seq(Batch("ComputeCurrentTime", Once, ComputeCurrentTime)) } test("analyzer should replace current_timestamp with literals") { val in = Project(Seq(Alias(CurrentTimestamp(), "a")(), Alias(CurrentTimestamp(), "b")()), LocalRelation()) val min = System.currentTimeMillis() * 1000 val plan = Optimize.execute(in.analyze).asInstanceOf[Project] val max = (System.currentTimeMillis() + 1) * 1000 val lits = new scala.collection.mutable.ArrayBuffer[Long] plan.transformAllExpressions { case e: Literal => lits += e.value.asInstanceOf[Long] e } assert(lits.size == 2) assert(lits(0) >= min && lits(0) <= max) assert(lits(1) >= min && lits(1) <= max) assert(lits(0) == lits(1)) } test("analyzer should replace current_date with literals") { val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation()) val min = DateTimeUtils.millisToDays(System.currentTimeMillis()) val plan = Optimize.execute(in.analyze).asInstanceOf[Project] val max = DateTimeUtils.millisToDays(System.currentTimeMillis()) val lits = new scala.collection.mutable.ArrayBuffer[Int] plan.transformAllExpressions { case e: Literal => lits += e.value.asInstanceOf[Int] e } assert(lits.size == 2) assert(lits(0) >= min && lits(0) <= max) assert(lits(1) >= min && lits(1) <= max) assert(lits(0) == lits(1)) } }
Example 3
Source File: TimeColumnBuffer.scala From spark-vector with Apache License 2.0 | 5 votes |
package com.actian.spark_vector.colbuffer.time import java.nio.ByteBuffer import java.sql.Timestamp import java.util.{ Calendar, TimeZone } import org.apache.spark.sql.catalyst.util.DateTimeUtils import com.actian.spark_vector.ComposePartial import com.actian.spark_vector.colbuffer._ import com.actian.spark_vector.colbuffer.util._ import com.actian.spark_vector.vector.VectorDataType private case class TimeColumnBufferParams(cbParams: ColumnBufferBuildParams, converter: TimeConversion.TimeConverter, adjustToUTC: Boolean = false) private[colbuffer] abstract class TimeColumnBuffer(p: TimeColumnBufferParams, valueWidth: Int) extends ColumnBuffer[Timestamp, Long](p.cbParams.name, p.cbParams.maxValueCount, valueWidth, valueWidth, p.cbParams.nullable) { private val ts = new Timestamp(System.currentTimeMillis()) private val cal = Calendar.getInstance override def put(source: Timestamp, buffer: ByteBuffer): Unit = { if (p.adjustToUTC) { TimeConversion.convertLocalTimestampToUTC(source, cal) } val convertedSource = p.converter.convert(TimeConversion.normalizeTime(source), p.cbParams.scale) putConverted(convertedSource, buffer) } protected def putConverted(converted: Long, buffer: ByteBuffer): Unit override def get(buffer: ByteBuffer): Long = { val deconvertedSource = p.converter.deconvert(getConverted(buffer), p.cbParams.scale) ts.setTime(TimeConversion.scaleNanos(deconvertedSource, MillisecondsScale)) ts.setNanos((deconvertedSource % PowersOfTen(NanosecondsScale)).toInt) if (p.adjustToUTC) { TimeConversion.convertUTCToLocalTimestamp(ts, cal) } DateTimeUtils.fromJavaTimestamp(ts) } protected def getConverted(buffer: ByteBuffer): Long } private class TimeIntColumnBuffer(p: TimeColumnBufferParams) extends TimeColumnBuffer(p, IntSize) { override protected def putConverted(converted: Long, buffer: ByteBuffer): Unit = buffer.putInt(converted.toInt) override protected def getConverted(buffer: ByteBuffer): Long = buffer.getInt() } private class TimeLongColumnBuffer(p: TimeColumnBufferParams) extends TimeColumnBuffer(p, LongSize) { override protected def putConverted(converted: Long, buffer: ByteBuffer): Unit = buffer.putLong(converted) override protected def getConverted(buffer: ByteBuffer): Long = buffer.getLong() } private class TimeNZLZConverter extends TimeConversion.TimeConverter { override def convert(unscaledNanos: Long, scale: Int): Long = TimeConversion.scaleNanos(unscaledNanos, scale) override def deconvert(scaledNanos: Long, scale: Int): Long = TimeConversion.unscaleNanos(scaledNanos, scale) } private class TimeTZConverter extends TimeConversion.TimeConverter { override def convert(unscaledNanos: Long, scale: Int): Long = (TimeConversion.scaleNanos(unscaledNanos, scale) << TimeMaskSize) override def deconvert(scaledNanos: Long, scale: Int): Long = TimeConversion.unscaleNanos(scaledNanos >> TimeMaskSize, scale) } private[colbuffer] object TimeColumnBuffer extends ColumnBufferBuilder { private final val (nzlzIntScaleBounds, nzlzLongScaleBounds) = ((0, 4), (5, 9)) private final val (tzIntScaleBounds, tzLongScaleBounds) = ((0, 1), (2, 9)) private val calIsNotUTC = Calendar.getInstance.getTimeZone != TimeZone.getTimeZone("UTC") private val buildNZPartial: PartialFunction[ColumnBufferBuildParams, TimeColumnBufferParams] = ofDataType(VectorDataType.TimeType) andThen { TimeColumnBufferParams(_, new TimeNZLZConverter(), calIsNotUTC) } private val buildLZPartial: PartialFunction[ColumnBufferBuildParams, TimeColumnBufferParams] = ofDataType(VectorDataType.TimeLTZType) andThen { TimeColumnBufferParams(_, new TimeNZLZConverter()) } private val buildNZLZ: PartialFunction[ColumnBufferBuildParams, ColumnBuffer[_, _]] = (buildNZPartial orElse buildLZPartial) andThenPartial { case nzlz if isInBounds(nzlz.cbParams.scale, nzlzIntScaleBounds) => new TimeIntColumnBuffer(nzlz) case nzlz if isInBounds(nzlz.cbParams.scale, nzlzLongScaleBounds) => new TimeLongColumnBuffer(nzlz) } private val buildTZPartial: PartialFunction[ColumnBufferBuildParams, TimeColumnBufferParams] = ofDataType(VectorDataType.TimeTZType) andThen { TimeColumnBufferParams(_, new TimeTZConverter()) } private val buildTZ: PartialFunction[ColumnBufferBuildParams, ColumnBuffer[_, _]] = buildTZPartial andThenPartial { case tz if isInBounds(tz.cbParams.scale, tzIntScaleBounds) => new TimeIntColumnBuffer(tz) case tz if isInBounds(tz.cbParams.scale, tzLongScaleBounds) => new TimeLongColumnBuffer(tz) } override private[colbuffer] val build: PartialFunction[ColumnBufferBuildParams, ColumnBuffer[_, _]] = buildNZLZ orElse buildTZ }
Example 4
Source File: Neo4jUtils.scala From neo4j-spark-connector with Apache License 2.0 | 5 votes |
package org.neo4j.spark.utils import java.sql.Timestamp import java.time._ import java.util.concurrent.Callable import java.util.function import io.github.resilience4j.retry.{Retry, RetryConfig} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.neo4j.driver.exceptions.{ServiceUnavailableException, SessionExpiredException, TransientException} import org.neo4j.driver.{Driver, Result, Session, Transaction} import org.neo4j.spark.Neo4jConfig import org.slf4j.LoggerFactory class Neo4jUtils object Neo4jUtils { private val logger = LoggerFactory.getLogger(classOf[Neo4jUtils]) def close(driver: Driver, session: Session): Unit = { try { if (session != null && session.isOpen) { closeSafety(session) } } finally { if (driver != null) { closeSafety(driver) } } } private def closeSafety(closable: AutoCloseable): Unit = { try { closable.close() } catch { case e: Throwable => { logger.error("Exception while trying to close an AutoCloseable, because of the following exception", e) } } } private val retryConfig = RetryConfig.custom.retryExceptions( classOf[SessionExpiredException], classOf[ServiceUnavailableException] // retry on the same exceptions the driver does [1] ) .retryOnException(new function.Predicate[Throwable] { override def test(exception: Throwable): Boolean = exception match { case t: TransientException => { val code = t.code() !("Neo.TransientError.Transaction.Terminated" == code) && !("Neo.TransientError.Transaction.LockClientStopped" == code) } case _ => false } }) .maxAttempts(3) .build def executeTxWithRetries[T](neo4jConfig: Neo4jConfig, query: String, params: java.util.Map[String, AnyRef], write: Boolean): (Driver, Session, Transaction, Result) = { val driver: Driver = neo4jConfig.driver() val session: Session = driver.session(neo4jConfig.sessionConfig(write)) Retry.decorateCallable( Retry.of("neo4jTransactionRetryPool", retryConfig), new Callable[(Driver, Session, Transaction, Result)] { override def call(): (Driver, Session, Transaction, Result) = { val transaction = session.beginTransaction() val result = transaction.run(query, params) (driver, session, transaction, result) } } ) .call() } def convert(value: AnyRef): AnyRef = value match { case m: ZonedDateTime => new Timestamp(DateTimeUtils.fromUTCTime(m.toInstant.toEpochMilli, m.getZone.getId)) case m: LocalDateTime => new Timestamp(DateTimeUtils.fromUTCTime(m.toInstant(ZoneOffset.UTC).toEpochMilli,"UTC")) case m: LocalDate => java.sql.Date.valueOf(m) case m: OffsetTime => new Timestamp(m.atDate(LocalDate.ofEpochDay(0)).toInstant.toEpochMilli) case _ => value } }
Example 5
Source File: Executor.scala From neo4j-spark-connector with Apache License 2.0 | 5 votes |
package org.neo4j.spark import java.time.{LocalDate, LocalDateTime, OffsetTime, ZoneOffset, ZonedDateTime} import java.util import java.sql.Timestamp import org.apache.spark.SparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types.StructType import org.neo4j.spark.dataframe.CypherTypes import org.neo4j.spark.utils.{Neo4jSessionAwareIterator, Neo4jUtils} import scala.collection.JavaConverters._ object Executor { def convert(value: AnyRef): Any = value match { case it: util.Collection[_] => it.toArray() case m: java.util.Map[_,_] => m.asScala case _ => Neo4jUtils.convert(value) } def toJava(parameters: Map[String, Any]): java.util.Map[String, Object] = { parameters.mapValues(toJava).asJava } private def toJava(x: Any): AnyRef = x match { case y: Seq[_] => y.asJava case _ => x.asInstanceOf[AnyRef] } val EMPTY = Array.empty[Any] val EMPTY_RESULT = new CypherResult(new StructType(), Iterator.empty) class CypherResult(val schema: StructType, val rows: Iterator[Array[Any]]) { def sparkRows: Iterator[Row] = rows.map(row => new GenericRowWithSchema(row, schema)) def fields = schema.fieldNames } def execute(sc: SparkContext, query: String, parameters: Map[String, AnyRef]): CypherResult = { execute(Neo4jConfig(sc.getConf), query, parameters) } private def rows(result: Iterator[_]) = { var i = 0 while (result.hasNext) i = i + 1 i } def execute(config: Neo4jConfig, query: String, parameters: Map[String, Any], write: Boolean = false): CypherResult = { val result = new Neo4jSessionAwareIterator(config, query, toJava(parameters), write) if (!result.hasNext) { return EMPTY_RESULT } val peek = result.peek() val keyCount = peek.size() if (keyCount == 0) { return new CypherResult(new StructType(), Array.fill[Array[Any]](rows(result))(EMPTY).toIterator) } val keys = peek.keys().asScala val fields = keys.map(k => (k, peek.get(k).`type`())).map(keyType => CypherTypes.field(keyType)) val schema = StructType(fields) val it = result.map(record => { val row = new Array[Any](keyCount) var i = 0 while (i < keyCount) { val value = convert(record.get(i).asObject()) row.update(i, value) i = i + 1 } row }) new CypherResult(schema, it) } }
Example 6
Source File: DataFramePrettyPrinter.scala From lighthouse with Apache License 2.0 | 5 votes |
package be.dataminded.lighthouse.testing import java.sql.Date import org.apache.commons.lang3.StringUtils import org.apache.spark.sql.DataFrame import org.apache.spark.sql.catalyst.util.DateTimeUtils private[testing] object DataFramePrettyPrinter { def prettyPrintDataFrame(df: DataFrame, number: Int, truncate: Int = 20): String = { val numRows = number.max(0) val takeResult = df.take(numRows + 1) val hasMoreData = takeResult.length > numRows val data = takeResult.take(numRows) val header = df.schema.fieldNames.toSeq def asReadableRows = { data.map { row => row.toSeq.map { cell => val str = cell match { case null => "null" case binary: Array[Byte] => binary.map("%02X".format(_)).mkString("[", " ", "]") case array: Array[_] => array.mkString("[", ", ", "]") case seq: Seq[_] => seq.mkString("[", ", ", "]") case d: Date => DateTimeUtils.dateToString(DateTimeUtils.fromJavaDate(d)) case _ => cell.toString } if (truncate > 0 && str.length > truncate) { // do not show ellipses for strings shorter than 4 characters. if (truncate < 4) str.substring(0, truncate) else str.substring(0, truncate - 3) + "..." } else { str } }: Seq[String] } } // For array values, replace Seq and Array with square brackets // For cells that are beyond `truncate` characters, replace it with the // first `truncate-3` and "..." val rows: Seq[Seq[String]] = header +: asReadableRows val sb = new StringBuilder // Initialise the width of each column to a minimum value of '3' val colWidths = Array.fill(header.length)(3) // Compute the width of each column for (row <- rows) { for ((cell, i) <- row.zipWithIndex) { colWidths(i) = math.max(colWidths(i), cell.length) } } // Create SeparateLine val sep: String = colWidths.map("-" * _).addString(sb, "+", "+", "+\n").toString() // column names rows.head.zipWithIndex .map { case (cell, i) => if (truncate > 0) { StringUtils.leftPad(cell, colWidths(i)) } else { StringUtils.rightPad(cell, colWidths(i)) } } .addString(sb, "|", "|", "|\n") sb.append(sep) // data rows.tail.map { _.zipWithIndex .map { case (cell, i) => if (truncate > 0) { StringUtils.leftPad(cell.toString, colWidths(i)) } else { StringUtils.rightPad(cell.toString, colWidths(i)) } } .addString(sb, "|", "|", "|\n") } sb.append(sep) // For Data that has more than "numRows" records if (hasMoreData) { val rowsString = if (numRows == 1) "row" else "rows" sb.append(s"only showing top $numRows $rowsString\n") } sb.toString() } }
Example 7
Source File: JacksonGenerator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.json import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{MapData, ArrayData, DateTimeUtils} import scala.collection.Map import com.fasterxml.jackson.core._ import org.apache.spark.sql.Row import org.apache.spark.sql.types._ private[sql] object JacksonGenerator { def apply(rowSchema: StructType, gen: JsonGenerator)(row: InternalRow): Unit = { def valWriter: (DataType, Any) => Unit = { case (_, null) | (NullType, _) => gen.writeNull() case (StringType, v) => gen.writeString(v.toString) case (TimestampType, v: Long) => gen.writeString(DateTimeUtils.toJavaTimestamp(v).toString) case (IntegerType, v: Int) => gen.writeNumber(v) case (ShortType, v: Short) => gen.writeNumber(v) case (FloatType, v: Float) => gen.writeNumber(v) case (DoubleType, v: Double) => gen.writeNumber(v) case (LongType, v: Long) => gen.writeNumber(v) case (DecimalType(), v: Decimal) => gen.writeNumber(v.toJavaBigDecimal) case (ByteType, v: Byte) => gen.writeNumber(v.toInt) case (BinaryType, v: Array[Byte]) => gen.writeBinary(v) case (BooleanType, v: Boolean) => gen.writeBoolean(v) case (DateType, v: Int) => gen.writeString(DateTimeUtils.toJavaDate(v).toString) // For UDT values, they should be in the SQL type's corresponding value type. // We should not see values in the user-defined class at here. // For example, VectorUDT's SQL type is an array of double. So, we should expect that v is // an ArrayData at here, instead of a Vector. case (udt: UserDefinedType[_], v) => valWriter(udt.sqlType, v) case (ArrayType(ty, _), v: ArrayData) => gen.writeStartArray() v.foreach(ty, (_, value) => valWriter(ty, value)) gen.writeEndArray() case (MapType(kt, vt, _), v: MapData) => gen.writeStartObject() v.foreach(kt, vt, { (k, v) => gen.writeFieldName(k.toString) valWriter(vt, v) }) gen.writeEndObject() case (StructType(ty), v: InternalRow) => gen.writeStartObject() var i = 0 while (i < ty.length) { val field = ty(i) val value = v.get(i, field.dataType) if (value != null) { gen.writeFieldName(field.name) valWriter(field.dataType, value) } i += 1 } gen.writeEndObject() case (dt, v) => sys.error( s"Failed to convert value $v (class of ${v.getClass}}) with the type of $dt to JSON.") } valWriter(rowSchema, row) } }
Example 8
Source File: KinesisRecordToUnsafeRowConverter.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import com.amazonaws.services.kinesis.model.Record import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.unsafe.types.UTF8String private[kinesis] class KinesisRecordToUnsafeRowConverter { private val rowWriter = new UnsafeRowWriter(5) def toUnsafeRow(record: Record, streamName: String): UnsafeRow = { rowWriter.reset() rowWriter.write(0, record.getData.array()) rowWriter.write(1, UTF8String.fromString(streamName)) rowWriter.write(2, UTF8String.fromString(record.getPartitionKey)) rowWriter.write(3, UTF8String.fromString(record.getSequenceNumber)) rowWriter.write(4, DateTimeUtils.fromJavaTimestamp( new java.sql.Timestamp(record.getApproximateArrivalTimestamp.getTime))) rowWriter.getRow } }
Example 9
Source File: UnivocityGenerator.scala From mimir with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.ubodin.csv import java.io.Writer import com.univocity.parsers.csv.CsvWriter import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ private[csv] class UnivocityGenerator( schema: StructType, writer: Writer, options: CSVOptions) { private val writerSettings = options.asWriterSettings writerSettings.setHeaders(schema.fieldNames: _*) private val gen = new CsvWriter(writer, writerSettings) private var printHeader = options.headerFlag // A `ValueConverter` is responsible for converting a value of an `InternalRow` to `String`. // When the value is null, this converter should not be called. private type ValueConverter = (InternalRow, Int) => String // `ValueConverter`s for all values in the fields of the schema private val valueConverters: Array[ValueConverter] = schema.map(_.dataType).map(makeConverter).toArray private def makeConverter(dataType: DataType): ValueConverter = dataType match { case DateType => (row: InternalRow, ordinal: Int) => options.dateFormat.format(DateTimeUtils.toJavaDate(row.getInt(ordinal))) case TimestampType => (row: InternalRow, ordinal: Int) => options.timestampFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal))) case udt: UserDefinedType[_] => makeConverter(udt.sqlType) case dt: DataType => (row: InternalRow, ordinal: Int) => row.get(ordinal, dt).toString } private def convertRow(row: InternalRow): Seq[String] = { var i = 0 val values = new Array[String](row.numFields) while (i < row.numFields) { if (!row.isNullAt(i)) { values(i) = valueConverters(i).apply(row, i) } else { values(i) = options.nullValue } i += 1 } values } def write(row: InternalRow): Unit = { if (printHeader) { gen.writeHeaders() } gen.writeRow(convertRow(row): _*) printHeader = false } def close(): Unit = gen.close() def flush(): Unit = gen.flush() }
Example 10
Source File: ResolvedDataSourceSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.test.SharedSQLContext class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext { private def getProvidingClass(name: String): Class[_] = DataSource( sparkSession = spark, className = name, options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID) ).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("error message for unknown data sources") { val error1 = intercept[AnalysisException] { getProvidingClass("avro") } assert(error1.getMessage.contains("Failed to find data source: avro.")) val error2 = intercept[AnalysisException] { getProvidingClass("com.databricks.spark.avro") } assert(error2.getMessage.contains("Failed to find data source: com.databricks.spark.avro.")) val error3 = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error3.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 11
Source File: ArrowUtilsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.arrow import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ class ArrowUtilsSuite extends SparkFunSuite { def roundtrip(dt: DataType): Unit = { dt match { case schema: StructType => assert(ArrowUtils.fromArrowSchema(ArrowUtils.toArrowSchema(schema, null)) === schema) case _ => roundtrip(new StructType().add("value", dt)) } } test("simple") { roundtrip(BooleanType) roundtrip(ByteType) roundtrip(ShortType) roundtrip(IntegerType) roundtrip(LongType) roundtrip(FloatType) roundtrip(DoubleType) roundtrip(StringType) roundtrip(BinaryType) roundtrip(DecimalType.SYSTEM_DEFAULT) roundtrip(DateType) val tsExMsg = intercept[UnsupportedOperationException] { roundtrip(TimestampType) } assert(tsExMsg.getMessage.contains("timeZoneId")) } test("timestamp") { def roundtripWithTz(timeZoneId: String): Unit = { val schema = new StructType().add("value", TimestampType) val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId) val fieldType = arrowSchema.findField("value").getType.asInstanceOf[ArrowType.Timestamp] assert(fieldType.getTimezone() === timeZoneId) assert(ArrowUtils.fromArrowSchema(arrowSchema) === schema) } roundtripWithTz(DateTimeUtils.defaultTimeZone().getID) roundtripWithTz("Asia/Tokyo") roundtripWithTz("UTC") roundtripWithTz("America/Los_Angeles") } test("array") { roundtrip(ArrayType(IntegerType, containsNull = true)) roundtrip(ArrayType(IntegerType, containsNull = false)) roundtrip(ArrayType(ArrayType(IntegerType, containsNull = true), containsNull = true)) roundtrip(ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = true)) roundtrip(ArrayType(ArrayType(IntegerType, containsNull = true), containsNull = false)) roundtrip(ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = false)) } test("struct") { roundtrip(new StructType()) roundtrip(new StructType().add("i", IntegerType)) roundtrip(new StructType().add("arr", ArrayType(IntegerType))) roundtrip(new StructType().add("i", IntegerType).add("arr", ArrayType(IntegerType))) roundtrip(new StructType().add( "struct", new StructType().add("i", IntegerType).add("arr", ArrayType(IntegerType)))) } }
Example 12
Source File: UnivocityGenerator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import java.io.Writer import com.univocity.parsers.csv.CsvWriter import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ private[csv] class UnivocityGenerator( schema: StructType, writer: Writer, options: CSVOptions) { private val writerSettings = options.asWriterSettings writerSettings.setHeaders(schema.fieldNames: _*) private val gen = new CsvWriter(writer, writerSettings) private var printHeader = options.headerFlag // A `ValueConverter` is responsible for converting a value of an `InternalRow` to `String`. // When the value is null, this converter should not be called. private type ValueConverter = (InternalRow, Int) => String // `ValueConverter`s for all values in the fields of the schema private val valueConverters: Array[ValueConverter] = schema.map(_.dataType).map(makeConverter).toArray private def makeConverter(dataType: DataType): ValueConverter = dataType match { case DateType => (row: InternalRow, ordinal: Int) => options.dateFormat.format(DateTimeUtils.toJavaDate(row.getInt(ordinal))) case TimestampType => (row: InternalRow, ordinal: Int) => options.timestampFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal))) case udt: UserDefinedType[_] => makeConverter(udt.sqlType) case dt: DataType => (row: InternalRow, ordinal: Int) => row.get(ordinal, dt).toString } private def convertRow(row: InternalRow): Seq[String] = { var i = 0 val values = new Array[String](row.numFields) while (i < row.numFields) { if (!row.isNullAt(i)) { values(i) = valueConverters(i).apply(row, i) } else { values(i) = options.nullValue } i += 1 } values } def write(row: InternalRow): Unit = { if (printHeader) { gen.writeHeaders() } gen.writeRow(convertRow(row): _*) printHeader = false } def close(): Unit = gen.close() def flush(): Unit = gen.flush() }
Example 13
Source File: ComputeCurrentTimeSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.util.DateTimeUtils class ComputeCurrentTimeSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Seq(Batch("ComputeCurrentTime", Once, ComputeCurrentTime)) } test("analyzer should replace current_timestamp with literals") { val in = Project(Seq(Alias(CurrentTimestamp(), "a")(), Alias(CurrentTimestamp(), "b")()), LocalRelation()) val min = System.currentTimeMillis() * 1000 val plan = Optimize.execute(in.analyze).asInstanceOf[Project] val max = (System.currentTimeMillis() + 1) * 1000 val lits = new scala.collection.mutable.ArrayBuffer[Long] plan.transformAllExpressions { case e: Literal => lits += e.value.asInstanceOf[Long] e } assert(lits.size == 2) assert(lits(0) >= min && lits(0) <= max) assert(lits(1) >= min && lits(1) <= max) assert(lits(0) == lits(1)) } test("analyzer should replace current_date with literals") { val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation()) val min = DateTimeUtils.millisToDays(System.currentTimeMillis()) val plan = Optimize.execute(in.analyze).asInstanceOf[Project] val max = DateTimeUtils.millisToDays(System.currentTimeMillis()) val lits = new scala.collection.mutable.ArrayBuffer[Int] plan.transformAllExpressions { case e: Literal => lits += e.value.asInstanceOf[Int] e } assert(lits.size == 2) assert(lits(0) >= min && lits(0) <= max) assert(lits(1) >= min && lits(1) <= max) assert(lits(0) == lits(1)) } }
Example 14
Source File: JacksonGeneratorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.json import java.io.CharArrayWriter import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.types._ class JacksonGeneratorSuite extends SparkFunSuite { val gmtId = DateTimeUtils.TimeZoneGMT.getID val option = new JSONOptions(Map.empty, gmtId) test("initial with StructType and write out a row") { val dataType = StructType(StructField("a", IntegerType) :: Nil) val input = InternalRow(1) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """{"a":1}""") } test("initial with StructType and write out rows") { val dataType = StructType(StructField("a", IntegerType) :: Nil) val input = new GenericArrayData(InternalRow(1) :: InternalRow(2) :: Nil) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """[{"a":1},{"a":2}]""") } test("initial with StructType and write out an array with single empty row") { val dataType = StructType(StructField("a", IntegerType) :: Nil) val input = new GenericArrayData(InternalRow(null) :: Nil) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """[{}]""") } test("initial with StructType and write out an empty array") { val dataType = StructType(StructField("a", IntegerType) :: Nil) val input = new GenericArrayData(Nil) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """[]""") } test("initial with Map and write out a map data") { val dataType = MapType(StringType, IntegerType) val input = ArrayBasedMapData(Map("a" -> 1)) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """{"a":1}""") } test("initial with Map and write out an array of maps") { val dataType = MapType(StringType, IntegerType) val input = new GenericArrayData( ArrayBasedMapData(Map("a" -> 1)) :: ArrayBasedMapData(Map("b" -> 2)) :: Nil) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """[{"a":1},{"b":2}]""") } test("error handling: initial with StructType but error calling write a map") { val dataType = StructType(StructField("a", IntegerType) :: Nil) val input = ArrayBasedMapData(Map("a" -> 1)) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) intercept[UnsupportedOperationException] { gen.write(input) } } test("error handling: initial with MapType and write out a row") { val dataType = MapType(StringType, IntegerType) val input = InternalRow(1) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) intercept[UnsupportedOperationException] { gen.write(input) } } }
Example 15
Source File: LitTests.scala From frameless with Apache License 2.0 | 5 votes |
package frameless import frameless.functions.lit import org.scalacheck.Prop import org.scalacheck.Prop._ class LitTests extends TypedDatasetSuite { def prop[A: TypedEncoder](value: A): Prop = { val df: TypedDataset[Int] = TypedDataset.create(1 :: Nil) // filter forces whole codegen val elems = df.deserialized.filter((_:Int) => true).select(lit(value)) .collect() .run() .toVector // otherwise it uses local relation val localElems = df.select(lit(value)) .collect() .run() .toVector (localElems ?= Vector(value)) && (elems ?= Vector(value)) } test("select(lit(...))") { check(prop[Int] _) check(prop[Long] _) check(prop[String] _) check(prop[SQLDate] _) check(prop[Option[Int]] _) check(prop[Option[String]] _) check(prop[Vector[Long]] _) check(prop[Vector[X1[Long]]] _) check(prop[Vector[String]] _) check(prop[Vector[X1[String]]] _) check(prop[X1[Int]] _) check(prop[X1[X1[Int]]] _) check(prop[Food] _) // doesn't work, object has to be serializable // check(prop[frameless.LocalDateTime] _) } test("#205: comparing literals encoded using Injection") { import org.apache.spark.sql.catalyst.util.DateTimeUtils implicit val dateAsInt: Injection[java.sql.Date, Int] = Injection(DateTimeUtils.fromJavaDate, DateTimeUtils.toJavaDate) val today = new java.sql.Date(System.currentTimeMillis) val data = Vector(P(42, today)) val tds = TypedDataset.create(data) tds.filter(tds('d) === today).collect().run() } } final case class P(i: Int, d: java.sql.Date)
Example 16
Source File: ComputeCurrentTimeSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.util.DateTimeUtils class ComputeCurrentTimeSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Seq(Batch("ComputeCurrentTime", Once, ComputeCurrentTime)) } test("analyzer should replace current_timestamp with literals") { val in = Project(Seq(Alias(CurrentTimestamp(), "a")(), Alias(CurrentTimestamp(), "b")()), LocalRelation()) val min = System.currentTimeMillis() * 1000 val plan = Optimize.execute(in.analyze).asInstanceOf[Project] val max = (System.currentTimeMillis() + 1) * 1000 val lits = new scala.collection.mutable.ArrayBuffer[Long] plan.transformAllExpressions { case e: Literal => lits += e.value.asInstanceOf[Long] e } assert(lits.size == 2) assert(lits(0) >= min && lits(0) <= max) assert(lits(1) >= min && lits(1) <= max) assert(lits(0) == lits(1)) } test("analyzer should replace current_date with literals") { val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation()) val min = DateTimeUtils.millisToDays(System.currentTimeMillis()) val plan = Optimize.execute(in.analyze).asInstanceOf[Project] val max = DateTimeUtils.millisToDays(System.currentTimeMillis()) val lits = new scala.collection.mutable.ArrayBuffer[Int] plan.transformAllExpressions { case e: Literal => lits += e.value.asInstanceOf[Int] e } assert(lits.size == 2) assert(lits(0) >= min && lits(0) <= max) assert(lits(1) >= min && lits(1) <= max) assert(lits(0) == lits(1)) } }
Example 17
Source File: MergeProjection.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.mutation.merge import java.sql.{Date, Timestamp} import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection} import org.apache.spark.sql.catalyst.util.DateTimeUtils case class MergeProjection( @transient tableCols: Seq[String], @transient statusCol : String, @transient ds: Dataset[Row], @transient rltn: CarbonDatasourceHadoopRelation, @transient sparkSession: SparkSession, @transient mergeAction: MergeAction) { private val cutOffDate = Integer.MAX_VALUE >> 1 val isUpdate = mergeAction.isInstanceOf[UpdateAction] val isDelete = mergeAction.isInstanceOf[DeleteAction] def apply(row: GenericRowWithSchema): InternalRow = { // TODO we can avoid these multiple conversions if this is added as a SparkPlan node. val values = row.values.map { case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s) case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d) case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b) case d: Date => DateTimeUtils.fromJavaDate(d) case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t) case value => value } projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow] } val (projection, output) = generateProjection private def generateProjection: (Projection, Array[Expression]) = { val existingDsOutput = rltn.carbonRelation.schema.toAttributes val colsMap = mergeAction match { case UpdateAction(updateMap) => updateMap case InsertAction(insertMap) => insertMap case _ => null } if (colsMap != null) { val output = new Array[Expression](tableCols.length) val expecOutput = new Array[Expression](tableCols.length) colsMap.foreach { case (k, v) => val tableIndex = tableCols.indexOf(k.toString().toLowerCase) if (tableIndex < 0) { throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap") } output(tableIndex) = v.expr.transform { case a: Attribute if !a.resolved => ds.queryExecution.analyzed.resolveQuoted(a.name, sparkSession.sessionState.analyzer.resolver).get } expecOutput(tableIndex) = existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get } if (output.contains(null)) { throw new CarbonMergeDataSetException(s"Not all columns are mapped") } (new InterpretedMutableProjection(output++Seq( ds.queryExecution.analyzed.resolveQuoted(statusCol, sparkSession.sessionState.analyzer.resolver).get), ds.queryExecution.analyzed.output), expecOutput) } else { (null, null) } } }
Example 18
Source File: ComputeCurrentTimeSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.util.DateTimeUtils class ComputeCurrentTimeSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Seq(Batch("ComputeCurrentTime", Once, ComputeCurrentTime)) } test("analyzer should replace current_timestamp with literals") { val in = Project(Seq(Alias(CurrentTimestamp(), "a")(), Alias(CurrentTimestamp(), "b")()), LocalRelation()) val min = System.currentTimeMillis() * 1000 val plan = Optimize.execute(in.analyze).asInstanceOf[Project] val max = (System.currentTimeMillis() + 1) * 1000 val lits = new scala.collection.mutable.ArrayBuffer[Long] plan.transformAllExpressions { case e: Literal => lits += e.value.asInstanceOf[Long] e } assert(lits.size == 2) assert(lits(0) >= min && lits(0) <= max) assert(lits(1) >= min && lits(1) <= max) assert(lits(0) == lits(1)) } test("analyzer should replace current_date with literals") { val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation()) val min = DateTimeUtils.millisToDays(System.currentTimeMillis()) val plan = Optimize.execute(in.analyze).asInstanceOf[Project] val max = DateTimeUtils.millisToDays(System.currentTimeMillis()) val lits = new scala.collection.mutable.ArrayBuffer[Int] plan.transformAllExpressions { case e: Literal => lits += e.value.asInstanceOf[Int] e } assert(lits.size == 2) assert(lits(0) >= min && lits(0) <= max) assert(lits(1) >= min && lits(1) <= max) assert(lits(0) == lits(1)) } }
Example 19
Source File: ColumnarLiterals.scala From OAP with Apache License 2.0 | 5 votes |
package com.intel.sparkColumnarPlugin.expression import com.google.common.collect.Lists import org.apache.arrow.gandiva.evaluator._ import org.apache.arrow.gandiva.exceptions.GandivaException import org.apache.arrow.gandiva.expression._ import org.apache.arrow.vector.types.pojo.ArrowType import org.apache.arrow.vector.types.pojo.Field import org.apache.arrow.vector.types.DateUnit import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ import scala.collection.mutable.ListBuffer class ColumnarLiteral(lit: Literal) extends Literal(lit.value, lit.dataType) with ColumnarExpression { override def doColumnarCodeGen(args: java.lang.Object): (TreeNode, ArrowType) = { val resultType = CodeGeneration.getResultType(dataType) dataType match { case t: StringType => (TreeBuilder.makeStringLiteral(value.toString().asInstanceOf[String]), resultType) case t: IntegerType => (TreeBuilder.makeLiteral(value.asInstanceOf[Integer]), resultType) case t: LongType => (TreeBuilder.makeLiteral(value.asInstanceOf[java.lang.Long]), resultType) case t: DoubleType => (TreeBuilder.makeLiteral(value.asInstanceOf[java.lang.Double]), resultType) case d: DecimalType => val v = value.asInstanceOf[Decimal] (TreeBuilder.makeDecimalLiteral(v.toString, v.precision, v.scale), resultType) case d: DateType => val origIntNode = TreeBuilder.makeLiteral(value.asInstanceOf[Integer]) val dateNode = TreeBuilder.makeFunction("castDATE", Lists.newArrayList(origIntNode), new ArrowType.Date(DateUnit.DAY)) (dateNode, new ArrowType.Date(DateUnit.DAY)) case b: BooleanType => (TreeBuilder.makeLiteral(value.asInstanceOf[java.lang.Boolean]), resultType) } } }
Example 20
Source File: TimestampExpressionSuite.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import java.sql.Timestamp import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types.{DateType, IntegerType} import org.scalatest.FunSuite class TimestampExpressionSuite extends FunSuite with ExpressionEvalHelper { test("add_seconds") { // scalastyle:off magic.number checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-01 00:11:33")), Literal(28)), DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-01-01 00:12:01"))) checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-02 00:00:00")), Literal(-1)), DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2015-01-01 23:59:59"))) checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-01 00:00:00")), Literal(-1)), DateTimeUtils.fromJavaTimestamp(Timestamp.valueOf("2014-12-31 23:59:59"))) checkEvaluation(AddSeconds(Literal(Timestamp.valueOf("2015-01-02 00:00:00")), Literal.create(null, IntegerType)), null) checkEvaluation(AddSeconds(Literal.create(null, DateType), Literal(1)), null) checkEvaluation(AddSeconds(Literal.create(null, DateType), Literal.create(null, IntegerType)), null) } }
Example 21
Source File: AddSeconds.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.types._ import org.apache.spark.sql.catalyst.util.DateTimeUtils case class AddSeconds(timestamp: Expression, seconds: Expression) extends BinaryExpression with ImplicitCastInputTypes with CodegenFallback { override def inputTypes: Seq[AbstractDataType] = Seq(TimestampType, IntegerType) override def nullSafeEval(microseconds: Any, seconds: Any): Any = { microseconds.asInstanceOf[DateTimeUtils.SQLTimestamp] + (seconds.asInstanceOf[Int] * DateTimeUtils.MICROS_PER_SECOND) } override def left: Expression = timestamp override def right: Expression = seconds override def dataType: DataType = TimestampType }
Example 22
Source File: dateExpressions.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ case class AddYears(date: Expression, years: Expression) extends BinaryExpression with ImplicitCastInputTypes with CodegenFallback { override def inputTypes: Seq[AbstractDataType] = Seq(DateType, IntegerType) override def nullSafeEval(d: Any, y: Any): Any = { DateTimeUtils.dateAddMonths( d.asInstanceOf[DateTimeUtils.SQLDate], y.asInstanceOf[Int] * 12 ) } override def left: Expression = date override def right: Expression = years override def dataType: DataType = DateType }
Example 23
Source File: ResolvedDataSourceSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources import org.apache.spark.SparkFunSuite import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.execution.datasources.DataSource import org.apache.spark.sql.test.SharedSQLContext class ResolvedDataSourceSuite extends SparkFunSuite with SharedSQLContext { private def getProvidingClass(name: String): Class[_] = DataSource( sparkSession = spark, className = name, options = Map(DateTimeUtils.TIMEZONE_OPTION -> DateTimeUtils.defaultTimeZone().getID) ).providingClass test("jdbc") { assert( getProvidingClass("jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) assert( getProvidingClass("org.apache.spark.sql.jdbc") === classOf[org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider]) } test("json") { assert( getProvidingClass("json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) assert( getProvidingClass("org.apache.spark.sql.json") === classOf[org.apache.spark.sql.execution.datasources.json.JsonFileFormat]) } test("parquet") { assert( getProvidingClass("parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.execution.datasources.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) assert( getProvidingClass("org.apache.spark.sql.parquet") === classOf[org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat]) } test("csv") { assert( getProvidingClass("csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) assert( getProvidingClass("com.databricks.spark.csv") === classOf[org.apache.spark.sql.execution.datasources.csv.CSVFileFormat]) } test("avro: show deploy guide for loading the external avro module") { Seq("avro", "org.apache.spark.sql.avro").foreach { provider => val message = intercept[AnalysisException] { getProvidingClass(provider) }.getMessage assert(message.contains(s"Failed to find data source: $provider")) assert(message.contains("Please deploy the application as per the deployment section of")) } } test("kafka: show deploy guide for loading the external kafka module") { val message = intercept[AnalysisException] { getProvidingClass("kafka") }.getMessage assert(message.contains("Failed to find data source: kafka")) assert(message.contains("Please deploy the application as per the deployment section of")) } test("error message for unknown data sources") { val error = intercept[ClassNotFoundException] { getProvidingClass("asfdwefasdfasdf") } assert(error.getMessage.contains("Failed to find data source: asfdwefasdfasdf.")) } }
Example 24
Source File: ArrowUtilsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.arrow import org.apache.arrow.vector.types.pojo.{ArrowType, Field, FieldType, Schema} import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ class ArrowUtilsSuite extends SparkFunSuite { def roundtrip(dt: DataType): Unit = { dt match { case schema: StructType => assert(ArrowUtils.fromArrowSchema(ArrowUtils.toArrowSchema(schema, null)) === schema) case _ => roundtrip(new StructType().add("value", dt)) } } test("simple") { roundtrip(BooleanType) roundtrip(ByteType) roundtrip(ShortType) roundtrip(IntegerType) roundtrip(LongType) roundtrip(FloatType) roundtrip(DoubleType) roundtrip(StringType) roundtrip(BinaryType) roundtrip(DecimalType.SYSTEM_DEFAULT) roundtrip(DateType) val tsExMsg = intercept[UnsupportedOperationException] { roundtrip(TimestampType) } assert(tsExMsg.getMessage.contains("timeZoneId")) } test("timestamp") { def roundtripWithTz(timeZoneId: String): Unit = { val schema = new StructType().add("value", TimestampType) val arrowSchema = ArrowUtils.toArrowSchema(schema, timeZoneId) val fieldType = arrowSchema.findField("value").getType.asInstanceOf[ArrowType.Timestamp] assert(fieldType.getTimezone() === timeZoneId) assert(ArrowUtils.fromArrowSchema(arrowSchema) === schema) } roundtripWithTz(DateTimeUtils.defaultTimeZone().getID) roundtripWithTz("Asia/Tokyo") roundtripWithTz("UTC") roundtripWithTz("America/Los_Angeles") } test("array") { roundtrip(ArrayType(IntegerType, containsNull = true)) roundtrip(ArrayType(IntegerType, containsNull = false)) roundtrip(ArrayType(ArrayType(IntegerType, containsNull = true), containsNull = true)) roundtrip(ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = true)) roundtrip(ArrayType(ArrayType(IntegerType, containsNull = true), containsNull = false)) roundtrip(ArrayType(ArrayType(IntegerType, containsNull = false), containsNull = false)) } test("struct") { roundtrip(new StructType()) roundtrip(new StructType().add("i", IntegerType)) roundtrip(new StructType().add("arr", ArrayType(IntegerType))) roundtrip(new StructType().add("i", IntegerType).add("arr", ArrayType(IntegerType))) roundtrip(new StructType().add( "struct", new StructType().add("i", IntegerType).add("arr", ArrayType(IntegerType)))) } }
Example 25
Source File: MetricsReporter.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.text.SimpleDateFormat import com.codahale.metrics.{Gauge, MetricRegistry} import org.apache.spark.internal.Logging import org.apache.spark.metrics.source.{Source => CodahaleSource} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.streaming.StreamingQueryProgress class MetricsReporter( stream: StreamExecution, override val sourceName: String) extends CodahaleSource with Logging { override val metricRegistry: MetricRegistry = new MetricRegistry // Metric names should not have . in them, so that all the metrics of a query are identified // together in Ganglia as a single metric group registerGauge("inputRate-total", _.inputRowsPerSecond, 0.0) registerGauge("processingRate-total", _.processedRowsPerSecond, 0.0) registerGauge("latency", _.durationMs.get("triggerExecution").longValue(), 0L) private val timestampFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") // ISO8601 timestampFormat.setTimeZone(DateTimeUtils.getTimeZone("UTC")) registerGauge("eventTime-watermark", progress => convertStringDateToMillis(progress.eventTime.get("watermark")), 0L) registerGauge("states-rowsTotal", _.stateOperators.map(_.numRowsTotal).sum, 0L) registerGauge("states-usedBytes", _.stateOperators.map(_.memoryUsedBytes).sum, 0L) private def convertStringDateToMillis(isoUtcDateStr: String) = { if (isoUtcDateStr != null) { timestampFormat.parse(isoUtcDateStr).getTime } else { 0L } } private def registerGauge[T]( name: String, f: StreamingQueryProgress => T, default: T): Unit = { synchronized { metricRegistry.register(name, new Gauge[T] { override def getValue: T = Option(stream.lastProgress).map(f).getOrElse(default) }) } } }
Example 26
Source File: UnivocityGenerator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.csv import java.io.Writer import com.univocity.parsers.csv.CsvWriter import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.types._ private[csv] class UnivocityGenerator( schema: StructType, writer: Writer, options: CSVOptions) { private val writerSettings = options.asWriterSettings writerSettings.setHeaders(schema.fieldNames: _*) private val gen = new CsvWriter(writer, writerSettings) private var printHeader = options.headerFlag // A `ValueConverter` is responsible for converting a value of an `InternalRow` to `String`. // When the value is null, this converter should not be called. private type ValueConverter = (InternalRow, Int) => String // `ValueConverter`s for all values in the fields of the schema private val valueConverters: Array[ValueConverter] = schema.map(_.dataType).map(makeConverter).toArray private def makeConverter(dataType: DataType): ValueConverter = dataType match { case DateType => (row: InternalRow, ordinal: Int) => options.dateFormat.format(DateTimeUtils.toJavaDate(row.getInt(ordinal))) case TimestampType => (row: InternalRow, ordinal: Int) => options.timestampFormat.format(DateTimeUtils.toJavaTimestamp(row.getLong(ordinal))) case udt: UserDefinedType[_] => makeConverter(udt.sqlType) case dt: DataType => (row: InternalRow, ordinal: Int) => row.get(ordinal, dt).toString } private def convertRow(row: InternalRow): Seq[String] = { var i = 0 val values = new Array[String](row.numFields) while (i < row.numFields) { if (!row.isNullAt(i)) { values(i) = valueConverters(i).apply(row, i) } else { values(i) = options.nullValue } i += 1 } values } def write(row: InternalRow): Unit = { if (printHeader) { gen.writeHeaders() } gen.writeRow(convertRow(row): _*) printHeader = false } def close(): Unit = gen.close() def flush(): Unit = gen.flush() }
Example 27
Source File: ComputeCurrentTimeSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions.{Alias, CurrentDate, CurrentTimestamp, Literal} import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan, Project} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.catalyst.util.DateTimeUtils class ComputeCurrentTimeSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Seq(Batch("ComputeCurrentTime", Once, ComputeCurrentTime)) } test("analyzer should replace current_timestamp with literals") { val in = Project(Seq(Alias(CurrentTimestamp(), "a")(), Alias(CurrentTimestamp(), "b")()), LocalRelation()) val min = System.currentTimeMillis() * 1000 val plan = Optimize.execute(in.analyze).asInstanceOf[Project] val max = (System.currentTimeMillis() + 1) * 1000 val lits = new scala.collection.mutable.ArrayBuffer[Long] plan.transformAllExpressions { case e: Literal => lits += e.value.asInstanceOf[Long] e } assert(lits.size == 2) assert(lits(0) >= min && lits(0) <= max) assert(lits(1) >= min && lits(1) <= max) assert(lits(0) == lits(1)) } test("analyzer should replace current_date with literals") { val in = Project(Seq(Alias(CurrentDate(), "a")(), Alias(CurrentDate(), "b")()), LocalRelation()) val min = DateTimeUtils.millisToDays(System.currentTimeMillis()) val plan = Optimize.execute(in.analyze).asInstanceOf[Project] val max = DateTimeUtils.millisToDays(System.currentTimeMillis()) val lits = new scala.collection.mutable.ArrayBuffer[Int] plan.transformAllExpressions { case e: Literal => lits += e.value.asInstanceOf[Int] e } assert(lits.size == 2) assert(lits(0) >= min && lits(0) <= max) assert(lits(1) >= min && lits(1) <= max) assert(lits(0) == lits(1)) } }
Example 28
Source File: JacksonGeneratorSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.json import java.io.CharArrayWriter import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, DateTimeUtils, GenericArrayData} import org.apache.spark.sql.types._ class JacksonGeneratorSuite extends SparkFunSuite { val gmtId = DateTimeUtils.TimeZoneGMT.getID val option = new JSONOptions(Map.empty, gmtId) test("initial with StructType and write out a row") { val dataType = StructType(StructField("a", IntegerType) :: Nil) val input = InternalRow(1) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """{"a":1}""") } test("initial with StructType and write out rows") { val dataType = StructType(StructField("a", IntegerType) :: Nil) val input = new GenericArrayData(InternalRow(1) :: InternalRow(2) :: Nil) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """[{"a":1},{"a":2}]""") } test("initial with StructType and write out an array with single empty row") { val dataType = StructType(StructField("a", IntegerType) :: Nil) val input = new GenericArrayData(InternalRow(null) :: Nil) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """[{}]""") } test("initial with StructType and write out an empty array") { val dataType = StructType(StructField("a", IntegerType) :: Nil) val input = new GenericArrayData(Nil) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """[]""") } test("initial with Map and write out a map data") { val dataType = MapType(StringType, IntegerType) val input = ArrayBasedMapData(Map("a" -> 1)) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """{"a":1}""") } test("initial with Map and write out an array of maps") { val dataType = MapType(StringType, IntegerType) val input = new GenericArrayData( ArrayBasedMapData(Map("a" -> 1)) :: ArrayBasedMapData(Map("b" -> 2)) :: Nil) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) gen.write(input) gen.flush() assert(writer.toString === """[{"a":1},{"b":2}]""") } test("error handling: initial with StructType but error calling write a map") { val dataType = StructType(StructField("a", IntegerType) :: Nil) val input = ArrayBasedMapData(Map("a" -> 1)) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) intercept[UnsupportedOperationException] { gen.write(input) } } test("error handling: initial with MapType and write out a row") { val dataType = MapType(StringType, IntegerType) val input = InternalRow(1) val writer = new CharArrayWriter() val gen = new JacksonGenerator(dataType, writer, option) intercept[UnsupportedOperationException] { gen.write(input) } } }