org.apache.spark.sql.types.LongType Scala Examples
The following examples show how to use org.apache.spark.sql.types.LongType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GroupedIteratorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType} class GroupedIteratorSuite extends SparkFunSuite { test("basic") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 1) key.getInt(0) -> data.map(encoder.fromRow).toSeq }.toSeq assert(result == 1 -> Seq(input(0), input(1)) :: 2 -> Seq(input(2)) :: Nil) } test("group by 2 columns") { val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq( Row(1, 2L, "a"), Row(1, 2L, "b"), Row(1, 3L, "c"), Row(2, 1L, "d"), Row(3, 2L, "e")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 2) (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq) }.toSeq assert(result == (1, 2L, Seq(input(0), input(1))) :: (1, 3L, Seq(input(2))) :: (2, 1L, Seq(input(3))) :: (3, 2L, Seq(input(4))) :: Nil) } test("do nothing to the value iterator") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) assert(grouped.length == 2) } }
Example 2
Source File: SQLTransformerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.sql.types.{LongType, StructField, StructType} import org.apache.spark.storage.StorageLevel class SQLTransformerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new SQLTransformer()) } test("transform numeric data") { val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0)) .toDF("id", "v1", "v2", "v3", "v4") val resultSchema = sqlTrans.transformSchema(original.schema) testTransformerByGlobalCheckFunc[(Int, Double, Double)]( original, sqlTrans, "id", "v1", "v2", "v3", "v4") { rows => assert(rows.head.schema.toString == resultSchema.toString) assert(resultSchema == expected.schema) assert(rows == expected.collect().toSeq) assert(original.sparkSession.catalog.listTables().count() == 0) } } test("read/write") { val t = new SQLTransformer() .setStatement("select * from __THIS__") testDefaultReadWrite(t) } test("transformSchema") { val df = spark.range(10) val outputSchema = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") .transformSchema(df.schema) val expected = StructType(Seq(StructField("id1", LongType, nullable = false))) assert(outputSchema === expected) } test("SPARK-22538: SQLTransformer should not unpersist given dataset") { val df = spark.range(10).toDF() df.cache() df.count() assert(df.storageLevel != StorageLevel.NONE) val sqlTrans = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") testTransformerByGlobalCheckFunc[Long](df, sqlTrans, "id1") { _ => } assert(df.storageLevel != StorageLevel.NONE) } }
Example 3
Source File: Mean.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{count, sum} import org.apache.spark.sql.types.{DoubleType, StructType, LongType} import Analyzers._ case class MeanState(sum: Double, count: Long) extends DoubleValuedState[MeanState] { override def sum(other: MeanState): MeanState = { MeanState(sum + other.sum, count + other.count) } override def metricValue(): Double = { if (count == 0L) Double.NaN else sum / count } } case class Mean(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MeanState]("Mean", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { sum(conditionalSelection(column, where)).cast(DoubleType) :: count(conditionalSelection(column, where)).cast(LongType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MeanState] = { ifNoNullsIn(result, offset, howMany = 2) { _ => MeanState(result.getDouble(offset), result.getLong(offset + 1)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 4
Source File: CustomSchemaTest.scala From spark-sftp with Apache License 2.0 | 5 votes |
package com.springml.spark.sftp import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, _} import org.scalatest.{BeforeAndAfterEach, FunSuite} class CustomSchemaTest extends FunSuite with BeforeAndAfterEach { var ss: SparkSession = _ val csvTypesMap = Map("ProposalId" -> IntegerType, "OpportunityId" -> StringType, "Clicks" -> LongType, "Impressions" -> LongType ) val jsonTypesMap = Map("name" -> StringType, "age" -> IntegerType ) override def beforeEach() { ss = SparkSession.builder().master("local").appName("Custom Schema Test").getOrCreate() } private def validateTypes(field : StructField, typeMap : Map[String, DataType]) = { val expectedType = typeMap(field.name) assert(expectedType == field.dataType) } private def columnArray(typeMap : Map[String, DataType]) : Array[StructField] = { val columns = typeMap.map(x => new StructField(x._1, x._2, true)) val columnStruct = Array[StructField] () columns.copyToArray(columnStruct) columnStruct } test ("Read CSV with custom schema") { val columnStruct = columnArray(csvTypesMap) val expectedSchema = StructType(columnStruct) val fileLocation = getClass.getResource("/sample.csv").getPath val dsr = DatasetRelation(fileLocation, "csv", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext) val rdd = dsr.buildScan() assert(dsr.schema.fields.length == columnStruct.length) dsr.schema.fields.foreach(s => validateTypes(s, csvTypesMap)) } test ("Read Json with custom schema") { val columnStruct = columnArray(jsonTypesMap) val expectedSchema = StructType(columnStruct) val fileLocation = getClass.getResource("/people.json").getPath val dsr = DatasetRelation(fileLocation, "json", "false", "true", ",", "\"", "\\", "false", null, expectedSchema, ss.sqlContext) val rdd = dsr.buildScan() assert(dsr.schema.fields.length == columnStruct.length) dsr.schema.fields.foreach(s => validateTypes(s, jsonTypesMap)) } }
Example 5
Source File: FreqStatsTransformerSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package odkl.analysis.spark.texts import odkl.analysis.spark.TestEnv import org.apache.spark.ml.odkl.texts.FreqStatsTransformer import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, LongType, StringType, StructType} import org.scalatest.FlatSpec class FreqStatsTransformerSpec extends FlatSpec with TestEnv with org.scalatest.Matchers { "FreqStatsTransformer" should "count freq" in { val fTransformer = new FreqStatsTransformer() .setInputDataCol("data") .setOutputColFreq("Freq") .setOutputColTerm("Term") val schema = new StructType().add("data",ArrayType(StringType,true)) val inDF = sqlc.createDataFrame( sc.parallelize(Seq(Seq[String]("a","b","c"),Seq[String]("a","b","a"))) .map(f => {Row(f)}), schema) val correctAns = Array[(String,Double)](("a",2D/5D),("b",2D/5D),("c",1D/5D)) val realAns = fTransformer.transform(inDF).sort("Term").collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"))}) assertResult(correctAns)(realAns) } "FreqStatsTransformer" should "filter freq by uni and bi treshold" in { val fTransformer = new FreqStatsTransformer() .setInputDataCol("data") .setOutputColFreq("Freq") .setOutputColTerm("Term") .setTresholdArr(Array[Double](1.5D/8D,1.1D/8D)) val schema = new StructType().add("data",ArrayType(StringType,true)) val inDF = sqlc.createDataFrame( sc.parallelize(Seq(Seq[String]("a","b","c","c a", "c a"),Seq[String]("a","b","a", "c a", "a b"))) .map(f => {Row(f)}), schema) val correctAns = Array[(String,Double)](("a",2D/8D),("b",2D/8D),("c a",2D/8D)) val realAnsDF = fTransformer.transform(inDF).sort("Term") val realAns = realAnsDF.collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"))}) assertResult(correctAns)(realAns) } "FreqStatsTransformer" should "extract max timestamp by term" in { val fTransformer = new FreqStatsTransformer() .setInputDataCol("data") .setOutputColFreq("Freq") .setOutputColTerm("Term") .setWithTimestamp(true) .setTimestampColumnName("timestamp") .setTresholdArr(Array[Double](1D/8D,1.1D/8D)) val schema = new StructType().add("data",ArrayType(StringType,true)).add("timestamp",LongType) val inDF = sqlc.createDataFrame( sc.parallelize(Seq(Seq(Seq[String]("a","c","c a", "c a"),100L),Seq(Seq[String]("c a", "a b"),150L),Seq(Seq[String]("b"),200L))) .map(f => {Row.fromSeq(f)}), schema) inDF.collect() val correctAns = Array[(String,Double,Long)](("a",1D/6D,100L),("a b",1D/6D, 150L),("b",1D/6D,200L), ("c",1D/6D, 100L),("c a",2D/6D, 150L)) val realAns = fTransformer.transform(inDF).sort("Term").collect().map(f =>{(f.getAs[String]("Term"),f.getAs[Double]("Freq"),f.getAs[Long]("timestamp"))}) assertResult(correctAns)(realAns) assertResult(correctAns(1))(realAns(1)) } }
Example 6
Source File: HashBasedDeduplicatorSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package odkl.analysis.spark.texts import odkl.analysis.spark.TestEnv import org.apache.spark.ml.odkl.texts.HashBasedDeduplicator import org.apache.spark.ml.linalg.{VectorUDT, Vectors} import org.apache.spark.ml.odkl.MatrixUtils import org.apache.spark.sql.Row import org.apache.spark.sql.types.{LongType, StringType, StructType} import org.scalatest.FlatSpec class HashBasedDeduplicatorSpec extends FlatSpec with TestEnv with org.scalatest.Matchers { "cotrect HashBasedDeduplicator " should " remove similar vectors based on hash " in { val vectorsSize = 10000 val vector1 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 1.0, 1.0)), 1L, "vector1") val vector2 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 1.0, 0.0)), 1L, "vector2") val vector3 = (Vectors.sparse(vectorsSize, Array(5, 6, 7), Array(1.0, 0.0, 1.0)), 2L, "vector3") //pretty similar, but in 2nd bucket val vector4 = (Vectors.sparse(vectorsSize, Array(1, 2), Array(1.0, 1.0)), 1L, "vector4") //completly another but in 1-st bucket val schema = new StructType() .add("vector", MatrixUtils.vectorUDT) .add("hash", LongType) .add("alias", StringType) val dataFrame = sqlc.createDataFrame(sc.parallelize(Seq(vector1, vector2, vector3, vector4).map(Row.fromTuple(_))), schema) val deduplicator = new HashBasedDeduplicator() .setInputColHash("hash") .setInputColVector("vector") .setSimilarityTreshold(0.80) val answer = deduplicator.transform(dataFrame) .collect().map(row => (row.getLong(1), row.getString(2))) assert(answer.exists(_._2 == "vector1")) //should stay assert(!answer.exists(_._2 == "vector2")) //should be removed assert(answer.exists(_._2 == "vector3")) //should stay cause in other bucket (FalseNegative) assert(answer.exists(_._2 == "vector4")) //should stay cause different (FalsePositive) } }
Example 7
Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import java.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{LongType, StructType} def setDim(value: Long): this.type = set(dim, value) def this() = this(Identifiable.randomUID("randomProjectionsHasher")) override def transform(dataset: Dataset[_]): DataFrame = { val dimensity = { if (!isSet(dim)) {//If dimensions is not set - will search AttributeGroup in metadata as it comes from OdklCountVectorizer val vectorsIndex = dataset.schema.fieldIndex($(inputCol)) AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size } else { $(dim).toInt } } val projectionMatrix = dataset.sqlContext.sparkContext.broadcast( Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix]) //the matrix of random vectors to costruct hash val binHashSparseVectorColumn = udf((vector: Vector) => { projectionMatrix.value.multiply(vector).values .map(f => if (f>0) 1L else 0L) .view.zipWithIndex .foldLeft(0L) {case (acc,(v, i)) => acc | (v << i) } }) dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), LongType) } }
Example 8
Source File: DecimalExpressionSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{LongType, DecimalType, Decimal} class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { //非标准的值 test("UnscaledValue") { val d1 = Decimal("10.1") checkEvaluation(UnscaledValue(Literal(d1)), 101L) val d2 = Decimal(101, 3, 1) checkEvaluation(UnscaledValue(Literal(d2)), 101L) checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null) } //十进制 test("MakeDecimal") { checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1")) checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null) } //提高精度 test("PromotePrecision") { val d1 = Decimal("10.1") checkEvaluation(PromotePrecision(Literal(d1)), d1) val d2 = Decimal(101, 3, 1) checkEvaluation(PromotePrecision(Literal(d2)), d2) checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null) } //检查溢出 test("CheckOverflow") { val d1 = Decimal("10.1") checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null) val d2 = Decimal(101, 3, 1) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null) checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null) } }
Example 9
Source File: MonotonicallyIncreasingID.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext} import org.apache.spark.sql.types.{LongType, DataType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initInternal(): Unit = { count = 0L partitionMask = TaskContext.getPartitionId().toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.primitive} = $partitionMaskTerm + $countTerm; $countTerm++; """ } }
Example 10
Source File: ArrangePostprocessor.scala From DataQuality with GNU Lesser General Public License v3.0 | 5 votes |
package it.agilelab.bigdata.DataQuality.postprocessors import com.typesafe.config.Config import it.agilelab.bigdata.DataQuality.checks.CheckResult import it.agilelab.bigdata.DataQuality.metrics.MetricResult import it.agilelab.bigdata.DataQuality.sources.HdfsFile import it.agilelab.bigdata.DataQuality.targets.HdfsTargetConfig import it.agilelab.bigdata.DataQuality.utils import it.agilelab.bigdata.DataQuality.utils.DQSettings import it.agilelab.bigdata.DataQuality.utils.io.{HdfsReader, HdfsWriter} import org.apache.hadoop.fs.FileSystem import org.apache.spark.sql.types.{DoubleType, IntegerType, LongType, NumericType} import org.apache.spark.sql.{Column, DataFrame, SQLContext} import scala.collection.JavaConversions._ final class ArrangePostprocessor(config: Config, settings: DQSettings) extends BasicPostprocessor(config, settings) { private case class ColumnSelector(name: String, tipo: Option[String] = None, format: Option[String] = None, precision: Option[Integer] = None) { def toColumn()(implicit df: DataFrame): Column = { val dataType: Option[NumericType with Product with Serializable] = tipo.getOrElse("").toUpperCase match { case "DOUBLE" => Some(DoubleType) case "INT" => Some(IntegerType) case "LONG" => Some(LongType) case _ => None } import org.apache.spark.sql.functions.format_number import org.apache.spark.sql.functions.format_string (dataType, precision, format) match { case (Some(dt), None, None) => df(name).cast(dt) case(Some(dt), None, Some(f)) => format_string(f, df(name).cast(dt)).alias(name) case (Some(dt), Some(p),None) => format_number(df(name).cast(dt), p).alias(name) case (None, Some(p), None) => format_number(df(name), p).alias(name) case (None, None, Some(f)) => format_string(f, df(name)).alias(name) case _ => df(name) } } } private val vs = config.getString("source") private val target: HdfsTargetConfig = { val conf = config.getConfig("saveTo") utils.parseTargetConfig(conf)(settings).get } private val columns: Seq[ColumnSelector] = config.getAnyRefList("columnOrder").map { case x: String => ColumnSelector(x) case x: java.util.HashMap[_, String] => { val (name, v) = x.head.asInstanceOf[String Tuple2 _] v match { case v: String => ColumnSelector(name, Option(v)) case v: java.util.HashMap[String, _] => { val k = v.head._1 val f = v.head._2 f match { case f: Integer => ColumnSelector(name, Option(k), None, Option(f)) case f: String => ColumnSelector(name, Option(k), Option(f)) } } } } } override def process(vsRef: Set[HdfsFile], metRes: Seq[MetricResult], chkRes: Seq[CheckResult])( implicit fs: FileSystem, sqlContext: SQLContext, settings: DQSettings): HdfsFile = { val reqVS: HdfsFile = vsRef.filter(vr => vr.id == vs).head implicit val df: DataFrame = HdfsReader.load(reqVS, settings.ref_date).head val arrangeDF = df.select(columns.map(_.toColumn): _*) HdfsWriter.saveVirtualSource(arrangeDF, target, settings.refDateString)( fs, sqlContext.sparkContext) new HdfsFile(target) } }
Example 11
Source File: SchemaColumnRandom.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker.schema.table.columns import java.sql.{Date, Timestamp} import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{to_utc_timestamp, round, rand, from_unixtime, to_date} import org.apache.spark.sql.types.{IntegerType, LongType} trait SchemaColumnRandom[T] extends SchemaColumn object SchemaColumnRandom { val FloatDP = 3 val DoubleDP = 3 def apply(name: String, min: Int, max: Int): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Long, max: Long): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Float, max: Float): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Double, max: Double): SchemaColumn = SchemaColumnRandomNumeric(name, min, max) def apply(name: String, min: Date, max: Date): SchemaColumn = SchemaColumnRandomDate(name, min, max) def apply(name: String, min: Timestamp, max: Timestamp): SchemaColumn = SchemaColumnRandomTimestamp(name, min, max) def apply(name: String): SchemaColumn = SchemaColumnRandomBoolean(name) } private case class SchemaColumnRandomNumeric[T: Numeric](override val name: String, min: T, max: T) extends SchemaColumnRandom[T] { override def column(rowID: Option[Column] = None): Column = { import Numeric.Implicits._ (min, max) match { case (_: Int, _: Int) => round(rand() * (max - min) + min, 0).cast(IntegerType) case (_: Long, _: Long) => round(rand() * (max - min) + min, 0).cast(LongType) case (_: Float, _: Float) => round(rand() * (max - min) + min, SchemaColumnRandom.FloatDP) case (_: Double, _: Double) => round(rand() * (max - min) + min, SchemaColumnRandom.DoubleDP) } } } private case class SchemaColumnRandomTimestamp(override val name: String, min: Timestamp, max: Timestamp) extends SchemaColumnRandom[Timestamp] { override def column(rowID: Option[Column] = None): Column = { val minTime = min.getTime / 1000 val maxTime = max.getTime / 1000 to_utc_timestamp(from_unixtime(rand() * (maxTime - minTime) + minTime), "UTC") } } private case class SchemaColumnRandomDate(override val name: String, min: Date, max: Date) extends SchemaColumnRandom[Date] { val timestamp = SchemaColumnRandomTimestamp(name, new Timestamp(min.getTime), new Timestamp(max.getTime + 86400000)) override def column(rowID: Option[Column] = None): Column = to_date(timestamp.column()) } private case class SchemaColumnRandomBoolean(override val name: String) extends SchemaColumnRandom[Boolean] { override def column(rowID: Option[Column] = None): Column = rand() < 0.5f } object SchemaColumnRandomProtocol extends SchemaColumnRandomProtocol trait SchemaColumnRandomProtocol extends YamlParserProtocol { import net.jcazevedo.moultingyaml._ implicit object SchemaColumnRandomFormat extends YamlFormat[SchemaColumnRandom[_]] { override def read(yaml: YamlValue): SchemaColumnRandom[_] = { val fields = yaml.asYamlObject.fields val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set")) val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError(s"data_type not set for $name")) if (dataType == SchemaColumnDataType.Boolean) { SchemaColumnRandomBoolean(name) } else { val min = fields.getOrElse(YamlString("min"), deserializationError(s"min not set for $name")) val max = fields.getOrElse(YamlString("max"), deserializationError(s"max not set for $name")) dataType match { case SchemaColumnDataType.Int => SchemaColumnRandomNumeric(name, min.convertTo[Int], max.convertTo[Int]) case SchemaColumnDataType.Long => SchemaColumnRandomNumeric(name, min.convertTo[Long], max.convertTo[Long]) case SchemaColumnDataType.Float => SchemaColumnRandomNumeric(name, min.convertTo[Float], max.convertTo[Float]) case SchemaColumnDataType.Double => SchemaColumnRandomNumeric(name, min.convertTo[Double], max.convertTo[Double]) case SchemaColumnDataType.Date => SchemaColumnRandomDate(name, min.convertTo[Date], max.convertTo[Date]) case SchemaColumnDataType.Timestamp => SchemaColumnRandomTimestamp(name, min.convertTo[Timestamp], max.convertTo[Timestamp]) case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Random}") } } } override def write(obj: SchemaColumnRandom[_]): YamlValue = ??? } }
Example 12
Source File: MonotonicallyIncreasingID.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initializeInternal(partitionIndex: Int): Unit = { count = 0L partitionMask = partitionIndex.toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.addMutableState(ctx.JAVA_LONG, "count") val partitionMaskTerm = "partitionMask" ctx.addImmutableStateIfNotExists(ctx.JAVA_LONG, partitionMaskTerm) ctx.addPartitionInitializationStatement(s"$countTerm = 0L;") ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = "false") } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" }
Example 13
Source File: MySQLDialect.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 14
Source File: ResolveInlineTablesSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.types.{LongType, NullType} class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables.convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables.convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables.convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 15
Source File: RandomSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.scalatest.Matchers._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, LongType} class RandomSuite extends SparkFunSuite with ExpressionEvalHelper { test("random") { checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001) checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001) checkDoubleEvaluation( new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001) checkDoubleEvaluation( new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001) } test("SPARK-9127 codegen with long seed") { checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001) checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001) } }
Example 16
Source File: DecimalExpressionSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Decimal, DecimalType, LongType} class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { test("UnscaledValue") { val d1 = Decimal("10.1") checkEvaluation(UnscaledValue(Literal(d1)), 101L) val d2 = Decimal(101, 3, 1) checkEvaluation(UnscaledValue(Literal(d2)), 101L) checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null) } test("MakeDecimal") { checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1")) checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null) } test("PromotePrecision") { val d1 = Decimal("10.1") checkEvaluation(PromotePrecision(Literal(d1)), d1) val d2 = Decimal(101, 3, 1) checkEvaluation(PromotePrecision(Literal(d2)), d2) checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null) } test("CheckOverflow") { val d1 = Decimal("10.1") checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null) val d2 = Decimal(101, 3, 1) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null) checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null) } }
Example 17
Source File: MonotonicallyIncreasingID.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initializeInternal(partitionIndex: Int): Unit = { count = 0L partitionMask = partitionIndex.toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, "") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "") ctx.addPartitionInitializationStatement(s"$countTerm = 0L;") ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = "false") } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" }
Example 18
Source File: SQLTransformerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.types.{LongType, StructField, StructType} class SQLTransformerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new SQLTransformer()) } test("transform numeric data") { val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") val result = sqlTrans.transform(original) val resultSchema = sqlTrans.transformSchema(original.schema) val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0)) .toDF("id", "v1", "v2", "v3", "v4") assert(result.schema.toString == resultSchema.toString) assert(resultSchema == expected.schema) assert(result.collect().toSeq == expected.collect().toSeq) assert(original.sparkSession.catalog.listTables().count() == 0) } test("read/write") { val t = new SQLTransformer() .setStatement("select * from __THIS__") testDefaultReadWrite(t) } test("transformSchema") { val df = spark.range(10) val outputSchema = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") .transformSchema(df.schema) val expected = StructType(Seq(StructField("id1", LongType, nullable = false))) assert(outputSchema === expected) } }
Example 19
Source File: LoadInteractionsInHive.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.util import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} import org.opencypher.morpheus.api.MorpheusSession object LoadInteractionsInHive { val databaseName = "customers" val baseTableName = s"$databaseName.csv_input" def load(show: Boolean = false)(implicit session: MorpheusSession): DataFrame = { val datafile = getClass.getResource("/customer-interactions/csv/customer-interactions.csv").toURI.getPath val structType = StructType(Seq( StructField("interactionId", LongType, nullable = false), StructField("date", StringType, nullable = false), StructField("customerIdx", LongType, nullable = false), StructField("empNo", LongType, nullable = false), StructField("empName", StringType, nullable = false), StructField("type", StringType, nullable = false), StructField("outcomeScore", StringType, nullable = false), StructField("accountHolderId", StringType, nullable = false), StructField("policyAccountNumber", StringType, nullable = false), StructField("customerId", StringType, nullable = false), StructField("customerName", StringType, nullable = false) )) val baseTable: DataFrame = session.sparkSession.read .format("csv") .option("header", "true") .schema(structType) .load(datafile) if (show) baseTable.show() session.sql(s"DROP DATABASE IF EXISTS $databaseName CASCADE") session.sql(s"CREATE DATABASE $databaseName") baseTable.write.saveAsTable(s"$baseTableName") // Create views for nodes createView(baseTableName, "interactions", true, "interactionId", "date", "type", "outcomeScore") createView(baseTableName, "customers", true, "customerIdx", "customerId", "customerName") createView(baseTableName, "account_holders", true, "accountHolderId") createView(baseTableName, "policies", true, "policyAccountNumber") createView(baseTableName, "customer_reps", true, "empNo", "empName") // Create views for relationships createView(baseTableName, "has_customer_reps", false, "interactionId", "empNo") createView(baseTableName, "has_customers", false, "interactionId", "customerIdx") createView(baseTableName, "has_policies", false, "interactionId", "policyAccountNumber") createView(baseTableName, "has_account_holders", false, "interactionId", "accountHolderId") baseTable } def createView(fromTable: String, viewName: String, distinct: Boolean, columns: String*) (implicit session: MorpheusSession): Unit = { val distinctString = if (distinct) "DISTINCT" else "" session.sql( s""" |CREATE VIEW $databaseName.${viewName}_SEED AS | SELECT $distinctString ${columns.mkString(", ")} | FROM $fromTable | WHERE date < '2017-01-01' """.stripMargin) session.sql( s""" |CREATE VIEW $databaseName.${viewName}_DELTA AS | SELECT $distinctString ${columns.mkString(", ")} | FROM $fromTable | WHERE date >= '2017-01-01' """.stripMargin) } }
Example 20
Source File: YelpHelpers.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.integration.yelp import org.apache.spark.sql.types.{ArrayType, DateType, IntegerType, LongType} import org.apache.spark.sql.{Column, DataFrame, SparkSession, functions} import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey} import org.opencypher.morpheus.impl.table.SparkTable._ import org.opencypher.morpheus.integration.yelp.YelpConstants._ object YelpHelpers { case class YelpTables( userDf: DataFrame, businessDf: DataFrame, reviewDf: DataFrame ) def loadYelpTables(inputPath: String)(implicit spark: SparkSession): YelpTables = { import spark.implicits._ log("read business.json", 2) val rawBusinessDf = spark.read.json(s"$inputPath/business.json") log("read review.json", 2) val rawReviewDf = spark.read.json(s"$inputPath/review.json") log("read user.json", 2) val rawUserDf = spark.read.json(s"$inputPath/user.json") val businessDf = rawBusinessDf.select($"business_id".as(sourceIdKey), $"business_id", $"name", $"address", $"city", $"state") val reviewDf = rawReviewDf.select($"review_id".as(sourceIdKey), $"user_id".as(sourceStartNodeKey), $"business_id".as(sourceEndNodeKey), $"stars", $"date".cast(DateType)) val userDf = rawUserDf.select( $"user_id".as(sourceIdKey), $"name", $"yelping_since".cast(DateType), functions.split($"elite", ",").cast(ArrayType(LongType)).as("elite")) YelpTables(userDf, businessDf, reviewDf) } def printYelpStats(inputPath: String)(implicit spark: SparkSession): Unit = { val rawBusinessDf = spark.read.json(s"$inputPath/business.json") val rawReviewDf = spark.read.json(s"$inputPath/review.json") import spark.implicits._ rawBusinessDf.select($"city", $"state").distinct().show() rawBusinessDf.withColumnRenamed("business_id", "id") .join(rawReviewDf, $"id" === $"business_id") .groupBy($"city", $"state") .count().as("count") .orderBy($"count".desc, $"state".asc) .show(100) } def extractYelpCitySubset(inputPath: String, outputPath: String, city: String)(implicit spark: SparkSession): Unit = { import spark.implicits._ def emailColumn(userId: String): Column = functions.concat($"$userId", functions.lit("@yelp.com")) val rawUserDf = spark.read.json(s"$inputPath/user.json") val rawReviewDf = spark.read.json(s"$inputPath/review.json") val rawBusinessDf = spark.read.json(s"$inputPath/business.json") val businessDf = rawBusinessDf.filter($"city" === city) val reviewDf = rawReviewDf .join(businessDf, Seq("business_id"), "left_semi") .withColumn("user_email", emailColumn("user_id")) .withColumnRenamed("stars", "stars_tmp") .withColumn("stars", $"stars_tmp".cast(IntegerType)) .drop("stars_tmp") val userDf = rawUserDf .join(reviewDf, Seq("user_id"), "left_semi") .withColumn("email", emailColumn("user_id")) val friendDf = userDf .select($"email".as("user1_email"), functions.explode(functions.split($"friends", ", ")).as("user2_id")) .withColumn("user2_email", emailColumn("user2_id")) .select(s"user1_email", s"user2_email") businessDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/business.json") reviewDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/review.json") userDf.write.json(s"$outputPath/$cityGraphName/$yelpDB/user.json") friendDf.write.json(s"$outputPath/$cityGraphName/$yelpBookDB/friend.json") } implicit class DataFrameOps(df: DataFrame) { def prependIdColumn(idColumn: String, prefix: String): DataFrame = df.transformColumns(idColumn)(column => functions.concat(functions.lit(prefix), column).as(idColumn)) } }
Example 21
Source File: EncodeLong.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.expressions import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, NullIntolerant, UnaryExpression} import org.apache.spark.sql.types.{BinaryType, DataType, LongType} import org.opencypher.morpheus.api.value.MorpheusElement._ case class EncodeLong(child: Expression) extends UnaryExpression with NullIntolerant with ExpectsInputTypes { override val dataType: DataType = BinaryType override val inputTypes: Seq[LongType] = Seq(LongType) override protected def nullSafeEval(input: Any): Any = EncodeLong.encodeLong(input.asInstanceOf[Long]) override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = defineCodeGen(ctx, ev, c => s"(byte[])(${EncodeLong.getClass.getName.dropRight(1)}.encodeLong($c))") } object EncodeLong { private final val moreBytesBitMask: Long = Integer.parseInt("10000000", 2) private final val varLength7BitMask: Long = Integer.parseInt("01111111", 2) private final val otherBitsMask = ~varLength7BitMask private final val maxBytesForLongVarEncoding = 10 // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding @inline final def encodeLong(l: Long): Array[Byte] = { val tempResult = new Array[Byte](maxBytesForLongVarEncoding) var remainder = l var index = 0 while ((remainder & otherBitsMask) != 0) { tempResult(index) = ((remainder & varLength7BitMask) | moreBytesBitMask).toByte remainder >>>= 7 index += 1 } tempResult(index) = remainder.toByte val result = new Array[Byte](index + 1) System.arraycopy(tempResult, 0, result, 0, index + 1) result } // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding @inline final def decodeLong(input: Array[Byte]): Long = { assert(input.nonEmpty, "`decodeLong` requires a non-empty array as its input") var index = 0 var currentByte = input(index) var decoded = currentByte & varLength7BitMask var nextLeftShift = 7 while ((currentByte & moreBytesBitMask) != 0) { index += 1 currentByte = input(index) decoded |= (currentByte & varLength7BitMask) << nextLeftShift nextLeftShift += 7 } assert(index == input.length - 1, s"`decodeLong` received an input array ${input.toSeq.toHex} with extra bytes that could not be decoded.") decoded } implicit class ColumnLongOps(val c: Column) extends AnyVal { def encodeLongAsMorpheusId(name: String): Column = encodeLongAsMorpheusId.as(name) def encodeLongAsMorpheusId: Column = new Column(EncodeLong(c.expr)) } }
Example 22
Source File: TemporalUdafs.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.temporal import org.apache.logging.log4j.scala.Logging import org.apache.spark.sql.Row import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types.{CalendarIntervalType, DataType, LongType, StructField, StructType} import org.apache.spark.unsafe.types.CalendarInterval import org.opencypher.okapi.impl.temporal.TemporalConstants import org.opencypher.morpheus.impl.temporal.TemporalConversions._ object TemporalUdafs extends Logging { abstract class SimpleDurationAggregation(aggrName: String) extends UserDefinedAggregateFunction { override def inputSchema: StructType = StructType(Array(StructField("duration", CalendarIntervalType))) override def bufferSchema: StructType = StructType(Array(StructField(aggrName, CalendarIntervalType))) override def dataType: DataType = CalendarIntervalType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = new CalendarInterval(0, 0L) } override def evaluate(buffer: Row): Any = buffer.getAs[CalendarInterval](0) } class DurationSum extends SimpleDurationAggregation("sum") { override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { buffer(0) = buffer.getAs[CalendarInterval](0).add(input.getAs[CalendarInterval](0)) } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = buffer2.getAs[CalendarInterval](0).add(buffer1.getAs[CalendarInterval](0)) } } class DurationMax extends SimpleDurationAggregation("max") { override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { val currMaxInterval = buffer.getAs[CalendarInterval](0) val inputInterval = input.getAs[CalendarInterval](0) buffer(0) = if (currMaxInterval.toDuration.compare(inputInterval.toDuration) >= 0) currMaxInterval else inputInterval } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { val interval1 = buffer1.getAs[CalendarInterval](0) val interval2 = buffer2.getAs[CalendarInterval](0) buffer1(0) = if (interval1.toDuration.compare(interval2.toDuration) >= 0) interval1 else interval2 } } class DurationMin extends SimpleDurationAggregation("min") { override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = new CalendarInterval(Integer.MAX_VALUE, Long.MaxValue) } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { val currMinInterval = buffer.getAs[CalendarInterval](0) val inputInterval = input.getAs[CalendarInterval](0) buffer(0) = if (inputInterval.toDuration.compare(currMinInterval.toDuration) >= 0) currMinInterval else inputInterval } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { val interval1 = buffer1.getAs[CalendarInterval](0) val interval2 = buffer2.getAs[CalendarInterval](0) buffer1(0) = if (interval2.toDuration.compare(interval1.toDuration) >= 0) interval1 else interval2 } } class DurationAvg extends UserDefinedAggregateFunction { override def inputSchema: StructType = StructType(Array(StructField("duration", CalendarIntervalType))) override def bufferSchema: StructType = StructType(Array(StructField("sum", CalendarIntervalType), StructField("cnt", LongType))) override def dataType: DataType = CalendarIntervalType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = new CalendarInterval(0, 0L) buffer(1) = 0L } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { buffer(0) = buffer.getAs[CalendarInterval](0).add(input.getAs[CalendarInterval](0)) buffer(1) = buffer.getLong(1) + 1 } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = buffer2.getAs[CalendarInterval](0).add(buffer1.getAs[CalendarInterval](0)) buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1) } override def evaluate(buffer: Row): Any = { val sumInterval = buffer.getAs[CalendarInterval](0) val cnt = buffer.getLong(1) new CalendarInterval((sumInterval.months / cnt).toInt, sumInterval.microseconds / cnt) } } val durationSum = new DurationSum() val durationAvg = new DurationAvg() val durationMin = new DurationMin() val durationMax = new DurationMax() }
Example 23
Source File: MonotonicallyIncreasingID.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext} import org.apache.spark.sql.types.{LongType, DataType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initInternal(): Unit = { count = 0L partitionMask = TaskContext.getPartitionId().toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++; """ } }
Example 24
Source File: SparkSequenceGeneratorIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.example import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.LongType import org.apache.spark.sql.streaming.OutputMode import pipelines.streamlets._ import pipelines.streamlets.StreamletShape import pipelines.streamlets.avro._ import pipelines.spark.{ SparkStreamletLogic, SparkStreamlet } import pipelines.spark.sql.SQLImplicits._ class SparkSequenceGeneratorIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.key.toString) val shape = StreamletShape(out) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to process.", Some(50)) override def configParameters = Vector(RecordsPerSecond) override def createLogic() = new SparkStreamletLogic { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) override def buildStreamingQueries = { writeStream(process, out, OutputMode.Append).toQueryExecution } private def process: Dataset[Data] = { session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .withColumn("key", ($"value" / SequenceSettings.GroupSize).cast(LongType)) .as[Data] } } }
Example 25
Source File: CallRecordGeneratorIngress.scala From pipelines-examples with Apache License 2.0 | 5 votes |
package pipelines.examples.carly.aggregator import java.sql.Timestamp import scala.util.Random import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, SparkSession } import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.LongType import pipelines.streamlets._ import pipelines.streamlets.avro._ import pipelines.spark.sql.SQLImplicits._ import pipelines.examples.carly.data.CallRecord import pipelines.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.log4j.{ Level, Logger } case class Rate(timestamp: Timestamp, value: Long) class CallRecordGeneratorIngress extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val RecordsPerSecond = IntegerConfigParameter( "records-per-second", "Records per second to process.", Some(50)) override def configParameters = Vector(RecordsPerSecond) val out = AvroOutlet[CallRecord]("out", _.user) val shape = StreamletShape(out) override def createLogic() = new SparkStreamletLogic { val recordsPerSecond = context.streamletConfig.getInt(RecordsPerSecond.key) override def buildStreamingQueries = { val outStream = DataGenerator.mkData(super.session, recordsPerSecond) writeStream(outStream, out, OutputMode.Append).toQueryExecution } } } object DataGenerator { def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = { // do we need to expose this through configuration? val MaxTime = 2.hours.toMillis val MaxUsers = 100000 val TS0 = new java.sql.Timestamp(0) val ZeroTimestampProb = 0.05 // error rate // Random Data Generator val usersUdf = udf(() ⇒ "user-" + Random.nextInt(MaxUsers)) val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing") // Time-biased randomized filter - 1/2 hour cycles val sinTime: Long ⇒ Double = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI) val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob val timeFilterUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng)) val zeroTimestampUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ { if (rng < ZeroTimestampProb) { TS0 } else { ts } }) val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand()) val sampledData = randomDataset.where(timeFilterUdf($"timestamp", $"rng")) .withColumn("user", usersUdf()) .withColumn("other", usersUdf()) .withColumn("direction", directionUdf()) .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType)) .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng")) .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp" as "timestamp") .as[CallRecord] sampledData } }
Example 26
Source File: SchemaWithInfo.scala From flamy with Apache License 2.0 | 5 votes |
package com.flaminem.flamy.model.metadata import com.flaminem.flamy.conf.FlamyContext import com.flaminem.flamy.model.IOFormat import com.flaminem.flamy.model.metadata.TableWithInfo.getSparkSchema import com.flaminem.flamy.model.names.{SchemaName, TableName} import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} class SchemaWithInfo( override val creationTime: Option[Long], override val location: String, val name: SchemaName, val numTables: Option[Int], fileSize: Option[Long], fileCount: Option[Long], modificationTime: Option[Long] ) extends ItemWithInfo { def formattedNumTables: String = { numTables.map{_.toString}.getOrElse("") } override def getFormattedInfo(context: FlamyContext, humanReadable: Boolean): Seq[String] = { Seq( name.toString, formattedNumTables, formattedFileSize(context, humanReadable), formattedFileCount(context), formattedModificationTime(context) ) } override def getFileSize: Option[Long] = { fileSize } override def getFileCount: Option[Long] = { fileCount } override def getModificationTime(context: FlamyContext, refresh: Boolean = false): Option[Long] = { modificationTime } override def toString: String = { name.toString } } object SchemaWithInfo { val getSparkSchema: StructType = { StructType(Seq( StructField("schema", StringType), StructField("num_tables", LongType), StructField("size", LongType), StructField("num_files", LongType), StructField("modification_time", LongType) )) } def getInfoHeader: Seq[String] = { getSparkSchema.fields.map{_.name} } }
Example 27
Source File: SQLTransformerSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import com.tencent.angel.sona.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.sql.types.{LongType, StructField, StructType} import org.apache.spark.storage.StorageLevel class SQLTransformerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("transform numeric data") { val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0)) .toDF("id", "v1", "v2", "v3", "v4") val resultSchema = sqlTrans.transformSchema(original.schema) testTransformerByGlobalCheckFunc[(Int, Double, Double)]( original, sqlTrans, "id", "v1", "v2", "v3", "v4") { rows => assert(rows.head.schema.toString == resultSchema.toString) assert(resultSchema == expected.schema) assert(rows == expected.collect().toSeq) assert(original.sparkSession.catalog.listTables().count() == 0) } } test("read/write") { val t = new SQLTransformer() .setStatement("select * from __THIS__") testDefaultReadWrite(t) } test("transformSchema") { val df = spark.range(10) val outputSchema = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") .transformSchema(df.schema) val expected = StructType(Seq(StructField("id1", LongType, nullable = false))) assert(outputSchema === expected) } ignore("SPARK-22538: SQLTransformer should not unpersist given dataset") { val df = spark.range(10).toDF() df.cache() df.count() assert(df.storageLevel != StorageLevel.NONE) val sqlTrans = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") testTransformerByGlobalCheckFunc[Long](df, sqlTrans, "id1") { _ => } assert(df.storageLevel != StorageLevel.NONE) } }
Example 28
Source File: KCore.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.kcore import com.tencent.angel.sona.context.PSContext import org.apache.spark.SparkContext import com.tencent.angel.sona.graph.params._ import com.tencent.angel.sona.ml.Transformer import com.tencent.angel.sona.ml.param.ParamMap import com.tencent.angel.sona.ml.util.Identifiable import org.apache.spark.sql.types.{IntegerType, LongType, StructField, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.storage.StorageLevel class KCore(override val uid: String) extends Transformer with HasSrcNodeIdCol with HasDstNodeIdCol with HasOutputNodeIdCol with HasOutputCoreIdCol with HasStorageLevel with HasPartitionNum with HasPSPartitionNum with HasUseBalancePartition { def this() = this(Identifiable.randomUID("KCore")) override def transform(dataset: Dataset[_]): DataFrame = { val edges = dataset.select($(srcNodeIdCol), $(dstNodeIdCol)).rdd .map(row => (row.getLong(0), row.getLong(1))) .filter(e => e._1 != e._2) edges.persist(StorageLevel.DISK_ONLY) val maxId = edges.map(e => math.max(e._1, e._2)).max() + 1 val minId = edges.map(e => math.min(e._1, e._2)).min() val nodes = edges.flatMap(e => Iterator(e._1, e._2)) val numEdges = edges.count() println(s"minId=$minId maxId=$maxId numEdges=$numEdges level=${$(storageLevel)}") // Start PS and init the model println("start to run ps") PSContext.getOrCreate(SparkContext.getOrCreate()) val model = KCorePSModel.fromMinMax(minId, maxId, nodes, $(psPartitionNum), $(useBalancePartition)) var graph = edges.flatMap(e => Iterator((e._1, e._2), (e._2, e._1))) .groupByKey($(partitionNum)) .mapPartitionsWithIndex((index, edgeIter) => Iterator(KCoreGraphPartition.apply(index, edgeIter))) graph.persist($(storageLevel)) graph.foreachPartition(_ => Unit) graph.foreach(_.initMsgs(model)) var curIteration = 0 var numMsgs = model.numMsgs() var prev = graph println(s"numMsgs=$numMsgs") do { curIteration += 1 graph = prev.map(_.process(model, numMsgs, curIteration == 1)) graph.persist($(storageLevel)) graph.count() prev.unpersist(true) prev = graph model.resetMsgs() numMsgs = model.numMsgs() println(s"curIteration=$curIteration numMsgs=$numMsgs") } while (numMsgs > 0) val retRDD = graph.map(_.save()).flatMap{case (nodes,cores) => nodes.zip(cores)} .map(r => Row.fromSeq(Seq[Any](r._1, r._2))) dataset.sparkSession.createDataFrame(retRDD, transformSchema(dataset.schema)) } override def transformSchema(schema: StructType): StructType = { StructType(Seq( StructField(s"${$(outputNodeIdCol)}", LongType, nullable = false), StructField(s"${$(outputCoreIdCol)}", IntegerType, nullable = false) )) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) }
Example 29
Source File: CallRecordGeneratorIngress.scala From cloudflow with Apache License 2.0 | 5 votes |
package carly.aggregator import java.sql.Timestamp import scala.util.Random import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, SparkSession } import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.LongType import cloudflow.streamlets._ import cloudflow.streamlets.avro._ import cloudflow.spark.sql.SQLImplicits._ import carly.data.CallRecord import cloudflow.spark.{ SparkStreamlet, SparkStreamletLogic } import org.apache.log4j.{ Level, Logger } case class Rate(timestamp: Timestamp, value: Long) class CallRecordGeneratorIngress extends SparkStreamlet { val rootLogger = Logger.getRootLogger() rootLogger.setLevel(Level.ERROR) val RecordsPerSecond = IntegerConfigParameter("records-per-second", "Records per second to process.", Some(50)) override def configParameters = Vector(RecordsPerSecond) val out = AvroOutlet[CallRecord]("out", _.user) val shape = StreamletShape(out) override def createLogic() = new SparkStreamletLogic { val recordsPerSecond = RecordsPerSecond.value override def buildStreamingQueries = { val outStream = DataGenerator.mkData(super.session, recordsPerSecond) writeStream(outStream, out, OutputMode.Append).toQueryExecution } } } object DataGenerator { def mkData(session: SparkSession, recordsPerSecond: Int): Dataset[CallRecord] = { // do we need to expose this through configuration? val MaxTime = 2.hours.toMillis val MaxUsers = 100000 val TS0 = new java.sql.Timestamp(0) val ZeroTimestampProb = 0.05 // error rate // Random Data Generator val usersUdf = udf(() ⇒ "user-" + Random.nextInt(MaxUsers)) val directionUdf = udf(() ⇒ if (Random.nextDouble() < 0.5) "incoming" else "outgoing") // Time-biased randomized filter - 1/2 hour cycles val sinTime: Long ⇒ Double = t ⇒ Math.sin((t / 1000 % 1800) * 1.0 / 1800 * Math.PI) val timeBoundFilter: Long ⇒ Double ⇒ Boolean = t ⇒ prob ⇒ (sinTime(t) + 0.5) > prob val timeFilterUdf = udf((ts: java.sql.Timestamp, rng: Double) ⇒ timeBoundFilter(ts.getTime)(rng)) val zeroTimestampUdf = udf { (ts: java.sql.Timestamp, rng: Double) ⇒ if (rng < ZeroTimestampProb) { TS0 } else { ts } } val rateStream = session.readStream .format("rate") .option("rowsPerSecond", recordsPerSecond) .load() .as[Rate] val randomDataset = rateStream.withColumn("rng", rand()).withColumn("tsRng", rand()) val sampledData = randomDataset .where(timeFilterUdf($"timestamp", $"rng")) .withColumn("user", usersUdf()) .withColumn("other", usersUdf()) .withColumn("direction", directionUdf()) .withColumn("duration", (round(abs(rand()) * MaxTime)).cast(LongType)) .withColumn("updatedTimestamp", zeroTimestampUdf($"timestamp", $"tsRng")) .select($"user", $"other", $"direction", $"duration", $"updatedTimestamp".as("timestamp")) .as[CallRecord] sampledData } }
Example 30
Source File: CassandraSink.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package cassandra.StreamSinkProvider import cassandra.{CassandraDriver, CassandraKafkaMetadata} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.functions.max import spark.SparkHelper import cassandra.CassandraDriver import com.datastax.spark.connector._ import kafka.KafkaMetadata import log.LazyLogger import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.types.LongType import radio.SimpleSongAggregation private def saveKafkaMetaData(df: DataFrame) = { val kafkaMetadata = df .groupBy($"partition") .agg(max($"offset").cast(LongType).as("offset")) .as[KafkaMetadata] log.warn("Saving Kafka Metadata (partition and offset per topic (only one in our example)") kafkaMetadata.show() kafkaMetadata.rdd.saveToCassandra(CassandraDriver.namespace, CassandraDriver.kafkaMetadata, SomeColumns("partition", "offset") ) //Otherway to save offset inside Cassandra //kafkaMetadata.collect().foreach(CassandraKafkaMetadata.save) } }
Example 31
Source File: GroupedIteratorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{LongType, StringType, IntegerType, StructType} class GroupedIteratorSuite extends SparkFunSuite { test("basic") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema) val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 1) key.getInt(0) -> data.map(encoder.fromRow).toSeq }.toSeq assert(result == 1 -> Seq(input(0), input(1)) :: 2 -> Seq(input(2)) :: Nil) } test("group by 2 columns") { val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType) val encoder = RowEncoder(schema) val input = Seq( Row(1, 2L, "a"), Row(1, 2L, "b"), Row(1, 3L, "c"), Row(2, 1L, "d"), Row(3, 2L, "e")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 2) (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq) }.toSeq assert(result == (1, 2L, Seq(input(0), input(1))) :: (1, 3L, Seq(input(2))) :: (2, 1L, Seq(input(3))) :: (3, 2L, Seq(input(4))) :: Nil) } test("do nothing to the value iterator") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema) val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) assert(grouped.length == 2) } }
Example 32
Source File: MySQLDialect.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, LongType, DataType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } }
Example 33
Source File: DecimalExpressionSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{LongType, DecimalType, Decimal} class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { test("UnscaledValue") { val d1 = Decimal("10.1") checkEvaluation(UnscaledValue(Literal(d1)), 101L) val d2 = Decimal(101, 3, 1) checkEvaluation(UnscaledValue(Literal(d2)), 101L) checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null) } test("MakeDecimal") { checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1")) checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null) } test("PromotePrecision") { val d1 = Decimal("10.1") checkEvaluation(PromotePrecision(Literal(d1)), d1) val d2 = Decimal(101, 3, 1) checkEvaluation(PromotePrecision(Literal(d2)), d2) checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null) } test("CheckOverflow") { val d1 = Decimal("10.1") checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null) val d2 = Decimal(101, 3, 1) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null) checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null) } }
Example 34
Source File: EdgeListDataSource.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.api.io.edgelist import org.apache.spark.sql.functions import org.apache.spark.sql.types.{LongType, StructField, StructType} import org.opencypher.morpheus.api.MorpheusSession import org.opencypher.morpheus.api.io.GraphElement.sourceIdKey import org.opencypher.morpheus.api.io.Relationship.{sourceEndNodeKey, sourceStartNodeKey} import org.opencypher.morpheus.api.io.edgelist.EdgeListDataSource._ import org.opencypher.morpheus.api.io.{MorpheusNodeTable, MorpheusRelationshipTable} import org.opencypher.morpheus.schema.MorpheusSchema import org.opencypher.okapi.api.graph.{GraphName, PropertyGraph} import org.opencypher.okapi.api.io.PropertyGraphDataSource import org.opencypher.okapi.api.schema.{PropertyGraphSchema, PropertyKeys} import org.opencypher.okapi.impl.exception.UnsupportedOperationException object EdgeListDataSource { val NODE_LABEL = "V" val REL_TYPE = "E" val GRAPH_NAME = GraphName("graph") val SCHEMA: PropertyGraphSchema = MorpheusSchema.empty .withNodePropertyKeys(Set(NODE_LABEL), PropertyKeys.empty) .withRelationshipPropertyKeys(REL_TYPE, PropertyKeys.empty) } case class EdgeListDataSource(path: String, options: Map[String, String] = Map.empty)(implicit morpheus: MorpheusSession) extends PropertyGraphDataSource { override def hasGraph(name: GraphName): Boolean = name == GRAPH_NAME override def graph(name: GraphName): PropertyGraph = { val reader = options.foldLeft(morpheus.sparkSession.read) { case (current, (key, value)) => current.option(key, value) } val rawRels = reader .schema(StructType(Seq( StructField(sourceStartNodeKey, LongType), StructField(sourceEndNodeKey, LongType)))) .csv(path) .withColumn(sourceIdKey, functions.monotonically_increasing_id()) .select(sourceIdKey, sourceStartNodeKey, sourceEndNodeKey) val rawNodes = rawRels .select(rawRels.col(sourceStartNodeKey).as(sourceIdKey)) .union(rawRels.select(rawRels.col(sourceEndNodeKey).as(sourceIdKey))) .distinct() morpheus.graphs.create(MorpheusNodeTable(Set(NODE_LABEL), rawNodes), MorpheusRelationshipTable(REL_TYPE, rawRels)) } override def schema(name: GraphName): Option[PropertyGraphSchema] = Some(SCHEMA) override def store(name: GraphName, graph: PropertyGraph): Unit = throw UnsupportedOperationException("Storing an edge list is not supported") override def delete(name: GraphName): Unit = throw UnsupportedOperationException("Deleting an edge list is not supported") override val graphNames: Set[GraphName] = Set(GRAPH_NAME) }
Example 35
Source File: HashSetManager.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.storage import edu.ucla.cs.wis.bigdatalog.spark.SchemaInfo import edu.ucla.cs.wis.bigdatalog.spark.storage.set.hashset._ import org.apache.spark.TaskContext import org.apache.spark.sql.types.{IntegerType, LongType} object HashSetManager { def determineKeyType(schemaInfo: SchemaInfo): Int = { schemaInfo.arity match { case 1 => { schemaInfo.schema(0).dataType match { case IntegerType => 1 case LongType => 2 case other => 3 } } case 2 => { val bytesPerKey = schemaInfo.schema.map(_.dataType.defaultSize).sum if (bytesPerKey == 8) 2 else 3 } case other => 3 } } def create(schemaInfo: SchemaInfo): HashSet = { determineKeyType(schemaInfo) match { case 1 => new IntKeysHashSet() case 2 => new LongKeysHashSet(schemaInfo) case _ => new ObjectHashSet() } } }
Example 36
Source File: GroupBitwiseOr.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.udf import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types.{ DataType, LongType } import org.apache.spark.sql.catalyst.expressions.{ AttributeReference, Literal, BitwiseOr } case class GroupBitwiseOr(child: org.apache.spark.sql.catalyst.expressions.Expression) extends DeclarativeAggregate { override def children: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = child :: Nil override def nullable: Boolean = false // Return data type. override def dataType: DataType = LongType override def checkInputDataTypes(): TypeCheckResult = TypeUtils.checkForOrderingExpr(child.dataType, "function group_bitwise_or") private lazy val group_bitwise_or = AttributeReference("group_bitwise_or", LongType)() override lazy val aggBufferAttributes: Seq[AttributeReference] = group_bitwise_or :: Nil override lazy val initialValues: Seq[Literal] = Seq( Literal.create(0, LongType) ) override lazy val updateExpressions: Seq[ org.apache.spark.sql.catalyst.expressions.Expression] = Seq( BitwiseOr(group_bitwise_or, child) ) override lazy val mergeExpressions: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = { Seq( BitwiseOr(group_bitwise_or.left, group_bitwise_or.right) ) } override lazy val evaluateExpression: AttributeReference = group_bitwise_or }
Example 37
Source File: GroupBitwiseAnd.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.udf import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types.{ DataType, LongType } import org.apache.spark.sql.catalyst.expressions.{ AttributeReference, Literal, BitwiseAnd } case class GroupBitwiseAnd(child: org.apache.spark.sql.catalyst.expressions.Expression) extends DeclarativeAggregate { override def children: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = child :: Nil override def nullable: Boolean = false // Return data type. override def dataType: DataType = LongType override def checkInputDataTypes(): TypeCheckResult = TypeUtils.checkForOrderingExpr(child.dataType, "function group_bitwise_and") private lazy val group_bitwise_and = AttributeReference("group_bitwise_and", LongType)() override lazy val aggBufferAttributes: Seq[AttributeReference] = group_bitwise_and :: Nil override lazy val initialValues: Seq[Literal] = Seq( Literal.create(0xffffffffffffffffl, LongType) ) override lazy val updateExpressions: Seq[ org.apache.spark.sql.catalyst.expressions.Expression] = Seq( BitwiseAnd(group_bitwise_and, child) ) override lazy val mergeExpressions: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = { Seq( BitwiseAnd(group_bitwise_and.left, group_bitwise_and.right) ) } override lazy val evaluateExpression: AttributeReference = group_bitwise_and }
Example 38
Source File: MimirSparkRuntimeUtils.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{ DataType, LongType } import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{ spark_partition_id, monotonically_increasing_id, count, sum, first, lit, col } object MimirSparkRuntimeUtils { def zipWithIndex(df: DataFrame, offset: Long = 1, indexName: String = "ROWIDX", indexType:DataType = LongType): DataFrame = { val dfWithPartitionId = df.withColumn("partition_id", spark_partition_id()).withColumn("inc_id", monotonically_increasing_id()) val partitionOffsets = dfWithPartitionId .groupBy("partition_id") .agg(count(lit(1)) as "cnt", first("inc_id") as "inc_id") .orderBy("partition_id") .select(col("partition_id"), sum("cnt").over(Window.orderBy("partition_id")) - col("cnt") - col("inc_id") + lit(offset) as "cnt" ) .collect() .map(row => (row.getInt(0), row.getLong(1))) .toMap val theUdf = org.apache.spark.sql.functions.udf( (partitionId: Int) => partitionOffsets(partitionId), LongType ) dfWithPartitionId .withColumn("partition_offset", theUdf(col("partition_id"))) .withColumn(indexName, (col("partition_offset") + col("inc_id")).cast(indexType)) .drop("partition_id", "partition_offset", "inc_id") } def writeDataSink(dataframe:DataFrame, format:String, options:Map[String, String], save:Option[String]) = { val dsFormat = dataframe.write.format(format) val dsOptions = options.toSeq.foldLeft(dsFormat)( (ds, opt) => opt._1 match { case "mode" => ds.mode(opt._2) case _ => ds.option(opt._1, opt._2) }) save match { case None => dsOptions.save case Some(outputFile) => { if(format.equals("com.github.potix2.spark.google.spreadsheets")){ val gsldfparts = outputFile.split("\\/") val gsldf = s"${gsldfparts(gsldfparts.length-2)}/${gsldfparts(gsldfparts.length-1)}" dsOptions.save(gsldf) } else{ dsOptions.save(outputFile) } } } } }
Example 39
Source File: GroupedDatasetSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.api.python.PythonEvalType import org.apache.spark.sql.catalyst.plans.logical.AnalysisBarrier import org.apache.spark.sql.execution.python.PythonUDF import org.apache.spark.sql.functions.udf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{LongType, StructField, StructType} class GroupedDatasetSuite extends QueryTest with SharedSQLContext { import testImplicits._ private val scalaUDF = udf((x: Long) => { x + 1 }) private lazy val datasetWithUDF = spark.range(1).toDF("s").select($"s", scalaUDF($"s")) private def assertContainsAnalysisBarrier(ds: Dataset[_], atLevel: Int = 1): Unit = { assert(atLevel >= 0) var children = Seq(ds.queryExecution.logical) (1 to atLevel).foreach { _ => children = children.flatMap(_.children) } val barriers = children.collect { case ab: AnalysisBarrier => ab } assert(barriers.nonEmpty, s"Plan does not contain AnalysisBarrier at level $atLevel:\n" + ds.queryExecution.logical) } test("SPARK-24373: avoid running Analyzer rules twice on RelationalGroupedDataset") { val groupByDataset = datasetWithUDF.groupBy() val rollupDataset = datasetWithUDF.rollup("s") val cubeDataset = datasetWithUDF.cube("s") val pivotDataset = datasetWithUDF.groupBy().pivot("s", Seq(1, 2)) datasetWithUDF.cache() Seq(groupByDataset, rollupDataset, cubeDataset, pivotDataset).foreach { rgDS => val df = rgDS.count() assertContainsAnalysisBarrier(df) assertCached(df) } val flatMapGroupsInRDF = datasetWithUDF.groupBy().flatMapGroupsInR( Array.emptyByteArray, Array.emptyByteArray, Array.empty, StructType(Seq(StructField("s", LongType)))) val flatMapGroupsInPandasDF = datasetWithUDF.groupBy().flatMapGroupsInPandas(PythonUDF( "pyUDF", null, StructType(Seq(StructField("s", LongType))), Seq.empty, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, true)) Seq(flatMapGroupsInRDF, flatMapGroupsInPandasDF).foreach { df => assertContainsAnalysisBarrier(df, 2) assertCached(df) } datasetWithUDF.unpersist(true) } test("SPARK-24373: avoid running Analyzer rules twice on KeyValueGroupedDataset") { val kvDasaset = datasetWithUDF.groupByKey(_.getLong(0)) datasetWithUDF.cache() val mapValuesKVDataset = kvDasaset.mapValues(_.getLong(0)).reduceGroups(_ + _) val keysKVDataset = kvDasaset.keys val flatMapGroupsKVDataset = kvDasaset.flatMapGroups((k, _) => Seq(k)) val aggKVDataset = kvDasaset.count() val otherKVDataset = spark.range(1).groupByKey(_ + 1) val cogroupKVDataset = kvDasaset.cogroup(otherKVDataset)((k, _, _) => Seq(k)) Seq((mapValuesKVDataset, 1), (keysKVDataset, 2), (flatMapGroupsKVDataset, 2), (aggKVDataset, 1), (cogroupKVDataset, 2)).foreach { case (df, analysisBarrierDepth) => assertContainsAnalysisBarrier(df, analysisBarrierDepth) assertCached(df) } datasetWithUDF.unpersist(true) } }
Example 40
Source File: GroupedIteratorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType} class GroupedIteratorSuite extends SparkFunSuite { test("basic") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 1) key.getInt(0) -> data.map(encoder.fromRow).toSeq }.toSeq assert(result == 1 -> Seq(input(0), input(1)) :: 2 -> Seq(input(2)) :: Nil) } test("group by 2 columns") { val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq( Row(1, 2L, "a"), Row(1, 2L, "b"), Row(1, 3L, "c"), Row(2, 1L, "d"), Row(3, 2L, "e")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 2) (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq) }.toSeq assert(result == (1, 2L, Seq(input(0), input(1))) :: (1, 3L, Seq(input(2))) :: (2, 1L, Seq(input(3))) :: (3, 2L, Seq(input(4))) :: Nil) } test("do nothing to the value iterator") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) assert(grouped.length == 2) } }
Example 41
Source File: MySQLDialect.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 42
Source File: ResolveInlineTablesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Cast, Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types.{LongType, NullType, TimestampType} class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(conf)(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables(conf).convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("convert TimeZoneAwareExpression") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType)))) val withTimeZone = ResolveTimeZone(conf).apply(table) val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone) val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType) .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long] assert(output.map(_.dataType) == Seq(TimestampType)) assert(data.size == 1) assert(data.head.getLong(0) == correct) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables(conf).convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables(conf).convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 43
Source File: RandomSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.scalatest.Matchers._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, LongType} class RandomSuite extends SparkFunSuite with ExpressionEvalHelper { test("random") { checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001) checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001) checkDoubleEvaluation( new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001) checkDoubleEvaluation( new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001) } test("SPARK-9127 codegen with long seed") { checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001) checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001) } }
Example 44
Source File: DecimalExpressionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Decimal, DecimalType, LongType} class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { test("UnscaledValue") { val d1 = Decimal("10.1") checkEvaluation(UnscaledValue(Literal(d1)), 101L) val d2 = Decimal(101, 3, 1) checkEvaluation(UnscaledValue(Literal(d2)), 101L) checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null) } test("MakeDecimal") { checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1")) checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null) } test("PromotePrecision") { val d1 = Decimal("10.1") checkEvaluation(PromotePrecision(Literal(d1)), d1) val d2 = Decimal(101, 3, 1) checkEvaluation(PromotePrecision(Literal(d2)), d2) checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null) } test("CheckOverflow") { val d1 = Decimal("10.1") checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null) val d2 = Decimal(101, 3, 1) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null) checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null) } }
Example 45
Source File: TestMetadataConstructor.scala From spark-salesforce with Apache License 2.0 | 5 votes |
package com.springml.spark.salesforce.metadata import org.apache.spark.sql.types.{StructType, StringType, IntegerType, LongType, FloatType, DateType, TimestampType, BooleanType, StructField} import org.scalatest.FunSuite import com.springml.spark.salesforce.Utils class TestMetadataConstructor extends FunSuite { test("Test Metadata generation") { val columnNames = List("c1", "c2", "c3", "c4") val columnStruct = columnNames.map(colName => StructField(colName, StringType, true)) val schema = StructType(columnStruct) val schemaString = MetadataConstructor.generateMetaString(schema,"sampleDataSet", Utils.metadataConfig(null)) assert(schemaString.length > 0) assert(schemaString.contains("sampleDataSet")) } test("Test Metadata generation With Custom MetadataConfig") { val columnNames = List("c1", "c2", "c3", "c4") val intField = StructField("intCol", IntegerType, true) val longField = StructField("longCol", LongType, true) val floatField = StructField("floatCol", FloatType, true) val dateField = StructField("dateCol", DateType, true) val timestampField = StructField("timestampCol", TimestampType, true) val stringField = StructField("stringCol", StringType, true) val someTypeField = StructField("someTypeCol", BooleanType, true) val columnStruct = Array[StructField] (intField, longField, floatField, dateField, timestampField, stringField, someTypeField) val schema = StructType(columnStruct) var metadataConfig = Map("string" -> Map("wave_type" -> "Text")) metadataConfig += ("integer" -> Map("wave_type" -> "Numeric", "precision" -> "10", "scale" -> "0", "defaultValue" -> "100")) metadataConfig += ("float" -> Map("wave_type" -> "Numeric", "precision" -> "10", "scale" -> "2")) metadataConfig += ("long" -> Map("wave_type" -> "Numeric", "precision" -> "18", "scale" -> "0")) metadataConfig += ("date" -> Map("wave_type" -> "Date", "format" -> "yyyy/MM/dd")) metadataConfig += ("timestamp" -> Map("wave_type" -> "Date", "format" -> "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")) val schemaString = MetadataConstructor.generateMetaString(schema, "sampleDataSet", metadataConfig) assert(schemaString.length > 0) assert(schemaString.contains("sampleDataSet")) assert(schemaString.contains("Numeric")) assert(schemaString.contains("precision")) assert(schemaString.contains("scale")) assert(schemaString.contains("18")) assert(schemaString.contains("Text")) assert(schemaString.contains("Date")) assert(schemaString.contains("format")) assert(schemaString.contains("defaultValue")) assert(schemaString.contains("100")) assert(schemaString.contains("yyyy/MM/dd")) assert(schemaString.contains("yyyy-MM-dd'T'HH:mm:ss.SSS'Z'")) } }
Example 46
Source File: MonotonicallyIncreasingID.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initializeInternal(partitionIndex: Int): Unit = { count = 0L partitionMask = partitionIndex.toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.addMutableState(CodeGenerator.JAVA_LONG, "count") val partitionMaskTerm = "partitionMask" ctx.addImmutableStateIfNotExists(CodeGenerator.JAVA_LONG, partitionMaskTerm) ctx.addPartitionInitializationStatement(s"$countTerm = 0L;") ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;") ev.copy(code = code""" final ${CodeGenerator.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = FalseLiteral) } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" override def freshCopy(): MonotonicallyIncreasingID = MonotonicallyIncreasingID() }
Example 47
Source File: inputFileBlock.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.rdd.InputFileBlockHolder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{DataType, LongType, StringType} import org.apache.spark.unsafe.types.UTF8String @ExpressionDescription( usage = "_FUNC_() - Returns the name of the file being read, or empty string if not available.") case class InputFileName() extends LeafExpression with Nondeterministic { override def nullable: Boolean = false override def dataType: DataType = StringType override def prettyName: String = "input_file_name" override protected def initializeInternal(partitionIndex: Int): Unit = {} override protected def evalInternal(input: InternalRow): UTF8String = { InputFileBlockHolder.getInputFilePath } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val className = InputFileBlockHolder.getClass.getName.stripSuffix("$") val typeDef = s"final ${CodeGenerator.javaType(dataType)}" ev.copy(code = code"$typeDef ${ev.value} = $className.getInputFilePath();", isNull = FalseLiteral) } } @ExpressionDescription( usage = "_FUNC_() - Returns the start offset of the block being read, or -1 if not available.") case class InputFileBlockStart() extends LeafExpression with Nondeterministic { override def nullable: Boolean = false override def dataType: DataType = LongType override def prettyName: String = "input_file_block_start" override protected def initializeInternal(partitionIndex: Int): Unit = {} override protected def evalInternal(input: InternalRow): Long = { InputFileBlockHolder.getStartOffset } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val className = InputFileBlockHolder.getClass.getName.stripSuffix("$") val typeDef = s"final ${CodeGenerator.javaType(dataType)}" ev.copy(code = code"$typeDef ${ev.value} = $className.getStartOffset();", isNull = FalseLiteral) } } @ExpressionDescription( usage = "_FUNC_() - Returns the length of the block being read, or -1 if not available.") case class InputFileBlockLength() extends LeafExpression with Nondeterministic { override def nullable: Boolean = false override def dataType: DataType = LongType override def prettyName: String = "input_file_block_length" override protected def initializeInternal(partitionIndex: Int): Unit = {} override protected def evalInternal(input: InternalRow): Long = { InputFileBlockHolder.getLength } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val className = InputFileBlockHolder.getClass.getName.stripSuffix("$") val typeDef = s"final ${CodeGenerator.javaType(dataType)}" ev.copy(code = code"$typeDef ${ev.value} = $className.getLength();", isNull = FalseLiteral) } }
Example 48
Source File: TiSparkTypeSuite.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.datasource import org.apache.spark.sql.Row import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} class TiSparkTypeSuite extends BaseDataSourceTest("type_test") { private val row1 = Row(null, "Hello") private val row2 = Row(2L, "TiDB") private val row3 = Row(3L, "Spark") private val row5 = Row(Long.MaxValue, "Duplicate") private val schema = StructType(List(StructField("i", LongType), StructField("s", StringType))) test("bigint test") { if (!supportBatchWrite) { cancel } dropTable() jdbcUpdate(s"create table $dbtable(i bigint, s varchar(128))") jdbcUpdate(s"insert into $dbtable values(null, 'Hello'), (2, 'TiDB')") tidbWrite(List(row3, row5), schema) testTiDBSelect(List(row1, row2, row3, row5)) } }
Example 49
Source File: SummarizeIntervalsSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.types.{ DoubleType, LongType, IntegerType } class SummarizeIntervalsSpec extends MultiPartitionSuite with TimeSeriesTestData with TimeTypeSuite { override val defaultResourceDir: String = "/timeseries/summarizeintervals" "SummarizeInterval" should "pass `SummarizeSingleColumn` test." in { withAllTimeType { val volumeTSRdd = fromCSV( "Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType) ) volumeTSRdd.toDF.show() val clockTSRdd = fromCSV("Clock.csv", Schema()) val resultTSRdd = fromCSV("SummarizeSingleColumn.results", Schema("volume_sum" -> DoubleType)) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeIntervals(clockTSRdd, Summarizers.sum("volume")) summarizedVolumeTSRdd.toDF.show() assert(summarizedVolumeTSRdd.collect().deep == resultTSRdd.collect().deep) } withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } it should "pass `SummarizeSingleColumnPerKey` test, i.e. with additional a single key." in { withAllTimeType { val volumeTSRdd = fromCSV( "Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType) ) val clockTSRdd = fromCSV("Clock.csv", Schema()) val resultTSRdd = fromCSV( "SummarizeSingleColumnPerKey.results", Schema("id" -> IntegerType, "volume_sum" -> DoubleType) ) val result2TSRdd = fromCSV( "SummarizeV2PerKey.results", Schema("id" -> IntegerType, "v2_sum" -> DoubleType) ) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeIntervals(clockTSRdd, Summarizers.sum("volume"), Seq("id")) assertEquals(summarizedVolumeTSRdd, resultTSRdd) val summarizedV2TSRdd = rdd.summarizeIntervals(clockTSRdd, Summarizers.sum("v2"), Seq("id")) assertEquals(summarizedV2TSRdd, result2TSRdd) } withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } it should "pass `SummarizeSingleColumnPerSeqOfKeys` test, i.e. with additional a sequence of keys." in { withAllTimeType { val volumeTSRdd = fromCSV( "VolumeWithIndustryGroup.csv", Schema("id" -> IntegerType, "group" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType) ) val clockTSRdd = fromCSV("Clock.csv", Schema()) val resultTSRdd = fromCSV( "SummarizeSingleColumnPerSeqOfKeys.results", Schema("id" -> IntegerType, "group" -> IntegerType, "volume_sum" -> DoubleType) ) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeIntervals( clockTSRdd, Summarizers.sum("volume"), Seq("id", "group") ) assertEquals(summarizedVolumeTSRdd, resultTSRdd) } withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } }
Example 50
Source File: QuantileSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer.subtractable import com.twosigma.flint.rdd.function.summarize.summarizer.subtractable.SequentialArrayQueue import com.twosigma.flint.timeseries.summarize.SummarizerSuite import com.twosigma.flint.timeseries.{ Clocks, Summarizers, TimeSeriesRDD } import org.apache.commons.math3.stat.descriptive.rank.Percentile import org.apache.spark.sql.Row import org.apache.spark.sql.types.LongType class QuantileSummarizerSpec extends SummarizerSuite { var clockTSRdd: TimeSeriesRDD = _ private lazy val init = { clockTSRdd = Clocks.uniform( sc, frequency = "1d", offset = "0d", beginDateTime = "1970-01-01", endDateTime = "1980-01-01" ) } "SequentialArrayQueue" should "resize up correctly" in { val queue = new SequentialArrayQueue[Double]() (1 to 32).map{ i => queue.add(i.toDouble) } assert(queue.view()._3.length == 32) queue.add(0.0) assert(queue.view()._3.length == 64) } it should "shift down correctly" in { val queue = new SequentialArrayQueue[Double]() (1 to 64).map{ i => queue.add(i.toDouble) } assert(queue.view()._3.length == 64) (1 to 32).map{ _ => queue.remove() } assert(queue.view()._1 == 0) } it should "addAll and preserve order" in { val queue1 = new SequentialArrayQueue[Double]() val queue2 = new SequentialArrayQueue[Double]() // Move the begin index (1 to 5).map{ i => queue1.add(i.toDouble) queue1.remove() } (1 to 3).map{ i => queue1.add(i.toDouble) } (4 to 10).map{ i => queue2.add(i.toDouble) } queue1.addAll(queue2) var index = queue1.view()._1 for (i <- 1 to 10) { assert(queue1.view()._3(index) == i) index += 1 } } "QuantileSummarizer" should "compute `quantile` correctly" in { init val p = (1 to 100).map(_ / 100.0) val results = clockTSRdd.summarize(Summarizers.quantile("time", p)).first() val percentileEstimator = new Percentile().withEstimationType(Percentile.EstimationType.R_7) percentileEstimator.setData(clockTSRdd.collect().map(_.getAs[Long]("time").toDouble)) val expectedResults = p.map { i => percentileEstimator.evaluate(i * 100.0) } (1 to 100).foreach { i => assert(results.getAs[Double](s"time_${i / 100.0}quantile") === expectedResults(i - 1)) } } it should "ignore null values" in { init val input = clockTSRdd.addColumns("v" -> LongType -> { row: Row => row.getAs[Long]("time") }) assertEquals( input.summarize(Summarizers.quantile("v", Seq(0.25, 0.5, 0.75, 0.9, 0.95))), insertNullRows(input, "v").summarize(Summarizers.quantile("v", Seq(0.25, 0.5, 0.75, 0.9, 0.95))) ) } it should "pass summarizer property test" in { summarizerPropertyTest(AllPropertiesAndSubtractable)(Summarizers.quantile("x1", Seq(0.25, 0.5, 0.75, 0.9, 0.95))) } }
Example 51
Source File: ExtremeSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.rdd.function.summarize.summarizer.Summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.{ SummarizerFactory, SummarizerSuite } import com.twosigma.flint.timeseries.{ CSV, Summarizers, TimeSeriesRDD, TimeSeriesSuite } import org.apache.spark.sql.types.{ DataType, DoubleType, FloatType, IntegerType, LongType, StructType } import java.util.Random import org.apache.spark.sql.Row class ExtremeSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer" private def test[T]( dataType: DataType, randValue: Row => Any, summarizer: String => SummarizerFactory, reduceFn: (T, T) => T, inputColumn: String, outputColumn: String ): Unit = { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)).addColumns( inputColumn -> dataType -> randValue ) val data = priceTSRdd.collect().map{ row => row.getAs[T](inputColumn) } val trueExtreme = data.reduceLeft[T]{ case (x, y) => reduceFn(x, y) } val result = priceTSRdd.summarize(summarizer(inputColumn)) val extreme = result.first().getAs[T](outputColumn) val outputType = result.schema(outputColumn).dataType assert(outputType == dataType, s"$outputType") assert(trueExtreme === extreme, s"extreme: $extreme, trueExtreme: $trueExtreme, data: ${data.toSeq}") } "MaxSummarizer" should "compute double max correctly" in { val rand = new Random() test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.max, math.max, "x", "x_max") } it should "compute long max correctly" in { val rand = new Random() test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.max, math.max, "x", "x_max") } it should "compute float max correctly" in { val rand = new Random() test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.max, math.max, "x", "x_max") } it should "compute int max correctly" in { val rand = new Random() test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.max, math.max, "x", "x_max") } "MinSummarizer" should "compute double min correctly" in { val rand = new Random() test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.min, math.min, "x", "x_min") } it should "compute long min correctly" in { val rand = new Random() test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.min, math.min, "x", "x_min") } it should "compute float min correctly" in { val rand = new Random() test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.min, math.min, "x", "x_min") } it should "compute int min correctly" in { val rand = new Random() test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.min, math.min, "x", "x_min") } it should "pass summarizer property test" in { summarizerPropertyTest(AllProperties)(Summarizers.max("x1")) summarizerPropertyTest(AllProperties)(Summarizers.min("x2")) } it should "ignore null values" in { val input = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val inputWithNull = insertNullRows(input, "price") assertEquals( input.summarize(Summarizers.min("price")), inputWithNull.summarize(Summarizers.min("price")) ) } }
Example 52
Source File: SummarizeCyclesSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.types.{ DoubleType, IntegerType, LongType } class SummarizeCyclesSpec extends MultiPartitionSuite with TimeSeriesTestData with TimeTypeSuite { override val defaultResourceDir: String = "/timeseries/summarizecycles" private val volumeSchema = Schema("id" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType) private val volume2Schema = Schema("id" -> IntegerType, "volume" -> LongType) private val volumeWithGroupSchema = Schema( "id" -> IntegerType, "group" -> IntegerType, "volume" -> LongType, "v2" -> DoubleType ) "SummarizeCycles" should "pass `SummarizeSingleColumn` test." in { withAllTimeType { val resultTSRdd = fromCSV("SummarizeSingleColumn.results", Schema("volume_sum" -> DoubleType)) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeCycles(Summarizers.sum("volume")) assertEquals(summarizedVolumeTSRdd, resultTSRdd) } val volumeTSRdd = fromCSV("Volume.csv", volumeSchema) withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } it should "pass `SummarizeSingleColumnPerKey` test, i.e. with additional a single key." in { withAllTimeType { val resultTSRdd = fromCSV( "SummarizeSingleColumnPerKey.results", Schema("id" -> IntegerType, "volume_sum" -> DoubleType) ) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeCycles(Summarizers.sum("volume"), Seq("id")) assertEquals(summarizedVolumeTSRdd, resultTSRdd) } val volumeTSRdd = fromCSV("Volume2.csv", volume2Schema) withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } it should "pass `SummarizeSingleColumnPerSeqOfKeys` test, i.e. with additional a sequence of keys." in { withAllTimeType { val resultTSRdd = fromCSV( "SummarizeSingleColumnPerSeqOfKeys.results", Schema("id" -> IntegerType, "group" -> IntegerType, "volume_sum" -> DoubleType) ) def test(rdd: TimeSeriesRDD): Unit = { val summarizedVolumeTSRdd = rdd.summarizeCycles(Summarizers.sum("volume"), Seq("id", "group")) assertEquals(summarizedVolumeTSRdd, resultTSRdd) } val volumeTSRdd = fromCSV("VolumeWithIndustryGroup.csv", volumeWithGroupSchema) withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } it should "pass generated cycle data test" in { // TODO: The way cycleData works now doesn't support changing time type. val testData = cycleData1 def sum(rdd: TimeSeriesRDD): TimeSeriesRDD = { rdd.summarizeCycles(Summarizers.compose(Summarizers.count(), Summarizers.sum("v1"))) } withPartitionStrategyCompare(testData)(DEFAULT)(sum) } }
Example 53
Source File: SummarizeSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import com.twosigma.flint.timeseries.row.Schema import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ LongType, IntegerType, DoubleType } class SummarizeSpec extends MultiPartitionSuite { override val defaultResourceDir: String = "/timeseries/summarize" it should "`summarize` correctly" in { val expectedSchema = Schema("volume_sum" -> DoubleType) val expectedResults = Array[Row](new GenericRowWithSchema(Array(0L, 7800.0), expectedSchema)) def test(rdd: TimeSeriesRDD): Unit = { val results = rdd.summarize(Summarizers.sum("volume")) assert(results.schema == expectedSchema) assert(results.collect().deep == expectedResults.deep) } { val volumeRdd = fromCSV("Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType)) withPartitionStrategy(volumeRdd)(DEFAULT)(test) } } it should "`summarize` per key correctly" in { val expectedSchema = Schema("id" -> IntegerType, "volume_sum" -> DoubleType) val expectedResults = Array[Row]( new GenericRowWithSchema(Array(0L, 7, 4100.0), expectedSchema), new GenericRowWithSchema(Array(0L, 3, 3700.0), expectedSchema) ) def test(rdd: TimeSeriesRDD): Unit = { val results = rdd.summarize(Summarizers.sum("volume"), Seq("id")) assert(results.schema == expectedSchema) assert(results.collect().sortBy(_.getAs[Int]("id")).deep == expectedResults.sortBy(_.getAs[Int]("id")).deep) } { val volumeTSRdd = fromCSV("Volume.csv", Schema("id" -> IntegerType, "volume" -> LongType)) withPartitionStrategy(volumeTSRdd)(DEFAULT)(test) } } }
Example 54
Source File: TimestampCast.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.codegen.{ CodegenContext, ExprCode, CodeGenerator, JavaCode, Block } import org.apache.spark.sql.catalyst.expressions.{ Expression, NullIntolerant, UnaryExpression } import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{ DataType, LongType, TimestampType } case class TimestampToNanos(child: Expression) extends TimestampCast { val dataType: DataType = LongType protected def cast(childPrim: String): String = s"$childPrim * 1000L" override protected def nullSafeEval(input: Any): Any = input.asInstanceOf[Long] * 1000L } case class NanosToTimestamp(child: Expression) extends TimestampCast { val dataType: DataType = TimestampType protected def cast(childPrim: String): String = s"$childPrim / 1000L" override protected def nullSafeEval(input: Any): Any = input.asInstanceOf[Long] / 1000L } object TimestampToNanos { private[this] def castCode(ctx: CodegenContext, childPrim: String, childNull: String, resultPrim: String, resultNull: String, resultType: DataType): Block = { code""" boolean $resultNull = $childNull; ${CodeGenerator.javaType(resultType)} $resultPrim = ${CodeGenerator.defaultValue(resultType)}; if (!${childNull}) { $resultPrim = (long) ${cast(childPrim)}; } """ } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val eval = child.genCode(ctx) ev.copy(code = eval.code + castCode(ctx, eval.value, eval.isNull, ev.value, ev.isNull, dataType)) } }
Example 55
Source File: Schema.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.s2jobs import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} object Schema { val GraphElementSchema = StructType(CommonFields ++ Seq( StructField("id", StringType, nullable = true), StructField("service", StringType, nullable = true), StructField("column", StringType, nullable = true), StructField("from", StringType, nullable = true), StructField("to", StringType, nullable = true), StructField("label", StringType, nullable = true), StructField("props", StringType, nullable = true) )) }
Example 56
Source File: ResolveTableValuedFunctions.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import java.util.Locale import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Range} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.{DataType, IntegerType, LongType} tvf("start" -> LongType, "end" -> LongType, "step" -> LongType, "numPartitions" -> IntegerType) { case Seq(start: Long, end: Long, step: Long, numPartitions: Int) => Range(start, end, step, Some(numPartitions)) }) ) override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) => // The whole resolution is somewhat difficult to understand here due to too much abstractions. // We should probably rewrite the following at some point. Reynold was just here to improve // error messages and didn't have time to do a proper rewrite. val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match { case Some(tvf) => def failAnalysis(): Nothing = { val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ") u.failAnalysis( s"""error: table-valued function ${u.functionName} with alternatives: |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")} |cannot be applied to: ($argTypes)""".stripMargin) } val resolved = tvf.flatMap { case (argList, resolver) => argList.implicitCast(u.functionArgs) match { case Some(casted) => try { Some(resolver(casted.map(_.eval()))) } catch { case e: AnalysisException => failAnalysis() } case _ => None } } resolved.headOption.getOrElse { failAnalysis() } case _ => u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function") } // If alias names assigned, add `Project` with the aliases if (u.outputNames.nonEmpty) { val outputAttrs = resolvedFunc.output // Checks if the number of the aliases is equal to expected one if (u.outputNames.size != outputAttrs.size) { u.failAnalysis(s"Number of given aliases does not match number of output columns. " + s"Function name: ${u.functionName}; number of aliases: " + s"${u.outputNames.size}; number of output columns: ${outputAttrs.size}.") } val aliases = outputAttrs.zip(u.outputNames).map { case (attr, name) => Alias(attr, name)() } Project(aliases, resolvedFunc) } else { resolvedFunc } } }
Example 57
Source File: GroupedIteratorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType} class GroupedIteratorSuite extends SparkFunSuite { test("basic") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 1) key.getInt(0) -> data.map(encoder.fromRow).toSeq }.toSeq assert(result == 1 -> Seq(input(0), input(1)) :: 2 -> Seq(input(2)) :: Nil) } test("group by 2 columns") { val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq( Row(1, 2L, "a"), Row(1, 2L, "b"), Row(1, 3L, "c"), Row(2, 1L, "d"), Row(3, 2L, "e")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 2) (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq) }.toSeq assert(result == (1, 2L, Seq(input(0), input(1))) :: (1, 3L, Seq(input(2))) :: (2, 1L, Seq(input(3))) :: (3, 2L, Seq(input(4))) :: Nil) } test("do nothing to the value iterator") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) assert(grouped.length == 2) } }
Example 58
Source File: MySQLDialect.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 59
Source File: ResolveInlineTablesSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.types.{LongType, NullType} class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables.convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables.convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables.convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 60
Source File: DecimalExpressionSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Decimal, DecimalType, LongType} class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { test("UnscaledValue") { val d1 = Decimal("10.1") checkEvaluation(UnscaledValue(Literal(d1)), 101L) val d2 = Decimal(101, 3, 1) checkEvaluation(UnscaledValue(Literal(d2)), 101L) checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null) } test("MakeDecimal") { checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1")) checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null) } test("PromotePrecision") { val d1 = Decimal("10.1") checkEvaluation(PromotePrecision(Literal(d1)), d1) val d2 = Decimal(101, 3, 1) checkEvaluation(PromotePrecision(Literal(d2)), d2) checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null) } test("CheckOverflow") { val d1 = Decimal("10.1") checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null) val d2 = Decimal(101, 3, 1) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null) checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null) } }
Example 61
Source File: ResolveTableValuedFunctions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Range} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.{DataType, IntegerType, LongType} tvf("start" -> LongType, "end" -> LongType, "step" -> LongType, "numPartitions" -> IntegerType) { case Seq(start: Long, end: Long, step: Long, numPartitions: Int) => Range(start, end, step, Some(numPartitions)) }) ) override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) => builtinFunctions.get(u.functionName) match { case Some(tvf) => val resolved = tvf.flatMap { case (argList, resolver) => argList.implicitCast(u.functionArgs) match { case Some(casted) => Some(resolver(casted.map(_.eval()))) case _ => None } } resolved.headOption.getOrElse { val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ") u.failAnalysis( s"""error: table-valued function ${u.functionName} with alternatives: |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")} |cannot be applied to: (${argTypes})""".stripMargin) } case _ => u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function") } } }
Example 62
Source File: MonotonicallyIncreasingID.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initInternal(): Unit = { count = 0L partitionMask = TaskContext.getPartitionId().toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = "false") } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" }
Example 63
Source File: SQLTransformerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.types.{LongType, StructField, StructType} class SQLTransformerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new SQLTransformer()) } test("transform numeric data") { val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") val result = sqlTrans.transform(original) val resultSchema = sqlTrans.transformSchema(original.schema) val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0)) .toDF("id", "v1", "v2", "v3", "v4") assert(result.schema.toString == resultSchema.toString) assert(resultSchema == expected.schema) assert(result.collect().toSeq == expected.collect().toSeq) assert(original.sparkSession.catalog.listTables().count() == 0) } test("read/write") { val t = new SQLTransformer() .setStatement("select * from __THIS__") testDefaultReadWrite(t) } test("transformSchema") { val df = spark.range(10) val outputSchema = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") .transformSchema(df.schema) val expected = StructType(Seq(StructField("id1", LongType, nullable = false))) assert(outputSchema === expected) } }
Example 64
Source File: TestTableStatsSinglePathMain.scala From Spark.TableStatsExample with Apache License 2.0 | 5 votes |
package com.cloudera.sa.examples.tablestats import org.apache.spark.{SparkContext, SparkConf} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType} import org.scalatest.{FunSuite, BeforeAndAfterEach, BeforeAndAfterAll} class TestTableStatsSinglePathMain extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{ test("run table stats on sample data") { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") var sc = new SparkContext("local", "test", sparkConfig) try { val sqlContext = new org.apache.spark.sql.SQLContext(sc) val schema = StructType( Array( StructField("id", LongType, true), StructField("name", StringType, true), StructField("age", LongType, true), StructField("gender", StringType, true), StructField("height", LongType, true), StructField("job_title", StringType, true) ) ) val rowRDD = sc.parallelize(Array( Row(1l, "Name.1", 20l, "M", 6l, "dad"), Row(2l, "Name.2", 20l, "F", 5l, "mom"), Row(3l, "Name.3", 20l, "F", 5l, "mom"), Row(4l, "Name.4", 20l, "M", 5l, "mom"), Row(5l, "Name.5", 10l, "M", 4l, "kid"), Row(6l, "Name.6", 8l, "M", 3l, "kid"))) val df = sqlContext.createDataFrame(rowRDD, schema) val firstPassStats = TableStatsSinglePathMain.getFirstPassStat(df) assertResult(6l)(firstPassStats.columnStatsMap(0).maxLong) assertResult(1l)(firstPassStats.columnStatsMap(0).minLong) assertResult(21l)(firstPassStats.columnStatsMap(0).sumLong) assertResult(3l)(firstPassStats.columnStatsMap(0).avgLong) assertResult(2)(firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.length) firstPassStats.columnStatsMap(3).topNValues.topNCountsForColumnArray.foreach { r => if (r._1.equals("M")) { assertResult(4l)(r._2) } else if (r._1.equals("F")) { assertResult(2l)(r._2) } else { throw new RuntimeException("Unknown gender: " + r._1) } } } finally { sc.stop() } } }
Example 65
Source File: ConfigurableDataGeneratorMain.scala From Spark.TableStatsExample with Apache License 2.0 | 5 votes |
package com.cloudera.sa.examples.tablestats import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.{StringType, LongType, StructField, StructType} import org.apache.spark.{SparkContext, SparkConf} import scala.collection.mutable import scala.util.Random object ConfigurableDataGeneratorMain { def main(args: Array[String]): Unit = { if (args.length == 0) { println("ConfigurableDataGeneratorMain <outputPath> <numberOfColumns> <numberOfRecords> <numberOfPartitions> <local>") return } val outputPath = args(0) val numberOfColumns = args(1).toInt val numberOfRecords = args(2).toInt val numberOfPartitions = args(3).toInt val runLocal = (args.length == 5 && args(4).equals("L")) var sc: SparkContext = null if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sc = new SparkContext("local", "test", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("ConfigurableDataGeneratorMain") sc = new SparkContext(sparkConfig) } val sqlContext = new org.apache.spark.sql.SQLContext(sc) //Part A val rowRDD = sc.parallelize( (0 until numberOfPartitions).map( i => i), numberOfPartitions) //Part B val megaDataRDD = rowRDD.flatMap( r => { val random = new Random() val dataRange = (0 until numberOfRecords/numberOfPartitions).iterator dataRange.map[Row]( x => { val values = new mutable.ArrayBuffer[Any] for (i <- 0 until numberOfColumns) { if (i % 2 == 0) { values.+=(random.nextInt(100).toLong) } else { values.+=(random.nextInt(100).toString) } } new GenericRow(values.toArray) }) }) //Part C val schema = StructType( (0 until numberOfColumns).map( i => { if (i % 2 == 0) { StructField("longColumn_" + i, LongType, true) } else { StructField("stringColumn_" + i, StringType, true) } }) ) val df = sqlContext.createDataFrame(megaDataRDD, schema) df.saveAsParquetFile(outputPath) //Part D sc.stop() } }
Example 66
Source File: MyUDF.scala From spark-tools with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.TimestampType object MyUDF { private def myTimestampCast(xs: Seq[Expression]): Expression = { val expSource = xs.head expSource.dataType match { case LongType => new Column(expSource).divide(Literal(1000)).cast(TimestampType).expr case TimestampType => expSource } } def register(sparkSession: SparkSession): Unit = sparkSession.sessionState.functionRegistry .registerFunction("toTs", myTimestampCast) }
Example 67
Source File: GraphDataGen.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.datageneration import org.apache.spark.sql.{DataFrame, Row, SparkSession} import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException} import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{any2Long, getOrDefault, getOrThrow, time} import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} import org.apache.spark.graphx.util.GraphGenerators object GraphDataGen extends WorkloadDefaults { val name = "graph-data-generator" val defaultMu = 4.0 val defaultSigma = 1.3 val defaultSeed = -1L val defaultNumOfPartitions = 0 override def apply(m: Map[String, Any]): GraphDataGen = { val numVertices = getOrThrow(m, "vertices").asInstanceOf[Int] val mu = getOrDefault[Double](m, "mu", defaultMu) val sigma = getOrDefault[Double](m, "sigma", defaultSigma) val numPartitions = getOrDefault[Int](m, "partitions", defaultNumOfPartitions) val seed = getOrDefault[Long](m, "seed", defaultSeed, any2Long) val output = { val str = getOrThrow(m, "output").asInstanceOf[String] val s = verifySuitabilityOfOutputFileFormat(str) Some(s) } val saveMode = getOrDefault[String](m, "save-mode", SaveModes.error) new GraphDataGen( numVertices = numVertices, input = None, output = output, saveMode = saveMode, mu = mu, sigma = sigma, seed = seed, numPartitions = numPartitions ) } private[datageneration] def verifySuitabilityOfOutputFileFormat(str: String): String = { val strArr: Array[String] = str.split('.') (strArr.length, strArr.last) match { case (1, _) => throw SparkBenchException("Output file for GraphDataGen must have \".txt\" as the file extension." + "Please modify your config file.") case (2, "txt") => str case (_, _) => throw SparkBenchException("Due to limitations of the GraphX GraphLoader, " + "the graph data generators may only save files as \".txt\"." + "Please modify your config file.") } } } case class GraphDataGen ( numVertices: Int, input: Option[String] = None, output: Option[String], saveMode: String, mu: Double = 4.0, sigma: Double = 1.3, seed: Long = 1, numPartitions: Int = 0 ) extends Workload { override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = { val timestamp = System.currentTimeMillis() val (generateTime, graph) = time(GraphGenerators.logNormalGraph(spark.sparkContext, numVertices, numPartitions, mu, sigma)) val (convertTime, out) = time(graph.edges.map(e => s"${e.srcId.toString} ${e.dstId}")) val (saveTime, _) = time(out.saveAsTextFile(output.get)) val timeResultSchema = StructType( List( StructField("name", StringType, nullable = false), StructField("timestamp", LongType, nullable = false), StructField("generate", LongType, nullable = true), StructField("convert", LongType, nullable = true), StructField("save", LongType, nullable = true), StructField("total_runtime", LongType, nullable = false) ) ) val total = generateTime + convertTime + saveTime val timeList = spark.sparkContext.parallelize(Seq(Row(GraphDataGen.name, timestamp, generateTime, convertTime, saveTime, total))) spark.createDataFrame(timeList, timeResultSchema) } }
Example 68
Source File: ExtAggregatesSpec.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import com.collective.TestSparkContext import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} import org.scalatest.FlatSpec import org.apache.spark.sql.functions._ import org.apache.spark.sql.ext.functions._ import scala.collection.mutable class ExtAggregatesSpec extends FlatSpec with TestSparkContext { val schema = StructType(Seq( StructField("cookie_id", StringType), StructField("site", StringType), StructField("impressions", LongType) )) val cookie1 = "cookie1" val cookie2 = "cookie2" val cookie3 = "cookie3" val impressionLog = sqlContext.createDataFrame(sc.parallelize(Seq( Row(cookie1, "google.com", 10L), Row(cookie1, "cnn.com", 14L), Row(cookie1, "google.com", 2L), Row(cookie2, "bbc.com", 20L), Row(cookie2, "auto.com", null), Row(cookie2, "auto.com", 1L), Row(cookie3, "sport.com", 100L) )), schema) "Ext Aggregates" should "collect column values as array" in { val cookies = impressionLog .select(collectArray(col("cookie_id"))) .first().getAs[mutable.WrappedArray[String]](0) assert(cookies.length == 7) assert(cookies.toSet.size == 3) } it should "collect distinct values as array" in { val distinctCookies = impressionLog.select(col("cookie_id")) .distinct() .select(collectArray(col("cookie_id"))) .first().getAs[mutable.WrappedArray[String]](0) assert(distinctCookies.length == 3) } it should "collect values after group by" in { val result = impressionLog .groupBy(col("cookie_id")) .agg(collectArray(col("site"))) val cookieSites = result.collect().map { case Row(cookie: String, sites: mutable.WrappedArray[_]) => cookie -> sites.toSeq }.toMap assert(cookieSites(cookie1).length == 3) assert(cookieSites(cookie2).length == 3) assert(cookieSites(cookie3).length == 1) } }
Example 69
Source File: SparkEsBulkWriterSpec.scala From Spark2Elasticsearch with Apache License 2.0 | 5 votes |
package com.github.jparkie.spark.elasticsearch import com.github.jparkie.spark.elasticsearch.conf.{ SparkEsMapperConf, SparkEsWriteConf } import com.github.jparkie.spark.elasticsearch.sql.{ SparkEsDataFrameMapper, SparkEsDataFrameSerializer } import com.holdenkarau.spark.testing.SharedSparkContext import org.apache.spark.sql.types.{ LongType, StringType, StructField, StructType } import org.apache.spark.sql.{ Row, SQLContext } import org.scalatest.{ MustMatchers, WordSpec } class SparkEsBulkWriterSpec extends WordSpec with MustMatchers with SharedSparkContext { val esServer = new ElasticSearchServer() override def beforeAll(): Unit = { super.beforeAll() esServer.start() } override def afterAll(): Unit = { esServer.stop() super.afterAll() } "SparkEsBulkWriter" must { "execute write() successfully" in { esServer.createAndWaitForIndex("test_index") val sqlContext = new SQLContext(sc) val inputSparkEsWriteConf = SparkEsWriteConf( bulkActions = 10, bulkSizeInMB = 1, concurrentRequests = 0, flushTimeoutInSeconds = 1 ) val inputMapperConf = SparkEsMapperConf( esMappingId = Some("id"), esMappingParent = None, esMappingVersion = None, esMappingVersionType = None, esMappingRouting = None, esMappingTTLInMillis = None, esMappingTimestamp = None ) val inputSchema = StructType( Array( StructField("id", StringType, true), StructField("parent", StringType, true), StructField("version", LongType, true), StructField("routing", StringType, true), StructField("ttl", LongType, true), StructField("timestamp", StringType, true), StructField("value", LongType, true) ) ) val inputData = sc.parallelize { Array( Row("TEST_ID_1", "TEST_PARENT_1", 1L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 1L), Row("TEST_ID_1", "TEST_PARENT_2", 2L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 2L), Row("TEST_ID_1", "TEST_PARENT_3", 3L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 3L), Row("TEST_ID_1", "TEST_PARENT_4", 4L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 4L), Row("TEST_ID_1", "TEST_PARENT_5", 5L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 5L), Row("TEST_ID_5", "TEST_PARENT_6", 6L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 6L), Row("TEST_ID_6", "TEST_PARENT_7", 7L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 7L), Row("TEST_ID_7", "TEST_PARENT_8", 8L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 8L), Row("TEST_ID_8", "TEST_PARENT_9", 9L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 9L), Row("TEST_ID_9", "TEST_PARENT_10", 10L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 10L), Row("TEST_ID_10", "TEST_PARENT_11", 11L, "TEST_ROUTING_1", 86400000L, "TEST_TIMESTAMP_1", 11L) ) } val inputDataFrame = sqlContext.createDataFrame(inputData, inputSchema) val inputDataIterator = inputDataFrame.rdd.toLocalIterator val inputSparkEsBulkWriter = new SparkEsBulkWriter[Row]( esIndex = "test_index", esType = "test_type", esClient = () => esServer.client, sparkEsSerializer = new SparkEsDataFrameSerializer(inputSchema), sparkEsMapper = new SparkEsDataFrameMapper(inputMapperConf), sparkEsWriteConf = inputSparkEsWriteConf ) inputSparkEsBulkWriter.write(null, inputDataIterator) val outputGetResponse = esServer.client.prepareGet("test_index", "test_type", "TEST_ID_1").get() outputGetResponse.isExists mustEqual true outputGetResponse.getSource.get("parent").asInstanceOf[String] mustEqual "TEST_PARENT_5" outputGetResponse.getSource.get("version").asInstanceOf[Integer] mustEqual 5 outputGetResponse.getSource.get("routing").asInstanceOf[String] mustEqual "TEST_ROUTING_1" outputGetResponse.getSource.get("ttl").asInstanceOf[Integer] mustEqual 86400000 outputGetResponse.getSource.get("timestamp").asInstanceOf[String] mustEqual "TEST_TIMESTAMP_1" outputGetResponse.getSource.get("value").asInstanceOf[Integer] mustEqual 5 } } }
Example 70
Source File: GroupedIteratorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType} class GroupedIteratorSuite extends SparkFunSuite { test("basic") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 1) key.getInt(0) -> data.map(encoder.fromRow).toSeq }.toSeq assert(result == 1 -> Seq(input(0), input(1)) :: 2 -> Seq(input(2)) :: Nil) } test("group by 2 columns") { val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq( Row(1, 2L, "a"), Row(1, 2L, "b"), Row(1, 3L, "c"), Row(2, 1L, "d"), Row(3, 2L, "e")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 2) (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq) }.toSeq assert(result == (1, 2L, Seq(input(0), input(1))) :: (1, 3L, Seq(input(2))) :: (2, 1L, Seq(input(3))) :: (3, 2L, Seq(input(4))) :: Nil) } test("do nothing to the value iterator") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) assert(grouped.length == 2) } }
Example 71
Source File: MySQLDialect.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 72
Source File: ResolveInlineTablesSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.types.{LongType, NullType} class ResolveInlineTablesSuite extends PlanTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables.validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables.validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables.convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables.convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables.convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 73
Source File: RandomSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.scalatest.Matchers._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, LongType} class RandomSuite extends SparkFunSuite with ExpressionEvalHelper { test("random") { checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001) checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001) checkDoubleEvaluation( new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001) checkDoubleEvaluation( new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001) } test("SPARK-9127 codegen with long seed") { checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001) checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001) } }
Example 74
Source File: DecimalExpressionSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Decimal, DecimalType, LongType} class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { test("UnscaledValue") { val d1 = Decimal("10.1") checkEvaluation(UnscaledValue(Literal(d1)), 101L) val d2 = Decimal(101, 3, 1) checkEvaluation(UnscaledValue(Literal(d2)), 101L) checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null) } test("MakeDecimal") { checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1")) checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null) } test("PromotePrecision") { val d1 = Decimal("10.1") checkEvaluation(PromotePrecision(Literal(d1)), d1) val d2 = Decimal(101, 3, 1) checkEvaluation(PromotePrecision(Literal(d2)), d2) checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null) } test("CheckOverflow") { val d1 = Decimal("10.1") checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null) val d2 = Decimal(101, 3, 1) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null) checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null) } }
Example 75
Source File: MonotonicallyIncreasingID.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initializeInternal(partitionIndex: Int): Unit = { count = 0L partitionMask = partitionIndex.toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, "") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "") ctx.addPartitionInitializationStatement(s"$countTerm = 0L;") ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = "false") } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" }
Example 76
Source File: SQLTransformerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.types.{LongType, StructField, StructType} class SQLTransformerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new SQLTransformer()) } test("transform numeric data") { val original = Seq((0, 1.0, 3.0), (2, 2.0, 5.0)).toDF("id", "v1", "v2") val sqlTrans = new SQLTransformer().setStatement( "SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM __THIS__") val result = sqlTrans.transform(original) val resultSchema = sqlTrans.transformSchema(original.schema) val expected = Seq((0, 1.0, 3.0, 4.0, 3.0), (2, 2.0, 5.0, 7.0, 10.0)) .toDF("id", "v1", "v2", "v3", "v4") assert(result.schema.toString == resultSchema.toString) assert(resultSchema == expected.schema) assert(result.collect().toSeq == expected.collect().toSeq) assert(original.sparkSession.catalog.listTables().count() == 0) } test("read/write") { val t = new SQLTransformer() .setStatement("select * from __THIS__") testDefaultReadWrite(t) } test("transformSchema") { val df = spark.range(10) val outputSchema = new SQLTransformer() .setStatement("SELECT id + 1 AS id1 FROM __THIS__") .transformSchema(df.schema) val expected = StructType(Seq(StructField("id1", LongType, nullable = false))) assert(outputSchema === expected) } }
Example 77
Source File: LinearRegressionDataGen.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.datageneration.mlgenerator import org.apache.spark.mllib.util.LinearDataGenerator import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row, SparkSession} import com.ibm.sparktc.sparkbench.utils.{SaveModes, SparkBenchException} import com.ibm.sparktc.sparkbench.utils.GeneralFunctions.{getOrDefault, getOrThrow, time} import com.ibm.sparktc.sparkbench.utils.SparkFuncs.writeToDisk import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} object LinearRegressionDataGen extends WorkloadDefaults { val name = "data-generation-lr" // Application parameters #1million points have 200M data size val numOfExamples: Int = 40000 val numOfFeatures: Int = 4 val eps: Double = 0.5 val intercepts: Double = 0.1 val numOfPartitions: Int = 10 val maxIteration: Int = 3 override def apply(m: Map[String, Any]) = new LinearRegressionDataGen( numRows = getOrThrow(m, "rows").asInstanceOf[Int], numCols = getOrThrow(m, "cols").asInstanceOf[Int], output = Some(getOrThrow(m, "output").asInstanceOf[String]), saveMode = getOrDefault[String](m, "save-mode", SaveModes.error), eps = getOrDefault[Double](m, "eps", eps), intercepts = getOrDefault[Double](m, "intercepts", intercepts), numPartitions = getOrDefault[Int](m, "partitions", numOfPartitions) ) } case class LinearRegressionDataGen ( numRows: Int, numCols: Int, input: Option[String] = None, output: Option[String], saveMode: String, eps: Double, intercepts: Double, numPartitions: Int ) extends Workload { override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = { val timestamp = System.currentTimeMillis() val (generateTime, data): (Long, RDD[LabeledPoint]) = time { LinearDataGenerator.generateLinearRDD( spark.sparkContext, numRows, numCols, eps, numPartitions, intercepts ) } import spark.implicits._ val (convertTime, dataDF) = time { data.toDF } val (saveTime, _) = time { val outputstr = output.get if(outputstr.endsWith(".csv")) throw SparkBenchException("LabeledPoints cannot be saved to CSV. Please try outputting to Parquet instead.") writeToDisk(output.get, saveMode, dataDF, spark) }//TODO you can't output this to CSV. Parquet is fine val timeResultSchema = StructType( List( StructField("name", StringType, nullable = false), StructField("timestamp", LongType, nullable = false), StructField("generate", LongType, nullable = true), StructField("convert", LongType, nullable = true), StructField("save", LongType, nullable = true), StructField("total_runtime", LongType, nullable = false) ) ) val total = generateTime + convertTime + saveTime val timeList = spark.sparkContext.parallelize(Seq(Row("kmeans", timestamp, generateTime, convertTime, saveTime, total))) spark.createDataFrame(timeList, timeResultSchema) } }
Example 78
Source File: MyUDF.scala From spark-tools with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.FunctionIdentifier import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.expressions.Literal import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.TimestampType object MyUDF { private def myTimestampCast(xs: Seq[Expression]): Expression = { val expSource = xs.head expSource.dataType match { case LongType => new Column(expSource).divide(Literal(1000)).cast(TimestampType).expr case TimestampType => expSource } } def register(sparkSession: SparkSession): Unit = sparkSession.sessionState.functionRegistry .registerFunction(FunctionIdentifier("toTs",None), myTimestampCast) }
Example 79
Source File: PartitionAndSleepWorkload.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.workload.exercise import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import org.apache.spark.sql.{DataFrame, Row, SparkSession} import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._ import com.ibm.sparktc.sparkbench.utils.SaveModes import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} object PartitionAndSleepWorkload extends WorkloadDefaults { val name = "timedsleep" val partitions: Int = 48 val sleepms: Long = 12000L def apply(m: Map[String, Any]) = new PartitionAndSleepWorkload( input = None, output = None, partitions = getOrDefault[Int](m, "partitions", partitions), sleepMS = getOrDefault[Long](m, "sleepms", sleepms, any2Long)) } case class PartitionAndSleepWorkload(input: Option[String] = None, output: Option[String] = None, saveMode: String = SaveModes.error, partitions: Int, sleepMS: Long) extends Workload { def doStuff(spark: SparkSession): (Long, Unit) = time { val ms = sleepMS val stuff: RDD[Int] = spark.sparkContext.parallelize(0 until partitions * 100, partitions) val cool: RDD[(Int, Int)] = stuff.map { i => Thread.sleep(ms) (i % 10, i + 42) } val yeah = cool.reduceByKey(_ + _) yeah.collect() } override def doWorkload(df: Option[DataFrame] = None, spark: SparkSession): DataFrame = { val (t, _) = doStuff(spark) val schema = StructType( List( StructField("name", StringType, nullable = false), StructField("timestamp", LongType, nullable = false), StructField("runtime", LongType, nullable = false) ) ) val timeList = spark.sparkContext.parallelize(Seq(Row("timedsleep", System.currentTimeMillis(), t))) spark.createDataFrame(timeList, schema) } }
Example 80
Source File: CubeMakerTest.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.test.cube import java.sql.Timestamp import com.github.nscala_time.time.Imports._ import com.stratio.sparta.driver.step.{Cube, CubeOperations, Trigger} import com.stratio.sparta.driver.writer.WriterOptions import com.stratio.sparta.plugin.default.DefaultField import com.stratio.sparta.plugin.cube.field.datetime.DateTimeField import com.stratio.sparta.plugin.cube.operator.count.CountOperator import com.stratio.sparta.sdk.pipeline.aggregation.cube.{Dimension, DimensionValue, DimensionValuesTime, InputFields} import com.stratio.sparta.sdk.pipeline.schema.TypeOp import com.stratio.sparta.sdk.utils.AggregationTime import org.apache.spark.sql.Row import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType, TimestampType} import org.apache.spark.streaming.TestSuiteBase import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class CubeMakerTest extends TestSuiteBase { val PreserverOrder = false def getEventOutput(timestamp: Timestamp, millis: Long): Seq[Seq[(DimensionValuesTime, InputFields)]] = { val dimensionString = Dimension("dim1", "eventKey", "identity", new DefaultField) val dimensionTime = Dimension("minute", "minute", "minute", new DateTimeField) val dimensionValueString1 = DimensionValue(dimensionString, "value1") val dimensionValueString2 = dimensionValueString1.copy(value = "value2") val dimensionValueString3 = dimensionValueString1.copy(value = "value3") val dimensionValueTs = DimensionValue(dimensionTime, timestamp) val tsMap = Row(timestamp) val valuesMap1 = InputFields(Row("value1", timestamp), 1) val valuesMap2 = InputFields(Row("value2", timestamp), 1) val valuesMap3 = InputFields(Row("value3", timestamp), 1) Seq(Seq( (DimensionValuesTime("cubeName", Seq(dimensionValueString1, dimensionValueTs)), valuesMap1), (DimensionValuesTime("cubeName", Seq(dimensionValueString2, dimensionValueTs)), valuesMap2), (DimensionValuesTime("cubeName", Seq(dimensionValueString3, dimensionValueTs)), valuesMap3) )) } }
Example 81
Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.http.HttpStreamClient import org.junit.Assert import org.junit.Test import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.types.BooleanType import org.apache.spark.sql.types.FloatType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.ByteType import org.apache.spark.sql.execution.streaming.http.HttpStreamServer import org.apache.spark.sql.execution.streaming.http.StreamPrinter import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException class HttpStreamServerClientTest { val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte), Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte), Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte)); val ROWS2 = Array(Row("hello"), Row("world"), Row("bye"), Row("world")); @Test def testHttpStreamIO() { //starts a http server val kryoSerializer = new KryoSerializer(new SparkConf()); val server = HttpStreamServer.start("/xxxx", 8080); val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]") .getOrCreate(); spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/"); val sqlContext = spark.sqlContext; import spark.implicits._ //add a local message buffer to server, with 2 topics registered server.withBuffer() .addListener(new StreamPrinter()) .createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1") .createTopic[String]("topic-2"); val client = HttpStreamClient.connect("http://localhost:8080/xxxx"); //tests schema of topics val schema1 = client.fetchSchema("topic-1"); Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType), schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]); val schema2 = client.fetchSchema("topic-2"); Assert.assertArrayEquals(Array[Object](StringType), schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]); //prepare to consume messages val sid1 = client.subscribe("topic-1")._1; val sid2 = client.subscribe("topic-2")._1; //produces some data client.sendRows("topic-1", 1, ROWS1); val sid4 = client.subscribe("topic-1")._1; val sid5 = client.subscribe("topic-2")._1; client.sendRows("topic-2", 1, ROWS2); //consumes data val fetched = client.fetchStream(sid1).map(_.originalRow); Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]); //it is empty now Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); client.unsubscribe(sid4); try { client.fetchStream(sid4); //exception should be thrown, because subscriber id is invalidated Assert.assertTrue(false); } catch { case e: Throwable ⇒ e.printStackTrace(); Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass); } server.stop(); } }
Example 82
Source File: hierarchyGen.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.test import org.apache.spark.sql.types.{LongType, Node} import org.scalacheck.{Arbitrary, Gen} import scala.util.Random import scalaz._ import Scalaz._ import scalaz.scalacheck.ScalazArbitrary._ // scalastyle:off file.size.limit object HierarchyGen { val MIN_SIZE_TREE = 6 val MAX_SIZE_TREE = 100 def next(): Long = { synchronized { if (currentSeq == Long.MaxValue) { currentSeq = Long.MinValue } val result = currentSeq currentSeq += 1 result } } def arb: Arbitrary[Long] = Arbitrary { gen } def gen: Gen[Long] = Gen.resultOf[Int,Long] { x => next() } }
Example 83
Source File: GroupedIteratorSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructType} class GroupedIteratorSuite extends SparkFunSuite { test("basic") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 1) key.getInt(0) -> data.map(encoder.fromRow).toSeq }.toSeq assert(result == 1 -> Seq(input(0), input(1)) :: 2 -> Seq(input(2)) :: Nil) } test("group by 2 columns") { val schema = new StructType().add("i", IntegerType).add("l", LongType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq( Row(1, 2L, "a"), Row(1, 2L, "b"), Row(1, 3L, "c"), Row(2, 1L, "d"), Row(3, 2L, "e")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0), 'l.long.at(1)), schema.toAttributes) val result = grouped.map { case (key, data) => assert(key.numFields == 2) (key.getInt(0), key.getLong(1), data.map(encoder.fromRow).toSeq) }.toSeq assert(result == (1, 2L, Seq(input(0), input(1))) :: (1, 3L, Seq(input(2))) :: (2, 1L, Seq(input(3))) :: (3, 2L, Seq(input(4))) :: Nil) } test("do nothing to the value iterator") { val schema = new StructType().add("i", IntegerType).add("s", StringType) val encoder = RowEncoder(schema).resolveAndBind() val input = Seq(Row(1, "a"), Row(1, "b"), Row(2, "c")) val grouped = GroupedIterator(input.iterator.map(encoder.toRow), Seq('i.int.at(0)), schema.toAttributes) assert(grouped.length == 2) } }
Example 84
Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} import org.apache.spark.util.CompletionIterator case class StreamingGlobalLimitExec( streamLimit: Long, child: SparkPlan, stateInfo: Option[StatefulOperatorStateInfo] = None, outputMode: Option[OutputMode] = None) extends UnaryExecNode with StateStoreWriter { private val keySchema = StructType(Array(StructField("key", NullType))) private val valueSchema = StructType(Array(StructField("value", LongType))) override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append, "StreamingGlobalLimitExec is only valid for streams in Append output mode") child.execute().mapPartitionsWithStateStore( getStateInfo, keySchema, valueSchema, indexOrdinal = None, sqlContext.sessionState, Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) => val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null))) val numOutputRows = longMetric("numOutputRows") val numUpdatedStateRows = longMetric("numUpdatedStateRows") val allUpdatesTimeMs = longMetric("allUpdatesTimeMs") val commitTimeMs = longMetric("commitTimeMs") val updatesStartTimeNs = System.nanoTime val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L) var cumulativeRowCount = preBatchRowCount val result = iter.filter { r => val x = cumulativeRowCount < streamLimit if (x) { cumulativeRowCount += 1 } x } CompletionIterator[InternalRow, Iterator[InternalRow]](result, { if (cumulativeRowCount > preBatchRowCount) { numUpdatedStateRows += 1 numOutputRows += cumulativeRowCount - preBatchRowCount store.put(key, getValueRow(cumulativeRowCount)) } allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs) commitTimeMs += timeTakenMs { store.commit() } setStoreMetrics(store) }) } } override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = child.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil private def getValueRow(value: Long): UnsafeRow = { UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value))) } }
Example 85
Source File: MySQLDialect.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 86
Source File: ResolveInlineTablesSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.scalatest.BeforeAndAfter import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Cast, Literal, Rand} import org.apache.spark.sql.catalyst.expressions.aggregate.Count import org.apache.spark.sql.catalyst.plans.logical.LocalRelation import org.apache.spark.sql.types.{LongType, NullType, TimestampType} class ResolveInlineTablesSuite extends AnalysisTest with BeforeAndAfter { private def lit(v: Any): Literal = Literal(v) test("validate inputs are foldable") { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1))))) // nondeterministic (rand) should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Rand(1))))) } // aggregate should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(Count(lit(1)))))) } // unresolved attribute should not work intercept[AnalysisException] { ResolveInlineTables(conf).validateInputEvaluable( UnresolvedInlineTable(Seq("c1"), Seq(Seq(UnresolvedAttribute("A"))))) } } test("validate input dimensions") { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2))))) // num alias != data dimension intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(lit(1)), Seq(lit(2))))) } // num alias == data dimension, but data themselves are inconsistent intercept[AnalysisException] { ResolveInlineTables(conf).validateInputDimension( UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(21), lit(22))))) } } test("do not fire the rule if not all expressions are resolved") { val table = UnresolvedInlineTable(Seq("c1", "c2"), Seq(Seq(UnresolvedAttribute("A")))) assert(ResolveInlineTables(conf)(table) == table) } test("convert") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted = ResolveInlineTables(conf).convert(table) assert(converted.output.map(_.dataType) == Seq(LongType)) assert(converted.data.size == 2) assert(converted.data(0).getLong(0) == 1L) assert(converted.data(1).getLong(0) == 2L) } test("convert TimeZoneAwareExpression") { val table = UnresolvedInlineTable(Seq("c1"), Seq(Seq(Cast(lit("1991-12-06 00:00:00.0"), TimestampType)))) val withTimeZone = ResolveTimeZone(conf).apply(table) val LocalRelation(output, data, _) = ResolveInlineTables(conf).apply(withTimeZone) val correct = Cast(lit("1991-12-06 00:00:00.0"), TimestampType) .withTimeZone(conf.sessionLocalTimeZone).eval().asInstanceOf[Long] assert(output.map(_.dataType) == Seq(TimestampType)) assert(data.size == 1) assert(data.head.getLong(0) == correct) } test("nullability inference in convert") { val table1 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(lit(2L)))) val converted1 = ResolveInlineTables(conf).convert(table1) assert(!converted1.schema.fields(0).nullable) val table2 = UnresolvedInlineTable(Seq("c1"), Seq(Seq(lit(1)), Seq(Literal(null, NullType)))) val converted2 = ResolveInlineTables(conf).convert(table2) assert(converted2.schema.fields(0).nullable) } }
Example 87
Source File: RandomSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.scalatest.Matchers._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{IntegerType, LongType} class RandomSuite extends SparkFunSuite with ExpressionEvalHelper { test("random") { checkDoubleEvaluation(Rand(30), 0.31429268272540556 +- 0.001) checkDoubleEvaluation(Randn(30), -0.4798519469521663 +- 0.001) checkDoubleEvaluation( new Rand(Literal.create(null, LongType)), 0.8446490682263027 +- 0.001) checkDoubleEvaluation( new Randn(Literal.create(null, IntegerType)), 1.1164209726833079 +- 0.001) } test("SPARK-9127 codegen with long seed") { checkDoubleEvaluation(Rand(5419823303878592871L), 0.2304755080444375 +- 0.001) checkDoubleEvaluation(Randn(5419823303878592871L), -1.2824262718225607 +- 0.001) } }
Example 88
Source File: DecimalExpressionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Decimal, DecimalType, LongType} class DecimalExpressionSuite extends SparkFunSuite with ExpressionEvalHelper { test("UnscaledValue") { val d1 = Decimal("10.1") checkEvaluation(UnscaledValue(Literal(d1)), 101L) val d2 = Decimal(101, 3, 1) checkEvaluation(UnscaledValue(Literal(d2)), 101L) checkEvaluation(UnscaledValue(Literal.create(null, DecimalType(2, 1))), null) } test("MakeDecimal") { checkEvaluation(MakeDecimal(Literal(101L), 3, 1), Decimal("10.1")) checkEvaluation(MakeDecimal(Literal.create(null, LongType), 3, 1), null) } test("PromotePrecision") { val d1 = Decimal("10.1") checkEvaluation(PromotePrecision(Literal(d1)), d1) val d2 = Decimal(101, 3, 1) checkEvaluation(PromotePrecision(Literal(d2)), d2) checkEvaluation(PromotePrecision(Literal.create(null, DecimalType(2, 1))), null) } test("CheckOverflow") { val d1 = Decimal("10.1") checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 1)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 2)), d1) checkEvaluation(CheckOverflow(Literal(d1), DecimalType(4, 3)), null) val d2 = Decimal(101, 3, 1) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 0)), Decimal("10")) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 1)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 2)), d2) checkEvaluation(CheckOverflow(Literal(d2), DecimalType(4, 3)), null) checkEvaluation(CheckOverflow(Literal.create(null, DecimalType(2, 1)), DecimalType(3, 2)), null) } }