org.apache.spark.sql.types.DataType Scala Examples
The following examples show how to use org.apache.spark.sql.types.DataType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: AvroDataToCatalyst.scala From spark-schema-registry with Apache License 2.0 | 6 votes |
package com.hortonworks.spark.registry.avro import java.io.ByteArrayInputStream import com.hortonworks.registries.schemaregistry.{SchemaVersionInfo, SchemaVersionKey} import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotDeserializer import org.apache.avro.Schema import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{BinaryType, DataType} import scala.collection.JavaConverters._ case class AvroDataToCatalyst(child: Expression, schemaName: String, version: Option[Int], config: Map[String, Object]) extends UnaryExpression with ExpectsInputTypes { override def inputTypes = Seq(BinaryType) @transient private lazy val srDeser: AvroSnapshotDeserializer = { val obj = new AvroSnapshotDeserializer() obj.init(config.asJava) obj } @transient private lazy val srSchema = fetchSchemaVersionInfo(schemaName, version) @transient private lazy val avroSchema = new Schema.Parser().parse(srSchema.getSchemaText) override lazy val dataType: DataType = SchemaConverters.toSqlType(avroSchema).dataType @transient private lazy val avroDeser= new AvroDeserializer(avroSchema, dataType) override def nullable: Boolean = true override def nullSafeEval(input: Any): Any = { val binary = input.asInstanceOf[Array[Byte]] val row = avroDeser.deserialize(srDeser.deserialize(new ByteArrayInputStream(binary), srSchema.getVersion)) val result = row match { case r: InternalRow => r.copy() case _ => row } result } override def simpleString: String = { s"from_sr(${child.sql}, ${dataType.simpleString})" } override def sql: String = { s"from_sr(${child.sql}, ${dataType.catalogString})" } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val expr = ctx.addReferenceObj("this", this) defineCodeGen(ctx, ev, input => s"(${ctx.boxedType(dataType)})$expr.nullSafeEval($input)") } private def fetchSchemaVersionInfo(schemaName: String, version: Option[Int]): SchemaVersionInfo = { val srClient = new SchemaRegistryClient(config.asJava) version.map(v => srClient.getSchemaVersionInfo(new SchemaVersionKey(schemaName, v))) .getOrElse(srClient.getLatestSchemaVersionInfo(schemaName)) } }
Example 2
Source File: DCT.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 3
Source File: SparkNarrow.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.centrifuge import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.StructType sealed trait SType { def typeName: String } case class SOption(arg: SType) extends SType { override def typeName: String = s"Option[${arg.typeName}]" } case class SClass(name: String) extends SType { override def typeName: String = name } case class SCC(names: Seq[String], args: Seq[(String, SType)]) extends SType { def classDef: String = s"case class ${names.last}(${args.map({ case (n, t) => s"$n:${t.typeName}" }).mkString(",")} )" override def typeName: String = names.mkString(".") } object Sparknarrow { def dataTypeToTypeName(dataType: DataType): String = dataType.simpleString.capitalize match { case "Date" => "java.sql.Date" case "Int" => "scala.Int" case x => s"java.lang.$x" } def basicCC(schema: StructType, pck: Option[String] = None, name: String = "_Cc"): SCC = SCC( names = pck.toSeq ++ List(name), schema.map(strucField => { strucField.name -> SOption(SClass(dataTypeToTypeName(strucField.dataType))) }) ) }
Example 4
Source File: Tokenizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @Since("1.6.0") def getToLowercase: Boolean = $(toLowercase) setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+", toLowercase -> true) override protected def createTransformFunc: String => Seq[String] = { originStr => val re = $(pattern).r val str = if ($(toLowercase)) originStr.toLowerCase() else originStr val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, true) @Since("1.4.1") override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra) } @Since("1.6.0") object RegexTokenizer extends DefaultParamsReadable[RegexTokenizer] { @Since("1.6.0") override def load(path: String): RegexTokenizer = super.load(path) }
Example 5
Source File: ElementwiseProduct.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 6
Source File: Normalizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 7
Source File: NGram.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @Since("1.5.0") def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 8
Source File: InputFileName.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.rdd.InputFileNameHolder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, StringType} import org.apache.spark.unsafe.types.UTF8String @ExpressionDescription( usage = "_FUNC_() - Returns the name of the current file being read if available", extended = "> SELECT _FUNC_();\n ''") case class InputFileName() extends LeafExpression with Nondeterministic { override def nullable: Boolean = true override def dataType: DataType = StringType override def prettyName: String = "input_file_name" override protected def initInternal(): Unit = {} override protected def evalInternal(input: InternalRow): UTF8String = { InputFileNameHolder.getInputFileName() } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { ev.copy(code = s"final ${ctx.javaType(dataType)} ${ev.value} = " + "org.apache.spark.rdd.InputFileNameHolder.getInputFileName();", isNull = "false") } }
Example 9
Source File: MonotonicallyIncreasingID.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initInternal(): Unit = { count = 0L partitionMask = TaskContext.getPartitionId().toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = "false") } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" }
Example 10
Source File: randomExpressions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, DoubleType} import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom @ExpressionDescription( usage = "_FUNC_(a) - Returns a random column with i.i.d. gaussian random distribution.") case class Randn(seed: Long) extends RDG { override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() def this() = this(Utils.random.nextLong()) def this(seed: Expression) = this(seed match { case IntegerLiteral(s) => s case _ => throw new AnalysisException("Input argument to randn must be an integer literal.") }) override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val rngTerm = ctx.freshName("rng") val className = classOf[XORShiftRandom].getName ctx.addMutableState(className, rngTerm, s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""", isNull = "false") } }
Example 11
Source File: ReferenceToExpressions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.objects.LambdaVariable import org.apache.spark.sql.types.DataType case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) extends Expression { override def nullable: Boolean = result.nullable override def dataType: DataType = result.dataType override def checkInputDataTypes(): TypeCheckResult = { if (result.references.nonEmpty) { return TypeCheckFailure("The result expression cannot reference to any attributes.") } var maxOrdinal = -1 result foreach { case b: BoundReference if b.ordinal > maxOrdinal => maxOrdinal = b.ordinal case _ => } if (maxOrdinal > children.length) { return TypeCheckFailure(s"The result expression need $maxOrdinal input expressions, but " + s"there are only ${children.length} inputs.") } TypeCheckSuccess } private lazy val projection = UnsafeProjection.create(children) override def eval(input: InternalRow): Any = { result.eval(projection(input)) } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val childrenGen = children.map(_.genCode(ctx)) val childrenVars = childrenGen.zip(children).map { case (childGen, child) => LambdaVariable(childGen.value, childGen.isNull, child.dataType) } val resultGen = result.transform { case b: BoundReference => childrenVars(b.ordinal) }.genCode(ctx) ExprCode(code = childrenGen.map(_.code).mkString("\n") + "\n" + resultGen.code, isNull = resultGen.isNull, value = resultGen.value) } }
Example 12
Source File: ResolveTableValuedFunctions.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans._ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Range} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.{DataType, IntegerType, LongType} tvf("start" -> LongType, "end" -> LongType, "step" -> LongType, "numPartitions" -> IntegerType) { case Seq(start: Long, end: Long, step: Long, numPartitions: Int) => Range(start, end, step, Some(numPartitions)) }) ) override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) => builtinFunctions.get(u.functionName) match { case Some(tvf) => val resolved = tvf.flatMap { case (argList, resolver) => argList.implicitCast(u.functionArgs) match { case Some(casted) => Some(resolver(casted.map(_.eval()))) case _ => None } } resolved.headOption.getOrElse { val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ") u.failAnalysis( s"""error: table-valued function ${u.functionName} with alternatives: |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")} |cannot be applied to: (${argTypes})""".stripMargin) } case _ => u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function") } } }
Example 13
Source File: MapDataSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import scala.collection._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.util.ArrayBasedMapData import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType} import org.apache.spark.unsafe.types.UTF8String class MapDataSuite extends SparkFunSuite { test("inequality tests") { def u(str: String): UTF8String = UTF8String.fromString(str) // test data val testMap1 = Map(u("key1") -> 1) val testMap2 = Map(u("key1") -> 1, u("key2") -> 2) val testMap3 = Map(u("key1") -> 1) val testMap4 = Map(u("key1") -> 1, u("key2") -> 2) // ArrayBasedMapData val testArrayMap1 = ArrayBasedMapData(testMap1.toMap) val testArrayMap2 = ArrayBasedMapData(testMap2.toMap) val testArrayMap3 = ArrayBasedMapData(testMap3.toMap) val testArrayMap4 = ArrayBasedMapData(testMap4.toMap) assert(testArrayMap1 !== testArrayMap3) assert(testArrayMap2 !== testArrayMap4) // UnsafeMapData val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType))) val row = new GenericInternalRow(1) def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = { row.update(0, map) val unsafeRow = unsafeConverter.apply(row) unsafeRow.getMap(0).copy } assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3)) assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4)) } }
Example 14
Source File: ExpressionEvalHelperSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, IntegerType} case class BadCodegenExpression() extends LeafExpression { override def nullable: Boolean = false override def eval(input: InternalRow): Any = 10 override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { ev.copy(code = s""" |int some_variable = 11; |int ${ev.value} = 10; """.stripMargin) } override def dataType: DataType = IntegerType }
Example 15
Source File: MySQLDialect.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 16
Source File: subquery.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId, InSet, Literal, PlanExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{BooleanType, DataType, StructType} case class ReuseSubquery(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of exchanges to avoid O(N*N) sameResult calls. val subqueries = mutable.HashMap[StructType, ArrayBuffer[SubqueryExec]]() plan transformAllExpressions { case sub: ExecSubqueryExpression => val sameSchema = subqueries.getOrElseUpdate(sub.plan.schema, ArrayBuffer[SubqueryExec]()) val sameResult = sameSchema.find(_.sameResult(sub.plan)) if (sameResult.isDefined) { sub.withNewPlan(sameResult.get) } else { sameSchema += sub.plan sub } } } }
Example 17
Source File: ExistingRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoder, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericInternalRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } case class RDDScanExec( output: Seq[Attribute], rdd: RDD[InternalRow], override val nodeName: String) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.mapPartitionsInternal { iter => val proj = UnsafeProjection.create(schema) iter.map { r => numOutputRows += 1 proj(r) } } } override def simpleString: String = { s"Scan $nodeName${Utils.truncatedString(output, "[", ",", "]")}" } }
Example 18
Source File: Grok.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.s2jobs.udfs import org.apache.s2graph.s2jobs.utils.GrokHelper import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{DataType, StructType} import play.api.libs.json.{JsValue, Json} class Grok extends Udf { import org.apache.spark.sql.functions.udf def register(ss: SparkSession, name:String, options:Map[String, String]) = { // grok val patternDir = options.getOrElse("patternDir", "/tmp") val patternFiles = options.getOrElse("patternFiles", "").split(",").toSeq val patterns = Json.parse(options.getOrElse("patterns", "{}")).asOpt[Map[String, String]].getOrElse(Map.empty) val compilePattern = options("compilePattern") val schemaOpt = options.get("schema") patternFiles.foreach { patternFile => ss.sparkContext.addFile(s"${patternDir}/${patternFile}") } implicit val grok = GrokHelper.getGrok(name, patternFiles, patterns, compilePattern) val f = if(schemaOpt.isDefined) { val schema = DataType.fromJson(schemaOpt.get) implicit val keys:Array[String] = schema.asInstanceOf[StructType].fieldNames udf(GrokHelper.grokMatchWithSchema _, schema) } else { udf(GrokHelper.grokMatch _) } ss.udf.register(name, f) } }
Example 19
Source File: Deserializer.scala From almaren-framework with Apache License 2.0 | 5 votes |
package com.github.music.of.the.ainur.almaren.state.core import com.github.music.of.the.ainur.almaren.State import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{DataType, StructType} import scala.language.implicitConversions import com.github.music.of.the.ainur.almaren.Almaren import com.github.music.of.the.ainur.almaren.util.Constants import org.apache.spark.sql.Dataset abstract class Deserializer() extends State { override def executor(df: DataFrame): DataFrame = deserializer(df) def deserializer(df: DataFrame): DataFrame implicit def string2Schema(schema: String): DataType = StructType.fromDDL(schema) } case class AvroDeserializer(columnName: String,schema: String) extends Deserializer { import org.apache.spark.sql.avro._ import org.apache.spark.sql.functions._ override def deserializer(df: DataFrame): DataFrame = { logger.info(s"columnName:{$columnName}, schema:{$schema}") df.withColumn(columnName,from_avro(col(columnName),schema)) .select("*",columnName.concat(".*")).drop(columnName) } } case class JsonDeserializer(columnName: String,schema: Option[String]) extends Deserializer { import org.apache.spark.sql.functions._ override def deserializer(df: DataFrame): DataFrame = { import df.sparkSession.implicits._ logger.info(s"columnName:{$columnName}, schema:{$schema}") df.withColumn(columnName, from_json(col(columnName), schema.getOrElse(getSchemaDDL(df.selectExpr(columnName).as[(String)])))) .select("*",columnName.concat(".*")) .drop(columnName) } private def getSchemaDDL(df: Dataset[String]): String = Almaren.spark.getOrCreate().read.json(df.sample(Constants.sampleDeserializer)).schema.toDDL } case class XMLDeserializer(columnName: String) extends Deserializer { import com.databricks.spark.xml.XmlReader override def deserializer(df: DataFrame): DataFrame = { logger.info(s"columnName:{$columnName}") new XmlReader().xmlRdd(df.sparkSession,df.select(columnName).rdd.map(r => r(0).asInstanceOf[String])).toDF } }
Example 20
Source File: TimestampCast.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.expressions.codegen.{ CodegenContext, ExprCode, CodeGenerator, JavaCode, Block } import org.apache.spark.sql.catalyst.expressions.{ Expression, NullIntolerant, UnaryExpression } import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{ DataType, LongType, TimestampType } case class TimestampToNanos(child: Expression) extends TimestampCast { val dataType: DataType = LongType protected def cast(childPrim: String): String = s"$childPrim * 1000L" override protected def nullSafeEval(input: Any): Any = input.asInstanceOf[Long] * 1000L } case class NanosToTimestamp(child: Expression) extends TimestampCast { val dataType: DataType = TimestampType protected def cast(childPrim: String): String = s"$childPrim / 1000L" override protected def nullSafeEval(input: Any): Any = input.asInstanceOf[Long] / 1000L } object TimestampToNanos { private[this] def castCode(ctx: CodegenContext, childPrim: String, childNull: String, resultPrim: String, resultNull: String, resultType: DataType): Block = { code""" boolean $resultNull = $childNull; ${CodeGenerator.javaType(resultType)} $resultPrim = ${CodeGenerator.defaultValue(resultType)}; if (!${childNull}) { $resultPrim = (long) ${cast(childPrim)}; } """ } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val eval = child.genCode(ctx) ev.copy(code = eval.code + castCode(ctx, eval.value, eval.isNull, ev.value, ev.isNull, dataType)) } }
Example 21
Source File: CatalystTypeConvertersWrapper.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow } import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.DataType object CatalystTypeConvertersWrapper { def toCatalystRowConverter(dataType: DataType): Row => InternalRow = { CatalystTypeConverters.createToCatalystConverter(dataType)(_).asInstanceOf[InternalRow] } def toScalaRowConverter(dataType: DataType): InternalRow => GenericRowWithSchema = { CatalystTypeConverters.createToScalaConverter(dataType)(_).asInstanceOf[GenericRowWithSchema] } def toCatalystConverter(dataType: DataType): Any => Any = CatalystTypeConverters.createToCatalystConverter(dataType) def toScalaConverter(dataType: DataType): Any => Any = CatalystTypeConverters.createToScalaConverter(dataType) }
Example 22
Source File: PredicateSummarizerFactory.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.summarize.{ ColumnList, FilterNullInput, Summarizer, SummarizerFactory } import org.apache.spark.sql.CatalystTypeConvertersWrapper import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.{ DataType, StructType } class PredicateSummarizerFactory( factory: SummarizerFactory, f: AnyRef, inputColumns: Seq[(String, DataType)] ) extends SummarizerFactory { override val requiredColumns: ColumnList = factory.requiredColumns ++ ColumnList.Sequence(inputColumns.map(_._1)) def apply(inputSchema: StructType): Summarizer = { inputColumns.foreach { case (column, dataType) => require(inputSchema.fieldNames.contains(column), s"Input schema $inputSchema doesn't contain $column column.") require( inputSchema(column).dataType == dataType, s"Input type: ${inputSchema(column).dataType} isn't equal to $dataType" ) } val filterFunction = UDFConverter.udfToFilter(f, inputSchema, inputColumns.map(_._1)) val innerSummarizer = factory(inputSchema) new PredicateSummarizer(inputSchema, prefixOpt, requiredColumns, innerSummarizer, filterFunction) } } class PredicateSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList, val innnerSummarizer: Summarizer, val predicate: InternalRow => Boolean ) extends Summarizer with FilterNullInput { override val schema: StructType = innnerSummarizer.schema override val summarizer = innnerSummarizer.summarizer override type T = innnerSummarizer.T override type U = innnerSummarizer.U override type V = innnerSummarizer.V override def isValid(r: InternalRow): Boolean = super.isValid(r) && predicate(r) override def toT(r: InternalRow): T = innnerSummarizer.toT(r) override def fromV(v: V): InternalRow = innnerSummarizer.fromV(v) } private object UDFConverter { def udfToFilter(function: AnyRef, inputSchema: StructType, columns: Seq[String]): InternalRow => Boolean = { val fieldIndices = columns.map(inputSchema.fieldIndex) val columnPairs = fieldIndices.map(index => index -> inputSchema.fields(index).dataType) buildFilterFunction(function, columnPairs) } private def buildFilterFunction(function: AnyRef, columns: Seq[(Int, DataType)]): InternalRow => Boolean = { val extractors = columns.map { case (index, dataType) => val converter = CatalystTypeConvertersWrapper.toScalaConverter(dataType) row: InternalRow => converter(row.get(index, dataType)) } columns.size match { case 1 => val func = function.asInstanceOf[(Any) => Boolean] (input: InternalRow) => { func(extractors(0)(input)) } case 2 => val func = function.asInstanceOf[(Any, Any) => Boolean] (input: InternalRow) => { func(extractors(0)(input), extractors(1)(input)) } case 3 => val func = function.asInstanceOf[(Any, Any, Any) => Boolean] (input: InternalRow) => { func(extractors(0)(input), extractors(1)(input), extractors(2)(input)) } case 4 => val func = function.asInstanceOf[(Any, Any, Any, Any) => Boolean] (input: InternalRow) => { func(extractors(0)(input), extractors(1)(input), extractors(2)(input), extractors(3)(input)) } case _ => throw new UnsupportedOperationException("Cannot build function with more than four arguments") } } }
Example 23
Source File: ExtremeSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.rdd.function.summarize.summarizer.Summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.{ SummarizerFactory, SummarizerSuite } import com.twosigma.flint.timeseries.{ CSV, Summarizers, TimeSeriesRDD, TimeSeriesSuite } import org.apache.spark.sql.types.{ DataType, DoubleType, FloatType, IntegerType, LongType, StructType } import java.util.Random import org.apache.spark.sql.Row class ExtremeSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer" private def test[T]( dataType: DataType, randValue: Row => Any, summarizer: String => SummarizerFactory, reduceFn: (T, T) => T, inputColumn: String, outputColumn: String ): Unit = { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)).addColumns( inputColumn -> dataType -> randValue ) val data = priceTSRdd.collect().map{ row => row.getAs[T](inputColumn) } val trueExtreme = data.reduceLeft[T]{ case (x, y) => reduceFn(x, y) } val result = priceTSRdd.summarize(summarizer(inputColumn)) val extreme = result.first().getAs[T](outputColumn) val outputType = result.schema(outputColumn).dataType assert(outputType == dataType, s"$outputType") assert(trueExtreme === extreme, s"extreme: $extreme, trueExtreme: $trueExtreme, data: ${data.toSeq}") } "MaxSummarizer" should "compute double max correctly" in { val rand = new Random() test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.max, math.max, "x", "x_max") } it should "compute long max correctly" in { val rand = new Random() test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.max, math.max, "x", "x_max") } it should "compute float max correctly" in { val rand = new Random() test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.max, math.max, "x", "x_max") } it should "compute int max correctly" in { val rand = new Random() test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.max, math.max, "x", "x_max") } "MinSummarizer" should "compute double min correctly" in { val rand = new Random() test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.min, math.min, "x", "x_min") } it should "compute long min correctly" in { val rand = new Random() test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.min, math.min, "x", "x_min") } it should "compute float min correctly" in { val rand = new Random() test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.min, math.min, "x", "x_min") } it should "compute int min correctly" in { val rand = new Random() test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.min, math.min, "x", "x_min") } it should "pass summarizer property test" in { summarizerPropertyTest(AllProperties)(Summarizers.max("x1")) summarizerPropertyTest(AllProperties)(Summarizers.min("x2")) } it should "ignore null values" in { val input = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val inputWithNull = insertNullRows(input, "price") assertEquals( input.summarize(Summarizers.min("price")), inputWithNull.summarize(Summarizers.min("price")) ) } }
Example 24
Source File: UDFTransformer.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package org.apache.spark.ml import java.io._ import org.apache.hadoop.fs.Path import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType class UDFTransformer[T, U](override val uid: String, f: T => U, inType: DataType, outType: DataType) extends UnaryTransformer[T, U, UDFTransformer[T, U]] with MLWritable { def this() = this("", null, null, null) override protected def createTransformFunc: T => U = f override protected def validateInputType(inputType: DataType): Unit = require(inputType == inType) override protected def outputDataType: DataType = outType override def write: MLWriter = new UDFWriter(this) } object UDFTransformer extends MLReadable[UDFTransformer[_, _]] { override def read: MLReader[UDFTransformer[_, _]] = new UDFReader } class UDFReader extends MLReader[UDFTransformer[_, _]] { override def load(path: String): UDFTransformer[_, _] = { val metadata = DefaultParamsReader.loadMetadata(path, sc) val modelPath = new Path(path, "model").toString val model = sc.objectFile[UDFTransformer[_,_]](modelPath, 1).first() model } } class UDFWriter(instance: UDFTransformer[_, _]) extends MLWriter with Serializable { override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) val modelPath = new Path(path, "model").toString sc.parallelize(Seq(instance), 1).saveAsObjectFile(modelPath) } }
Example 25
Source File: SparkWrapper.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.types.{DataType, Metadata} object SparkWrapper { def getVersion: String = { "SparkWrapper-2.3" } def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = { SubqueryAlias(identifier, child) } def newAlias(child: Expression, name: String): Alias = { Alias(child, name)() } def newAttributeReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)() } def callSessionCatalogCreateTable( obj: SessionCatalog, tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = { obj.createTable(tableDefinition, ignoreIfExists) } }
Example 26
Source File: SparkWrapper.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.types.{DataType, Metadata} object SparkWrapper { def getVersion: String = { "SparkWrapper-2.4" } def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = { SubqueryAlias(identifier, child) } def newAlias(child: Expression, name: String): Alias = { Alias(child, name)() } def newAttributeReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)() } def callSessionCatalogCreateTable( obj: SessionCatalog, tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = { obj.createTable(tableDefinition, ignoreIfExists) } }
Example 27
Source File: parser.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.extensions import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.expressions.{Expression, SubqueryExpression} import org.apache.spark.sql.catalyst.parser._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.execution.SparkSqlParser import org.apache.spark.sql.execution.command.{ CacheTableCommand, CreateViewCommand, ExplainCommand, UncacheTableCommand } import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{SparkSession, TiContext} case class TiParser(getOrCreateTiContext: SparkSession => TiContext)( sparkSession: SparkSession, delegate: ParserInterface) extends ParserInterface { private lazy val tiContext = getOrCreateTiContext(sparkSession) private lazy val internal = new SparkSqlParser(sparkSession.sqlContext.conf) private def needQualify(tableIdentifier: TableIdentifier) = tableIdentifier.database.isEmpty && tiContext.sessionCatalog .getTempView(tableIdentifier.table) .isEmpty }
Example 28
Source File: inputFileBlock.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.rdd.InputFileBlockHolder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{DataType, LongType, StringType} import org.apache.spark.unsafe.types.UTF8String @ExpressionDescription( usage = "_FUNC_() - Returns the name of the file being read, or empty string if not available.") case class InputFileName() extends LeafExpression with Nondeterministic { override def nullable: Boolean = false override def dataType: DataType = StringType override def prettyName: String = "input_file_name" override protected def initializeInternal(partitionIndex: Int): Unit = {} override protected def evalInternal(input: InternalRow): UTF8String = { InputFileBlockHolder.getInputFilePath } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val className = InputFileBlockHolder.getClass.getName.stripSuffix("$") val typeDef = s"final ${CodeGenerator.javaType(dataType)}" ev.copy(code = code"$typeDef ${ev.value} = $className.getInputFilePath();", isNull = FalseLiteral) } } @ExpressionDescription( usage = "_FUNC_() - Returns the start offset of the block being read, or -1 if not available.") case class InputFileBlockStart() extends LeafExpression with Nondeterministic { override def nullable: Boolean = false override def dataType: DataType = LongType override def prettyName: String = "input_file_block_start" override protected def initializeInternal(partitionIndex: Int): Unit = {} override protected def evalInternal(input: InternalRow): Long = { InputFileBlockHolder.getStartOffset } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val className = InputFileBlockHolder.getClass.getName.stripSuffix("$") val typeDef = s"final ${CodeGenerator.javaType(dataType)}" ev.copy(code = code"$typeDef ${ev.value} = $className.getStartOffset();", isNull = FalseLiteral) } } @ExpressionDescription( usage = "_FUNC_() - Returns the length of the block being read, or -1 if not available.") case class InputFileBlockLength() extends LeafExpression with Nondeterministic { override def nullable: Boolean = false override def dataType: DataType = LongType override def prettyName: String = "input_file_block_length" override protected def initializeInternal(partitionIndex: Int): Unit = {} override protected def evalInternal(input: InternalRow): Long = { InputFileBlockHolder.getLength } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val className = InputFileBlockHolder.getClass.getName.stripSuffix("$") val typeDef = s"final ${CodeGenerator.javaType(dataType)}" ev.copy(code = code"$typeDef ${ev.value} = $className.getLength();", isNull = FalseLiteral) } }
Example 29
Source File: MonotonicallyIncreasingID.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initializeInternal(partitionIndex: Int): Unit = { count = 0L partitionMask = partitionIndex.toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.addMutableState(CodeGenerator.JAVA_LONG, "count") val partitionMaskTerm = "partitionMask" ctx.addImmutableStateIfNotExists(CodeGenerator.JAVA_LONG, partitionMaskTerm) ctx.addPartitionInitializationStatement(s"$countTerm = 0L;") ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;") ev.copy(code = code""" final ${CodeGenerator.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = FalseLiteral) } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" override def freshCopy(): MonotonicallyIncreasingID = MonotonicallyIncreasingID() }
Example 30
Source File: constraintExpressions.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, FalseLiteral} import org.apache.spark.sql.types.DataType case class KnownNotNull(child: Expression) extends UnaryExpression { override def nullable: Boolean = false override def dataType: DataType = child.dataType override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { child.genCode(ctx).copy(isNull = FalseLiteral) } override def eval(input: InternalRow): Any = { child.eval(input) } }
Example 31
Source File: PythonUDF.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.api.python.{PythonEvalType, PythonFunction} import org.apache.spark.sql.catalyst.util.toPrettySQL import org.apache.spark.sql.types.DataType case class PythonUDF( name: String, func: PythonFunction, dataType: DataType, children: Seq[Expression], evalType: Int, udfDeterministic: Boolean, resultId: ExprId = NamedExpression.newExprId) extends Expression with Unevaluable with NonSQLExpression with UserDefinedExpression { override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic) override def toString: String = s"$name(${children.mkString(", ")})" lazy val resultAttribute: Attribute = AttributeReference(toPrettySQL(this), dataType, nullable)( exprId = resultId) override def nullable: Boolean = true }
Example 32
Source File: ResolveTableValuedFunctions.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.analysis import java.util.Locale import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Project, Range} import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.{DataType, IntegerType, LongType} tvf("start" -> LongType, "end" -> LongType, "step" -> LongType, "numPartitions" -> IntegerType) { case Seq(start: Long, end: Long, step: Long, numPartitions: Int) => Range(start, end, step, Some(numPartitions)) }) ) override def apply(plan: LogicalPlan): LogicalPlan = plan resolveOperators { case u: UnresolvedTableValuedFunction if u.functionArgs.forall(_.resolved) => // The whole resolution is somewhat difficult to understand here due to too much abstractions. // We should probably rewrite the following at some point. Reynold was just here to improve // error messages and didn't have time to do a proper rewrite. val resolvedFunc = builtinFunctions.get(u.functionName.toLowerCase(Locale.ROOT)) match { case Some(tvf) => def failAnalysis(): Nothing = { val argTypes = u.functionArgs.map(_.dataType.typeName).mkString(", ") u.failAnalysis( s"""error: table-valued function ${u.functionName} with alternatives: |${tvf.keys.map(_.toString).toSeq.sorted.map(x => s" ($x)").mkString("\n")} |cannot be applied to: ($argTypes)""".stripMargin) } val resolved = tvf.flatMap { case (argList, resolver) => argList.implicitCast(u.functionArgs) match { case Some(casted) => try { Some(resolver(casted.map(_.eval()))) } catch { case e: AnalysisException => failAnalysis() } case _ => None } } resolved.headOption.getOrElse { failAnalysis() } case _ => u.failAnalysis(s"could not resolve `${u.functionName}` to a table-valued function") } // If alias names assigned, add `Project` with the aliases if (u.outputNames.nonEmpty) { val outputAttrs = resolvedFunc.output // Checks if the number of the aliases is equal to expected one if (u.outputNames.size != outputAttrs.size) { u.failAnalysis(s"Number of given aliases does not match number of output columns. " + s"Function name: ${u.functionName}; number of aliases: " + s"${u.outputNames.size}; number of output columns: ${outputAttrs.size}.") } val aliases = outputAttrs.zip(u.outputNames).map { case (attr, name) => Alias(attr, name)() } Project(aliases, resolvedFunc) } else { resolvedFunc } } }
Example 33
Source File: ExpressionEvalHelperSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{DataType, IntegerType} case class BadCodegenExpression() extends LeafExpression { override def nullable: Boolean = false override def eval(input: InternalRow): Any = 10 override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { ev.copy(code = code""" |int some_variable = 11; |int ${ev.value} = 10; """.stripMargin) } override def dataType: DataType = IntegerType }
Example 34
Source File: PythonSQLUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.api.python import java.io.InputStream import java.nio.channels.Channels import org.apache.spark.api.java.JavaRDD import org.apache.spark.api.python.PythonRDDServer import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.arrow.ArrowConverters import org.apache.spark.sql.types.DataType private[sql] object PythonSQLUtils { def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText) // This is needed when generating SQL documentation for built-in functions. def listBuiltinFunctionInfos(): Array[ExpressionInfo] = { FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray } private[sql] class ArrowRDDServer(sqlContext: SQLContext) extends PythonRDDServer { override protected def streamToRDD(input: InputStream): RDD[Array[Byte]] = { // Create array to consume iterator so that we can safely close the inputStream val batches = ArrowConverters.getBatchesFromStream(Channels.newChannel(input)).toArray // Parallelize the record batches to create an RDD JavaRDD.fromRDD(sqlContext.sparkContext.parallelize(batches, batches.length)) } }
Example 35
Source File: MySQLDialect.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 36
Source File: UserDefinedFunction.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.InterfaceStability import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.catalyst.expressions.ScalaUDF import org.apache.spark.sql.types.DataType def asNondeterministic(): UserDefinedFunction = { if (!_deterministic) { this } else { val udf = copyAll() udf._deterministic = false udf } } } // We have to use a name different than `UserDefinedFunction` here, to avoid breaking the binary // compatibility of the auto-generate UserDefinedFunction object. private[sql] object SparkUserDefinedFunction { def create( f: AnyRef, dataType: DataType, inputSchemas: Seq[Option[ScalaReflection.Schema]]): UserDefinedFunction = { val inputTypes = if (inputSchemas.contains(None)) { None } else { Some(inputSchemas.map(_.get.dataType)) } val udf = new UserDefinedFunction(f, dataType, inputTypes) udf.nullableTypes = Some(inputSchemas.map(_.map(_.nullable).getOrElse(true))) udf } }
Example 37
Source File: EvalPythonExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import java.io.File import scala.collection.mutable.ArrayBuffer import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.api.python.ChainedPythonFunctions import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.util.Utils abstract class EvalPythonExec(udfs: Seq[PythonUDF], output: Seq[Attribute], child: SparkPlan) extends SparkPlan { def children: Seq[SparkPlan] = child :: Nil override def producedAttributes: AttributeSet = AttributeSet(output.drop(child.output.length)) private def collectFunctions(udf: PythonUDF): (ChainedPythonFunctions, Seq[Expression]) = { udf.children match { case Seq(u: PythonUDF) => val (chained, children) = collectFunctions(u) (ChainedPythonFunctions(chained.funcs ++ Seq(udf.func)), children) case children => // There should not be any other UDFs, or the children can't be evaluated directly. assert(children.forall(_.find(_.isInstanceOf[PythonUDF]).isEmpty)) (ChainedPythonFunctions(Seq(udf.func)), udf.children) } } protected def evaluate( funcs: Seq[ChainedPythonFunctions], argOffsets: Array[Array[Int]], iter: Iterator[InternalRow], schema: StructType, context: TaskContext): Iterator[InternalRow] protected override def doExecute(): RDD[InternalRow] = { val inputRDD = child.execute().map(_.copy()) inputRDD.mapPartitions { iter => val context = TaskContext.get() // The queue used to buffer input rows so we can drain it to // combine input with output from Python. val queue = HybridRowQueue(context.taskMemoryManager(), new File(Utils.getLocalDir(SparkEnv.get.conf)), child.output.length) context.addTaskCompletionListener[Unit] { ctx => queue.close() } val (pyFuncs, inputs) = udfs.map(collectFunctions).unzip // flatten all the arguments val allInputs = new ArrayBuffer[Expression] val dataTypes = new ArrayBuffer[DataType] val argOffsets = inputs.map { input => input.map { e => if (allInputs.exists(_.semanticEquals(e))) { allInputs.indexWhere(_.semanticEquals(e)) } else { allInputs += e dataTypes += e.dataType allInputs.length - 1 } }.toArray }.toArray val projection = newMutableProjection(allInputs, child.output) val schema = StructType(dataTypes.zipWithIndex.map { case (dt, i) => StructField(s"_$i", dt) }) // Add rows to queue to join later with the result. val projectedRowIter = iter.map { inputRow => queue.add(inputRow.asInstanceOf[UnsafeRow]) projection(inputRow) } val outputRowIterator = evaluate( pyFuncs, argOffsets, projectedRowIter, schema, context) val joined = new JoinedRow val resultProj = UnsafeProjection.create(output, output) outputRowIterator.map { outputRow => resultProj(joined(queue.remove(), outputRow)) } } } }
Example 38
Source File: subquery.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.{expressions, InternalRow} import org.apache.spark.sql.catalyst.expressions.{Expression, ExprId, InSet, Literal, PlanExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{BooleanType, DataType, StructType} case class ReuseSubquery(conf: SQLConf) extends Rule[SparkPlan] { def apply(plan: SparkPlan): SparkPlan = { if (!conf.exchangeReuseEnabled) { return plan } // Build a hash map using schema of subqueries to avoid O(N*N) sameResult calls. val subqueries = mutable.HashMap[StructType, ArrayBuffer[SubqueryExec]]() plan transformAllExpressions { case sub: ExecSubqueryExpression => val sameSchema = subqueries.getOrElseUpdate(sub.plan.schema, ArrayBuffer[SubqueryExec]()) val sameResult = sameSchema.find(_.sameResult(sub.plan)) if (sameResult.isDefined) { sub.withNewPlan(sameResult.get) } else { sameSchema += sub.plan sub } } } }
Example 39
Source File: ExistingRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoder, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericInternalRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } case class RDDScanExec( output: Seq[Attribute], rdd: RDD[InternalRow], name: String, override val outputPartitioning: Partitioning = UnknownPartitioning(0), override val outputOrdering: Seq[SortOrder] = Nil) extends LeafExecNode { private def rddName: String = Option(rdd.name).map(n => s" $n").getOrElse("") override val nodeName: String = s"Scan $name$rddName" override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.mapPartitionsWithIndexInternal { (index, iter) => val proj = UnsafeProjection.create(schema) proj.initialize(index) iter.map { r => numOutputRows += 1 proj(r) } } } override def simpleString: String = { s"$nodeName${Utils.truncatedString(output, "[", ",", "]")}" } }
Example 40
Source File: TestCompressibleColumnBuilder.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.types.{AtomicType, DataType} class TestCompressibleColumnBuilder[T <: AtomicType]( override val columnStats: ColumnStats, override val columnType: NativeColumnType[T], override val schemes: Seq[CompressionScheme]) extends NativeColumnBuilder(columnStats, columnType) with NullableColumnBuilder with CompressibleColumnBuilder[T] { override protected def isWorthCompressing(encoder: Encoder[T]) = true } object TestCompressibleColumnBuilder { def apply[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], scheme: CompressionScheme): TestCompressibleColumnBuilder[T] = { val builder = new TestCompressibleColumnBuilder(columnStats, columnType, Seq(scheme)) builder.initialize(0, "", useCompression = true) builder } } object ColumnBuilderHelper { def apply( dataType: DataType, batchSize: Int, name: String, useCompression: Boolean): ColumnBuilder = { ColumnBuilder(dataType, batchSize, name, useCompression) } }
Example 41
Source File: FieldExtractor.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.tablefunctions import org.apache.spark.sql.types.{DataType, Metadata} def apply(index: Int, tableName: String, originalTableName: String, name: String, originalName: String, dataType: DataType, metadata: Metadata, isNullable: Boolean, checkStar: Boolean): FieldExtractor = new FieldExtractor( index, tableName, originalTableName, name, originalName, DataTypeExtractor(dataType), AnnotationsExtractor(metadata, checkStar), isNullable) }
Example 42
Source File: SqlBuilderSuiteBase.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.sql import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.DataType import org.scalatest.FunSuite import scala.util.matching.Regex trait SqlBuilderSuiteBase { self: FunSuite => val sqlBuilder: SqlBuilder // scalastyle:ignore def testExpressionToSql(sql: String)(expr: Expression): Unit = { val cleanSql = cleanUpSql(sql) test(s"expressionToSql: $cleanSql | with $expr") { assertResult(cleanSql)(sqlBuilder.expressionToSql(expr)) } } def testBuildSelect(sql: String) (i1: SqlLikeRelation, i2: Seq[String], i3: Seq[Filter]): Unit = { val cleanSql = cleanUpSql(sql) test(s"buildSelect: $cleanSql | with $i1 $i2 $i3") { assertResult(cleanSql)(sqlBuilder.buildSelect(i1, i2, i3)) } } def testLogicalPlan(sql: String)(plan: LogicalPlan): Unit = { val cleanSql = cleanUpSql(sql) test(s"logical plan: $cleanSql | with $plan") { assertResult(cleanSql)(sqlBuilder.logicalPlanToSql(plan)) } } def testLogicalPlanInternal(sql: String)(plan: LogicalPlan): Unit = { val cleanSql = cleanUpSql(sql) test(s"logical plan (internal): $cleanSql | with $plan") { assertResult(cleanSql)(sqlBuilder.internalLogicalPlanToSql(plan, noProject = true)) } } def testUnsupportedLogicalPlan(plan: LogicalPlan): Unit = { test(s"invalid logical plan: $plan") { intercept[RuntimeException] { sqlBuilder.logicalPlanToSql(plan) } } } private def cleanUpSql(q: String): String = q.replaceAll("\\s+", " ").trim def testUnsupportedLogicalPlanInternal(plan: LogicalPlan): Unit = { test(s"invalid logical plan (internal): $plan") { intercept[RuntimeException] { sqlBuilder.internalLogicalPlanToSql(plan) } } } def testGeneratedSqlDataType(expected: String)(dataType: DataType): Unit = { test(s"The generated sql type for ${dataType.simpleString} is $expected") { val generated = sqlBuilder.typeToSql(dataType) assertResult(expected)(generated) } } }
Example 43
Source File: ExcelRelation.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import scala.collection.JavaConversions._ import org.apache.spark.sql.sources.{ BaseRelation, TableScan } import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.ArrayType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import org.apache.spark.sql.SQLContext import org.apache.spark.sql._ import org.apache.spark.rdd.RDD import org.apache.hadoop.conf._ import org.apache.hadoop.mapreduce._ import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log import org.zuinnote.hadoop.office.format.common.dao._ import org.zuinnote.hadoop.office.format.mapreduce._ import org.zuinnote.spark.office.excel.util.ExcelFile override def buildScan: RDD[Row] = { // read ExcelRows val excelRowsRDD = ExcelFile.load(sqlContext, location, hadoopParams) // map to schema val schemaFields = schema.fields excelRowsRDD.flatMap(excelKeyValueTuple => { // map the Excel row data structure to a Spark SQL schema val rowArray = new Array[Any](excelKeyValueTuple._2.get.length) var i = 0; for (x <- excelKeyValueTuple._2.get) { // parse through the SpreadSheetCellDAO val spreadSheetCellDAOStructArray = new Array[String](schemaFields.length) val currentSpreadSheetCellDAO: Array[SpreadSheetCellDAO] = excelKeyValueTuple._2.get.asInstanceOf[Array[SpreadSheetCellDAO]] spreadSheetCellDAOStructArray(0) = currentSpreadSheetCellDAO(i).getFormattedValue spreadSheetCellDAOStructArray(1) = currentSpreadSheetCellDAO(i).getComment spreadSheetCellDAOStructArray(2) = currentSpreadSheetCellDAO(i).getFormula spreadSheetCellDAOStructArray(3) = currentSpreadSheetCellDAO(i).getAddress spreadSheetCellDAOStructArray(4) = currentSpreadSheetCellDAO(i).getSheetName // add row representing one Excel row rowArray(i) = spreadSheetCellDAOStructArray i += 1 } Some(Row.fromSeq(rowArray)) }) } }
Example 44
Source File: L8-4DataFrameCreationSchema.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.Row import org.apache.spark.sql.SQLContext import org.apache.spark.sql.functions.desc import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object DataframeCreationApp2 { def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeApp2 <appname> <batchInterval> <hostname> <port> <schemaPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) val schemaJson = scala.io.Source.fromFile(schemaFile).mkString val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = sqlC.createDataFrame(rdd.map(c => Row(c: _*)), schema) cdrs.groupBy("countryCode").count().orderBy(desc("count")).show(5) }) ssc.start() ssc.awaitTermination() } }
Example 45
Source File: L8-35DataFrameExamplesRDD.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.sql.types.DataType import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.json4s.DefaultFormats object CdrDataframeExamplesRDDApp { case class Cdr(squareId: Int, timeInterval: Long, countryCode: Int, smsInActivity: Float, smsOutActivity: Float, callInActivity: Float, callOutActivity: Float, internetTrafficActivity: Float) def main(args: Array[String]) { if (args.length != 5) { System.err.println( "Usage: CdrDataframeExamplesRDDApp <appname> <batchInterval> <hostname> <schemaPath>") System.exit(1) } val Seq(appName, batchInterval, hostname, port, schemaFile) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ implicit val formats = DefaultFormats val schemaJson = scala.io.Source.fromFile(schemaFile).mkString val schema = DataType.fromJson(schemaJson).asInstanceOf[StructType] val cdrStream = ssc.socketTextStream(hostname, port.toInt) .map(_.split("\\t", -1)) .foreachRDD(rdd => { val cdrs = seqToCdr(rdd).toDF() val highInternet = sqlC.createDataFrame(cdrs.rdd.filter(r => r.getFloat(3) + r.getFloat(4) >= r.getFloat(5) + r.getFloat(6)), schema) val highOther = cdrs.except(highInternet) val highInternetGrid = highInternet.select("squareId", "countryCode").dropDuplicates() val highOtherGrid = highOther.select("squareId", "countryCode").dropDuplicates() highOtherGrid.except(highInternetGrid).show() highInternetGrid.except(highOtherGrid).show() }) ssc.start() ssc.awaitTermination() } def seqToCdr(rdd: RDD[Array[String]]): RDD[Cdr] = { rdd.map(c => c.map(f => f match { case x if x.isEmpty() => "0" case x => x })).map(c => Cdr(c(0).toInt, c(1).toLong, c(2).toInt, c(3).toFloat, c(4).toFloat, c(5).toFloat, c(6).toFloat, c(7).toFloat)) } }
Example 46
Source File: DataTypeMapping.scala From azure-kusto-spark with Apache License 2.0 | 5 votes |
package com.microsoft.kusto.spark.utils import org.apache.spark.sql.types.DataTypes._ import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, MapType, StructType} object DataTypeMapping { val kustoTypeToSparkTypeMap: Map[String, DataType] = Map( "string" -> StringType, "long" -> LongType, "datetime" -> TimestampType,// Kusto datetime is equivalent to TimestampType "timespan" -> StringType, "bool" -> BooleanType, "real" -> DoubleType, // Can be partitioned differently between precision and scale, total must be 34 to match .Net SqlDecimal "decimal" -> DataTypes.createDecimalType(20,14), "guid" -> StringType, "int" -> IntegerType, "dynamic" -> StringType ) val kustoJavaTypeToSparkTypeMap: Map[String, DataType] = Map( "string" -> StringType, "int64" -> LongType, "datetime" -> TimestampType, "timespan" -> StringType, "sbyte" -> BooleanType, "double" -> DoubleType, "sqldecimal" -> DataTypes.createDecimalType(20,14), "guid" -> StringType, "int32" -> IntegerType, "object" -> StringType ) val sparkTypeToKustoTypeMap: Map[DataType, String] = Map( StringType -> "string", BooleanType -> "bool", DateType -> "datetime", TimestampType -> "datetime", DataTypes.createDecimalType() -> "decimal", DoubleType -> "real", FloatType -> "real", ByteType -> "int", IntegerType -> "int", LongType -> "long", ShortType -> "int" ) def getSparkTypeToKustoTypeMap(fieldType: DataType): String ={ if(fieldType.isInstanceOf[DecimalType]) "decimal" else if (fieldType.isInstanceOf[ArrayType] || fieldType.isInstanceOf[StructType] || fieldType.isInstanceOf[MapType]) "dynamic" else DataTypeMapping.sparkTypeToKustoTypeMap.getOrElse(fieldType, "string") } }
Example 47
Source File: SparkExtension.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.sql import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier} import org.apache.spark.sql.catalyst.expressions.Expression import org.apache.spark.sql.catalyst.parser.ParserInterface import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{SparkSession, SparkSessionExtensions} class SparkExtension extends (SparkSessionExtensions => Unit) { def apply(e: SparkSessionExtensions): Unit = { e.injectParser(SparkAtlasConnectorParser) } } case class SparkAtlasConnectorParser(spark: SparkSession, delegate: ParserInterface) extends ParserInterface { override def parsePlan(sqlText: String): LogicalPlan = { SQLQuery.set(sqlText) delegate.parsePlan(sqlText) } override def parseExpression(sqlText: String): Expression = delegate.parseExpression(sqlText) override def parseTableIdentifier(sqlText: String): TableIdentifier = delegate.parseTableIdentifier(sqlText) override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier = delegate.parseFunctionIdentifier(sqlText) override def parseTableSchema(sqlText: String): StructType = delegate.parseTableSchema(sqlText) override def parseDataType(sqlText: String): DataType = delegate.parseDataType(sqlText) } object SQLQuery { private[this] val sqlQuery = new ThreadLocal[String] def get(): String = sqlQuery.get def set(s: String): Unit = sqlQuery.set(s) }
Example 48
Source File: DataFrameInfo.scala From tensorframes with Apache License 2.0 | 5 votes |
package org.tensorframes import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{DataType, StructType} class DataFrameInfo private (cs: Array[ColumnInformation]) extends Serializable { def cols: Seq[ColumnInformation] = cs def explain: String = { val els = cols.map { c => c.stf.map { i => s"${i.dataType.toString}${i.shape.toString}" } .getOrElse { "??" + DataFrameInfo.pprint(c.field.dataType) } } els.mkString("DataFrame[", ", ", "]") } def merged: StructType = { StructType(cs.map(_.merged)) } override def toString = explain } object DataFrameInfo { def pprint(s: DataType) = s.toString def apply(d: Seq[ColumnInformation]): DataFrameInfo = new DataFrameInfo(d.toArray) def get(df: DataFrame): DataFrameInfo = { new DataFrameInfo(df.schema.map(ColumnInformation.apply).toArray) } }
Example 49
Source File: MLUserDefinedType.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.sql.types.DataType import org.apache.spark.ml.linalg.SQLDataTypes.{MatrixType, VectorType} import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.scalacheck.{Arbitrary, Gen} object MLUserDefinedType { def unapply(dataType: DataType): Option[Gen[Any]] = dataType match { case MatrixType => { val dense = for { rows <- Gen.choose(0, 20) cols <- Gen.choose(0, 20) values <- Gen.containerOfN[Array, Double](rows * cols, Arbitrary.arbitrary[Double]) } yield new DenseMatrix(rows, cols, values) val sparse = dense.map(_.toSparse) Some(Gen.oneOf(dense, sparse)) } case VectorType => { val dense = Arbitrary.arbitrary[Array[Double]].map(Vectors.dense) val sparse = for { indices <- Gen.nonEmptyContainerOf[Set, Int](Gen.choose(0, Int.MaxValue - 1)) values <- Gen.listOfN(indices.size, Arbitrary.arbitrary[Double]) } yield Vectors.sparse(indices.max + 1, indices.toSeq.zip(values)) Some(Gen.oneOf(dense, sparse)) } case _ => None } }
Example 50
Source File: ElementwiseProduct.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 51
Source File: Normalizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 52
Source File: DCT.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 53
Source File: NGram.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @Since("1.5.0") def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 54
Source File: MonotonicallyIncreasingID.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initializeInternal(partitionIndex: Int): Unit = { count = 0L partitionMask = partitionIndex.toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, "") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "") ctx.addPartitionInitializationStatement(s"$countTerm = 0L;") ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = "false") } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" }
Example 55
Source File: ReferenceToExpressions.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.objects.LambdaVariable import org.apache.spark.sql.types.DataType case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) extends Expression { override def nullable: Boolean = result.nullable override def dataType: DataType = result.dataType override def checkInputDataTypes(): TypeCheckResult = { if (result.references.nonEmpty) { return TypeCheckFailure("The result expression cannot reference to any attributes.") } var maxOrdinal = -1 result foreach { case b: BoundReference if b.ordinal > maxOrdinal => maxOrdinal = b.ordinal case _ => } if (maxOrdinal > children.length) { return TypeCheckFailure(s"The result expression need $maxOrdinal input expressions, but " + s"there are only ${children.length} inputs.") } TypeCheckSuccess } private lazy val projection = UnsafeProjection.create(children) override def eval(input: InternalRow): Any = { result.eval(projection(input)) } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val childrenGen = children.map(_.genCode(ctx)) val (classChildrenVars, initClassChildrenVars) = childrenGen.zip(children).map { case (childGen, child) => // SPARK-18125: The children vars are local variables. If the result expression uses // splitExpression, those variables cannot be accessed so compilation fails. // To fix it, we use class variables to hold those local variables. val classChildVarName = ctx.freshName("classChildVar") val classChildVarIsNull = ctx.freshName("classChildVarIsNull") ctx.addMutableState(ctx.javaType(child.dataType), classChildVarName, "") ctx.addMutableState("boolean", classChildVarIsNull, "") val classChildVar = LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType) val initCode = s"${classChildVar.value} = ${childGen.value};\n" + s"${classChildVar.isNull} = ${childGen.isNull};" (classChildVar, initCode) }.unzip val resultGen = result.transform { case b: BoundReference => classChildrenVars(b.ordinal) }.genCode(ctx) ExprCode(code = childrenGen.map(_.code).mkString("\n") + initClassChildrenVars.mkString("\n") + resultGen.code, isNull = resultGen.isNull, value = resultGen.value) } }
Example 56
Source File: MapDataSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import scala.collection._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.util.ArrayBasedMapData import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType} import org.apache.spark.unsafe.types.UTF8String class MapDataSuite extends SparkFunSuite { test("inequality tests") { def u(str: String): UTF8String = UTF8String.fromString(str) // test data val testMap1 = Map(u("key1") -> 1) val testMap2 = Map(u("key1") -> 1, u("key2") -> 2) val testMap3 = Map(u("key1") -> 1) val testMap4 = Map(u("key1") -> 1, u("key2") -> 2) // ArrayBasedMapData val testArrayMap1 = ArrayBasedMapData(testMap1.toMap) val testArrayMap2 = ArrayBasedMapData(testMap2.toMap) val testArrayMap3 = ArrayBasedMapData(testMap3.toMap) val testArrayMap4 = ArrayBasedMapData(testMap4.toMap) assert(testArrayMap1 !== testArrayMap3) assert(testArrayMap2 !== testArrayMap4) // UnsafeMapData val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType))) val row = new GenericInternalRow(1) def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = { row.update(0, map) val unsafeRow = unsafeConverter.apply(row) unsafeRow.getMap(0).copy } assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3)) assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4)) } }
Example 57
Source File: ExpressionEvalHelperSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, IntegerType} case class BadCodegenExpression() extends LeafExpression { override def nullable: Boolean = false override def eval(input: InternalRow): Any = 10 override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { ev.copy(code = s""" |int some_variable = 11; |int ${ev.value} = 10; """.stripMargin) } override def dataType: DataType = IntegerType }
Example 58
Source File: MySQLDialect.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 59
Source File: ExpressionHelper.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv.plans.modular import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId, Expression, NamedExpression} import org.apache.spark.sql.types.{DataType, Metadata} object ExpressionHelper { def createReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata, exprId: ExprId, qualifier: Option[String], attrRef : NamedExpression = null): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)(exprId, qualifier) } def createAlias( child: Expression, name: String, exprId: ExprId = NamedExpression.newExprId, qualifier: Option[String] = None, explicitMetadata: Option[Metadata] = None, namedExpr : Option[NamedExpression] = None ) : Alias = { Alias(child, name)(exprId, qualifier, explicitMetadata) } def getTheLastQualifier(reference: AttributeReference): String = { reference.qualifier.head } }
Example 60
Source File: ExpressionHelper.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv.plans.modular import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId, Expression, NamedExpression} import org.apache.spark.sql.types.{DataType, Metadata} object ExpressionHelper { def createReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata, exprId: ExprId, qualifier: Option[String], attrRef : NamedExpression = null): AttributeReference = { val qf = if (qualifier.nonEmpty) Seq(qualifier.get) else Seq.empty AttributeReference(name, dataType, nullable, metadata)(exprId, qf) } def createAlias( child: Expression, name: String, exprId: ExprId, qualifier: Option[String]) : Alias = { val qf = if (qualifier.nonEmpty) Seq(qualifier.get) else Seq.empty Alias(child, name)(exprId, qf, None) } def getTheLastQualifier(reference: AttributeReference): String = { reference.qualifier.reverse.head } }
Example 61
Source File: CarbonHiveMetastoreListener.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.hive import scala.collection.JavaConverters._ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.hive.metastore.MetaStorePreEventListener import org.apache.hadoop.hive.metastore.api.{FieldSchema, MetaException} import org.apache.hadoop.hive.metastore.events._ import org.apache.hadoop.hive.metastore.events.PreEventContext.PreEventType._ import org.apache.spark.sql.types.{DataType, StructField, StructType} class CarbonHiveMetastoreListener(conf: Configuration) extends MetaStorePreEventListener(conf) { override def onEvent(preEventContext: PreEventContext): Unit = { preEventContext.getEventType match { case CREATE_TABLE => val table = preEventContext.asInstanceOf[PreCreateTableEvent].getTable val tableProps = table.getParameters if (tableProps != null && (tableProps.get("spark.sql.sources.provider") == "org.apache.spark.sql.CarbonSource" || tableProps.get("spark.sql.sources.provider").equalsIgnoreCase("carbondata"))) { val numSchemaParts = tableProps.get("spark.sql.sources.schema.numParts") if (numSchemaParts != null && !numSchemaParts.isEmpty) { val parts = (0 until numSchemaParts.toInt).map { index => val part = tableProps.get(s"spark.sql.sources.schema.part.${index}") if (part == null) { throw new MetaException(s"spark.sql.sources.schema.part.${index} is missing!") } part } // Stick all parts back to a single schema string. val schema = DataType.fromJson(parts.mkString).asInstanceOf[StructType] val hiveSchema = schema.map(toHiveColumn).asJava table.getSd.setCols(hiveSchema) table.getSd.setInputFormat("org.apache.carbondata.hive.MapredCarbonInputFormat") table.getSd.setOutputFormat("org.apache.carbondata.hive.MapredCarbonOutputFormat") val serdeInfo = table.getSd.getSerdeInfo serdeInfo.setSerializationLib("org.apache.carbondata.hive.CarbonHiveSerDe") val tablePath = serdeInfo.getParameters.get("tablePath") if (tablePath != null) { table.getSd.setLocation(tablePath) } } } case ALTER_TABLE => val table = preEventContext.asInstanceOf[PreAlterTableEvent].getNewTable val tableProps = table.getParameters if (tableProps != null && (tableProps.get("spark.sql.sources.provider") == "org.apache.spark.sql.CarbonSource" || tableProps.get("spark.sql.sources.provider").equalsIgnoreCase("carbondata"))) { val numSchemaParts = tableProps.get("spark.sql.sources.schema.numParts") if (numSchemaParts != null && !numSchemaParts.isEmpty) { val schemaParts = (0 until numSchemaParts.toInt).map { index => val schemaPart = tableProps.get(s"spark.sql.sources.schema.part.$index") if (schemaPart == null) { throw new MetaException(s"spark.sql.sources.schema.part.$index is missing!") } schemaPart } // Stick all schemaParts back to a single schema string. val schema = DataType.fromJson(schemaParts.mkString).asInstanceOf[StructType] val hiveSchema = schema.map(toHiveColumn).asJava table.getSd.setCols(hiveSchema) } } case _ => // do nothing } } private def toHiveColumn(c: StructField): FieldSchema = { val typeString = if (c.metadata.contains("HIVE_TYPE_STRING")) { c.metadata.getString("HIVE_TYPE_STRING") } else { c.dataType.catalogString } new FieldSchema(c.name, typeString, c.getComment().orNull) } }
Example 62
Source File: CarbonExpressions.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, ScalaUDF} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.execution.command.DescribeTableCommand import org.apache.spark.sql.types.DataType object CarbonScalaUDF { def unapply(expression: Expression): Option[(ScalaUDF)] = { expression match { case a: ScalaUDF => Some(a) case _ => None } } } }
Example 63
Source File: DataLoader.scala From variantsdwh with Apache License 2.0 | 5 votes |
package pl.edu.pw.ii.zsibio.dwh.benchmark import com.typesafe.config.ConfigFactory import org.apache.kudu.spark.kudu.KuduContext import org.apache.spark.sql.{Row, SQLContext} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.{SparkConf, SparkContext} import org.rogach.scallop.ScallopConf import org.apache.kudu.spark.kudu._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.{DataType, StructField, StructType} object DataLoader { class RunConf(args:Array[String]) extends ScallopConf(args){ val csvFile =opt[String]("csvFile",required = true, descr = "A CSV file to load" ) val tableName =opt[String]("tableName",required = true, descr = "A table to load" ) val storageType = opt[String]("storageType",required = true, descr = "Storage type parquet|orc|kudu|carbon") val dbName =opt[String]("dbName",required = true, descr = "Database name" ) verify() } def main(args: Array[String]): Unit = { val runConf = new RunConf(args) val scConf = new SparkConf() .setAppName("DataLoader") val sc = new SparkContext(scConf) val sqlContext = new HiveContext(sc) if(runConf.storageType().toLowerCase() == "orc" || runConf.storageType().toLowerCase() == "parquet") { val df = sqlContext.read .format("com.databricks.spark.csv") .option("delimiter", "|") .option("nullValue","\\N") .option("inferSchema", "true") // Automatically infer data types .load(runConf.csvFile()) .repartition(10) df.registerTempTable("temp_csv") sqlContext.sql( s""" |INSERT OVERWRITE TABLE ${runConf.dbName()}.${runConf.tableName()} |SELECT * FROM temp_csv """.stripMargin) } if(runConf.storageType().toLowerCase() == "kudu"){ val confFile = ConfigFactory.load() val kuduMaster = confFile.getString("kudu.master.server") val kuduContext = new KuduContext(kuduMaster) val dfTarget = sqlContext.read.options(Map("kudu.master" -> kuduMaster,"kudu.table" -> runConf.tableName())).kudu val df = sqlContext.read .format("com.databricks.spark.csv") .option("delimiter", "|") .option("nullValue","\\N") .schema(dfTarget.schema) .load(runConf.csvFile()) .repartition(10) kuduContext.upsertRows(df,runConf.tableName()) } } private def synSchemas(inSchema:StructType, outSchema:StructType) = { val size = inSchema.fields.length val structFields = (0 to size - 1).map{ i => StructField(outSchema.fields(i).name,inSchema.fields(i).dataType,outSchema.fields(i).nullable) } new StructType(structFields.toArray) } }
Example 64
Source File: JsonNestedExample.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.nested import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.types.{ArrayType, DataType, StructField, StructType} import scala.collection.mutable object JsonNestedExample { Logger.getLogger("org").setLevel(Level.OFF) Logger.getLogger("akka").setLevel(Level.OFF) def main(args: Array[String]): Unit = { val isLocal = args(0).equalsIgnoreCase("l") val jsonPath = args(1) val outputTableName = args(2) val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } println("---") val jsonDf = sparkSession.read.json(jsonPath) val localJsonDf = jsonDf.collect() println("--Df") jsonDf.foreach(row => { println("row:" + row) }) println("--local") localJsonDf.foreach(row => { println("row:" + row) }) jsonDf.createOrReplaceTempView("json_table") println("--Tree Schema") jsonDf.schema.printTreeString() println("--") jsonDf.write.saveAsTable(outputTableName) sparkSession.sqlContext.sql("select * from " + outputTableName).take(10).foreach(println) println("--") sparkSession.stop() } def populatedFlattedHashMap(row:Row, schema:StructType, fields:Array[StructField], flattedMap:mutable.HashMap[(String, DataType), mutable.MutableList[Any]], parentFieldName:String): Unit = { fields.foreach(field => { println("field:" + field.dataType) if (field.dataType.isInstanceOf[ArrayType]) { val elementType = field.dataType.asInstanceOf[ArrayType].elementType if (elementType.isInstanceOf[StructType]) { val childSchema = elementType.asInstanceOf[StructType] val childRow = Row.fromSeq(row.getAs[mutable.WrappedArray[Any]](field.name).toSeq) populatedFlattedHashMap(childRow, childSchema, childSchema.fields, flattedMap, parentFieldName + field.name + ".") } } else { val fieldList = flattedMap.getOrElseUpdate((parentFieldName + field.name, field.dataType), new mutable.MutableList[Any]) fieldList.+=:(row.getAs[Any](schema.fieldIndex(field.name))) } }) } }
Example 65
Source File: AppendLoadConfiguration.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.config import com.adidas.analytics.algo.core.Algorithm.SafeWriteOperation import com.adidas.analytics.config.shared.{ConfigurationContext, LoadConfiguration, MetadataUpdateStrategy} import com.adidas.analytics.util.DataFormat.ParquetFormat import com.adidas.analytics.util.{LoadMode, OutputWriter} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.util.DropMalformedMode import org.apache.spark.sql.types.{DataType, StructType} import scala.util.parsing.json.JSONObject trait AppendLoadConfiguration extends ConfigurationContext with LoadConfiguration with SafeWriteOperation with MetadataUpdateStrategy { protected def spark: SparkSession private val regexFilename: Seq[String] = configReader.getAsSeq[String]("regex_filename") protected val headerDir: String = configReader.getAs[String]("header_dir") protected val targetTable: Option[String] = configReader.getAsOption[String]("target_table") // This option is used to specify whether the input data schema must be the same as target schema specified in the configuration file // Note: if it is set to True, it will cause input data to be read more than once private val verifySchemaOption: Option[Boolean] = configReader.getAsOption[Boolean]("verify_schema") protected val verifySchema: Boolean = dataType match { case SEMISTRUCTURED => verifySchemaOption.getOrElse(true) case _ => false } protected val columnToRegexPairs: Seq[(String, String)] = targetPartitions zip regexFilename private val jsonSchemaOption: Option[JSONObject] = configReader.getAsOption[JSONObject]("schema") protected val targetSchema: StructType = getTargetSchema private val targetDir: Option[String] = configReader.getAsOption[String]("target_dir") override protected val writer: OutputWriter.AtomicWriter = dataType match { case STRUCTURED if targetTable.isDefined => OutputWriter.newTableLocationWriter( table = targetTable.get, format = ParquetFormat(Some(targetSchema)), targetPartitions = targetPartitions, loadMode = LoadMode.OverwritePartitionsWithAddedColumns, metadataConfiguration = getMetaDataUpdateStrategy(targetTable.get,targetPartitions) ) case SEMISTRUCTURED if targetDir.isDefined => OutputWriter.newFileSystemWriter( location = targetDir.get, format = ParquetFormat(Some(targetSchema)), targetPartitions = targetPartitions, loadMode = LoadMode.OverwritePartitions ) case anotherDataType => throw new RuntimeException(s"Unsupported data type: $anotherDataType in AppendLoad or the configuration file is malformed.") } private def getTargetSchemaFromHiveTable: StructType = { targetTable match { case Some(tableName) => spark.table(tableName).schema case None => throw new RuntimeException("No schema definition found.") } } private def getTargetSchema: StructType = { dataType match { case STRUCTURED => getTargetSchemaFromHiveTable case SEMISTRUCTURED if jsonSchemaOption.isDefined => DataType.fromJson(jsonSchemaOption.get.toString()).asInstanceOf[StructType] case anotherDataType => throw new RuntimeException(s"Unsupported data type: $anotherDataType in AppendLoad or the configuration file is malformed.") } } override def loadMode: String = readerModeSetter(DropMalformedMode.name) }
Example 66
Source File: RecoverPartitionsNativeIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class RecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated with native spark.recoverPartitions()") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false spark .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)") .filter("col_name == 'Partition Statistics'") .head() .getAs[String]("data_type").contains("6 rows") shouldBe true fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 67
Source File: SparkRecoverPartitionsNativeIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class SparkRecoverPartitionsNativeIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated with native spark.recoverPartitions()") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 68
Source File: SparkRecoverPartitionsCustomIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class SparkRecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated programmatically using custom logic") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 69
Source File: RecoverPartitionsCustomIntegrationTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.integration import com.adidas.utils.TestUtils._ import com.adidas.analytics.algo.AppendLoad import com.adidas.utils.FileReader import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.apache.spark.sql.{Dataset, Encoders} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ import scala.collection.JavaConverters._ class RecoverPartitionsCustomIntegrationTest extends FeatureSpec with BaseIntegrationTest { feature("Partitions can be updated programmatically using custom logic") { scenario("Using Append Load Algorithm with multiple source files") { val testResourceDir = "multiple_source_files" val headerPath20180101 = new Path(headerDirPath, "year=2018/month=1/day=1/header.json") val targetPath20180101 = new Path(targetDirPath, "year=2018/month=1/day=1") val targetSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/target_schema.json")).asInstanceOf[StructType] val expectedPartitionsSchema = DataType.fromJson(getResourceAsText(s"$testResourceDir/expected_partitions_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val expectedPartitionsDataReader = FileReader.newDSVFileReader(Some(expectedPartitionsSchema)) val targetTable = createTargetTable(testResourceDir, Seq("year", "month", "day"), targetSchema) setupInitialState(targetTable, s"$testResourceDir/lake_data_pre.psv", dataReader) prepareSourceData(testResourceDir, Seq("data_20180101-part-00000.psv", "data_20180101-part-00001.psv")) uploadParameters(testResourceDir) // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 7 targetTable.read().count() shouldBe 19 fs.exists(targetPath20180101) shouldBe false fs.exists(headerPath20180101) shouldBe false // executing load AppendLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource(s"$testResourceDir/lake_data_post.psv", withProtocol = true) val expectedPartitionsLocation = resolveResource(s"$testResourceDir/expected_partitions.txt", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() val producedPartitionsNumber: Dataset[String] = spark .sql(s"SHOW PARTITIONS ${targetDatabase}.${tableName}") .as(Encoders.STRING) // MetaData Specific Tests val expectedPartitions: Dataset[String] = expectedPartitionsDataReader .read(spark, expectedPartitionsLocation) .as(Encoders.STRING) expectedPartitions.collectAsList().asScala.sorted.toSet should equal(producedPartitionsNumber.collectAsList().asScala.sorted.toSet) actualDf.hasDiff(expectedDf) shouldBe false spark .sql(s"DESCRIBE extended ${targetDatabase}.${tableName} PARTITION(year=2018,month=1,day=1)") .filter("col_name == 'Partition Statistics'") .head() .getAs[String]("data_type").contains("6 rows") shouldBe true fs.exists(targetPath20180101) shouldBe true fs.exists(headerPath20180101) shouldBe true } } }
Example 70
Source File: AlgorithmTemplateTest.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.feature import com.adidas.analytics.algo.FullLoad import com.adidas.analytics.util.{DFSWrapper, HiveTableAttributeReader, LoadMode} import com.adidas.utils.TestUtils._ import com.adidas.utils.{BaseAlgorithmTest, FileReader, Table} import org.apache.hadoop.fs.Path import org.apache.spark.sql.types.{DataType, StructType} import org.scalatest.FeatureSpec import org.scalatest.Matchers._ class AlgorithmTemplateTest extends FeatureSpec with BaseAlgorithmTest { private val sourceEnvironmentLocation: String = "test_landing" private val targetDatabase: String = "test_lake" private val tableName: String = "test_table" private val paramsFileName: String = "algorithm_template_params.json" private val paramsFileHdfsPath: Path = new Path(hdfsRootTestPath, paramsFileName) private val sourceDirPath: Path = new Path(hdfsRootTestPath, s"$sourceEnvironmentLocation/test/$tableName/data") private val targetDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data") private val backupDirPath: Path = new Path(hdfsRootTestPath, s"$targetDatabase/test/$tableName/data_backup") feature("Algorithm template successfully loads files to lake") { scenario("when table is not partitioned, load is successful") { copyResourceFileToHdfs(s"$paramsFileName", paramsFileHdfsPath) val targetSchema = DataType.fromJson(getResourceAsText("target_schema.json")).asInstanceOf[StructType] val dataReader = FileReader.newDSVFileReader(Some(targetSchema)) val targetTable = createNonPartitionedTargetTable(targetSchema) setupInitialState(targetTable, "lake_data_pre.psv", dataReader) prepareDefaultSourceData() // checking pre-conditions spark.read.csv(sourceDirPath.toString).count() shouldBe 25 targetTable.read().count() shouldBe 19 FullLoad(spark, dfs, paramsFileHdfsPath.toString).run() // validating result val expectedDataLocation = resolveResource("lake_data_post.psv", withProtocol = true) val expectedDf = dataReader.read(spark, expectedDataLocation) val actualDf = targetTable.read() actualDf.hasDiff(expectedDf) shouldBe false // check the resulting table location is /data folder val tableLocation = HiveTableAttributeReader(targetTable.table, spark).getTableLocation tableLocation shouldBe fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString //check backUp dir is empty fs.listStatus(backupDirPath).length shouldBe 0 } } private def createNonPartitionedTargetTable(targetSchema: StructType): Table = { val targetTableLocation = fs.makeQualified(new Path(hdfsRootTestPath, targetDirPath)).toString Table.newBuilder(tableName, targetDatabase, targetTableLocation, targetSchema) .buildParquetTable(DFSWrapper(fs.getConf), spark, external = true) } private def setupInitialState(targetTable: Table, localDataFile: String, dataReader: FileReader): Unit = { val initialDataLocation = resolveResource(localDataFile, withProtocol = true) targetTable.write(Seq(initialDataLocation), dataReader, LoadMode.OverwritePartitionsWithAddedColumns) } private def prepareDefaultSourceData(): Unit = { Seq("new_data.psv").foreach { file => logger.info(s"copyResourceFileToHdfs $file to ${sourceDirPath.toString}") copyResourceFileToHdfs(s"$file", sourceDirPath) } } override def beforeEach(): Unit = { super.beforeEach() spark.sql(s"DROP DATABASE IF EXISTS $targetDatabase CASCADE") spark.sql(s"CREATE DATABASE $targetDatabase") logger.info(s"Creating ${sourceDirPath.toString}") fs.mkdirs(sourceDirPath) logger.info(s"Creating ${targetDirPath.toString}") fs.mkdirs(targetDirPath) } }
Example 71
Source File: ColumnSchema.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb.connector import org.apache.spark.sql.types.{DataType, StructType} private[dynamodb] class ColumnSchema(keySchema: KeySchema, sparkSchema: StructType) { type Attr = (String, Int, DataType) private val columnNames = sparkSchema.map(_.name) private val keyIndices = keySchema match { case KeySchema(hashKey, None) => val hashKeyIndex = columnNames.indexOf(hashKey) val hashKeyType = sparkSchema(hashKey).dataType Left(hashKey, hashKeyIndex, hashKeyType) case KeySchema(hashKey, Some(rangeKey)) => val hashKeyIndex = columnNames.indexOf(hashKey) val rangeKeyIndex = columnNames.indexOf(rangeKey) val hashKeyType = sparkSchema(hashKey).dataType val rangeKeyType = sparkSchema(rangeKey).dataType Right((hashKey, hashKeyIndex, hashKeyType), (rangeKey, rangeKeyIndex, rangeKeyType)) } private val attributeIndices = columnNames.zipWithIndex.filterNot({ case (name, _) => keySchema match { case KeySchema(hashKey, None) => name == hashKey case KeySchema(hashKey, Some(rangeKey)) => name == hashKey || name == rangeKey } }).map({ case (name, index) => (name, index, sparkSchema(name).dataType) }) def keys(): Either[Attr, (Attr, Attr)] = keyIndices def attributes(): Seq[Attr] = attributeIndices }
Example 72
Source File: functions.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions.{array, col, explode, udf} import org.apache.spark.sql.types.DataType import scala.reflect.runtime.universe._ object functions { implicit class FilterAnnotations(dataset: DataFrame) { def filterByAnnotationsCol(column: String, function: Seq[Annotation] => Boolean): DataFrame = { val meta = dataset.schema(column).metadata val func = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } dataset.filter(func(col(column)).as(column, meta)) } } def mapAnnotations[T](function: Seq[Annotation] => T, outputType: DataType): UserDefinedFunction = udf ( { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) }, outputType) def mapAnnotationsStrict(function: Seq[Annotation] => Seq[Annotation]): UserDefinedFunction = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } implicit class MapAnnotations(dataset: DataFrame) { def mapAnnotationsCol[T: TypeTag](column: String, outputCol: String, function: Seq[Annotation] => T): DataFrame = { val meta = dataset.schema(column).metadata val func = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } dataset.withColumn(outputCol, func(col(column)).as(outputCol, meta)) } } implicit class EachAnnotations(dataset: DataFrame) { import dataset.sparkSession.implicits._ def eachAnnotationsCol[T: TypeTag](column: String, function: Seq[Annotation] => Unit): Unit = { dataset.select(column).as[Array[Annotation]].foreach(function(_)) } } implicit class ExplodeAnnotations(dataset: DataFrame) { def explodeAnnotationsCol[T: TypeTag](column: String, outputCol: String): DataFrame = { val meta = dataset.schema(column).metadata dataset. withColumn(outputCol, explode(col(column))). withColumn(outputCol, array(col(outputCol)).as(outputCol, meta)) } } }
Example 73
Source File: CheckDeltaInvariant.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.schema import org.apache.spark.sql.delta.schema.Invariants.{ArbitraryExpression, NotNull} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{Block, CodegenContext, ExprCode, JavaCode, TrueLiteral} import org.apache.spark.sql.catalyst.expressions.codegen.Block._ import org.apache.spark.sql.types.{DataType, NullType} case class CheckDeltaInvariant( child: Expression, invariant: Invariant) extends UnaryExpression with NonSQLExpression { override def dataType: DataType = NullType override def foldable: Boolean = false override def nullable: Boolean = true override def flatArguments: Iterator[Any] = Iterator(child) private def assertRule(input: InternalRow): Unit = invariant.rule match { case NotNull if child.eval(input) == null => throw InvariantViolationException(invariant, "") case ArbitraryExpression(expr) => val resolvedExpr = expr.transform { case _: UnresolvedAttribute => child } val result = resolvedExpr.eval(input) if (result == null || result == false) { throw InvariantViolationException( invariant, s"Value ${child.eval(input)} violates requirement.") } } override def eval(input: InternalRow): Any = { assertRule(input) null } private def generateNotNullCode(ctx: CodegenContext): Block = { val childGen = child.genCode(ctx) val invariantField = ctx.addReferenceObj("errMsg", invariant) code"""${childGen.code} | |if (${childGen.isNull}) { | throw org.apache.spark.sql.delta.schema.InvariantViolationException.apply( | $invariantField, ""); |} """.stripMargin } private def generateExpressionValidationCode(expr: Expression, ctx: CodegenContext): Block = { val resolvedExpr = expr.transform { case _: UnresolvedAttribute => child } val elementValue = child.genCode(ctx) val childGen = resolvedExpr.genCode(ctx) val invariantField = ctx.addReferenceObj("errMsg", invariant) val eValue = ctx.freshName("elementResult") code"""${elementValue.code} |${childGen.code} | |if (${childGen.isNull} || ${childGen.value} == false) { | Object $eValue = "null"; | if (!${elementValue.isNull}) { | $eValue = (Object) ${elementValue.value}; | } | throw org.apache.spark.sql.delta.schema.InvariantViolationException.apply( | $invariantField, "Value " + $eValue + " violates requirement."); |} """.stripMargin } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val code = invariant.rule match { case NotNull => generateNotNullCode(ctx) case ArbitraryExpression(expr) => generateExpressionValidationCode(expr, ctx) } ev.copy(code = code, isNull = TrueLiteral, value = JavaCode.literal("null", NullType)) } }
Example 74
Source File: DruidOperatorSchema.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sources.druid import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, Expression, NamedExpression} import org.apache.spark.sql.types.DataType import org.sparklinedata.druid.{DruidOperatorAttribute, DruidQueryBuilder} lazy val pushedDownExprToDruidAttr : Map[Expression, DruidOperatorAttribute] = buildPushDownDruidAttrsMap private def pushDownExpressionMap : Map[String, (Expression, DataType, DataType, String)] = dqb.outputAttributeMap.filter(t => t._2._1 != null) private def buildPushDownDruidAttrsMap : Map[Expression, DruidOperatorAttribute] = (pushDownExpressionMap map { case (nm, (e, oDT, dDT, tf)) => { (e -> druidAttrMap(nm)) } }) private def buildDruidOpAttr : Map[String, DruidOperatorAttribute] = (dqb.outputAttributeMap map { case (nm, (e, oDT, dDT, tf)) => { val druidEid = e match { case null => NamedExpression.newExprId case n: NamedExpression => n.exprId case _ => NamedExpression.newExprId } (nm -> DruidOperatorAttribute(druidEid, nm, dDT, tf)) } } ) }
Example 75
Source File: DruidRelation.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.sparklinedata.druid import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference, ExprId} import org.apache.spark.sql.sources.{BaseRelation, TableScan} import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.sql.{Row, SQLContext} import org.joda.time.Interval import org.sparklinedata.druid.metadata.DruidRelationInfo case class DruidOperatorAttribute(exprId : ExprId, name : String, dataType : DataType, tf: String = null) override val needConversion: Boolean = false override def schema: StructType = dQuery.map(_.schema(info)).getOrElse(info.sourceDF(sqlContext).schema) def buildInternalScan : RDD[InternalRow] = dQuery.map(new DruidRDD(sqlContext, info, _)).getOrElse( info.sourceDF(sqlContext).queryExecution.toRdd ) override def buildScan(): RDD[Row] = buildInternalScan.asInstanceOf[RDD[Row]] override def toString : String = { if (dQuery.isDefined) { s"DruidQuery(${System.identityHashCode(dQuery)}): ${Utils.queryToString(dQuery.get)}" } else { info.toString } } }
Example 76
Source File: UDFTransformer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasInputCols, HasOutputCol, Wrappable} import com.microsoft.ml.spark.core.env.InternalWrapper import com.microsoft.ml.spark.core.serialize.ComplexParam import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.{ParamMap, UDFParam, UDPyFParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.execution.python.UserDefinedPythonFunction import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.sql.{Column, DataFrame, Dataset} import org.apache.spark.sql.functions.col object UDFTransformer extends ComplexParamsReadable[UDFTransformer] override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) if (isSet(inputCol)) { dataset.withColumn(getOutputCol, applyUDF(dataset.col(getInputCol))) } else { dataset.withColumn(getOutputCol, applyUDFOnCols(getInputCols.map(col): _*)) } } def validateAndTransformSchema(schema: StructType): StructType = { if (isSet(inputCol)) schema(getInputCol) else schema(Set(getInputCols: _*)) schema.add(StructField(getOutputCol, getDataType)) } def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema) def copy(extra: ParamMap): UDFTransformer = defaultCopy(extra) }
Example 77
Source File: SparkSessionExt.scala From spark-fast-tests with MIT License | 5 votes |
package com.github.mrpowers.spark.fast.tests import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.sql.types.{DataType, StructField, StructType} object SparkSessionExt { implicit class SparkSessionMethods(spark: SparkSession) { private def asRows[U](values: List[U]): List[Row] = { values.map { case x: Row => x.asInstanceOf[Row] case y: Product => Row(y.productIterator.toList: _*) case a => Row(a) } } private def asSchema[U](fields: List[U]): List[StructField] = { fields.map { case x: StructField => x.asInstanceOf[StructField] case (name: String, dataType: DataType, nullable: Boolean) => StructField(name, dataType, nullable) } } def createDF[U, T](rowData: List[U], fields: List[T]): DataFrame = { spark.createDataFrame( spark.sparkContext.parallelize(asRows(rowData)), StructType(asSchema(fields)) ) } } }
Example 78
Source File: TemporalUdafs.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.temporal import org.apache.logging.log4j.scala.Logging import org.apache.spark.sql.Row import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types.{CalendarIntervalType, DataType, LongType, StructField, StructType} import org.apache.spark.unsafe.types.CalendarInterval import org.opencypher.okapi.impl.temporal.TemporalConstants import org.opencypher.morpheus.impl.temporal.TemporalConversions._ object TemporalUdafs extends Logging { abstract class SimpleDurationAggregation(aggrName: String) extends UserDefinedAggregateFunction { override def inputSchema: StructType = StructType(Array(StructField("duration", CalendarIntervalType))) override def bufferSchema: StructType = StructType(Array(StructField(aggrName, CalendarIntervalType))) override def dataType: DataType = CalendarIntervalType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = new CalendarInterval(0, 0L) } override def evaluate(buffer: Row): Any = buffer.getAs[CalendarInterval](0) } class DurationSum extends SimpleDurationAggregation("sum") { override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { buffer(0) = buffer.getAs[CalendarInterval](0).add(input.getAs[CalendarInterval](0)) } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = buffer2.getAs[CalendarInterval](0).add(buffer1.getAs[CalendarInterval](0)) } } class DurationMax extends SimpleDurationAggregation("max") { override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { val currMaxInterval = buffer.getAs[CalendarInterval](0) val inputInterval = input.getAs[CalendarInterval](0) buffer(0) = if (currMaxInterval.toDuration.compare(inputInterval.toDuration) >= 0) currMaxInterval else inputInterval } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { val interval1 = buffer1.getAs[CalendarInterval](0) val interval2 = buffer2.getAs[CalendarInterval](0) buffer1(0) = if (interval1.toDuration.compare(interval2.toDuration) >= 0) interval1 else interval2 } } class DurationMin extends SimpleDurationAggregation("min") { override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = new CalendarInterval(Integer.MAX_VALUE, Long.MaxValue) } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { val currMinInterval = buffer.getAs[CalendarInterval](0) val inputInterval = input.getAs[CalendarInterval](0) buffer(0) = if (inputInterval.toDuration.compare(currMinInterval.toDuration) >= 0) currMinInterval else inputInterval } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { val interval1 = buffer1.getAs[CalendarInterval](0) val interval2 = buffer2.getAs[CalendarInterval](0) buffer1(0) = if (interval2.toDuration.compare(interval1.toDuration) >= 0) interval1 else interval2 } } class DurationAvg extends UserDefinedAggregateFunction { override def inputSchema: StructType = StructType(Array(StructField("duration", CalendarIntervalType))) override def bufferSchema: StructType = StructType(Array(StructField("sum", CalendarIntervalType), StructField("cnt", LongType))) override def dataType: DataType = CalendarIntervalType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = new CalendarInterval(0, 0L) buffer(1) = 0L } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { buffer(0) = buffer.getAs[CalendarInterval](0).add(input.getAs[CalendarInterval](0)) buffer(1) = buffer.getLong(1) + 1 } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = buffer2.getAs[CalendarInterval](0).add(buffer1.getAs[CalendarInterval](0)) buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1) } override def evaluate(buffer: Row): Any = { val sumInterval = buffer.getAs[CalendarInterval](0) val cnt = buffer.getLong(1) new CalendarInterval((sumInterval.months / cnt).toInt, sumInterval.microseconds / cnt) } } val durationSum = new DurationSum() val durationAvg = new DurationAvg() val durationMin = new DurationMin() val durationMax = new DurationMax() }
Example 79
Source File: EncodeLong.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.expressions import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, NullIntolerant, UnaryExpression} import org.apache.spark.sql.types.{BinaryType, DataType, LongType} import org.opencypher.morpheus.api.value.MorpheusElement._ case class EncodeLong(child: Expression) extends UnaryExpression with NullIntolerant with ExpectsInputTypes { override val dataType: DataType = BinaryType override val inputTypes: Seq[LongType] = Seq(LongType) override protected def nullSafeEval(input: Any): Any = EncodeLong.encodeLong(input.asInstanceOf[Long]) override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = defineCodeGen(ctx, ev, c => s"(byte[])(${EncodeLong.getClass.getName.dropRight(1)}.encodeLong($c))") } object EncodeLong { private final val moreBytesBitMask: Long = Integer.parseInt("10000000", 2) private final val varLength7BitMask: Long = Integer.parseInt("01111111", 2) private final val otherBitsMask = ~varLength7BitMask private final val maxBytesForLongVarEncoding = 10 // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding @inline final def encodeLong(l: Long): Array[Byte] = { val tempResult = new Array[Byte](maxBytesForLongVarEncoding) var remainder = l var index = 0 while ((remainder & otherBitsMask) != 0) { tempResult(index) = ((remainder & varLength7BitMask) | moreBytesBitMask).toByte remainder >>>= 7 index += 1 } tempResult(index) = remainder.toByte val result = new Array[Byte](index + 1) System.arraycopy(tempResult, 0, result, 0, index + 1) result } // Same encoding as as Base 128 Varints @ https://developers.google.com/protocol-buffers/docs/encoding @inline final def decodeLong(input: Array[Byte]): Long = { assert(input.nonEmpty, "`decodeLong` requires a non-empty array as its input") var index = 0 var currentByte = input(index) var decoded = currentByte & varLength7BitMask var nextLeftShift = 7 while ((currentByte & moreBytesBitMask) != 0) { index += 1 currentByte = input(index) decoded |= (currentByte & varLength7BitMask) << nextLeftShift nextLeftShift += 7 } assert(index == input.length - 1, s"`decodeLong` received an input array ${input.toSeq.toHex} with extra bytes that could not be decoded.") decoded } implicit class ColumnLongOps(val c: Column) extends AnyVal { def encodeLongAsMorpheusId(name: String): Column = encodeLongAsMorpheusId.as(name) def encodeLongAsMorpheusId: Column = new Column(EncodeLong(c.expr)) } }
Example 80
Source File: ElementwiseProduct.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 81
Source File: Normalizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 82
Source File: DCT.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 83
Source File: NGram.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @Since("1.5.0") def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 84
Source File: MonotonicallyIncreasingID.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initializeInternal(partitionIndex: Int): Unit = { count = 0L partitionMask = partitionIndex.toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, "") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, "") ctx.addPartitionInitializationStatement(s"$countTerm = 0L;") ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = "false") } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" }
Example 85
Source File: ReferenceToExpressions.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.analysis.TypeCheckResult.{TypeCheckFailure, TypeCheckSuccess} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.objects.LambdaVariable import org.apache.spark.sql.types.DataType case class ReferenceToExpressions(result: Expression, children: Seq[Expression]) extends Expression { override def nullable: Boolean = result.nullable override def dataType: DataType = result.dataType override def checkInputDataTypes(): TypeCheckResult = { if (result.references.nonEmpty) { return TypeCheckFailure("The result expression cannot reference to any attributes.") } var maxOrdinal = -1 result foreach { case b: BoundReference if b.ordinal > maxOrdinal => maxOrdinal = b.ordinal case _ => } if (maxOrdinal > children.length) { return TypeCheckFailure(s"The result expression need $maxOrdinal input expressions, but " + s"there are only ${children.length} inputs.") } TypeCheckSuccess } private lazy val projection = UnsafeProjection.create(children) override def eval(input: InternalRow): Any = { result.eval(projection(input)) } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val childrenGen = children.map(_.genCode(ctx)) val (classChildrenVars, initClassChildrenVars) = childrenGen.zip(children).map { case (childGen, child) => // SPARK-18125: The children vars are local variables. If the result expression uses // splitExpression, those variables cannot be accessed so compilation fails. // To fix it, we use class variables to hold those local variables. val classChildVarName = ctx.freshName("classChildVar") val classChildVarIsNull = ctx.freshName("classChildVarIsNull") ctx.addMutableState(ctx.javaType(child.dataType), classChildVarName, "") ctx.addMutableState("boolean", classChildVarIsNull, "") val classChildVar = LambdaVariable(classChildVarName, classChildVarIsNull, child.dataType) val initCode = s"${classChildVar.value} = ${childGen.value};\n" + s"${classChildVar.isNull} = ${childGen.isNull};" (classChildVar, initCode) }.unzip val resultGen = result.transform { case b: BoundReference => classChildrenVars(b.ordinal) }.genCode(ctx) ExprCode(code = childrenGen.map(_.code).mkString("\n") + initClassChildrenVars.mkString("\n") + resultGen.code, isNull = resultGen.isNull, value = resultGen.value) } }
Example 86
Source File: MapDataSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import scala.collection._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.util.ArrayBasedMapData import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType} import org.apache.spark.unsafe.types.UTF8String class MapDataSuite extends SparkFunSuite { test("inequality tests") { def u(str: String): UTF8String = UTF8String.fromString(str) // test data val testMap1 = Map(u("key1") -> 1) val testMap2 = Map(u("key1") -> 1, u("key2") -> 2) val testMap3 = Map(u("key1") -> 1) val testMap4 = Map(u("key1") -> 1, u("key2") -> 2) // ArrayBasedMapData val testArrayMap1 = ArrayBasedMapData(testMap1.toMap) val testArrayMap2 = ArrayBasedMapData(testMap2.toMap) val testArrayMap3 = ArrayBasedMapData(testMap3.toMap) val testArrayMap4 = ArrayBasedMapData(testMap4.toMap) assert(testArrayMap1 !== testArrayMap3) assert(testArrayMap2 !== testArrayMap4) // UnsafeMapData val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType))) val row = new GenericInternalRow(1) def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = { row.update(0, map) val unsafeRow = unsafeConverter.apply(row) unsafeRow.getMap(0).copy } assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3)) assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4)) } }
Example 87
Source File: ExpressionEvalHelperSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, IntegerType} case class BadCodegenExpression() extends LeafExpression { override def nullable: Boolean = false override def eval(input: InternalRow): Any = 10 override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { ev.copy(code = s""" |int some_variable = 11; |int ${ev.value} = 10; """.stripMargin) } override def dataType: DataType = IntegerType }
Example 88
Source File: MySQLDialect.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 89
Source File: Tokenizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getPattern: String = $(pattern) setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+") override protected def createTransformFunc: String => Seq[String] = { str => val re = $(pattern).r val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra) }
Example 90
Source File: nullFunctions.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.trees import org.apache.spark.sql.catalyst.analysis.UnresolvedException import org.apache.spark.sql.types.DataType case class Coalesce(children: Seq[Expression]) extends Expression { type EvaluatedType = Any case class AtLeastNNonNulls(n: Int, children: Seq[Expression]) extends Predicate { override def nullable: Boolean = false override def foldable: Boolean = false override def toString: String = s"AtLeastNNulls(n, ${children.mkString(",")})" private[this] val childrenArray = children.toArray override def eval(input: Row): Boolean = { var numNonNulls = 0 var i = 0 while (i < childrenArray.length && numNonNulls < n) { if (childrenArray(i).eval(input) != null) { numNonNulls += 1 } i += 1 } numNonNulls >= n } }
Example 91
Source File: ExistingRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} private[sql] case class LogicalLocalTable(output: Seq[Attribute], rows: Seq[Row])(sqlContext: SQLContext) extends LogicalPlan with MultiInstanceRelation { override def children: Seq[LogicalPlan] = Nil override def newInstance(): this.type = LogicalLocalTable(output.map(_.newInstance()), rows)(sqlContext).asInstanceOf[this.type] override def sameResult(plan: LogicalPlan): Boolean = plan match { case LogicalRDD(_, otherRDD) => rows == rows case _ => false } @transient override lazy val statistics: Statistics = Statistics( // TODO: Improve the statistics estimation. // This is made small enough so it can be broadcasted. sizeInBytes = sqlContext.conf.autoBroadcastJoinThreshold - 1 ) }
Example 92
Source File: NullableColumnAccessorSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.types.DataType class TestNullableColumnAccessor[T <: DataType, JvmType]( buffer: ByteBuffer, columnType: ColumnType[T, JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[T <: DataType, JvmType](buffer: ByteBuffer, columnType: ColumnType[T, JvmType]) : TestNullableColumnAccessor[T, JvmType] = { // Skips the column type ID buffer.getInt() new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import ColumnarTestUtils._ Seq( INT, LONG, SHORT, BOOLEAN, BYTE, STRING, DOUBLE, FLOAT, FIXED_DECIMAL(15, 10), BINARY, GENERIC, DATE, TIMESTAMP ).foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[T <: DataType, JvmType]( columnType: ColumnType[T, JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => builder.appendFrom(randomRow, 0) builder.appendFrom(nullRow, 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericMutableRow(1) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row(0) === randomRow(0)) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 93
Source File: ColumnarTestUtils.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar import java.sql.Timestamp import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.types.{UTF8String, DataType, Decimal, AtomicType} object ColumnarTestUtils { def makeNullRow(length: Int): GenericMutableRow = { val row = new GenericMutableRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[T <: DataType, JvmType](columnType: ColumnType[T, JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case FIXED_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case STRING => UTF8String(Random.nextString(Random.nextInt(32))) case BOOLEAN => Random.nextBoolean() case BINARY => randomBytes(Random.nextInt(32)) case DATE => Random.nextInt() case TIMESTAMP => val timestamp = new Timestamp(Random.nextLong()) timestamp.setNanos(Random.nextInt(999999999)) timestamp case _ => // Using a random one-element map instead of an arbitrary object Map(Random.nextInt() -> Random.nextString(Random.nextInt(32))) }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_ <: DataType, _], tail: ColumnType[_ <: DataType, _]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_ <: DataType, _]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[T <: DataType, JvmType]( columnType: ColumnType[T, JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_ <: DataType, _], tail: ColumnType[_ <: DataType, _]*): Row = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_ <: DataType, _]]): Row = { val row = new GenericMutableRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericMutableRow(1) row(0) = value row } (values, rows) } }
Example 94
Source File: RColumnTransformer.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables import java.util.UUID import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.OperationExecutionDispatcher.Result import ai.deepsense.deeplang.params.{CodeSnippetLanguage, CodeSnippetParam, Param} import org.apache.spark.sql.types.DataType class RColumnTransformer() extends CustomCodeColumnTransformer { override val codeParameter = CodeSnippetParam( name = "column operation code", description = None, language = CodeSnippetLanguage(CodeSnippetLanguage.r) ) setDefault(codeParameter -> """transform.column <- function(column, column.name) { | return(column) |}""".stripMargin ) override def getSpecificParams: Array[Param[_]] = Array(codeParameter, targetType) override def getComposedCode( userCode: String, inputColumn: String, outputColumn: String, targetType: DataType): String = { val newFieldName = UUID.randomUUID().toString.replace("-", "") s""" |$userCode | |transform <- function(dataframe) { | new.column <- cast(transform.column(dataframe$$'$inputColumn', '$inputColumn'), | '${targetType.simpleString}') | return(withColumn(dataframe, '$newFieldName', new.column)) |} """.stripMargin } override def runCode(context: ExecutionContext, code: String): Result = context.customCodeExecutor.runR(code) override def isValid(context: ExecutionContext, code: String): Boolean = context.customCodeExecutor.isRValid(code) }
Example 95
Source File: CustomCodeColumnTransformer.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.doperations.exceptions.CustomOperationExecutionException import ai.deepsense.deeplang.OperationExecutionDispatcher.Result import ai.deepsense.deeplang.params.{CodeSnippetParam, Param} import ai.deepsense.deeplang.params.choice.ChoiceParam import org.apache.spark.sql.types.{DataType, StructField, StructType} abstract class CustomCodeColumnTransformer() extends MultiColumnTransformer { import CustomCodeColumnTransformer._ val targetType = ChoiceParam[TargetTypeChoice]( name = "target type", description = Some("Target type of the columns.")) def getTargetType: TargetTypeChoice = $(targetType) def setTargetType(value: TargetTypeChoice): this.type = set(targetType, value) val codeParameter: CodeSnippetParam def getCodeParameter: String = $(codeParameter) def setCodeParameter(value: String): this.type = set(codeParameter, value) def runCode(context: ExecutionContext, code: String): Result def isValid(context: ExecutionContext, code: String): Boolean def getComposedCode( userCode: String, inputColumn: String, outputColumn: String, targetType: DataType): String override def getSpecificParams: Array[Param[_]] private def executeCode( code: String, inputColumn: String, outputColumn: String, context: ExecutionContext, dataFrame: DataFrame): DataFrame = { runCode(context, code) match { case Left(error) => throw CustomOperationExecutionException(s"Execution exception:\n\n$error") case Right(_) => val sparkDataFrame = context.dataFrameStorage.getOutputDataFrame(OutputPortNumber).getOrElse { throw CustomOperationExecutionException( "Operation finished successfully, but did not produce a DataFrame.") } val newSparkDataFrame = context.sparkSQLSession.createDataFrame( sparkDataFrame.rdd, transformSingleColumnSchema(inputColumn, outputColumn, dataFrame.schema.get).get) DataFrame.fromSparkDataFrame(newSparkDataFrame) } } override def transformSingleColumn( inputColumn: String, outputColumn: String, context: ExecutionContext, dataFrame: DataFrame): DataFrame = { val code = getComposedCode( $(codeParameter), inputColumn, outputColumn, getTargetType.columnType) logger.debug(s"Code to be validated and executed:\n$code") if (!isValid(context, code)) { throw CustomOperationExecutionException("Code validation failed") } context.dataFrameStorage.withInputDataFrame(InputPortNumber, dataFrame.sparkDataFrame) { executeCode(code, inputColumn, outputColumn, context, dataFrame) } } override def transformSingleColumnSchema( inputColumn: String, outputColumn: String, schema: StructType): Option[StructType] = { MultiColumnTransformer.assertColumnExist(inputColumn, schema) MultiColumnTransformer.assertColumnDoesNotExist(outputColumn, schema) Some(schema.add(StructField(outputColumn, getTargetType.columnType, nullable = true))) } } object CustomCodeColumnTransformer { val InputPortNumber: Int = 0 val OutputPortNumber: Int = 0 }
Example 96
Source File: StringTokenizerSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class StringTokenizerSmokeTest extends AbstractTransformerWrapperSmokeTest[StringTokenizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: StringTokenizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("tokenized") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("s")) .setInPlace(inPlace) val transformer = new StringTokenizer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( "this is a test", "this values should be separated", "Bla bla bla!" ) val tokenized = strings.map { _.toLowerCase.split("\\s") } strings.zip(tokenized) } override def inputType: DataType = StringType override def outputType: DataType = new ArrayType(StringType, true) }
Example 97
Source File: PolynomialExpanderSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import ai.deepsense.sparkutils.Linalg.Vectors import org.apache.spark.sql.types.DataType import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class PolynomialExpanderSmokeTest extends AbstractTransformerWrapperSmokeTest[PolynomialExpander] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: PolynomialExpander = { val inPlace = NoInPlaceChoice() .setOutputColumn("polynomial") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("v")) .setInPlace(inPlace) val transformer = new PolynomialExpander() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.degree -> 3 ): _*) } override def testValues: Seq[(Any, Any)] = { val input = Seq( Vectors.dense(1.0), Vectors.dense(1.0, 2.0) ) val inputAfterDCT = Seq( // x, x^2, x^3 Vectors.dense(1.0, 1.0, 1.0), // x, x^2, x^3, y, x * y, x^2 * y, x * y^2, y^2, y^3 Vectors.dense(1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 4.0, 4.0, 8.0) ) input.zip(inputAfterDCT) } override def inputType: DataType = new ai.deepsense.sparkutils.Linalg.VectorUDT override def outputType: DataType = new ai.deepsense.sparkutils.Linalg.VectorUDT }
Example 98
Source File: BinarizerSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{DataType, DoubleType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class BinarizerSmokeTest extends AbstractTransformerWrapperSmokeTest[Binarizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: Binarizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("binarizerOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("d")) .setInPlace(inPlace) val binarizer = new Binarizer() binarizer.set( binarizer.singleOrMultiChoiceParam -> single, binarizer.threshold -> 0.5) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(0.2, 0.5, 1.8) val outputNumbers = Seq(0.0, 0.0, 1.0) inputNumbers.zip(outputNumbers) } override def inputType: DataType = DoubleType override def outputType: DataType = DoubleType }
Example 99
Source File: DiscreteCosineTransformerSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import ai.deepsense.sparkutils.Linalg.Vectors import org.apache.spark.sql.types.DataType import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class DiscreteCosineTransformerSmokeTest extends AbstractTransformerWrapperSmokeTest[DiscreteCosineTransformer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: DiscreteCosineTransformer = { val inPlace = NoInPlaceChoice() .setOutputColumn("dct") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("v")) .setInPlace(inPlace) val transformer = new DiscreteCosineTransformer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.inverse -> false ): _*) } override def testValues: Seq[(Any, Any)] = { val input = Seq( Vectors.dense(0.0), Vectors.dense(1.0), Vectors.dense(2.0) ) val inputAfterDCT = Seq( Vectors.dense(0.0), Vectors.dense(1.0), Vectors.dense(2.0) ) input.zip(inputAfterDCT) } override def inputType: DataType = new ai.deepsense.sparkutils.Linalg.VectorUDT override def outputType: DataType = new ai.deepsense.sparkutils.Linalg.VectorUDT }
Example 100
Source File: RegexTokenizerSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class RegexTokenizerSmokeTest extends AbstractTransformerWrapperSmokeTest[RegexTokenizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: RegexTokenizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("tokenized") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("s")) .setInPlace(inPlace) val transformer = new RegexTokenizer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.gaps -> false, transformer.minTokenLength -> 1, transformer.pattern -> "\\d+" ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( "100 200 300", "400 500 600", "700 800 900" ) val tokenized = strings.map { _.toLowerCase.split(" ") } strings.zip(tokenized) } override def inputType: DataType = StringType override def outputType: DataType = new ArrayType(StringType, true) }
Example 101
Source File: OneHotEncoderSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import ai.deepsense.sparkutils.Linalg.Vectors import org.apache.spark.sql.types.{DataType, DoubleType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class OneHotEncoderSmokeTest extends AbstractTransformerWrapperSmokeTest[OneHotEncoder] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: OneHotEncoder = { val inPlace = NoInPlaceChoice() .setOutputColumn("oneHotEncoderOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("d")) .setInPlace(inPlace) val oneHotEncoder = new OneHotEncoder() oneHotEncoder.set( oneHotEncoder.singleOrMultiChoiceParam -> single, oneHotEncoder.dropLast -> false) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(0.0, 1.0) val outputNumbers = Seq(Vectors.dense(1.0, 0.0), Vectors.dense(0.0, 1.0)) inputNumbers.zip(outputNumbers) } override def inputType: DataType = DoubleType override def outputType: DataType = new ai.deepsense.sparkutils.Linalg.VectorUDT }
Example 102
Source File: NGramTransformerSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class NGramTransformerSmokeTest extends AbstractTransformerWrapperSmokeTest[NGramTransformer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: NGramTransformer = { val inPlace = NoInPlaceChoice() .setOutputColumn("ngrams") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("as")) .setInPlace(inPlace) val transformer = new NGramTransformer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.n -> 2 ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( Array("a", "b", "c"), Array("d", "e", "f") ) val ngrams = Seq( Array("a b", "b c"), Array("d e", "e f") ) strings.zip(ngrams) } override def inputType: DataType = new ArrayType(StringType, true) override def outputType: DataType = new ArrayType(StringType, false) }
Example 103
Source File: StopWordsRemoverSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class StopWordsRemoverSmokeTest extends AbstractTransformerWrapperSmokeTest[StopWordsRemover] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: StopWordsRemover = { val inPlace = NoInPlaceChoice() .setOutputColumn("stopWordsRemoverOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("as")) .setInPlace(inPlace) val stopWordsRemover = new StopWordsRemover() stopWordsRemover.set( stopWordsRemover.singleOrMultiChoiceParam -> single, stopWordsRemover.caseSensitive -> false) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(Array("a", "seahorse", "The", "Horseshoe", "Crab")) val outputNumbers = Seq(Array("seahorse", "Horseshoe", "Crab")) inputNumbers.zip(outputNumbers) } override def inputType: DataType = ArrayType(StringType) override def outputType: DataType = ArrayType(StringType) }
Example 104
Source File: NormalizerSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.transformers import ai.deepsense.sparkutils.Linalg.Vectors import org.apache.spark.sql.types.DataType import ai.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import ai.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection class NormalizerSmokeTest extends AbstractTransformerWrapperSmokeTest[Normalizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: Normalizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("normalize") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("v")) .setInPlace(inPlace) val transformer = new Normalizer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.p -> 1.0 ): _*) } override def testValues: Seq[(Any, Any)] = { val input = Seq( Vectors.dense(0.0, 100.0, 100.0), Vectors.dense(1.0, 1.0, 0.0), Vectors.dense(-3.0, 3.0, 0.0) ) val inputAfterNormalize = Seq( Vectors.dense(0.0, 0.5, 0.5), Vectors.dense(0.5, 0.5, 0.0), Vectors.dense(-0.5, 0.5, 0.0) ) input.zip(inputAfterNormalize) } override def inputType: DataType = new ai.deepsense.sparkutils.Linalg.VectorUDT override def outputType: DataType = new ai.deepsense.sparkutils.Linalg.VectorUDT }
Example 105
Source File: StructFieldJsonProtocol.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.reportlib.model import org.apache.spark.sql.types.{DataType, StructField} import spray.json._ import ai.deepsense.commons.json.EnumerationSerializer import ai.deepsense.commons.types.{ColumnType, SparkConversions} trait StructFieldJsonProtocol extends DefaultJsonProtocol with MetadataJsonProtocol with DataTypeJsonProtocol { implicit val failureCodeFormat = EnumerationSerializer.jsonEnumFormat(ColumnType) // StructField format without metadata, with deeplangType appended implicit val structFieldFormat = new RootJsonFormat[StructField] { val c = (s: String, d: DataType, b: Boolean) => StructField(s, d, b) implicit val rawFormat = jsonFormat(c, "name", "dataType", "nullable") override def write(obj: StructField): JsValue = { val jsObject = obj.toJson(rawFormat).asJsObject val deeplangType = SparkConversions.sparkColumnTypeToColumnType(obj.dataType) JsObject(jsObject.fields + ("deeplangType" -> deeplangType.toJson)) } override def read(json: JsValue): StructField = { json.convertTo(rawFormat) } } }
Example 106
package frameless.functions import frameless.TypedEncoder import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.expressions.{Expression, Literal, NonSQLExpression} import org.apache.spark.sql.types.DataType case class FramelessLit[A](obj: A, encoder: TypedEncoder[A]) extends Expression with NonSQLExpression { override def nullable: Boolean = encoder.nullable override def toString: String = s"FramelessLit($obj)" def eval(input: InternalRow): Any = { val ctx = new CodegenContext() val eval = genCode(ctx) val codeBody = s""" public scala.Function1<InternalRow, Object> generate(Object[] references) { return new FramelessLitEvalImpl(references); } class FramelessLitEvalImpl extends scala.runtime.AbstractFunction1<InternalRow, Object> { private final Object[] references; ${ctx.declareMutableStates()} ${ctx.declareAddedFunctions()} public FramelessLitEvalImpl(Object[] references) { this.references = references; ${ctx.initMutableStates()} } public java.lang.Object apply(java.lang.Object z) { InternalRow ${ctx.INPUT_ROW} = (InternalRow) z; ${eval.code} return ${eval.isNull} ? ((Object)null) : ((Object)${eval.value}); } } """ val code = CodeFormatter.stripOverlappingComments( new CodeAndComment(codeBody, ctx.getPlaceHolderToComments())) val (clazz, _) = CodeGenerator.compile(code) val codegen = clazz.generate(ctx.references.toArray).asInstanceOf[InternalRow => AnyRef] codegen(input) } def dataType: DataType = encoder.catalystRepr def children: Seq[Expression] = Nil override def genCode(ctx: CodegenContext): ExprCode = { encoder.toCatalyst(new Literal(obj, encoder.jvmRepr)).genCode(ctx) } protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = ??? }
Example 107
Source File: OpPipelineStageReaderWriterTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.features._ import com.salesforce.op.features.types._ import com.salesforce.op.stages.OpPipelineStageReaderWriter._ import com.salesforce.op.test.PassengerSparkFixtureTest import com.salesforce.op.utils.reflection.ReflectionUtils import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.{Model, Transformer} import org.apache.spark.sql.types.{DataType, Metadata, MetadataBuilder} import org.json4s.JsonAST.JValue import org.json4s.jackson.JsonMethods.{compact, parse, pretty, render} import org.json4s.{JArray, JObject} import org.scalatest.FlatSpec import org.slf4j.LoggerFactory // TODO: consider adding a read/write test for a spark wrapped stage as well private[stages] abstract class OpPipelineStageReaderWriterTest extends FlatSpec with PassengerSparkFixtureTest { val meta = new MetadataBuilder().putString("foo", "bar").build() val expectedFeaturesLength = 1 def stage: OpPipelineStageBase with Transformer val expected: Array[Real] val hasOutputName = true private val log = LoggerFactory.getLogger(this.getClass) private lazy val savePath = tempDir + "/" + this.getClass.getSimpleName + "-" + System.currentTimeMillis() private lazy val writer = new OpPipelineStageWriter(stage) private lazy val stageJsonString: String = writer.writeToJsonString(savePath) private lazy val stageJson: JValue = parse(stageJsonString) private lazy val isModel = stage.isInstanceOf[Model[_]] private val FN = FieldNames Spec(this.getClass) should "write stage uid" in { log.info(pretty(stageJson)) (stageJson \ FN.Uid.entryName).extract[String] shouldBe stage.uid } it should "write class name" in { (stageJson \ FN.Class.entryName).extract[String] shouldBe stage.getClass.getName } it should "write params map" in { val params = extractParams(stageJson).extract[Map[String, Any]] if (hasOutputName) { params should have size 4 params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema", "outputFeatureName") } else { params should have size 3 params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema") } } it should "write outputMetadata" in { val params = extractParams(stageJson) val metadataStr = compact(render(extractParams(stageJson) \ "outputMetadata")) val metadata = Metadata.fromJson(metadataStr) metadata shouldBe stage.getMetadata() } it should "write inputSchema" in { val schemaStr = compact(render(extractParams(stageJson) \ "inputSchema")) val schema = DataType.fromJson(schemaStr) schema shouldBe stage.getInputSchema() } it should "write input features" in { val jArray = (extractParams(stageJson) \ "inputFeatures").extract[JArray] jArray.values should have length expectedFeaturesLength val obj = jArray(0).extract[JObject] obj.values.keys shouldBe Set("name", "isResponse", "isRaw", "uid", "typeName", "stages", "originFeatures") } it should "write model ctor args" in { if (stage.isInstanceOf[Model[_]]) { val ctorArgs = (stageJson \ FN.CtorArgs.entryName).extract[JObject] val (_, args) = ReflectionUtils.bestCtorWithArgs(stage) ctorArgs.values.keys shouldBe args.map(_._1).toSet } } it should "load stage correctly" in { val reader = new OpPipelineStageReader(stage) val stageLoaded = reader.loadFromJsonString(stageJsonString, path = savePath) stageLoaded shouldBe a[OpPipelineStageBase] stageLoaded shouldBe a[Transformer] stageLoaded.getOutput() shouldBe a[FeatureLike[_]] val _ = stage.asInstanceOf[Transformer].transform(passengersDataSet) val transformed = stageLoaded.asInstanceOf[Transformer].transform(passengersDataSet) transformed.collect(stageLoaded.getOutput().asInstanceOf[FeatureLike[Real]]) shouldBe expected stageLoaded.uid shouldBe stage.uid stageLoaded.operationName shouldBe stage.operationName stageLoaded.getInputFeatures() shouldBe stage.getInputFeatures() stageLoaded.getInputSchema() shouldBe stage.getInputSchema() } private def extractParams(stageJson: JValue): JValue = { val defaultParamsMap = stageJson \ FN.DefaultParamMap.entryName val paramsMap = stageJson \ FN.ParamMap.entryName defaultParamsMap.merge(paramsMap) } }
Example 108
Source File: TypeQualifiers.scala From kyuubi with Apache License 2.0 | 5 votes |
package yaooqinn.kyuubi.schema import scala.collection.JavaConverters._ import org.apache.hive.service.cli.thrift.{TCLIServiceConstants, TTypeQualifiers, TTypeQualifierValue} import org.apache.spark.sql.types.{DataType, DecimalType} class TypeQualifiers private() { private var precision: Option[Int] = None private var scale: Option[Int] = None private def setPrecision(precision: Int): Unit = { this.precision = Some(precision) } private def setScale(scale: Int): Unit = { this.scale = Some(scale) } def toTTypeQualifiers: TTypeQualifiers = new TTypeQualifiers( (precision.map(TTypeQualifierValue.i32Value).map(TCLIServiceConstants.PRECISION -> _) ++ scale.map(TTypeQualifierValue.i32Value).map(TCLIServiceConstants.SCALE -> _)).toMap.asJava) } object TypeQualifiers { def fromTypeInfo(typ: DataType): TypeQualifiers = { val result = new TypeQualifiers typ match { case decimalType: DecimalType => result.setScale(decimalType.scale) result.setPrecision(decimalType.precision) case _ => } result } }
Example 109
Source File: TypeDescriptor.scala From kyuubi with Apache License 2.0 | 5 votes |
package yaooqinn.kyuubi.schema import org.apache.hive.service.cli.thrift.{TPrimitiveTypeEntry, TTypeDesc, TTypeEntry} import org.apache.spark.sql.types.{DataType, DecimalType} case class TypeDescriptor(typ: DataType) { private val typeQualifiers: Option[TypeQualifiers] = typ match { case d: DecimalType => Some(TypeQualifiers.fromTypeInfo(d)) case _ => None } def toTTypeDesc: TTypeDesc = { val primitiveEntry = new TPrimitiveTypeEntry(SchemaMapper.toTTypeId(typ)) typeQualifiers.map(_.toTTypeQualifiers).foreach(primitiveEntry.setTypeQualifiers) val entry = TTypeEntry.primitiveEntry(primitiveEntry) val desc = new TTypeDesc desc.addToTypes(entry) desc } }
Example 110
Source File: SparkSQLUtils.scala From kyuubi with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.hive.HiveUtils import org.apache.spark.sql.types.DataType object SparkSQLUtils { def toHiveString(a: (Any, DataType)): String = { HiveUtils.toHiveString(a) } def getUserJarClassLoader(sparkSession: SparkSession): ClassLoader = { sparkSession.sharedState.jarClassLoader } def parsePlan(sparkSession: SparkSession, statement: String): LogicalPlan = { sparkSession.sessionState.sqlParser.parsePlan(statement) } def toDataFrame(sparkSession: SparkSession, plan: LogicalPlan): DataFrame = { Dataset.ofRows(sparkSession, plan) } def initializeMetaStoreClient(sparkSession: SparkSession): Seq[String] = { sparkSession.sessionState.catalog.listDatabases("default") } }
Example 111
Source File: Tokenizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getPattern: String = $(pattern) setDefault(minTokenLength -> 1, gaps -> true, pattern -> "\\s+") override protected def createTransformFunc: String => Seq[String] = { str => val re = $(pattern).r val tokens = if ($(gaps)) re.split(str).toSeq else re.findAllIn(str).toSeq val minLength = $(minTokenLength) tokens.filter(_.length >= minLength) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, true) override def copy(extra: ParamMap): RegexTokenizer = defaultCopy(extra) }
Example 112
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Experimental import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util.Identifiable import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.sql.types.DataType def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT }
Example 113
Source File: MonotonicallyIncreasingID.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext} import org.apache.spark.sql.types.{LongType, DataType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initInternal(): Unit = { count = 0L partitionMask = TaskContext.getPartitionId().toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.primitive} = $partitionMaskTerm + $countTerm; $countTerm++; """ } }
Example 114
Source File: randomExpressions.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types.{DataType, DoubleType} import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom case class Randn(seed: Long) extends RDG { override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() def this() = this(Utils.random.nextLong()) def this(seed: Expression) = this(seed match { case IntegerLiteral(s) => s case _ => throw new AnalysisException("Input argument to rand must be an integer literal.") }) override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val rngTerm = ctx.freshName("rng") val className = classOf[XORShiftRandom].getName ctx.addMutableState(className, rngTerm, s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextGaussian(); """ } }
Example 115
Source File: ShuffledRowRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.DataType private class ShuffledRowRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = idx } class ShuffledRowRDD( @transient var prev: RDD[Product2[Int, InternalRow]], serializer: Serializer, numPartitions: Int) extends RDD[InternalRow](prev.context, Nil) { private val part: Partitioner = new PartitionIdPassthrough(numPartitions) override def getDependencies: Seq[Dependency[_]] = { List(new ShuffleDependency[Int, InternalRow, InternalRow](prev, part, Some(serializer))) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRowRDDPartition(i)) } override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[Int, InternalRow, InternalRow]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[Product2[Int, InternalRow]]] .map(_._2) } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 116
Source File: NullableColumnAccessorSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.types.{StringType, ArrayType, DataType} //试验可为空的列的访问 class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor //试验可为空的列的访问 object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { // Skips the column type ID buffer.getInt() new TestNullableColumnAccessor(buffer, columnType) } } //空列存取器套件 class NullableColumnAccessorSuite extends SparkFunSuite { import ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, DATE, LONG, TIMESTAMP, FLOAT, DOUBLE, STRING, BINARY, FIXED_DECIMAL(15, 10), GENERIC(ArrayType(StringType))) .foreach { testNullableColumnAccessor(_) } //试验可为空的列的访问 def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { //stripSuffix去掉<string>字串中结尾的字符 val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) //空值 test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } //访问空值 test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => builder.appendFrom(randomRow, 0) builder.appendFrom(nullRow, 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericMutableRow(1) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.get(0, columnType.dataType) === randomRow.get(0, columnType.dataType)) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 117
Source File: ColumnarTestUtils.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.types.{DataType, Decimal, AtomicType} import org.apache.spark.unsafe.types.UTF8String //列测试工具 object ColumnarTestUtils { def makeNullRow(length: Int): GenericMutableRow = { val row = new GenericMutableRow(length) (0 until length).foreach(row.setNullAt) row } //产生随机值 def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case DATE => Random.nextInt() case LONG => Random.nextLong() case TIMESTAMP => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case FIXED_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case _ => // Using a random one-element map instead of an arbitrary object //使用随机一元映射代替任意对象 Map(Random.nextInt() -> Random.nextString(Random.nextInt(32))) }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } //使唯一随机值 def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericMutableRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } //使唯一值和单值行 def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericMutableRow(1) row(0) = value row } (values, rows) } }
Example 118
Source File: UnaryTransformerExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.DoubleParam import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.{DataType, DataTypes} import org.apache.spark.util.Utils // $example off$ object MyTransformer extends DefaultParamsReadable[MyTransformer] // $example off$ def main(args: Array[String]) { val spark = SparkSession .builder() .appName("UnaryTransformerExample") .getOrCreate() // $example on$ val myTransformer = new MyTransformer() .setShift(0.5) .setInputCol("input") .setOutputCol("output") // Create data, transform, and display it. val data = spark.range(0, 5).toDF("input") .select(col("input").cast("double").as("input")) val result = myTransformer.transform(data) println("Transformed by adding constant value") result.show() // Save and load the Transformer. val tmpDir = Utils.createTempDir() val dirName = tmpDir.getCanonicalPath myTransformer.write.overwrite().save(dirName) val sameTransformer = MyTransformer.load(dirName) // Transform the data to show the results are identical. println("Same transform applied from loaded model") val sameResult = sameTransformer.transform(data) sameResult.show() Utils.deleteRecursively(tmpDir) // $example off$ spark.stop() } } // scalastyle:on println
Example 119
Source File: ElementwiseProduct.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 120
Source File: Normalizer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 121
Source File: DCT.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 122
Source File: NGram.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} @Since("1.5.0") def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 123
Source File: MonotonicallyIncreasingID.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, LongType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initializeInternal(partitionIndex: Int): Unit = { count = 0L partitionMask = partitionIndex.toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val countTerm = ctx.addMutableState(ctx.JAVA_LONG, "count") val partitionMaskTerm = "partitionMask" ctx.addImmutableStateIfNotExists(ctx.JAVA_LONG, partitionMaskTerm) ctx.addPartitionInitializationStatement(s"$countTerm = 0L;") ctx.addPartitionInitializationStatement(s"$partitionMaskTerm = ((long) partitionIndex) << 33;") ev.copy(code = s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++;""", isNull = "false") } override def prettyName: String = "monotonically_increasing_id" override def sql: String = s"$prettyName()" }
Example 124
Source File: ExpressionEvalHelperSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{DataType, IntegerType} case class BadCodegenExpression() extends LeafExpression { override def nullable: Boolean = false override def eval(input: InternalRow): Any = 10 override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { ev.copy(code = s""" |int some_variable = 11; |int ${ev.value} = 10; """.stripMargin) } override def dataType: DataType = IntegerType }
Example 125
Source File: GenerateUnsafeProjectionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.BoundReference import org.apache.spark.sql.catalyst.util.{ArrayData, MapData} import org.apache.spark.sql.types.{DataType, Decimal, StringType, StructType} import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} class GenerateUnsafeProjectionSuite extends SparkFunSuite { test("Test unsafe projection string access pattern") { val dataType = (new StructType).add("a", StringType) val exprs = BoundReference(0, dataType, nullable = true) :: Nil val projection = GenerateUnsafeProjection.generate(exprs) val result = projection.apply(InternalRow(AlwaysNull)) assert(!result.isNullAt(0)) assert(result.getStruct(0, 1).isNullAt(0)) } } object AlwaysNull extends InternalRow { override def numFields: Int = 1 override def setNullAt(i: Int): Unit = {} override def copy(): InternalRow = this override def anyNull: Boolean = true override def isNullAt(ordinal: Int): Boolean = true override def update(i: Int, value: Any): Unit = notSupported override def getBoolean(ordinal: Int): Boolean = notSupported override def getByte(ordinal: Int): Byte = notSupported override def getShort(ordinal: Int): Short = notSupported override def getInt(ordinal: Int): Int = notSupported override def getLong(ordinal: Int): Long = notSupported override def getFloat(ordinal: Int): Float = notSupported override def getDouble(ordinal: Int): Double = notSupported override def getDecimal(ordinal: Int, precision: Int, scale: Int): Decimal = notSupported override def getUTF8String(ordinal: Int): UTF8String = notSupported override def getBinary(ordinal: Int): Array[Byte] = notSupported override def getInterval(ordinal: Int): CalendarInterval = notSupported override def getStruct(ordinal: Int, numFields: Int): InternalRow = notSupported override def getArray(ordinal: Int): ArrayData = notSupported override def getMap(ordinal: Int): MapData = notSupported override def get(ordinal: Int, dataType: DataType): AnyRef = notSupported private def notSupported: Nothing = throw new UnsupportedOperationException }
Example 126
Source File: ComplexDataSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import scala.collection._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{BoundReference, GenericInternalRow, SpecificInternalRow, UnsafeMapData, UnsafeProjection} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType} import org.apache.spark.unsafe.types.UTF8String class ComplexDataSuite extends SparkFunSuite { def utf8(str: String): UTF8String = UTF8String.fromString(str) test("inequality tests for MapData") { // test data val testMap1 = Map(utf8("key1") -> 1) val testMap2 = Map(utf8("key1") -> 1, utf8("key2") -> 2) val testMap3 = Map(utf8("key1") -> 1) val testMap4 = Map(utf8("key1") -> 1, utf8("key2") -> 2) // ArrayBasedMapData val testArrayMap1 = ArrayBasedMapData(testMap1.toMap) val testArrayMap2 = ArrayBasedMapData(testMap2.toMap) val testArrayMap3 = ArrayBasedMapData(testMap3.toMap) val testArrayMap4 = ArrayBasedMapData(testMap4.toMap) assert(testArrayMap1 !== testArrayMap3) assert(testArrayMap2 !== testArrayMap4) // UnsafeMapData val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType))) val row = new GenericInternalRow(1) def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = { row.update(0, map) val unsafeRow = unsafeConverter.apply(row) unsafeRow.getMap(0).copy } assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3)) assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4)) } test("GenericInternalRow.copy return a new instance that is independent from the old one") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val genericRow = new GenericInternalRow(Array[Any](unsafeRow.getUTF8String(0))) val copiedGenericRow = genericRow.copy() assert(copiedGenericRow.getString(0) == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied internal row should not be changed externally. assert(copiedGenericRow.getString(0) == "a") } test("SpecificMutableRow.copy return a new instance that is independent from the old one") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val mutableRow = new SpecificInternalRow(Seq(StringType)) mutableRow(0) = unsafeRow.getUTF8String(0) val copiedMutableRow = mutableRow.copy() assert(copiedMutableRow.getString(0) == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied internal row should not be changed externally. assert(copiedMutableRow.getString(0) == "a") } test("GenericArrayData.copy return a new instance that is independent from the old one") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val genericArray = new GenericArrayData(Array[Any](unsafeRow.getUTF8String(0))) val copiedGenericArray = genericArray.copy() assert(copiedGenericArray.getUTF8String(0).toString == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied array data should not be changed externally. assert(copiedGenericArray.getUTF8String(0).toString == "a") } test("copy on nested complex type") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val arrayOfRow = new GenericArrayData(Array[Any](InternalRow(unsafeRow.getUTF8String(0)))) val copied = arrayOfRow.copy() assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied data should not be changed externally. assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a") } }
Example 127
Source File: PythonSQLUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.api.python import org.apache.spark.api.java.JavaRDD import org.apache.spark.sql.{DataFrame, SQLContext} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry import org.apache.spark.sql.catalyst.expressions.ExpressionInfo import org.apache.spark.sql.catalyst.parser.CatalystSqlParser import org.apache.spark.sql.execution.arrow.ArrowConverters import org.apache.spark.sql.types.DataType private[sql] object PythonSQLUtils { def parseDataType(typeText: String): DataType = CatalystSqlParser.parseDataType(typeText) // This is needed when generating SQL documentation for built-in functions. def listBuiltinFunctionInfos(): Array[ExpressionInfo] = { FunctionRegistry.functionSet.flatMap(f => FunctionRegistry.builtin.lookupFunction(f)).toArray } def arrowPayloadToDataFrame( payloadRDD: JavaRDD[Array[Byte]], schemaString: String, sqlContext: SQLContext): DataFrame = { ArrowConverters.toDataFrame(payloadRDD, schemaString, sqlContext) } }
Example 128
Source File: AggregatedDialect.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import org.apache.spark.sql.types.{DataType, MetadataBuilder} private class AggregatedDialect(dialects: List[JdbcDialect]) extends JdbcDialect { require(dialects.nonEmpty) override def canHandle(url : String): Boolean = dialects.map(_.canHandle(url)).reduce(_ && _) override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { dialects.flatMap(_.getCatalystType(sqlType, typeName, size, md)).headOption } override def getJDBCType(dt: DataType): Option[JdbcType] = { dialects.flatMap(_.getJDBCType(dt)).headOption } override def quoteIdentifier(colName: String): String = { dialects.head.quoteIdentifier(colName) } override def getTableExistsQuery(table: String): String = { dialects.head.getTableExistsQuery(table) } override def getSchemaQuery(table: String): String = { dialects.head.getSchemaQuery(table) } override def isCascadingTruncateTable(): Option[Boolean] = { // If any dialect claims cascading truncate, this dialect is also cascading truncate. // Otherwise, if any dialect has unknown cascading truncate, this dialect is also unknown. dialects.flatMap(_.isCascadingTruncateTable()).reduceOption(_ || _) match { case Some(true) => Some(true) case _ if dialects.exists(_.isCascadingTruncateTable().isEmpty) => None case _ => Some(false) } } override def getTruncateQuery(table: String): String = { dialects.head.getTruncateQuery(table) } }
Example 129
Source File: MySQLDialect.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 130
Source File: UserDefinedFunction.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.InterfaceStability import org.apache.spark.sql.Column import org.apache.spark.sql.catalyst.expressions.ScalaUDF import org.apache.spark.sql.types.DataType def asNondeterministic(): UserDefinedFunction = { if (!_deterministic) { this } else { val udf = copyAll() udf._deterministic = false udf } } }
Example 131
Source File: PythonUDF.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.python import org.apache.spark.api.python.PythonFunction import org.apache.spark.sql.catalyst.expressions.{Expression, NonSQLExpression, Unevaluable, UserDefinedExpression} import org.apache.spark.sql.types.DataType case class PythonUDF( name: String, func: PythonFunction, dataType: DataType, children: Seq[Expression], evalType: Int, udfDeterministic: Boolean) extends Expression with Unevaluable with NonSQLExpression with UserDefinedExpression { override lazy val deterministic: Boolean = udfDeterministic && children.forall(_.deterministic) override def toString: String = s"$name(${children.mkString(", ")})" override def nullable: Boolean = true }
Example 132
Source File: TestCompressibleColumnBuilder.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar.compression import org.apache.spark.sql.execution.columnar._ import org.apache.spark.sql.types.{AtomicType, DataType} class TestCompressibleColumnBuilder[T <: AtomicType]( override val columnStats: ColumnStats, override val columnType: NativeColumnType[T], override val schemes: Seq[CompressionScheme]) extends NativeColumnBuilder(columnStats, columnType) with NullableColumnBuilder with CompressibleColumnBuilder[T] { override protected def isWorthCompressing(encoder: Encoder[T]) = true } object TestCompressibleColumnBuilder { def apply[T <: AtomicType]( columnStats: ColumnStats, columnType: NativeColumnType[T], scheme: CompressionScheme): TestCompressibleColumnBuilder[T] = { val builder = new TestCompressibleColumnBuilder(columnStats, columnType, Seq(scheme)) builder.initialize(0, "", useCompression = true) builder } } object ColumnBuilderHelper { def apply( dataType: DataType, batchSize: Int, name: String, useCompression: Boolean): ColumnBuilder = { ColumnBuilder(dataType, batchSize, name, useCompression) } }
Example 133
Source File: MimirSparkRuntimeUtils.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{ DataType, LongType } import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{ spark_partition_id, monotonically_increasing_id, count, sum, first, lit, col } object MimirSparkRuntimeUtils { def zipWithIndex(df: DataFrame, offset: Long = 1, indexName: String = "ROWIDX", indexType:DataType = LongType): DataFrame = { val dfWithPartitionId = df.withColumn("partition_id", spark_partition_id()).withColumn("inc_id", monotonically_increasing_id()) val partitionOffsets = dfWithPartitionId .groupBy("partition_id") .agg(count(lit(1)) as "cnt", first("inc_id") as "inc_id") .orderBy("partition_id") .select(col("partition_id"), sum("cnt").over(Window.orderBy("partition_id")) - col("cnt") - col("inc_id") + lit(offset) as "cnt" ) .collect() .map(row => (row.getInt(0), row.getLong(1))) .toMap val theUdf = org.apache.spark.sql.functions.udf( (partitionId: Int) => partitionOffsets(partitionId), LongType ) dfWithPartitionId .withColumn("partition_offset", theUdf(col("partition_id"))) .withColumn(indexName, (col("partition_offset") + col("inc_id")).cast(indexType)) .drop("partition_id", "partition_offset", "inc_id") } def writeDataSink(dataframe:DataFrame, format:String, options:Map[String, String], save:Option[String]) = { val dsFormat = dataframe.write.format(format) val dsOptions = options.toSeq.foldLeft(dsFormat)( (ds, opt) => opt._1 match { case "mode" => ds.mode(opt._2) case _ => ds.option(opt._1, opt._2) }) save match { case None => dsOptions.save case Some(outputFile) => { if(format.equals("com.github.potix2.spark.google.spreadsheets")){ val gsldfparts = outputFile.split("\\/") val gsldf = s"${gsldfparts(gsldfparts.length-2)}/${gsldfparts(gsldfparts.length-1)}" dsOptions.save(gsldf) } else{ dsOptions.save(outputFile) } } } } }
Example 134
Source File: GroupOr.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.udf import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types.{ DataType, BooleanType } import org.apache.spark.sql.catalyst.expressions.{ AttributeReference, Literal, Or } case class GroupOr(child: org.apache.spark.sql.catalyst.expressions.Expression) extends DeclarativeAggregate { override def children: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = child :: Nil override def nullable: Boolean = false // Return data type. override def dataType: DataType = BooleanType override def checkInputDataTypes(): TypeCheckResult = TypeUtils.checkForOrderingExpr(child.dataType, "function group_or") private lazy val group_or = AttributeReference("group_or", BooleanType)() override lazy val aggBufferAttributes: Seq[AttributeReference] = group_or :: Nil override lazy val initialValues: Seq[Literal] = Seq( Literal.create(false, BooleanType) ) override lazy val updateExpressions: Seq[ org.apache.spark.sql.catalyst.expressions.Expression] = Seq( Or(group_or, child) ) override lazy val mergeExpressions: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = { Seq( Or(group_or.left, group_or.right) ) } override lazy val evaluateExpression: AttributeReference = group_or }
Example 135
Source File: MimirUDF.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.udf import java.sql.{ Timestamp, Date } import org.apache.spark.sql.types.{ DataType, StructType, StructField } import mimir.algebra._ import mimir.exec.spark._ import mimir.util.SparkUtils class MimirUDF { def getPrimitive(t:Type, value:Any) = value match { case null => NullPrimitive() case _ => t match { //case TInt() => IntPrimitive(value.asInstanceOf[Long]) case TInt() => IntPrimitive(value.asInstanceOf[Long]) case TFloat() => FloatPrimitive(value.asInstanceOf[Double]) case TDate() => SparkUtils.convertDate(value.asInstanceOf[Date]) case TTimestamp() => SparkUtils.convertTimestamp(value.asInstanceOf[Timestamp]) case TString() => StringPrimitive(value.asInstanceOf[String]) case TBool() => BoolPrimitive(value.asInstanceOf[Boolean]) case TRowId() => RowIdPrimitive(value.asInstanceOf[String]) case TType() => TypePrimitive(Type.fromString(value.asInstanceOf[String])) //case TAny() => NullPrimitive() //case TUser(name) => name.toLowerCase //case TInterval() => Primitive(value.asInstanceOf[Long]) case _ => StringPrimitive(value.asInstanceOf[String]) } } def getNative(primitive : PrimitiveValue) : AnyRef = primitive match { case NullPrimitive() => null case RowIdPrimitive(s) => s case StringPrimitive(s) => s case IntPrimitive(i) => new java.lang.Long(i) case FloatPrimitive(f) => new java.lang.Double(f) case BoolPrimitive(b) => new java.lang.Boolean(b) case ts@TimestampPrimitive(y,m,d,h,mm,s,ms) => SparkUtils.convertTimestamp(ts) case dt@DatePrimitive(y,m,d) => SparkUtils.convertDate(dt) case x => x.asString } def getStructType(datatypes:Seq[DataType]): StructType = { StructType(datatypes.map(dti => StructField("", RAToSpark.getInternalSparkType(dti), true))) } }
Example 136
Source File: GroupAnd.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.udf import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types.{ DataType, BooleanType } import org.apache.spark.sql.catalyst.expressions.{ AttributeReference, Literal, And } case class GroupAnd(child: org.apache.spark.sql.catalyst.expressions.Expression) extends DeclarativeAggregate { override def children: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = child :: Nil override def nullable: Boolean = false // Return data type. override def dataType: DataType = BooleanType override def checkInputDataTypes(): TypeCheckResult = TypeUtils.checkForOrderingExpr(child.dataType, "function group_and") private lazy val group_and = AttributeReference("group_and", BooleanType)() override lazy val aggBufferAttributes: Seq[AttributeReference] = group_and :: Nil override lazy val initialValues: Seq[Literal] = Seq( Literal.create(true, BooleanType) ) override lazy val updateExpressions: Seq[ org.apache.spark.sql.catalyst.expressions.Expression] = Seq( And(group_and, child) ) override lazy val mergeExpressions: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = { Seq( And(group_and.left, group_and.right) ) } override lazy val evaluateExpression: AttributeReference = group_and }
Example 137
Source File: GroupBitwiseAnd.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.udf import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types.{ DataType, LongType } import org.apache.spark.sql.catalyst.expressions.{ AttributeReference, Literal, BitwiseAnd } case class GroupBitwiseAnd(child: org.apache.spark.sql.catalyst.expressions.Expression) extends DeclarativeAggregate { override def children: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = child :: Nil override def nullable: Boolean = false // Return data type. override def dataType: DataType = LongType override def checkInputDataTypes(): TypeCheckResult = TypeUtils.checkForOrderingExpr(child.dataType, "function group_bitwise_and") private lazy val group_bitwise_and = AttributeReference("group_bitwise_and", LongType)() override lazy val aggBufferAttributes: Seq[AttributeReference] = group_bitwise_and :: Nil override lazy val initialValues: Seq[Literal] = Seq( Literal.create(0xffffffffffffffffl, LongType) ) override lazy val updateExpressions: Seq[ org.apache.spark.sql.catalyst.expressions.Expression] = Seq( BitwiseAnd(group_bitwise_and, child) ) override lazy val mergeExpressions: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = { Seq( BitwiseAnd(group_bitwise_and.left, group_bitwise_and.right) ) } override lazy val evaluateExpression: AttributeReference = group_bitwise_and }
Example 138
Source File: GroupBitwiseOr.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.udf import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types.{ DataType, LongType } import org.apache.spark.sql.catalyst.expressions.{ AttributeReference, Literal, BitwiseOr } case class GroupBitwiseOr(child: org.apache.spark.sql.catalyst.expressions.Expression) extends DeclarativeAggregate { override def children: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = child :: Nil override def nullable: Boolean = false // Return data type. override def dataType: DataType = LongType override def checkInputDataTypes(): TypeCheckResult = TypeUtils.checkForOrderingExpr(child.dataType, "function group_bitwise_or") private lazy val group_bitwise_or = AttributeReference("group_bitwise_or", LongType)() override lazy val aggBufferAttributes: Seq[AttributeReference] = group_bitwise_or :: Nil override lazy val initialValues: Seq[Literal] = Seq( Literal.create(0, LongType) ) override lazy val updateExpressions: Seq[ org.apache.spark.sql.catalyst.expressions.Expression] = Seq( BitwiseOr(group_bitwise_or, child) ) override lazy val mergeExpressions: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = { Seq( BitwiseOr(group_bitwise_or.left, group_bitwise_or.right) ) } override lazy val evaluateExpression: AttributeReference = group_bitwise_or }
Example 139
Source File: JsonGroupArray.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark.udf import org.apache.spark.sql.catalyst.expressions.aggregate.DeclarativeAggregate import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types.{ DataType, StringType } import org.apache.spark.sql.catalyst.expressions.{ AttributeReference, If, StartsWith, Literal, IsNull, Concat, Substring } case class JsonGroupArray(child: org.apache.spark.sql.catalyst.expressions.Expression) extends DeclarativeAggregate { override def children: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = child :: Nil override def nullable: Boolean = false // Return data type. override def dataType: DataType = StringType override def checkInputDataTypes(): TypeCheckResult = TypeUtils.checkForOrderingExpr(child.dataType, "function json_group_array") private lazy val json_group_array = AttributeReference("json_group_array", StringType)() override lazy val aggBufferAttributes: Seq[AttributeReference] = json_group_array :: Nil override lazy val initialValues: Seq[Literal] = Seq( Literal.create("", StringType) ) override lazy val updateExpressions: Seq[ org.apache.spark.sql.catalyst.expressions.Expression] = Seq( If(IsNull(child), Concat(Seq(json_group_array, Literal(","), Literal("null"))), Concat(Seq(json_group_array, Literal(","), org.apache.spark.sql.catalyst.expressions.Cast(child,StringType,None)))) ) override lazy val mergeExpressions: Seq[org.apache.spark.sql.catalyst.expressions.Expression] = { Seq( Concat(Seq(json_group_array.left, json_group_array.right)) ) } override lazy val evaluateExpression = Concat(Seq(Literal("["), If(StartsWith(json_group_array,Literal(",")),Substring(json_group_array,Literal(2),Literal(Integer.MAX_VALUE)),json_group_array), Literal("]"))) }
Example 140
Source File: PolynomialExpanderSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import io.deepsense.sparkutils.Linalg.Vectors import org.apache.spark.sql.types.DataType import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class PolynomialExpanderSmokeTest extends AbstractTransformerWrapperSmokeTest[PolynomialExpander] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: PolynomialExpander = { val inPlace = NoInPlaceChoice() .setOutputColumn("polynomial") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("v")) .setInPlace(inPlace) val transformer = new PolynomialExpander() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.degree -> 3 ): _*) } override def testValues: Seq[(Any, Any)] = { val input = Seq( Vectors.dense(1.0), Vectors.dense(1.0, 2.0) ) val inputAfterDCT = Seq( // x, x^2, x^3 Vectors.dense(1.0, 1.0, 1.0), // x, x^2, x^3, y, x * y, x^2 * y, x * y^2, y^2, y^3 Vectors.dense(1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 4.0, 4.0, 8.0) ) input.zip(inputAfterDCT) } override def inputType: DataType = new io.deepsense.sparkutils.Linalg.VectorUDT override def outputType: DataType = new io.deepsense.sparkutils.Linalg.VectorUDT }
Example 141
Source File: ElementwiseProduct.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import org.apache.spark.linalg.{DenseVector, IntSparseVector, LongSparseVector, VectorUDT, Vectors} import com.tencent.angel.sona.ml.UnaryTransformer import com.tencent.angel.sona.ml.param.Param import com.tencent.angel.sona.ml.util.{DefaultParamsWritable, DefaultParamsReadable, Identifiable} import org.apache.spark.sql.types.DataType import org.apache.spark.linalg /** * Outputs the Hadamard product (i.e., the element-wise product) of each input vector with a * provided "weight" vector. In other words, it scales each column of the dataset by a scalar * multiplier. */ class ElementwiseProduct(override val uid: String) extends UnaryTransformer[linalg.Vector, linalg.Vector, ElementwiseProduct] with DefaultParamsWritable { def this() = this(Identifiable.randomUID("elemProd")) /** * the vector to multiply with input vectors * * @group param */ val scalingVec: Param[linalg.Vector] = new Param(this, "scalingVec", "vector for hadamard product") def setScalingVec(value: linalg.Vector): this.type = set(scalingVec, value) def getScalingVec: linalg.Vector = getOrDefault(scalingVec) override protected def createTransformFunc: linalg.Vector => linalg.Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") vector => { require(vector.size == $(scalingVec).size, s"vector sizes do not match: Expected ${$(scalingVec).size} but found ${vector.size}") vector match { case dv: DenseVector => val values: Array[Double] = dv.values.clone() val dim = $(scalingVec).size var i = 0 while (i < dim) { values(i) *= $(scalingVec)(i) i += 1 } Vectors.dense(values) case IntSparseVector(size, indices, vs) => val values = vs.clone() val dim = values.length var i = 0 while (i < dim) { values(i) *= $(scalingVec)(indices(i)) i += 1 } Vectors.sparse(size, indices, values) case LongSparseVector(size, indices, vs) => val values = vs.clone() val dim = values.length var i = 0 while (i < dim) { values(i) *= $(scalingVec)(indices(i)) i += 1 } Vectors.sparse(size, indices, values) case v => throw new IllegalArgumentException("Does not support vector type " + v.getClass) } } } override protected def outputDataType: DataType = new VectorUDT() } object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { override def load(path: String): ElementwiseProduct = super.load(path) }
Example 142
Source File: Normalizer.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import org.apache.spark.linalg.{DenseVector, IntSparseVector, LongSparseVector, VectorUDT, Vectors} import com.tencent.angel.sona.ml.param.{DoubleParam, ParamValidators} import com.tencent.angel.sona.ml.util.{DefaultParamsWritable, Identifiable} import com.tencent.angel.sona.ml.UnaryTransformer import org.apache.spark.sql.types.DataType import com.tencent.angel.sona.ml.util.DefaultParamsReadable import org.apache.spark.linalg /** * Normalize a vector to have unit norm using the given p-norm. */ class Normalizer(override val uid: String) extends UnaryTransformer[linalg.Vector, linalg.Vector, Normalizer] with DefaultParamsWritable { def this() = this(Identifiable.randomUID("normalizer")) /** * Normalization in L^p^ space. Must be greater than equal to 1. * (default: p = 2) * * @group param */ val p = new DoubleParam(this, "p", "the p norm value", ParamValidators.gtEq(1)) setDefault(p -> 2.0) def getP: Double = $(p) def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: linalg.Vector => linalg.Vector = { vector => { val norm = Vectors.norm(vector, $(p)) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case IntSparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case LongSparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. vector } } } override protected def outputDataType: DataType = new VectorUDT() } object Normalizer extends DefaultParamsReadable[Normalizer] { override def load(path: String): Normalizer = super.load(path) }
Example 143
package com.tencent.angel.sona.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.linalg.{VectorUDT, Vectors} import com.tencent.angel.sona.ml.UnaryTransformer import com.tencent.angel.sona.ml.param.BooleanParam import com.tencent.angel.sona.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types.DataType import com.tencent.angel.sona.ml.util.DefaultParamsReadable import org.apache.spark.linalg /** * A feature transformer that takes the 1D discrete cosine transform of a real vector. No zero * padding is performed on the input vector. * It returns a real vector of the same length representing the DCT. The return vector is scaled * such that the transform matrix is unitary (aka scaled DCT-II). * * More information on <a href="https://en.wikipedia.org/wiki/Discrete_cosine_transform#DCT-II"> * DCT-II in Discrete cosine transform (Wikipedia)</a>. */ class DCT(override val uid: String) extends UnaryTransformer[linalg.Vector, linalg.Vector, DCT] with DefaultParamsWritable { def this() = this(Identifiable.randomUID("dct")) /** * Indicates whether to perform the inverse DCT (true) or forward DCT (false). * Default: false * * @group param */ def inverse: BooleanParam = new BooleanParam( this, "inverse", "Set transformer to perform inverse DCT") def setInverse(value: Boolean): this.type = set(inverse, value) def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: linalg.Vector => linalg.Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be ${(new VectorUDT).catalogString} but got ${inputType.catalogString}.") } override protected def outputDataType: DataType = new VectorUDT } object DCT extends DefaultParamsReadable[DCT] { override def load(path: String): DCT = super.load(path) }
Example 144
Source File: NGram.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import com.tencent.angel.sona.ml.UnaryTransformer import com.tencent.angel.sona.ml.param.{IntParam, ParamValidators} import com.tencent.angel.sona.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import com.tencent.angel.sona.ml.util.DefaultParamsReadable import org.apache.spark.util.DataTypeUtil /** * A feature transformer that converts the input array of strings into an array of n-grams. Null * values in the input array are ignored. * It returns an array of n-grams where each n-gram is represented by a space-separated string of * words. * * When the input is empty, an empty array is returned. * When the input array length is less than n (number of elements per n-gram), no n-grams are * returned. */ class NGram(override val uid: String) extends UnaryTransformer[Seq[String], Seq[String], NGram] with DefaultParamsWritable { def this() = this(Identifiable.randomUID("ngram")) /** * Minimum n-gram length, greater than or equal to 1. * Default: 2, bigram features * * @group param */ val n: IntParam = new IntParam(this, "n", "number elements per n-gram (>=1)", ParamValidators.gtEq(1)) def setN(value: Int): this.type = set(n, value) def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(DataTypeUtil.sameType(inputType, ArrayType(StringType)), s"Input type must be ${ArrayType(StringType).catalogString} but got " + inputType.catalogString) } override protected def outputDataType: DataType = new ArrayType(StringType, false) } object NGram extends DefaultParamsReadable[NGram] { override def load(path: String): NGram = super.load(path) }
Example 145
Source File: AnnotationUtils.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.vcf import org.apache.spark.sql.types.{ArrayType, DataType, IntegerType, StringType, StructField, StructType} // Unified VCF annotation representation, used by SnpEff and VEP object AnnotationUtils { // Delimiter between annotation fields val annotationDelimiter = "|" val annotationDelimiterRegex = "\\|" // Fractional delimiter for struct subfields val structDelimiter = "/" val structDelimiterRegex = "\\/" // Delimiter for array subfields val arrayDelimiter = "&" // Struct subfield schemas private val rankTotalStruct = StructType( Seq(StructField("rank", IntegerType), StructField("total", IntegerType))) private val posLengthStruct = StructType( Seq(StructField("pos", IntegerType), StructField("length", IntegerType))) private val referenceVariantStruct = StructType( Seq(StructField("reference", StringType), StructField("variant", StringType))) // Special schemas for SnpEff subfields private val snpEffFieldsToSchema: Map[String, DataType] = Map( "Annotation" -> ArrayType(StringType), "Rank" -> rankTotalStruct, "cDNA_pos/cDNA_length" -> posLengthStruct, "CDS_pos/CDS_length" -> posLengthStruct, "AA_pos/AA_length" -> posLengthStruct, "Distance" -> IntegerType ) // Special schemas for VEP subfields private val vepFieldsToSchema: Map[String, DataType] = Map( "Consequence" -> ArrayType(StringType), "EXON" -> rankTotalStruct, "INTRON" -> rankTotalStruct, "cDNA_position" -> IntegerType, "CDS_position" -> IntegerType, "Protein_position" -> IntegerType, "Amino_acids" -> referenceVariantStruct, "Codons" -> referenceVariantStruct, "Existing_variation" -> ArrayType(StringType), "DISTANCE" -> IntegerType, "STRAND" -> IntegerType, "FLAGS" -> ArrayType(StringType) ) // Special schemas for LOFTEE (as VEP plugin) subfields private val lofteeFieldsToSchema: Map[String, DataType] = Map( "LoF_filter" -> ArrayType(StringType), "LoF_flags" -> ArrayType(StringType), "LoF_info" -> ArrayType(StringType) ) // Default string schema for annotation subfield val allFieldsToSchema: Map[String, DataType] = (snpEffFieldsToSchema ++ vepFieldsToSchema ++ lofteeFieldsToSchema).withDefaultValue(StringType) }
Example 146
Source File: package.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql import org.apache.spark.sql.catalyst.analysis.UnresolvedAttribute import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types.DataType package object dsl { trait ImplicitOperators { def expr: Expression private def makeLambdaFunction(f: Expression => Expression): LambdaFunction = { val x = UnresolvedNamedLambdaVariable(Seq("x")) LambdaFunction(f(x), Seq(x)) } private def makeLambdaFunction(f: (Expression, Expression) => Expression): LambdaFunction = { val x = UnresolvedNamedLambdaVariable(Seq("x")) val y = UnresolvedNamedLambdaVariable(Seq("y")) LambdaFunction(f(x, y), Seq(x, y)) } def arrayTransform(fn: Expression => Expression): Expression = { ArrayTransform(expr, makeLambdaFunction(fn)) } def arrayTransform(fn: (Expression, Expression) => Expression): Expression = { ArrayTransform(expr, makeLambdaFunction(fn)) } def filter(f: Expression => Expression): Expression = { ArrayFilter(expr, makeLambdaFunction(f)) } def filter(f: (Expression, Expression) => Expression): Expression = { ArrayFilter(expr, makeLambdaFunction(f)) } def aggregate( initialValue: Expression, merge: (Expression, Expression) => Expression, finish: Expression => Expression = identity): Expression = { ArrayAggregate( expr, initialValue, makeLambdaFunction(merge), makeLambdaFunction(finish) ) } } implicit class GlowExpression(val expr: Expression) extends ImplicitOperators }
Example 147
Source File: SqlExtensionProviderSuite.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, Expression, Literal, UnaryExpression} import org.apache.spark.sql.types.{DataType, IntegerType} import io.projectglow.GlowSuite class SqlExtensionProviderSuite extends GlowSuite { override def beforeAll(): Unit = { super.beforeAll() SqlExtensionProvider.registerFunctions( spark.sessionState.conf, spark.sessionState.functionRegistry, "test-functions.yml") } private lazy val sess = spark test("one arg function") { import sess.implicits._ assert(spark.range(1).selectExpr("one_arg_test(id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("one_arg_test()").collect() } intercept[AnalysisException] { spark.range(1).selectExpr("one_arg_test(id, id)").collect() } } test("two arg function") { import sess.implicits._ assert(spark.range(1).selectExpr("two_arg_test(id, id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("two_arg_test(id)").collect() } intercept[AnalysisException] { spark.range(1).selectExpr("two_arg_test(id, id, id)").collect() } } test("var args function") { import sess.implicits._ assert(spark.range(1).selectExpr("var_args_test(id, id)").as[Int].head() == 1) assert(spark.range(1).selectExpr("var_args_test(id, id, id, id)").as[Int].head() == 1) assert(spark.range(1).selectExpr("var_args_test(id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("var_args_test()").collect() } } test("can call optional arg function") { import sess.implicits._ assert(spark.range(1).selectExpr("optional_arg_test(id)").as[Int].head() == 1) assert(spark.range(1).selectExpr("optional_arg_test(id, id)").as[Int].head() == 1) intercept[AnalysisException] { spark.range(1).selectExpr("optional_arg_test()").collect() } intercept[AnalysisException] { spark.range(1).selectExpr("optional_arg_test(id, id, id)").collect() } } } trait TestExpr extends Expression with CodegenFallback { override def dataType: DataType = IntegerType override def nullable: Boolean = true override def eval(input: InternalRow): Any = 1 } case class OneArgExpr(child: Expression) extends UnaryExpression with TestExpr case class TwoArgExpr(left: Expression, right: Expression) extends BinaryExpression with TestExpr case class VarArgsExpr(arg: Expression, varArgs: Seq[Expression]) extends TestExpr { override def children: Seq[Expression] = arg +: varArgs } case class OptionalArgExpr(required: Expression, optional: Expression) extends TestExpr { def this(required: Expression) = this(required, Literal(1)) override def children: Seq[Expression] = Seq(required, optional) }
Example 148
Source File: RColumnTransformer.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables import java.util.UUID import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.OperationExecutionDispatcher.Result import io.deepsense.deeplang.params.{CodeSnippetLanguage, CodeSnippetParam, Param} import org.apache.spark.sql.types.DataType class RColumnTransformer() extends CustomCodeColumnTransformer { override val codeParameter = CodeSnippetParam( name = "column operation code", description = None, language = CodeSnippetLanguage(CodeSnippetLanguage.r) ) setDefault(codeParameter -> """transform.column <- function(column, column.name) { | return(column) |}""".stripMargin ) override def getSpecificParams: Array[Param[_]] = Array(codeParameter, targetType) override def getComposedCode( userCode: String, inputColumn: String, outputColumn: String, targetType: DataType): String = { val newFieldName = UUID.randomUUID().toString.replace("-", "") s""" |$userCode | |transform <- function(dataframe) { | new.column <- cast(transform.column(dataframe$$'$inputColumn', '$inputColumn'), | '${targetType.simpleString}') | return(withColumn(dataframe, '$newFieldName', new.column)) |} """.stripMargin } override def runCode(context: ExecutionContext, code: String): Result = context.customCodeExecutor.runR(code) override def isValid(context: ExecutionContext, code: String): Boolean = context.customCodeExecutor.isRValid(code) }
Example 149
Source File: CustomCodeColumnTransformer.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperations.exceptions.CustomOperationExecutionException import io.deepsense.deeplang.OperationExecutionDispatcher.Result import io.deepsense.deeplang.params.{CodeSnippetParam, Param} import io.deepsense.deeplang.params.choice.ChoiceParam import org.apache.spark.sql.types.{DataType, StructField, StructType} abstract class CustomCodeColumnTransformer() extends MultiColumnTransformer { import CustomCodeColumnTransformer._ val targetType = ChoiceParam[TargetTypeChoice]( name = "target type", description = Some("Target type of the columns.")) def getTargetType: TargetTypeChoice = $(targetType) def setTargetType(value: TargetTypeChoice): this.type = set(targetType, value) val codeParameter: CodeSnippetParam def getCodeParameter: String = $(codeParameter) def setCodeParameter(value: String): this.type = set(codeParameter, value) def runCode(context: ExecutionContext, code: String): Result def isValid(context: ExecutionContext, code: String): Boolean def getComposedCode( userCode: String, inputColumn: String, outputColumn: String, targetType: DataType): String override def getSpecificParams: Array[Param[_]] private def executeCode( code: String, inputColumn: String, outputColumn: String, context: ExecutionContext, dataFrame: DataFrame): DataFrame = { runCode(context, code) match { case Left(error) => throw CustomOperationExecutionException(s"Execution exception:\n\n$error") case Right(_) => val sparkDataFrame = context.dataFrameStorage.getOutputDataFrame(OutputPortNumber).getOrElse { throw CustomOperationExecutionException( "Operation finished successfully, but did not produce a DataFrame.") } val newSparkDataFrame = context.sparkSQLSession.createDataFrame( sparkDataFrame.rdd, transformSingleColumnSchema(inputColumn, outputColumn, dataFrame.schema.get).get) DataFrame.fromSparkDataFrame(newSparkDataFrame) } } override def transformSingleColumn( inputColumn: String, outputColumn: String, context: ExecutionContext, dataFrame: DataFrame): DataFrame = { val code = getComposedCode( $(codeParameter), inputColumn, outputColumn, getTargetType.columnType) logger.debug(s"Code to be validated and executed:\n$code") if (!isValid(context, code)) { throw CustomOperationExecutionException("Code validation failed") } context.dataFrameStorage.withInputDataFrame(InputPortNumber, dataFrame.sparkDataFrame) { executeCode(code, inputColumn, outputColumn, context, dataFrame) } } override def transformSingleColumnSchema( inputColumn: String, outputColumn: String, schema: StructType): Option[StructType] = { MultiColumnTransformer.assertColumnExist(inputColumn, schema) MultiColumnTransformer.assertColumnDoesNotExist(outputColumn, schema) Some(schema.add(StructField(outputColumn, getTargetType.columnType, nullable = true))) } } object CustomCodeColumnTransformer { val InputPortNumber: Int = 0 val OutputPortNumber: Int = 0 }
Example 150
Source File: StringTokenizerSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class StringTokenizerSmokeTest extends AbstractTransformerWrapperSmokeTest[StringTokenizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: StringTokenizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("tokenized") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("s")) .setInPlace(inPlace) val transformer = new StringTokenizer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( "this is a test", "this values should be separated", "Bla bla bla!" ) val tokenized = strings.map { _.toLowerCase.split("\\s") } strings.zip(tokenized) } override def inputType: DataType = StringType override def outputType: DataType = new ArrayType(StringType, true) }
Example 151
Source File: TypeConversionConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Column, DataFrame} import scala.util.Try case class TypeConversionConstraint(columnName: String, convertedType: DataType) extends Constraint { val fun = (df: DataFrame) => { val originalColumn = new Column(columnName) val castedColumnName = columnName + "_casted" val maybeCasted = Try(df.select(originalColumn, originalColumn.cast(convertedType).as(castedColumnName))) val maybeFailedCastsAndOriginalType = maybeCasted.map(casted => { val failedCastsCount = casted.filter(new Column(castedColumnName).isNull && originalColumn.isNotNull).count val originalType = df.schema.find(_.name == columnName).get.dataType (failedCastsCount, originalType) }) TypeConversionConstraintResult( constraint = this, data = maybeFailedCastsAndOriginalType.toOption.map{ case (failedCastsCount, originalType) => TypeConversionConstraintResultData( originalType = originalType, failedRows = failedCastsCount ) }, status = ConstraintUtil.tryToStatus[Long](maybeFailedCastsAndOriginalType.map{ case (failedCastsCount, originalType) => failedCastsCount }, _ == 0) ) } } case class TypeConversionConstraintResult(constraint: TypeConversionConstraint, data: Option[TypeConversionConstraintResultData], status: ConstraintStatus) extends ConstraintResult[TypeConversionConstraint] { val message: String = { val convertedType = constraint.convertedType val columnName = constraint.columnName val maybePluralSVerb = data.map(data => if (data.failedRows == 1) ("", "is") else ("s", "are")) (status, data, maybePluralSVerb) match { case (ConstraintSuccess, Some(TypeConversionConstraintResultData(originalType, 0)), _) => s"Column $columnName can be converted from $originalType to $convertedType." case (ConstraintFailure, Some(TypeConversionConstraintResultData(originalType, failedRows)), Some((pluralS, verb))) => s"Column $columnName cannot be converted from $originalType to $convertedType. " + s"$failedRows row$pluralS could not be converted." case (ConstraintError(throwable), None, None) => s"Checking whether column $columnName can be converted to $convertedType failed: $throwable" case default => throw IllegalConstraintResultException(this) } } } case class TypeConversionConstraintResultData(originalType: DataType, failedRows: Long)
Example 152
Source File: BinarizerSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{DataType, DoubleType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class BinarizerSmokeTest extends AbstractTransformerWrapperSmokeTest[Binarizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: Binarizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("binarizerOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("d")) .setInPlace(inPlace) val binarizer = new Binarizer() binarizer.set( binarizer.singleOrMultiChoiceParam -> single, binarizer.threshold -> 0.5) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(0.2, 0.5, 1.8) val outputNumbers = Seq(0.0, 0.0, 1.0) inputNumbers.zip(outputNumbers) } override def inputType: DataType = DoubleType override def outputType: DataType = DoubleType }
Example 153
Source File: DiscreteCosineTransformerSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import io.deepsense.sparkutils.Linalg.Vectors import org.apache.spark.sql.types.DataType import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class DiscreteCosineTransformerSmokeTest extends AbstractTransformerWrapperSmokeTest[DiscreteCosineTransformer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: DiscreteCosineTransformer = { val inPlace = NoInPlaceChoice() .setOutputColumn("dct") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("v")) .setInPlace(inPlace) val transformer = new DiscreteCosineTransformer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.inverse -> false ): _*) } override def testValues: Seq[(Any, Any)] = { val input = Seq( Vectors.dense(0.0), Vectors.dense(1.0), Vectors.dense(2.0) ) val inputAfterDCT = Seq( Vectors.dense(0.0), Vectors.dense(1.0), Vectors.dense(2.0) ) input.zip(inputAfterDCT) } override def inputType: DataType = new io.deepsense.sparkutils.Linalg.VectorUDT override def outputType: DataType = new io.deepsense.sparkutils.Linalg.VectorUDT }
Example 154
Source File: RegexTokenizerSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class RegexTokenizerSmokeTest extends AbstractTransformerWrapperSmokeTest[RegexTokenizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: RegexTokenizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("tokenized") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("s")) .setInPlace(inPlace) val transformer = new RegexTokenizer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.gaps -> false, transformer.minTokenLength -> 1, transformer.pattern -> "\\d+" ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( "100 200 300", "400 500 600", "700 800 900" ) val tokenized = strings.map { _.toLowerCase.split(" ") } strings.zip(tokenized) } override def inputType: DataType = StringType override def outputType: DataType = new ArrayType(StringType, true) }
Example 155
Source File: OneHotEncoderSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import io.deepsense.sparkutils.Linalg.Vectors import org.apache.spark.sql.types.{DataType, DoubleType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class OneHotEncoderSmokeTest extends AbstractTransformerWrapperSmokeTest[OneHotEncoder] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: OneHotEncoder = { val inPlace = NoInPlaceChoice() .setOutputColumn("oneHotEncoderOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("d")) .setInPlace(inPlace) val oneHotEncoder = new OneHotEncoder() oneHotEncoder.set( oneHotEncoder.singleOrMultiChoiceParam -> single, oneHotEncoder.dropLast -> false) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(0.0, 1.0) val outputNumbers = Seq(Vectors.dense(1.0, 0.0), Vectors.dense(0.0, 1.0)) inputNumbers.zip(outputNumbers) } override def inputType: DataType = DoubleType override def outputType: DataType = new io.deepsense.sparkutils.Linalg.VectorUDT }
Example 156
Source File: NGramTransformerSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class NGramTransformerSmokeTest extends AbstractTransformerWrapperSmokeTest[NGramTransformer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: NGramTransformer = { val inPlace = NoInPlaceChoice() .setOutputColumn("ngrams") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("as")) .setInPlace(inPlace) val transformer = new NGramTransformer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.n -> 2 ): _*) } override def testValues: Seq[(Any, Any)] = { val strings = Seq( Array("a", "b", "c"), Array("d", "e", "f") ) val ngrams = Seq( Array("a b", "b c"), Array("d e", "e f") ) strings.zip(ngrams) } override def inputType: DataType = new ArrayType(StringType, true) override def outputType: DataType = new ArrayType(StringType, false) }
Example 157
Source File: StopWordsRemoverSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class StopWordsRemoverSmokeTest extends AbstractTransformerWrapperSmokeTest[StopWordsRemover] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: StopWordsRemover = { val inPlace = NoInPlaceChoice() .setOutputColumn("stopWordsRemoverOutput") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("as")) .setInPlace(inPlace) val stopWordsRemover = new StopWordsRemover() stopWordsRemover.set( stopWordsRemover.singleOrMultiChoiceParam -> single, stopWordsRemover.caseSensitive -> false) } override def testValues: Seq[(Any, Any)] = { val inputNumbers = Seq(Array("a", "seahorse", "The", "Horseshoe", "Crab")) val outputNumbers = Seq(Array("seahorse", "Horseshoe", "Crab")) inputNumbers.zip(outputNumbers) } override def inputType: DataType = ArrayType(StringType) override def outputType: DataType = ArrayType(StringType) }
Example 158
Source File: NormalizerSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.transformers import io.deepsense.sparkutils.Linalg.Vectors import org.apache.spark.sql.types.DataType import io.deepsense.deeplang.doperables.multicolumn.MultiColumnParams.SingleOrMultiColumnChoices.SingleColumnChoice import io.deepsense.deeplang.doperables.multicolumn.SingleColumnParams.SingleTransformInPlaceChoices.NoInPlaceChoice import io.deepsense.deeplang.params.selections.NameSingleColumnSelection class NormalizerSmokeTest extends AbstractTransformerWrapperSmokeTest[Normalizer] with MultiColumnTransformerWrapperTestSupport { override def transformerWithParams: Normalizer = { val inPlace = NoInPlaceChoice() .setOutputColumn("normalize") val single = SingleColumnChoice() .setInputColumn(NameSingleColumnSelection("v")) .setInPlace(inPlace) val transformer = new Normalizer() transformer.set(Seq( transformer.singleOrMultiChoiceParam -> single, transformer.p -> 1.0 ): _*) } override def testValues: Seq[(Any, Any)] = { val input = Seq( Vectors.dense(0.0, 100.0, 100.0), Vectors.dense(1.0, 1.0, 0.0), Vectors.dense(-3.0, 3.0, 0.0) ) val inputAfterNormalize = Seq( Vectors.dense(0.0, 0.5, 0.5), Vectors.dense(0.5, 0.5, 0.0), Vectors.dense(-0.5, 0.5, 0.0) ) input.zip(inputAfterNormalize) } override def inputType: DataType = new io.deepsense.sparkutils.Linalg.VectorUDT override def outputType: DataType = new io.deepsense.sparkutils.Linalg.VectorUDT }
Example 159
Source File: StructFieldJsonProtocol.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.reportlib.model import org.apache.spark.sql.types.{DataType, StructField} import spray.json._ import io.deepsense.commons.json.EnumerationSerializer import io.deepsense.commons.types.{ColumnType, SparkConversions} trait StructFieldJsonProtocol extends DefaultJsonProtocol with MetadataJsonProtocol with DataTypeJsonProtocol { implicit val failureCodeFormat = EnumerationSerializer.jsonEnumFormat(ColumnType) // StructField format without metadata, with deeplangType appended implicit val structFieldFormat = new RootJsonFormat[StructField] { val c = (s: String, d: DataType, b: Boolean) => StructField(s, d, b) implicit val rawFormat = jsonFormat(c, "name", "dataType", "nullable") override def write(obj: StructField): JsValue = { val jsObject = obj.toJson(rawFormat).asJsObject val deeplangType = SparkConversions.sparkColumnTypeToColumnType(obj.dataType) JsObject(jsObject.fields + ("deeplangType" -> deeplangType.toJson)) } override def read(json: JsValue): StructField = { json.convertTo(rawFormat) } } }
Example 160
Source File: similarityFunctions.scala From spark-stringmetric with MIT License | 5 votes |
package com.github.mrpowers.spark.stringmetric.expressions import com.github.mrpowers.spark.stringmetric.unsafe.UTF8StringFunctions import org.apache.commons.text.similarity.CosineDistance import org.apache.spark.unsafe.types.UTF8String import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen.{ CodegenContext, ExprCode } import org.apache.spark.sql.types.{ DataType, IntegerType, StringType } trait UTF8StringFunctionsHelper { val stringFuncs: String = "com.github.mrpowers.spark.stringmetric.unsafe.UTF8StringFunctions" } trait StringString2IntegerExpression extends ImplicitCastInputTypes with NullIntolerant with UTF8StringFunctionsHelper { self: BinaryExpression => override def dataType: DataType = IntegerType override def inputTypes: Seq[DataType] = Seq(StringType, StringType) protected override def nullSafeEval(left: Any, right: Any): Any = -1 } case class HammingDistance(left: Expression, right: Expression) extends BinaryExpression with StringString2IntegerExpression { override def prettyName: String = "hamming" override def nullSafeEval(leftVal: Any, righValt: Any): Any = { val leftStr = left.asInstanceOf[UTF8String] val rightStr = right.asInstanceOf[UTF8String] UTF8StringFunctions.hammingDistance(leftStr, rightStr) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { defineCodeGen(ctx, ev, (s1, s2) => s"$stringFuncs.hammingDistance($s1, $s2)") } }
Example 161
Source File: Normalizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.sql.types.DataType def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) normalizer.transform } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 162
Source File: DCT.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.mllib.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.sql.types.DataType def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 163
Source File: NGram.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.types.{ArrayType, DataType, StringType} def getN: Int = $(n) setDefault(n -> 2) override protected def createTransformFunc: Seq[String] => Seq[String] = { _.iterator.sliding($(n)).withPartial(false).map(_.mkString(" ")).toSeq } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") } override protected def outputDataType: DataType = new ArrayType(StringType, false) } @Since("1.6.0") object NGram extends DefaultParamsReadable[NGram] { @Since("1.6.0") override def load(path: String): NGram = super.load(path) }
Example 164
Source File: MonotonicallyIncreasingID.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{GeneratedExpressionCode, CodeGenContext} import org.apache.spark.sql.types.{LongType, DataType} @transient private[this] var count: Long = _ @transient private[this] var partitionMask: Long = _ override protected def initInternal(): Unit = { count = 0L partitionMask = TaskContext.getPartitionId().toLong << 33 } override def nullable: Boolean = false override def dataType: DataType = LongType override protected def evalInternal(input: InternalRow): Long = { val currentCount = count count += 1 partitionMask + currentCount } override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val countTerm = ctx.freshName("count") val partitionMaskTerm = ctx.freshName("partitionMask") ctx.addMutableState(ctx.JAVA_LONG, countTerm, s"$countTerm = 0L;") ctx.addMutableState(ctx.JAVA_LONG, partitionMaskTerm, s"$partitionMaskTerm = ((long) org.apache.spark.TaskContext.getPartitionId()) << 33;") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.value} = $partitionMaskTerm + $countTerm; $countTerm++; """ } }
Example 165
Source File: randomExpressions.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.TaskContext import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode} import org.apache.spark.sql.types.{DataType, DoubleType} import org.apache.spark.util.Utils import org.apache.spark.util.random.XORShiftRandom case class Randn(seed: Long) extends RDG { override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian() def this() = this(Utils.random.nextLong()) def this(seed: Expression) = this(seed match { case IntegerLiteral(s) => s case _ => throw new AnalysisException("Input argument to rand must be an integer literal.") }) override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = { val rngTerm = ctx.freshName("rng") val className = classOf[XORShiftRandom].getName ctx.addMutableState(className, rngTerm, s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());") ev.isNull = "false" s""" final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian(); """ } }
Example 166
Source File: MapData.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.apache.spark.sql.types.DataType abstract class MapData extends Serializable { def numElements(): Int def keyArray(): ArrayData def valueArray(): ArrayData def copy(): MapData def foreach(keyType: DataType, valueType: DataType, f: (Any, Any) => Unit): Unit = { val length = numElements() val keys = keyArray() val values = valueArray() var i = 0 while (i < length) { f(keys.get(i, keyType), values.get(i, valueType)) i += 1 } } }
Example 167
Source File: MySQLDialect.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, LongType, DataType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } }
Example 168
Source File: ExistingRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.sources.{HadoopFsRelation, BaseRelation} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericMutableRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } //private[sql] case class PhysicalRDD( output: Seq[Attribute], rdd: RDD[InternalRow], override val nodeName: String, override val metadata: Map[String, String] = Map.empty, override val outputsUnsafeRows: Boolean = false) extends LeafNode { protected override def doExecute(): RDD[InternalRow] = rdd override def simpleString: String = { val metadataEntries = for ((key, value) <- metadata.toSeq.sorted) yield s"$key: $value" s"Scan $nodeName${output.mkString("[", ",", "]")}${metadataEntries.mkString(" ", ", ", "")}" } } private[sql] object PhysicalRDD { // Metadata keys val INPUT_PATHS = "InputPaths" val PUSHED_FILTERS = "PushedFilters" def createFromDataSource( output: Seq[Attribute], rdd: RDD[InternalRow], relation: BaseRelation, metadata: Map[String, String] = Map.empty): PhysicalRDD = { // All HadoopFsRelations output UnsafeRows val outputUnsafeRows = relation.isInstanceOf[HadoopFsRelation] PhysicalRDD(output, rdd, relation.toString, metadata, outputUnsafeRows) } }
Example 169
Source File: CatalystDataToAvro.scala From spark-schema-registry with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.registry.avro import com.hortonworks.registries.schemaregistry.{SchemaCompatibility, SchemaMetadata} import com.hortonworks.registries.schemaregistry.avro.AvroSchemaProvider import com.hortonworks.registries.schemaregistry.client.SchemaRegistryClient import com.hortonworks.registries.schemaregistry.serdes.avro.AvroSnapshotSerializer import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression} import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.types.{BinaryType, DataType} import scala.collection.JavaConverters._ case class CatalystDataToAvro( child: Expression, schemaName: String, recordName: String, nameSpace: String, config: Map[String, Object] ) extends UnaryExpression { override def dataType: DataType = BinaryType private val topLevelRecordName = if (recordName == "") schemaName else recordName @transient private lazy val avroType = SchemaConverters.toAvroType(child.dataType, child.nullable, topLevelRecordName, nameSpace) @transient private lazy val avroSer = new AvroSerializer(child.dataType, avroType, child.nullable) @transient private lazy val srSer: AvroSnapshotSerializer = { val obj = new AvroSnapshotSerializer() obj.init(config.asJava) obj } @transient private lazy val srClient = new SchemaRegistryClient(config.asJava) @transient private lazy val schemaMetadata = { var schemaMetadataInfo = srClient.getSchemaMetadataInfo(schemaName) if (schemaMetadataInfo == null) { val generatedSchemaMetadata = new SchemaMetadata.Builder(schemaName). `type`(AvroSchemaProvider.TYPE) .schemaGroup("Autogenerated group") .description("Autogenerated schema") .compatibility(SchemaCompatibility.BACKWARD).build srClient.addSchemaMetadata(generatedSchemaMetadata) generatedSchemaMetadata } else { schemaMetadataInfo.getSchemaMetadata } } override def nullSafeEval(input: Any): Any = { val avroData = avroSer.serialize(input) srSer.serialize(avroData.asInstanceOf[Object], schemaMetadata) } override def simpleString: String = { s"to_sr(${child.sql}, ${child.dataType.simpleString})" } override def sql: String = { s"to_sr(${child.sql}, ${child.dataType.catalogString})" } override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val expr = ctx.addReferenceObj("this", this) defineCodeGen(ctx, ev, input => s"(byte[]) $expr.nullSafeEval($input)") } }
Example 170
Source File: DataTypeUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ArrayType, DataType, MapType, StructType} object DataTypeUtil { def sameType(left: DataType, right: DataType): Boolean = if (SQLConf.get.caseSensitiveAnalysis) { equalsIgnoreNullability(left, right) } else { equalsIgnoreCaseAndNullability(left, right) } private def equalsIgnoreNullability(left: DataType, right: DataType): Boolean = { (left, right) match { case (ArrayType(leftElementType, _), ArrayType(rightElementType, _)) => equalsIgnoreNullability(leftElementType, rightElementType) case (MapType(leftKeyType, leftValueType, _), MapType(rightKeyType, rightValueType, _)) => equalsIgnoreNullability(leftKeyType, rightKeyType) && equalsIgnoreNullability(leftValueType, rightValueType) case (StructType(leftFields), StructType(rightFields)) => leftFields.length == rightFields.length && leftFields.zip(rightFields).forall { case (l, r) => l.name == r.name && equalsIgnoreNullability(l.dataType, r.dataType) } case (l, r) => l == r } } private def equalsIgnoreCaseAndNullability(from: DataType, to: DataType): Boolean = { (from, to) match { case (ArrayType(fromElement, _), ArrayType(toElement, _)) => equalsIgnoreCaseAndNullability(fromElement, toElement) case (MapType(fromKey, fromValue, _), MapType(toKey, toValue, _)) => equalsIgnoreCaseAndNullability(fromKey, toKey) && equalsIgnoreCaseAndNullability(fromValue, toValue) case (StructType(fromFields), StructType(toFields)) => fromFields.length == toFields.length && fromFields.zip(toFields).forall { case (l, r) => l.name.equalsIgnoreCase(r.name) && equalsIgnoreCaseAndNullability(l.dataType, r.dataType) } case (fromDataType, toDataType) => fromDataType == toDataType } } }
Example 171
Source File: S3AParquetRelationSuite.scala From cloud-integration with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc.cloud import com.cloudera.spark.cloud.s3.S3ATestSetup import org.apache.spark.sql.sources.CloudRelationBasicSuite import org.apache.spark.sql.types.{CalendarIntervalType, DataType, NullType} class S3AParquetRelationSuite extends CloudRelationBasicSuite with S3ATestSetup { init() def init(): Unit = { // propagate S3 credentials if (enabled) { initFS() } } override val dataSourceName: String = "parquet" // Parquet does not play well with NullType. override protected def supportsDataType( dataType: DataType): Boolean = dataType match { case _: NullType => false case _: CalendarIntervalType => false case _ => true } }
Example 172
Source File: S3AParquetRelationScaleSuite.scala From cloud-integration with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.orc.cloud import com.cloudera.spark.cloud.s3.S3ATestSetup import org.apache.spark.sql.sources.CloudRelationScaleTest import org.apache.spark.sql.types.{CalendarIntervalType, DataType, NullType} class S3AParquetRelationScaleSuite extends CloudRelationScaleTest with S3ATestSetup { init() def init(): Unit = { // propagate S3 credentials if (enabled) { initFS() } } override def enabled: Boolean = super.enabled && isScaleTestEnabled override val dataSourceName: String = "parquet" // Parquet does not play well with NullType. override protected def supportsDataType( dataType: DataType): Boolean = dataType match { case _: NullType => false case _: CalendarIntervalType => false case _ => true } }
Example 173
Source File: FieldPoly.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import com.esri.core.geometry.MultiPath import org.apache.spark.sql.types.{DataType, Metadata} @deprecated("not used", "0.4") abstract class FieldPoly(name: String, dataType: DataType, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) extends FieldBytes(name, dataType, nullValueAllowed, metadata) { protected var dx = 0L protected var dy = 0L def addPath(byteBuffer: ByteBuffer, numCoordinates: Int, path: MultiPath) = { 0 until numCoordinates foreach (n => { dx += byteBuffer.getVarInt dy += byteBuffer.getVarInt val x = dx / xyScale + xOrig val y = dy / xyScale + yOrig n match { case 0 => path.startPath(x, y) case _ => path.lineTo(x, y) } }) path } }
Example 174
Source File: FieldBytes.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import org.apache.spark.sql.types.{DataType, Metadata} abstract class FieldBytes(name: String, dataType: DataType, nullValueAllowed: Boolean, metadata: Metadata = Metadata.empty ) extends Field(name, dataType, nullValueAllowed, metadata) { protected var m_bytes = new Array[Byte](1024) def getByteBuffer(byteBuffer: ByteBuffer) = { val numBytes = fillVarBytes(byteBuffer) ByteBuffer.wrap(m_bytes, 0, numBytes) } def fillVarBytes(byteBuffer: ByteBuffer) = { val numBytes = byteBuffer.getVarUInt.toInt if (numBytes > m_bytes.length) { m_bytes = new Array[Byte](numBytes) } 0 until numBytes foreach { m_bytes(_) = byteBuffer.get } numBytes } }
Example 175
Source File: FieldPolygon.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import com.esri.core.geometry.Polygon import com.esri.udt.PolygonUDT import org.apache.spark.sql.types.{DataType, Metadata} @deprecated("not used", "0.4") object FieldPolygon { def apply(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) = { new FieldPolygonEsri(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata) } } @deprecated("not used", "0.4") abstract class FieldPolygon(name: String, dataType: DataType, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata ) extends FieldPoly(name, dataType, nullValueAllowed, xOrig, yOrig, xyScale, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val polygon = new Polygon() val blob = getByteBuffer(byteBuffer) val geomType = blob.getVarUInt val numPoints = blob.getVarUInt.toInt val numParts = blob.getVarUInt.toInt val xmin = blob.getVarUInt / xyScale + xOrig val ymin = blob.getVarUInt / xyScale + yOrig val xmax = blob.getVarUInt / xyScale + xmin val ymax = blob.getVarUInt / xyScale + ymin dx = 0L dy = 0L if (numParts > 1) { var sum = 0 val numCoordSeq = 1 to numParts map (part => { val numCoord = if (part == numParts) { numPoints - sum } else { blob.getVarUInt.toInt } sum += numCoord numCoord }) // TODO - fix shells and holes based on https://github.com/rouault/dump_gdbtable/wiki/FGDB-Spec numCoordSeq.foreach(numCoord => addPath(blob, numCoord, polygon)) } else { addPath(blob, numPoints, polygon) } polygon } } @deprecated("not used", "0.4") class FieldPolygonEsri(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) extends FieldPolygon(name, new PolygonUDT(), nullValueAllowed, xOrig, yOrig, xyScale, metadata)
Example 176
Source File: FieldPoly2Type.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import org.apache.spark.sql.types.{DataType, Metadata} abstract class FieldPoly2Type[T](name: String, dataType: DataType, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) extends FieldBytes(name, dataType, nullValueAllowed, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val blob = getByteBuffer(byteBuffer) val geomType = blob.getVarUInt val numPoints = blob.getVarUInt.toInt if (numPoints == 0) createPolyType(0, 0, 0, 0, Array.empty[Int], Array.empty[Double]) else { val numParts = blob.getVarUInt.toInt val xmin = blob.getVarUInt / xyScale + xOrig val ymin = blob.getVarUInt / xyScale + yOrig val xmax = blob.getVarUInt / xyScale + xmin val ymax = blob.getVarUInt / xyScale + ymin var dx = 0L var dy = 0L val xyNum = new Array[Int](numParts) val xyArr = new Array[Double](numPoints * 2) if (numParts > 1) { var i = 0 var sum = 0 1 to numParts foreach (partIndex => { if (partIndex == numParts) { xyNum(i) = numPoints - sum } else { val numXY = blob.getVarUInt.toInt xyNum(i) = numXY sum += numXY i += 1 } }) i = 0 xyNum.foreach(numXY => { 0 until numXY foreach (n => { dx += blob.getVarInt dy += blob.getVarInt val x = dx / xyScale + xOrig val y = dy / xyScale + yOrig xyArr(i) = x i += 1 xyArr(i) = y i += 1 }) }) } else { xyNum(0) = numPoints var i = 0 0 until numPoints foreach (n => { dx += blob.getVarInt dy += blob.getVarInt val x = dx / xyScale + xOrig val y = dy / xyScale + yOrig xyArr(i) = x i += 1 xyArr(i) = y i += 1 }) } createPolyType(xmin, ymin, xmax, ymax, xyNum, xyArr) } } def createPolyType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]): T }
Example 177
Source File: FieldPoly3Type.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import org.apache.spark.sql.types.{DataType, Metadata} abstract class FieldPoly3Type[T](name: String, dataType: DataType, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, nOrig: Double, xyScale: Double, nScale: Double, metadata: Metadata) extends FieldBytes(name, dataType, nullValueAllowed, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val blob = getByteBuffer(byteBuffer) val geomType = blob.getVarUInt val numPoints = blob.getVarUInt.toInt // TODO - Handle zero num points in other geom type. if (numPoints == 0) { createPolyMType(0, 0, 0, 0, Array.empty[Int], Array.empty[Double]) } else { val numParts = blob.getVarUInt.toInt val xmin = blob.getVarUInt / xyScale + xOrig val ymin = blob.getVarUInt / xyScale + yOrig val xmax = blob.getVarUInt / xyScale + xmin val ymax = blob.getVarUInt / xyScale + ymin var dx = 0L var dy = 0L val xyNum = new Array[Int](numParts) val xyArr = new Array[Double](numPoints * 3) var i = 0 if (numParts > 1) { var sum = 0 1 to numParts foreach (partIndex => { if (partIndex == numParts) { xyNum(i) = numPoints - sum } else { val numXY = blob.getVarUInt.toInt xyNum(i) = numXY sum += numXY i += 1 } }) i = 0 xyNum.foreach(numXY => { 0 until numXY foreach (_ => { dx += blob.getVarInt dy += blob.getVarInt val x = dx / xyScale + xOrig val y = dy / xyScale + yOrig xyArr(i) = x i += 1 xyArr(i) = y i += 2 }) }) } else { xyNum(0) = numPoints 0 until numPoints foreach (_ => { dx += blob.getVarInt dy += blob.getVarInt xyArr(i) = dx / xyScale + xOrig i += 1 xyArr(i) = dy / xyScale + yOrig i += 2 }) } i = 2 var dn = 0L 0 until numPoints foreach (_ => { dn += blob.getVarInt xyArr(i) = dn / nScale + nOrig i += 3 }) createPolyMType(xmin, ymin, xmax, ymax, xyNum, xyArr) } } def createPolyMType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]): T }
Example 178
Source File: SnowballStemmer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import org.tartarus.snowball.ext.EnglishStemmer class SnowballStemmer(override val uid: String) extends UnaryTransformer[Seq[String], Seq[String], SnowballStemmer] with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("snowballStemmer")) } override def createTransformFunc: Seq[String] => Seq[String] = { strings => val stemmer = new EnglishStemmer() strings.map((str: String) => { try { stemmer.setCurrent(str) stemmer.stem() stemmer.getCurrent() } catch { case _: Exception => str } }) } override def validateInputType(inputType: DataType): Unit = { require(inputType == ArrayType(StringType), s"Input type must be string type but got $inputType.") } override def outputDataType: DataType = { ArrayType(StringType) } override def copy(extra: ParamMap): SnowballStemmer = { defaultCopy(extra) } } object SnowballStemmer extends DefaultParamsReadable[SnowballStemmer]
Example 179
Source File: monotonicaggregates.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution.aggregates import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Expression, Greatest, Least, Literal, Unevaluable} import org.apache.spark.sql.catalyst.expressions.aggregate._ import org.apache.spark.sql.catalyst.util.TypeUtils import org.apache.spark.sql.types.{AbstractDataType, AnyDataType, DataType} abstract class MonotonicAggregateFunction extends DeclarativeAggregate with Serializable {} case class MMax(child: Expression) extends MonotonicAggregateFunction { override def children: Seq[Expression] = child :: Nil override def nullable: Boolean = true // Return data type. override def dataType: DataType = child.dataType // Expected input data type. override def inputTypes: Seq[AbstractDataType] = Seq(AnyDataType) override def checkInputDataTypes(): TypeCheckResult = TypeUtils.checkForOrderingExpr(child.dataType, "function mmax") private lazy val mmax = AttributeReference("mmax", child.dataType)() override lazy val aggBufferAttributes: Seq[AttributeReference] = mmax :: Nil override lazy val initialValues: Seq[Literal] = Seq( Least(Seq(mmin.left, mmin.right)) ) } override lazy val evaluateExpression: AttributeReference = mmin } case class MonotonicAggregateExpression(aggregateFunction: MonotonicAggregateFunction, mode: AggregateMode, isDistinct: Boolean) extends Expression with Unevaluable { override def children: Seq[Expression] = aggregateFunction :: Nil override def dataType: DataType = aggregateFunction.dataType override def foldable: Boolean = false override def nullable: Boolean = aggregateFunction.nullable override def references: AttributeSet = { val childReferences = mode match { case Partial | Complete => aggregateFunction.references.toSeq case PartialMerge | Final => aggregateFunction.aggBufferAttributes } AttributeSet(childReferences) } override def prettyString: String = aggregateFunction.prettyString override def toString: String = s"(${aggregateFunction},mode=$mode,isDistinct=$isDistinct)" }