org.apache.spark.sql.types.Metadata Scala Examples
The following examples show how to use org.apache.spark.sql.types.Metadata.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: FieldPoly2Type.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import org.apache.spark.sql.types.{DataType, Metadata} abstract class FieldPoly2Type[T](name: String, dataType: DataType, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) extends FieldBytes(name, dataType, nullValueAllowed, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val blob = getByteBuffer(byteBuffer) val geomType = blob.getVarUInt val numPoints = blob.getVarUInt.toInt if (numPoints == 0) createPolyType(0, 0, 0, 0, Array.empty[Int], Array.empty[Double]) else { val numParts = blob.getVarUInt.toInt val xmin = blob.getVarUInt / xyScale + xOrig val ymin = blob.getVarUInt / xyScale + yOrig val xmax = blob.getVarUInt / xyScale + xmin val ymax = blob.getVarUInt / xyScale + ymin var dx = 0L var dy = 0L val xyNum = new Array[Int](numParts) val xyArr = new Array[Double](numPoints * 2) if (numParts > 1) { var i = 0 var sum = 0 1 to numParts foreach (partIndex => { if (partIndex == numParts) { xyNum(i) = numPoints - sum } else { val numXY = blob.getVarUInt.toInt xyNum(i) = numXY sum += numXY i += 1 } }) i = 0 xyNum.foreach(numXY => { 0 until numXY foreach (n => { dx += blob.getVarInt dy += blob.getVarInt val x = dx / xyScale + xOrig val y = dy / xyScale + yOrig xyArr(i) = x i += 1 xyArr(i) = y i += 1 }) }) } else { xyNum(0) = numPoints var i = 0 0 until numPoints foreach (n => { dx += blob.getVarInt dy += blob.getVarInt val x = dx / xyScale + xOrig val y = dy / xyScale + yOrig xyArr(i) = x i += 1 xyArr(i) = y i += 1 }) } createPolyType(xmin, ymin, xmax, ymax, xyNum, xyArr) } } def createPolyType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]): T }
Example 2
Source File: NameAssigner.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasInputCols import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Dataset, functions} import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType} class NameAssigner(override val uid: String) extends Transformer with HasInputCols{ def setInputCols(column: String*) : this.type = set(inputCols, column.toArray) def this() = this(Identifiable.randomUID("NameAssigner")) override def transform(dataset: Dataset[_]): DataFrame = { $(inputCols) $(inputCols).foldLeft(dataset.toDF)((data, column) => { val metadata: Metadata = dataset.schema(column).metadata val attributes = AttributeGroup.fromStructField( StructField(column, new VectorUDT, nullable = false, metadata = metadata)) val map = attributes.attributes .map(arr => arr.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) val func = functions.udf[String, Number](x => if(x == null) { null } else { val i = x.intValue() map.getOrElse(i, i.toString) }) data.withColumn(column, func(data(column)).as(column, metadata)) }).toDF } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.map(f => if ($(inputCols).contains(f.name)) { StructField(f.name, StringType, f.nullable, f.metadata) } else { f })) }
Example 3
Source File: MetadataSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Metadata, MetadataBuilder} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 4
Source File: MetadataSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{MetadataBuilder, Metadata} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 5
Source File: FieldDateTime.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import java.sql.Timestamp import org.apache.spark.sql.types.{Metadata, TimestampType} class FieldDateTime(name: String, nullValueAllowed: Boolean, metadata:Metadata) extends Field(name, TimestampType, nullValueAllowed, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val numDays = byteBuffer.getDouble // convert days since 12/30/1899 to 1/1/1970 val unixDays = numDays - 25569 val millis = (unixDays * 1000 * 60 * 60 * 24).ceil.toLong new Timestamp(millis) } }
Example 6
Source File: FieldPointMType.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import com.esri.udt.{PointMType, PointMUDT} import org.apache.spark.sql.types.Metadata object FieldPointMType extends Serializable { def apply(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, mOrig: Double, xyScale: Double, mScale: Double, metadata: Metadata ) = { new FieldPointMType(name, nullValueAllowed, xOrig, yOrig, mOrig, xyScale, mScale, metadata) } } class FieldPointMType(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, mOrig: Double, xyScale: Double, mScale: Double, metadata: Metadata) extends FieldBytes(name, new PointMUDT(), nullValueAllowed, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val blob = getByteBuffer(byteBuffer) val geomType = blob.getVarUInt() val vx = blob.getVarUInt val vy = blob.getVarUInt val vm = blob.getVarUInt val x = (vx - 1.0) / xyScale + xOrig val y = (vy - 1.0) / xyScale + yOrig val m = (vm - 1.0) / mScale + mOrig new PointMType(x, y, m) } }
Example 7
Source File: FieldPoly.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import com.esri.core.geometry.MultiPath import org.apache.spark.sql.types.{DataType, Metadata} @deprecated("not used", "0.4") abstract class FieldPoly(name: String, dataType: DataType, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) extends FieldBytes(name, dataType, nullValueAllowed, metadata) { protected var dx = 0L protected var dy = 0L def addPath(byteBuffer: ByteBuffer, numCoordinates: Int, path: MultiPath) = { 0 until numCoordinates foreach (n => { dx += byteBuffer.getVarInt dy += byteBuffer.getVarInt val x = dx / xyScale + xOrig val y = dy / xyScale + yOrig n match { case 0 => path.startPath(x, y) case _ => path.lineTo(x, y) } }) path } }
Example 8
Source File: FieldBytes.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import org.apache.spark.sql.types.{DataType, Metadata} abstract class FieldBytes(name: String, dataType: DataType, nullValueAllowed: Boolean, metadata: Metadata = Metadata.empty ) extends Field(name, dataType, nullValueAllowed, metadata) { protected var m_bytes = new Array[Byte](1024) def getByteBuffer(byteBuffer: ByteBuffer) = { val numBytes = fillVarBytes(byteBuffer) ByteBuffer.wrap(m_bytes, 0, numBytes) } def fillVarBytes(byteBuffer: ByteBuffer) = { val numBytes = byteBuffer.getVarUInt.toInt if (numBytes > m_bytes.length) { m_bytes = new Array[Byte](numBytes) } 0 until numBytes foreach { m_bytes(_) = byteBuffer.get } numBytes } }
Example 9
Source File: FieldPolylineMType.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import com.esri.udt.{PolylineMType, PolylineMUDT} import org.apache.spark.sql.types.Metadata object FieldPolylineMType extends Serializable { def apply(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, mOrig: Double, xyScale: Double, mScale: Double, metadata: Metadata) = { new FieldPolylineMType(name, nullValueAllowed, xOrig, yOrig, mOrig, xyScale, mScale, metadata) } } class FieldPolylineMType(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, mOrig: Double, xyScale: Double, mScale: Double, metadata: Metadata) extends FieldPoly3Type[PolylineMType](name, new PolylineMUDT(), nullValueAllowed, xOrig, yOrig, mOrig, xyScale, mScale, metadata) { override def createPolyMType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]) = { PolylineMType(xmin, ymin, xmax, ymax, xyNum, xyArr) } }
Example 10
Source File: FieldPolygon.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import com.esri.core.geometry.Polygon import com.esri.udt.PolygonUDT import org.apache.spark.sql.types.{DataType, Metadata} @deprecated("not used", "0.4") object FieldPolygon { def apply(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) = { new FieldPolygonEsri(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata) } } @deprecated("not used", "0.4") abstract class FieldPolygon(name: String, dataType: DataType, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata ) extends FieldPoly(name, dataType, nullValueAllowed, xOrig, yOrig, xyScale, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val polygon = new Polygon() val blob = getByteBuffer(byteBuffer) val geomType = blob.getVarUInt val numPoints = blob.getVarUInt.toInt val numParts = blob.getVarUInt.toInt val xmin = blob.getVarUInt / xyScale + xOrig val ymin = blob.getVarUInt / xyScale + yOrig val xmax = blob.getVarUInt / xyScale + xmin val ymax = blob.getVarUInt / xyScale + ymin dx = 0L dy = 0L if (numParts > 1) { var sum = 0 val numCoordSeq = 1 to numParts map (part => { val numCoord = if (part == numParts) { numPoints - sum } else { blob.getVarUInt.toInt } sum += numCoord numCoord }) // TODO - fix shells and holes based on https://github.com/rouault/dump_gdbtable/wiki/FGDB-Spec numCoordSeq.foreach(numCoord => addPath(blob, numCoord, polygon)) } else { addPath(blob, numPoints, polygon) } polygon } } @deprecated("not used", "0.4") class FieldPolygonEsri(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) extends FieldPolygon(name, new PolygonUDT(), nullValueAllowed, xOrig, yOrig, xyScale, metadata)
Example 11
Source File: MetadataSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{MetadataBuilder, Metadata} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() //元数据构建器和getter test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } //元数据的JSON转换 test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 12
Source File: FieldPolyline.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import com.esri.core.geometry.Polyline import com.esri.udt.PolylineUDT import org.apache.spark.sql.types.Metadata @deprecated("not used", "0.4") object FieldPolyline extends Serializable { def apply(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) = { new FieldPolyline(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata) } } @deprecated("not used", "0.4") class FieldPolyline(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata ) extends FieldPoly(name, new PolylineUDT(), nullValueAllowed, xOrig, yOrig, xyScale, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val polyline = new Polyline() val blob = getByteBuffer(byteBuffer) val geomType = blob.getVarUInt val numPoints = blob.getVarUInt.toInt val numParts = blob.getVarUInt.toInt val xmin = blob.getVarUInt / xyScale + xOrig val ymin = blob.getVarUInt / xyScale + yOrig val xmax = blob.getVarUInt / xyScale + xmin val ymax = blob.getVarUInt / xyScale + ymin dx = 0L dy = 0L if (numParts > 1) { var sum = 0 val numCoordSeq = 1 to numParts map (part => { val numCoord = if (part == numParts) { numPoints - sum } else { blob.getVarUInt.toInt } sum += numCoord numCoord }) numCoordSeq.foreach(numCoord => addPath(blob, numCoord, polyline)) } else { addPath(blob, numPoints, polyline) } polyline } }
Example 13
Source File: FieldPointZMType.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import com.esri.udt.{PointZMType, PointZMUDT} import org.apache.spark.sql.types.Metadata object FieldPointZMType extends Serializable { def apply(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, zOrig: Double, mOrig: Double, xyScale: Double, zScale: Double, mScale: Double, metadata: Metadata ) = { new FieldPointZMType(name, nullValueAllowed, xOrig, yOrig, zOrig, mOrig, xyScale, zScale, mScale, metadata) } } class FieldPointZMType(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, zOrig: Double, mOrig: Double, xyScale: Double, zScale: Double, mScale: Double, metadata: Metadata) extends FieldBytes(name, new PointZMUDT(), nullValueAllowed, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val blob = getByteBuffer(byteBuffer) val geomType = blob.getVarUInt val vx = blob.getVarUInt() val vy = blob.getVarUInt() val x = (vx - 1.0) / xyScale + xOrig val y = (vy - 1.0) / xyScale + yOrig geomType match { // Point case 1 => new PointZMType(x, y) // PointZ case 9 => val vz = blob.getVarUInt val z = (vz - 1.0) / zScale + zOrig new PointZMType(x, y, z) // PointM case 21 => val vm = blob.getVarUInt val m = (vm - 1.0) / mScale + mOrig new PointZMType(x, y, 0.0, m) // PointZM case _ => val vz = blob.getVarUInt val vm = blob.getVarUInt val z = (vz - 1.0) / zScale + zOrig val m = (vm - 1.0) / mScale + mOrig new PointZMType(x, y, z, m) } } }
Example 14
Source File: FieldPolylineType.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import com.esri.udt.{PolylineType, PolylineUDT} import org.apache.spark.sql.types.Metadata object FieldPolylineType extends Serializable { def apply(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) = { new FieldPolylineType(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata) } } class FieldPolylineType(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) extends FieldPoly2Type[PolylineType](name, new PolylineUDT(), nullValueAllowed, xOrig, yOrig, xyScale, metadata) { override def createPolyType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]): PolylineType = { PolylineType(xmin, ymin, xmax, ymax, xyNum, xyArr) } }
Example 15
Source File: FieldPointZType.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import com.esri.udt.{PointZType, PointZUDT} import org.apache.spark.sql.types.Metadata object FieldPointZType extends Serializable { def apply(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, zOrig: Double, xyScale: Double, zScale: Double, metadata: Metadata ) = { new FieldPointZType(name, nullValueAllowed, xOrig, yOrig, zOrig, xyScale, zScale, metadata) } } class FieldPointZType(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, zOrig: Double, xyScale: Double, zScale: Double, metadata: Metadata) extends FieldBytes(name, new PointZUDT(), nullValueAllowed, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val blob = getByteBuffer(byteBuffer) val geomType = blob.getVarUInt val vx = blob.getVarUInt val vy = blob.getVarUInt val vz = blob.getVarUInt val x = (vx - 1.0) / xyScale + xOrig val y = (vy - 1.0) / xyScale + yOrig val z = (vz - 1.0) / zScale + zOrig new PointZType(x, y, z) } }
Example 16
Source File: FieldPoly3Type.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import org.apache.spark.sql.types.{DataType, Metadata} abstract class FieldPoly3Type[T](name: String, dataType: DataType, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, nOrig: Double, xyScale: Double, nScale: Double, metadata: Metadata) extends FieldBytes(name, dataType, nullValueAllowed, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val blob = getByteBuffer(byteBuffer) val geomType = blob.getVarUInt val numPoints = blob.getVarUInt.toInt // TODO - Handle zero num points in other geom type. if (numPoints == 0) { createPolyMType(0, 0, 0, 0, Array.empty[Int], Array.empty[Double]) } else { val numParts = blob.getVarUInt.toInt val xmin = blob.getVarUInt / xyScale + xOrig val ymin = blob.getVarUInt / xyScale + yOrig val xmax = blob.getVarUInt / xyScale + xmin val ymax = blob.getVarUInt / xyScale + ymin var dx = 0L var dy = 0L val xyNum = new Array[Int](numParts) val xyArr = new Array[Double](numPoints * 3) var i = 0 if (numParts > 1) { var sum = 0 1 to numParts foreach (partIndex => { if (partIndex == numParts) { xyNum(i) = numPoints - sum } else { val numXY = blob.getVarUInt.toInt xyNum(i) = numXY sum += numXY i += 1 } }) i = 0 xyNum.foreach(numXY => { 0 until numXY foreach (_ => { dx += blob.getVarInt dy += blob.getVarInt val x = dx / xyScale + xOrig val y = dy / xyScale + yOrig xyArr(i) = x i += 1 xyArr(i) = y i += 2 }) }) } else { xyNum(0) = numPoints 0 until numPoints foreach (_ => { dx += blob.getVarInt dy += blob.getVarInt xyArr(i) = dx / xyScale + xOrig i += 1 xyArr(i) = dy / xyScale + yOrig i += 2 }) } i = 2 var dn = 0L 0 until numPoints foreach (_ => { dn += blob.getVarInt xyArr(i) = dn / nScale + nOrig i += 3 }) createPolyMType(xmin, ymin, xmax, ymax, xyNum, xyArr) } } def createPolyMType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]): T }
Example 17
Source File: FieldPolygonType.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import com.esri.udt.{PolygonType, PolygonUDT} import org.apache.spark.sql.types.Metadata object FieldPolygonType extends Serializable { def apply(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) = { new FieldPolygonType(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata) } } class FieldPolygonType(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) extends FieldPoly2Type[PolygonType](name, new PolygonUDT(), nullValueAllowed, xOrig, yOrig, xyScale, metadata) { override def createPolyType(xmin: Double, ymin: Double, xmax: Double, ymax: Double, xyNum: Array[Int], xyArr: Array[Double]): PolygonType = { PolygonType(xmin, ymin, xmax, ymax, xyNum, xyArr) } }
Example 18
Source File: FieldPointType.scala From spark-gdb with Apache License 2.0 | 5 votes |
package com.esri.gdb import java.nio.ByteBuffer import com.esri.udt.{PointType, PointUDT} import org.apache.spark.sql.types.Metadata object FieldPointType extends Serializable { def apply(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) = { new FieldPointType(name, nullValueAllowed, xOrig, yOrig, xyScale, metadata) } } class FieldPointType(name: String, nullValueAllowed: Boolean, xOrig: Double, yOrig: Double, xyScale: Double, metadata: Metadata) extends FieldBytes(name, new PointUDT(), nullValueAllowed, metadata) { override def readValue(byteBuffer: ByteBuffer, oid: Int) = { val blob = getByteBuffer(byteBuffer) blob.getVarUInt() // geomType val vx = blob.getVarUInt() val vy = blob.getVarUInt() val x = (vx - 1.0) / xyScale + xOrig val y = (vy - 1.0) / xyScale + yOrig new PointType(x, y) } }
Example 19
Source File: DatasetUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata} import org.apache.spark.sql.{Column, DataFrame, Dataset} object DatasetUtil { def withColumns[T](ds: Dataset[T], colNames: Seq[String], cols: Seq[Column], metadata: Seq[Metadata]): DataFrame = { require(colNames.size == cols.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of columns: ${cols.size}") require(colNames.size == metadata.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of metadata elements: ${metadata.size}") val sparkSession = ds.sparkSession val queryExecution = ds.queryExecution val resolver = sparkSession.sessionState.analyzer.resolver val output = queryExecution.analyzed.output checkColumnNameDuplication(colNames, "in given column names", sparkSession.sessionState.conf.caseSensitiveAnalysis) val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) => colName -> col.as(colName, metadata) }.toMap val replacedAndExistingColumns = output.map { field => columnMap.find { case (colName, _) => resolver(field.name, colName) } match { case Some((colName: String, col: Column)) => col.as(colName) case _ => new Column(field) } } val newColumns = columnMap.filter { case (colName, col) => !output.exists(f => resolver(f.name, colName)) }.map { case (colName, col) => col.as(colName) } ds.select(replacedAndExistingColumns ++ newColumns: _*) } def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = { withColumns(ds, Seq(colName), Seq(col), Seq(metadata)) } private def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } /** * Cast a column in a Dataset to Vector type. * * The supported data types of the input column are * - Vector * - float/double type Array. * * Note: The returned column does not have Metadata. * * @param dataset input DataFrame * @param colName column name. * @return Vector column */ def columnToVector(dataset: Dataset[_], colName: String): Column = { val columnDataType = dataset.schema(colName).dataType columnDataType match { case _: VectorUDT => col(colName) case fdt: ArrayType => val transferUDF = fdt.elementType match { case _: FloatType => udf(f = (vector: Seq[Float]) => { val inputArray = Array.fill[Double](vector.size)(0.0) vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble) Vectors.dense(inputArray) }) case _: DoubleType => udf((vector: Seq[Double]) => { Vectors.dense(vector.toArray) }) case other => throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector") } transferUDF(col(colName)) case other => throw new IllegalArgumentException(s"$other column cannot be cast to Vector") } } }
Example 20
Source File: GBTClassifierSmokeTest.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.spark.wrappers.estimators import org.apache.spark.sql.types.{DoubleType, Metadata, StructType} import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.doperables.spark.wrappers.params.common.ClassificationImpurity import io.deepsense.deeplang.params.ParamPair import io.deepsense.deeplang.params.selections.NameSingleColumnSelection import io.deepsense.deeplang.utils.DataFrameUtils class GBTClassifierSmokeTest extends AbstractEstimatorModelWrapperSmokeTest { override def className: String = "GBTClassifier" override val estimator = new GBTClassifier() private val labelColumnName = "myRating" import estimator.vanillaGBTClassifier._ override val estimatorParams: Seq[ParamPair[_]] = Seq( featuresColumn -> NameSingleColumnSelection("myFeatures"), impurity -> ClassificationImpurity.Entropy(), labelColumn -> NameSingleColumnSelection(labelColumnName), lossType -> GBTClassifier.Logistic(), maxBins -> 2.0, maxDepth -> 6.0, maxIterations -> 10.0, minInfoGain -> 0.0, minInstancesPerNode -> 1, predictionColumn -> "prediction", seed -> 100.0, stepSize -> 0.11, subsamplingRate -> 0.999 ) override def assertTransformedDF(dataFrame: DataFrame): Unit = { val possibleValues = DataFrameUtils.collectValues(dataFrame, labelColumnName) val actualValues = DataFrameUtils.collectValues(dataFrame, "prediction") actualValues.diff(possibleValues) shouldBe empty } override def assertTransformedSchema(schema: StructType): Unit = { val predictionColumn = schema.fields.last predictionColumn.name shouldBe "prediction" predictionColumn.dataType shouldBe DoubleType predictionColumn.metadata shouldBe Metadata.empty } }
Example 21
Source File: MetricUtils.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.metrics import com.microsoft.ml.spark.core.schema.{SchemaConstants, SparkSchema} import com.microsoft.ml.spark.core.schema.SchemaConstants.MMLTag import org.apache.spark.sql.types.injections.MetadataUtilities import org.apache.spark.sql.types.{Metadata, StructField, StructType} object MetricUtils { def isClassificationMetric(metric: String): Boolean = { if (MetricConstants.RegressionMetrics.contains(metric)) false else if (MetricConstants.ClassificationMetrics.contains(metric)) true else throw new Exception("Invalid metric specified") } def getSchemaInfo(schema: StructType, labelCol: Option[String], evaluationMetric: String): (String, String, String) = { val schemaInfo = tryGetSchemaInfo(schema) if (schemaInfo.isDefined) { schemaInfo.get } else { if (labelCol.isEmpty) { throw new Exception("Please score the model prior to evaluating") } else if (evaluationMetric == MetricConstants.AllSparkMetrics) { throw new Exception("Please specify whether you are using evaluation for " + MetricConstants.ClassificationMetricsName + " or " + MetricConstants.RegressionMetricsName + " instead of " + MetricConstants.AllSparkMetrics) } ("custom model", labelCol.get, if (isClassificationMetric(evaluationMetric)) SchemaConstants.ClassificationKind else SchemaConstants.RegressionKind) } } private def tryGetSchemaInfo(schema: StructType): Option[(String, String, String)] = { // TODO: evaluate all models; for now, get first model name found val firstModelName = schema.collectFirst { case StructField(_, _, _, m) if getFirstModelName(m) != null && getFirstModelName(m).isDefined => getFirstModelName(m).get } if (firstModelName.isEmpty) None else { val modelName = firstModelName.get val labelColumnName = SparkSchema.getLabelColumnName(schema, modelName) val scoreValueKind = SparkSchema.getScoreValueKind(schema, modelName, labelColumnName) Option((modelName, labelColumnName, scoreValueKind)) } } private def getFirstModelName(colMetadata: Metadata): Option[String] = { if (!colMetadata.contains(MMLTag)) null else { val mlTagMetadata = colMetadata.getMetadata(MMLTag) val metadataKeys = MetadataUtilities.getMetadataKeys(mlTagMetadata) metadataKeys.find(key => key.startsWith(SchemaConstants.ScoreModelPrefix)) } } }
Example 22
Source File: SparkWrapper.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.types.{DataType, Metadata} object SparkWrapper { def getVersion: String = { "SparkWrapper-2.3" } def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = { SubqueryAlias(identifier, child) } def newAlias(child: Expression, name: String): Alias = { Alias(child, name)() } def newAttributeReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)() } def callSessionCatalogCreateTable( obj: SessionCatalog, tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = { obj.createTable(tableDefinition, ignoreIfExists) } }
Example 23
Source File: SparkWrapper.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark import org.apache.spark.sql.catalyst.catalog.{CatalogTable, SessionCatalog} import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, Expression} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, SubqueryAlias} import org.apache.spark.sql.types.{DataType, Metadata} object SparkWrapper { def getVersion: String = { "SparkWrapper-2.4" } def newSubqueryAlias(identifier: String, child: LogicalPlan): SubqueryAlias = { SubqueryAlias(identifier, child) } def newAlias(child: Expression, name: String): Alias = { Alias(child, name)() } def newAttributeReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)() } def callSessionCatalogCreateTable( obj: SessionCatalog, tableDefinition: CatalogTable, ignoreIfExists: Boolean): Unit = { obj.createTable(tableDefinition, ignoreIfExists) } }
Example 24
Source File: MetadataSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Metadata, MetadataBuilder} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 25
Source File: TableColumnsParser.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.parser import org.apache.spark.sql.catalyst.AbstractSparkSQLParser import org.apache.spark.sql.catalyst.util.DataTypeParser import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} trait TableColumnsParser extends AbstractSparkSQLParser with DataTypeParser with AnnotationParser { protected def commentIndicator: Keyword protected lazy val columnName = acceptMatch("column name", { case lexical.Identifier(chars) => chars case lexical.Keyword(chars) if !sqlReservedWords.contains(chars.toUpperCase) => chars }) protected lazy val tableColumns: Parser[Seq[StructField]] = "(" ~> repsep(annotatedCol, ",") <~ ")" protected lazy val annotatedCol: Parser[StructField] = columnName ~ metadata ~ dataType ^^ { case name ~ md ~ typ => StructField(name, typ, nullable = true, metadata = toTableMetadata(md)) } | columnName ~ dataType ~ (commentIndicator ~> stringLit).? ^^ { case name ~ typ ~ cm => val meta = cm match { case Some(comment) => new MetadataBuilder().putString(commentIndicator.str.toLowerCase, comment).build() case None => Metadata.empty } StructField(name, typ, nullable = true, meta) } }
Example 26
Source File: FieldExtractor.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.tablefunctions import org.apache.spark.sql.types.{DataType, Metadata} def apply(index: Int, tableName: String, originalTableName: String, name: String, originalName: String, dataType: DataType, metadata: Metadata, isNullable: Boolean, checkStar: Boolean): FieldExtractor = new FieldExtractor( index, tableName, originalTableName, name, originalName, DataTypeExtractor(dataType), AnnotationsExtractor(metadata, checkStar), isNullable) }
Example 27
Source File: AnnotationsExtractor.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.tablefunctions import org.apache.spark.sql.types.{Metadata, MetadataAccessor} lazy val annotations: Map[String, String] = metadata .filter { case (k, v) if checkStar => k != "*" case _ => true } .mapValues(_.toString) } object AnnotationsExtractor { def apply(metadata: Metadata, checkStar: Boolean): AnnotationsExtractor = AnnotationsExtractor(MetadataAccessor.metadataToMap(metadata), checkStar) }
Example 28
Source File: MetadataSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Metadata, MetadataBuilder} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 29
Source File: ExpressionHelper.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv.plans.modular import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId, Expression, NamedExpression} import org.apache.spark.sql.types.{DataType, Metadata} object ExpressionHelper { def createReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata, exprId: ExprId, qualifier: Option[String], attrRef : NamedExpression = null): AttributeReference = { AttributeReference(name, dataType, nullable, metadata)(exprId, qualifier) } def createAlias( child: Expression, name: String, exprId: ExprId = NamedExpression.newExprId, qualifier: Option[String] = None, explicitMetadata: Option[Metadata] = None, namedExpr : Option[NamedExpression] = None ) : Alias = { Alias(child, name)(exprId, qualifier, explicitMetadata) } def getTheLastQualifier(reference: AttributeReference): String = { reference.qualifier.head } }
Example 30
Source File: ExpressionHelper.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.mv.plans.modular import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeReference, ExprId, Expression, NamedExpression} import org.apache.spark.sql.types.{DataType, Metadata} object ExpressionHelper { def createReference( name: String, dataType: DataType, nullable: Boolean, metadata: Metadata, exprId: ExprId, qualifier: Option[String], attrRef : NamedExpression = null): AttributeReference = { val qf = if (qualifier.nonEmpty) Seq(qualifier.get) else Seq.empty AttributeReference(name, dataType, nullable, metadata)(exprId, qf) } def createAlias( child: Expression, name: String, exprId: ExprId, qualifier: Option[String]) : Alias = { val qf = if (qualifier.nonEmpty) Seq(qualifier.get) else Seq.empty Alias(child, name)(exprId, qf, None) } def getTheLastQualifier(reference: AttributeReference): String = { reference.qualifier.reverse.head } }
Example 31
Source File: MetadataSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Metadata, MetadataBuilder} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 32
Source File: MetadataSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Metadata, MetadataBuilder} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 33
Source File: MetadataSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{MetadataBuilder, Metadata} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 34
Source File: GBTClassifierSmokeTest.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.spark.wrappers.estimators import org.apache.spark.sql.types.{DoubleType, Metadata, StructType} import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.doperables.spark.wrappers.params.common.ClassificationImpurity import ai.deepsense.deeplang.params.ParamPair import ai.deepsense.deeplang.params.selections.NameSingleColumnSelection import ai.deepsense.deeplang.utils.DataFrameUtils class GBTClassifierSmokeTest extends AbstractEstimatorModelWrapperSmokeTest { override def className: String = "GBTClassifier" override val estimator = new GBTClassifier() private val labelColumnName = "myRating" import estimator.vanillaGBTClassifier._ override val estimatorParams: Seq[ParamPair[_]] = Seq( featuresColumn -> NameSingleColumnSelection("myFeatures"), impurity -> ClassificationImpurity.Entropy(), labelColumn -> NameSingleColumnSelection(labelColumnName), lossType -> GBTClassifier.Logistic(), maxBins -> 2.0, maxDepth -> 6.0, maxIterations -> 10.0, minInfoGain -> 0.0, minInstancesPerNode -> 1, predictionColumn -> "prediction", seed -> 100.0, stepSize -> 0.11, subsamplingRate -> 0.999 ) override def assertTransformedDF(dataFrame: DataFrame): Unit = { val possibleValues = DataFrameUtils.collectValues(dataFrame, labelColumnName) val actualValues = DataFrameUtils.collectValues(dataFrame, "prediction") actualValues.diff(possibleValues) shouldBe empty } override def assertTransformedSchema(schema: StructType): Unit = { val predictionColumn = schema.fields.last predictionColumn.name shouldBe "prediction" predictionColumn.dataType shouldBe DoubleType predictionColumn.metadata shouldBe Metadata.empty } }
Example 35
Source File: MinVarianceFilterMetadata.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.preparators import com.salesforce.op.stages.impl.MetadataLike import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.sql.types.{Metadata, MetadataBuilder} import scala.util.{Failure, Success, Try} def fromMetadata(meta: Metadata): MinVarianceSummary = { val wrapped = meta.wrapped Try { MinVarianceSummary( dropped = wrapped.getArray[String](MinVarianceNames.Dropped).toSeq, featuresStatistics = statisticsFromMetadata(wrapped.get[Metadata](MinVarianceNames.FeaturesStatistics)), names = wrapped.getArray[String](MinVarianceNames.Names).toSeq ) } match { case Success(summary) => summary case Failure(_) => throw new IllegalArgumentException(s"failed to parse MinVarianceSummary from $meta") } } }
Example 36
Source File: VectorsCombinerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.TransientFeature import com.salesforce.op.features.types.{Text, _} import com.salesforce.op.stages.base.sequence.SequenceModel import com.salesforce.op.test.{OpEstimatorSpec, PassengerSparkFixtureTest, TestFeatureBuilder} import com.salesforce.op.utils.spark.OpVectorMetadata import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.ml.attribute.MetadataHelper import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.types.Metadata import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class VectorsCombinerTest extends OpEstimatorSpec[OPVector, SequenceModel[OPVector, OPVector], VectorsCombiner] with PassengerSparkFixtureTest { override def specName: String = classOf[VectorsCombiner].getSimpleName val (inputData, f1, f2) = TestFeatureBuilder(Seq( Vectors.sparse(4, Array(0, 3), Array(1.0, 1.0)).toOPVector -> Vectors.sparse(4, Array(0, 3), Array(2.0, 3.0)).toOPVector, Vectors.dense(Array(2.0, 3.0, 4.0)).toOPVector -> Vectors.dense(Array(12.0, 13.0, 14.0)).toOPVector, // Purposely added some very large sparse vectors to verify the efficiency Vectors.sparse(100000000, Array(1), Array(777.0)).toOPVector -> Vectors.sparse(500000000, Array(0), Array(888.0)).toOPVector )) val estimator = new VectorsCombiner().setInput(f1, f2) val expectedResult = Seq( Vectors.sparse(8, Array(0, 3, 4, 7), Array(1.0, 1.0, 2.0, 3.0)).toOPVector, Vectors.dense(Array(2.0, 3.0, 4.0, 12.0, 13.0, 14.0)).toOPVector, Vectors.sparse(600000000, Array(1, 100000000), Array(777.0, 888.0)).toOPVector ) it should "combine metadata correctly" in { val vector = Seq(height, description, stringMap).transmogrify() val inputs = vector.parents val outputData = new OpWorkflow().setReader(dataReader) .setResultFeatures(vector, inputs(0), inputs(1), inputs(2)) .train().score() val inputMetadata = OpVectorMetadata.flatten(vector.name, inputs.map(i => OpVectorMetadata(outputData.schema(i.name)))) OpVectorMetadata(outputData.schema(vector.name)).columns should contain theSameElementsAs inputMetadata.columns } it should "create metadata correctly" in { val descVect = description.map[Text] { t => Text(t.value match { case Some(text) => "this is dumb " + text case None => "some STUFF to tokenize" }) }.tokenize().tf(numTerms = 5) val vector = Seq(height, stringMap, descVect).transmogrify() val Seq(inputs1, inputs2, inputs3) = vector.parents val outputData = new OpWorkflow().setReader(dataReader) .setResultFeatures(vector, inputs1, inputs2, inputs3) .train().score() outputData.schema(inputs1.name).metadata.wrapped .get[Metadata](MetadataHelper.attributeKeys.ML_ATTR) .getLong(MetadataHelper.attributeKeys.NUM_ATTRIBUTES) shouldBe 5 val inputMetadata = OpVectorMetadata.flatten(vector.name, Array(TransientFeature(inputs1).toVectorMetaData(5, Option(inputs1.name)), OpVectorMetadata(outputData.schema(inputs2.name)), OpVectorMetadata(outputData.schema(inputs3.name)))) OpVectorMetadata(outputData.schema(vector.name)).columns should contain theSameElementsAs inputMetadata.columns } }
Example 37
Source File: MinVarianceFilterMetadataTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.preparators import com.salesforce.op.stages.impl.preparators.MinVarianceSummary.statisticsFromMetadata import com.salesforce.op.test.TestSparkContext import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.sql.types.Metadata import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class MinVarianceFilterMetadataTest extends FlatSpec with TestSparkContext { val summary = MinVarianceSummary( dropped = Seq("f1"), featuresStatistics = SummaryStatistics(3, 0.01, Seq(0.1, 0.2, 0.3), Seq(0.1, 0.2, 0.3), Seq(0.1, 0.2, 0.3), Seq(0.1, 0.2, 0.3)), names = Seq("f1", "f2", "f3") ) Spec[MinVarianceSummary] should "convert to and from metadata correctly" in { val meta = summary.toMetadata() meta.isInstanceOf[Metadata] shouldBe true val retrieved = MinVarianceSummary.fromMetadata(meta) retrieved.isInstanceOf[MinVarianceSummary] retrieved.dropped should contain theSameElementsAs summary.dropped retrieved.featuresStatistics.count shouldBe summary.featuresStatistics.count retrieved.featuresStatistics.max should contain theSameElementsAs summary.featuresStatistics.max retrieved.featuresStatistics.min should contain theSameElementsAs summary.featuresStatistics.min retrieved.featuresStatistics.mean should contain theSameElementsAs summary.featuresStatistics.mean retrieved.featuresStatistics.variance should contain theSameElementsAs summary.featuresStatistics.variance retrieved.names should contain theSameElementsAs summary.names } it should "convert to and from JSON and give the same values" in { val meta = summary.toMetadata() val json = meta.wrapped.prettyJson val recovered = Metadata.fromJson(json).wrapped val dropped = recovered.getArray[String](MinVarianceNames.Dropped).toSeq val stats = statisticsFromMetadata(recovered.get[Metadata](MinVarianceNames.FeaturesStatistics)) val names = recovered.getArray[String](MinVarianceNames.Names).toSeq dropped should contain theSameElementsAs summary.dropped stats.count shouldBe summary.featuresStatistics.count stats.max should contain theSameElementsAs summary.featuresStatistics.max stats.min should contain theSameElementsAs summary.featuresStatistics.min stats.mean should contain theSameElementsAs summary.featuresStatistics.mean stats.variance should contain theSameElementsAs summary.featuresStatistics.variance names should contain theSameElementsAs summary.names } }
Example 38
Source File: OpPipelineStageReaderWriterTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.features._ import com.salesforce.op.features.types._ import com.salesforce.op.stages.OpPipelineStageReaderWriter._ import com.salesforce.op.test.PassengerSparkFixtureTest import com.salesforce.op.utils.reflection.ReflectionUtils import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.{Model, Transformer} import org.apache.spark.sql.types.{DataType, Metadata, MetadataBuilder} import org.json4s.JsonAST.JValue import org.json4s.jackson.JsonMethods.{compact, parse, pretty, render} import org.json4s.{JArray, JObject} import org.scalatest.FlatSpec import org.slf4j.LoggerFactory // TODO: consider adding a read/write test for a spark wrapped stage as well private[stages] abstract class OpPipelineStageReaderWriterTest extends FlatSpec with PassengerSparkFixtureTest { val meta = new MetadataBuilder().putString("foo", "bar").build() val expectedFeaturesLength = 1 def stage: OpPipelineStageBase with Transformer val expected: Array[Real] val hasOutputName = true private val log = LoggerFactory.getLogger(this.getClass) private lazy val savePath = tempDir + "/" + this.getClass.getSimpleName + "-" + System.currentTimeMillis() private lazy val writer = new OpPipelineStageWriter(stage) private lazy val stageJsonString: String = writer.writeToJsonString(savePath) private lazy val stageJson: JValue = parse(stageJsonString) private lazy val isModel = stage.isInstanceOf[Model[_]] private val FN = FieldNames Spec(this.getClass) should "write stage uid" in { log.info(pretty(stageJson)) (stageJson \ FN.Uid.entryName).extract[String] shouldBe stage.uid } it should "write class name" in { (stageJson \ FN.Class.entryName).extract[String] shouldBe stage.getClass.getName } it should "write params map" in { val params = extractParams(stageJson).extract[Map[String, Any]] if (hasOutputName) { params should have size 4 params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema", "outputFeatureName") } else { params should have size 3 params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema") } } it should "write outputMetadata" in { val params = extractParams(stageJson) val metadataStr = compact(render(extractParams(stageJson) \ "outputMetadata")) val metadata = Metadata.fromJson(metadataStr) metadata shouldBe stage.getMetadata() } it should "write inputSchema" in { val schemaStr = compact(render(extractParams(stageJson) \ "inputSchema")) val schema = DataType.fromJson(schemaStr) schema shouldBe stage.getInputSchema() } it should "write input features" in { val jArray = (extractParams(stageJson) \ "inputFeatures").extract[JArray] jArray.values should have length expectedFeaturesLength val obj = jArray(0).extract[JObject] obj.values.keys shouldBe Set("name", "isResponse", "isRaw", "uid", "typeName", "stages", "originFeatures") } it should "write model ctor args" in { if (stage.isInstanceOf[Model[_]]) { val ctorArgs = (stageJson \ FN.CtorArgs.entryName).extract[JObject] val (_, args) = ReflectionUtils.bestCtorWithArgs(stage) ctorArgs.values.keys shouldBe args.map(_._1).toSet } } it should "load stage correctly" in { val reader = new OpPipelineStageReader(stage) val stageLoaded = reader.loadFromJsonString(stageJsonString, path = savePath) stageLoaded shouldBe a[OpPipelineStageBase] stageLoaded shouldBe a[Transformer] stageLoaded.getOutput() shouldBe a[FeatureLike[_]] val _ = stage.asInstanceOf[Transformer].transform(passengersDataSet) val transformed = stageLoaded.asInstanceOf[Transformer].transform(passengersDataSet) transformed.collect(stageLoaded.getOutput().asInstanceOf[FeatureLike[Real]]) shouldBe expected stageLoaded.uid shouldBe stage.uid stageLoaded.operationName shouldBe stage.operationName stageLoaded.getInputFeatures() shouldBe stage.getInputFeatures() stageLoaded.getInputSchema() shouldBe stage.getInputSchema() } private def extractParams(stageJson: JValue): JValue = { val defaultParamsMap = stageJson \ FN.DefaultParamMap.entryName val paramsMap = stageJson \ FN.ParamMap.entryName defaultParamsMap.merge(paramsMap) } }
Example 39
Source File: MetadataIteratorSpec.scala From jgit-spark-connector with Apache License 2.0 | 5 votes |
package tech.sourced.engine.iterator import java.nio.file.Paths import java.util.{Properties, UUID} import org.apache.commons.io.FileUtils import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.types.{Metadata, StringType, StructType} import org.scalatest.{BeforeAndAfterAll, FlatSpec, Matchers} import tech.sourced.engine.{BaseSparkSpec, Schema} class JDBCQueryIteratorSpec extends FlatSpec with Matchers with BeforeAndAfterAll with BaseSparkSpec { private val tmpPath = Paths.get( System.getProperty("java.io.tmpdir"), UUID.randomUUID.toString ) private val dbPath = tmpPath.resolve("test.db") override def beforeAll(): Unit = { super.beforeAll() tmpPath.toFile.mkdir() val rdd = ss.sparkContext.parallelize(Seq( Row("id1"), Row("id2"), Row("id3") )) val properties = new Properties() properties.put("driver", "org.sqlite.JDBC") val df = ss.createDataFrame(rdd, StructType(Seq(Schema.repositories.head))) df.write.jdbc(s"jdbc:sqlite:${dbPath.toString}", "repositories", properties) } override def afterAll(): Unit = { super.afterAll() FileUtils.deleteQuietly(tmpPath.toFile) } "JDBCQueryIterator" should "return all rows for the query" in { val iter = new JDBCQueryIterator( Seq(attr("id")), dbPath.toString, "SELECT id FROM repositories ORDER BY id" ) // calling hasNext more than one time does not cause rows to be lost iter.hasNext iter.hasNext val rows = (for (row <- iter) yield row).toArray rows.length should be(3) rows(0).length should be(1) rows(0)(0).toString should be("id1") rows(1)(0).toString should be("id2") rows(2)(0).toString should be("id3") } private def attr(name: String): Attribute = AttributeReference( name, StringType, nullable = false, Metadata.empty )() }