org.apache.spark.sql.types.MetadataBuilder Scala Examples
The following examples show how to use org.apache.spark.sql.types.MetadataBuilder.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: AddSourceToAttributes.scala From jgit-spark-connector with Apache License 2.0 | 5 votes |
package tech.sourced.engine.rule import org.apache.spark.sql.catalyst.catalog.CatalogTable import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.rules.Rule import org.apache.spark.sql.execution.datasources.LogicalRelation import org.apache.spark.sql.sources.BaseRelation import org.apache.spark.sql.types.MetadataBuilder import tech.sourced.engine.{GitRelation, MetadataRelation, Sources} import tech.sourced.engine.compat def apply(plan: LogicalPlan): LogicalPlan = plan transformUp { case compat.LogicalRelation(rel @ GitRelation(_, _, _, schemaSource), out, catalogTable) => withMetadata(rel, schemaSource, out, catalogTable) case compat.LogicalRelation( rel @ MetadataRelation(_, _, _, _, schemaSource), out, catalogTable) => withMetadata(rel, schemaSource, out, catalogTable) } private def withMetadata(relation: BaseRelation, schemaSource: Option[String], out: Seq[AttributeReference], catalogTable: Option[CatalogTable]): LogicalRelation = { val processedOut = schemaSource match { case Some(table) => out.map( _.withMetadata(new MetadataBuilder().putString(SOURCE, table).build() ).asInstanceOf[AttributeReference] ) case None => out } compat.LogicalRelation(relation, processedOut, catalogTable) } }
Example 2
Source File: MySQLDialect.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 3
Source File: EventTimeWatermarkExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { override def user: String = child.user val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 4
Source File: MetadataSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{MetadataBuilder, Metadata} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 5
Source File: UnaryEstimatorTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.base.unary import com.salesforce.op.UID import com.salesforce.op.features.Feature import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{DoubleType, MetadataBuilder, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class UnaryEstimatorTest extends OpEstimatorSpec[Real, UnaryModel[Real, Real], UnaryEstimator[Real, Real]] { val expectedResult = Seq(0.0, 0.8, 0.4, 0.2, 1.0).map(_.toReal) } class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator]) extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) { def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = { val grouped = dataset.groupBy() val maxVal = grouped.max().first().getDouble(0) val minVal = grouped.min().first().getDouble(0) new MinMaxNormEstimatorModel(min = minVal, max = maxVal, operationName = operationName, uid = uid) } } final class MinMaxNormEstimatorModel private[op](val min: Double, val max: Double, operationName: String, uid: String) extends UnaryModel[Real, Real](operationName = operationName, uid = uid) { def transformFn: Real => Real = _.v.map(v => (v - min) / (max - min)).toReal }
Example 6
Source File: FeatureHistoryTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op import com.salesforce.op.FeatureHistory.{OriginFeatureKey, StagesKey} import com.salesforce.op.test.TestCommon import org.apache.spark.sql.types.MetadataBuilder import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class FeatureHistoryTest extends FlatSpec with TestCommon { val feature1 = "feature1" val feature2 = "feature2" val stage1 = "stage1" val stage2 = "stage2" Spec[FeatureHistory] should "convert a feature history to metadata" in { val featureHistory = FeatureHistory(originFeatures = Seq(feature1, feature2), stages = Seq(stage1, stage2)) val featureHistoryMetadata = featureHistory.toMetadata featureHistoryMetadata.contains(OriginFeatureKey) shouldBe true featureHistoryMetadata.contains(StagesKey) shouldBe true featureHistoryMetadata.getStringArray(OriginFeatureKey) shouldBe Array(feature1, feature2) featureHistoryMetadata.getStringArray(StagesKey) shouldBe Array(stage1, stage2) } it should "merge two instances" in { val featureHistory1 = FeatureHistory(originFeatures = Seq(feature1), stages = Seq(stage1)) val featureHistory2 = FeatureHistory(originFeatures = Seq(feature2), stages = Seq(stage2)) val featureHistoryCombined = featureHistory1.merge(featureHistory2) featureHistoryCombined.originFeatures shouldBe Seq(feature1, feature2) featureHistoryCombined.stages shouldBe Seq(stage1, stage2) } it should "create a metadata for a map" in { val featureHistory1 = FeatureHistory(originFeatures = Seq(feature1), stages = Seq(stage1)) val featureHistory2 = FeatureHistory(originFeatures = Seq(feature2), stages = Seq(stage2)) val map = Map(("1" -> featureHistory1), ("2" -> featureHistory2)) val featureHistoryMetadata = FeatureHistory.toMetadata(map) featureHistoryMetadata.contains("1") shouldBe true featureHistoryMetadata.contains("2") shouldBe true val f1 = featureHistoryMetadata.getMetadata("1") f1.contains(OriginFeatureKey) shouldBe true f1.contains(StagesKey) shouldBe true f1.getStringArray(OriginFeatureKey) shouldBe Array(feature1) f1.getStringArray(StagesKey) shouldBe Array(stage1) val f2 = featureHistoryMetadata.getMetadata("2") f2.contains(OriginFeatureKey) shouldBe true f2.contains(StagesKey) shouldBe true f2.getStringArray(OriginFeatureKey) shouldBe Array(feature2) f2.getStringArray(StagesKey) shouldBe Array(stage2) } it should "create a map from metadata" in { val featureHistory1 = FeatureHistory(originFeatures = Seq(feature1), stages = Seq(stage1)) val featureHistory2 = FeatureHistory(originFeatures = Seq(feature2), stages = Seq(stage2)) val featureHistoryMapMetadata = new MetadataBuilder() .putMetadata("1", featureHistory1.toMetadata) .putMetadata("2", featureHistory2.toMetadata) .build() val featureHistoryMap = FeatureHistory.fromMetadataMap(featureHistoryMapMetadata) featureHistoryMap.contains("1") shouldBe true featureHistoryMap("1") shouldBe featureHistory1 featureHistoryMap.contains("2") shouldBe true featureHistoryMap("2") shouldBe featureHistory2 } }
Example 7
Source File: PercentileCalibrator.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.UID import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.unary.{UnaryEstimator, UnaryModel} import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.ml.feature.QuantileDiscretizer import org.apache.spark.ml.param.IntParam import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.MetadataBuilder import scala.collection.Searching._ class PercentileCalibrator(uid: String = UID[PercentileCalibrator]) extends UnaryEstimator[RealNN, RealNN](operationName = "percentCalibrator", uid = uid) { final val expectedNumBuckets = new IntParam( this, "expectedNumBuckets", "number of buckets to divide input data into" ) setDefault(expectedNumBuckets, 100) def setExpectedNumBuckets(buckets: Int): this.type = set(expectedNumBuckets, buckets) def fitFn(dataset: Dataset[Option[Double]]): UnaryModel[RealNN, RealNN] = { val estimator: QuantileDiscretizer = new QuantileDiscretizer() .setNumBuckets($(expectedNumBuckets)) .setRelativeError(0) .setInputCol(dataset.columns(0)) .setOutputCol(dataset.columns(0) + "-out") val bucketizerModel = estimator.fit(dataset) val model = new PercentileCalibratorModel( splits = bucketizerModel.getSplits, actualNumBuckets = bucketizerModel.getSplits.length, expectedNumBuckets = $(expectedNumBuckets), operationName = operationName, uid = uid ) val scaledBuckets = bucketizerModel.getSplits.map(v => model.transformFn(v.toRealNN).v.get) val meta = new MetadataBuilder() .putStringArray(PercentileCalibrator.OrigSplitsKey, bucketizerModel.getSplits.map(_.toString)) .putStringArray(PercentileCalibrator.ScaledSplitsKey, scaledBuckets.map(_.toString)).build() setMetadata(meta.toSummaryMetadata()) model } } final class PercentileCalibratorModel private[op] ( val splits: Array[Double], val actualNumBuckets: Int, val expectedNumBuckets: Int, operationName: String, uid: String ) extends UnaryModel[RealNN, RealNN](operationName = operationName, uid = uid) { def transformFn: RealNN => RealNN = (inScalar: RealNN) => { val calibrated = splits.search(inScalar.v.get) match { case Found(idx) => idx case InsertionPoint(idx) => idx } scale(actualNumBuckets, expectedNumBuckets, calibrated).toRealNN } private def scale(actualNumBuckets: Int, expectedBuckets: Int, calibrated: Int): Long = { if (actualNumBuckets >= expectedBuckets) { calibrated - 1 // make it start at zero } else { val (oldMin, newMin) = (0, 0) val (oldMax, newMax) = (Math.max(actualNumBuckets - 2, 0), Math.max(expectedBuckets - 1, 0)) val oldRange = oldMax - oldMin oldRange match { case 0 => newMin case _ => val newRange = (newMax - newMin).toDouble val newValue = (((calibrated - oldMin) * newRange) / oldRange) + newMin Math.min(newValue.round, newMax) } } } } case object PercentileCalibrator { val OrigSplitsKey: String = "origSplits" val ScaledSplitsKey: String = "scaledSplits" }
Example 8
Source File: MinVarianceFilterMetadata.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.preparators import com.salesforce.op.stages.impl.MetadataLike import com.salesforce.op.utils.spark.RichMetadata._ import org.apache.spark.sql.types.{Metadata, MetadataBuilder} import scala.util.{Failure, Success, Try} def fromMetadata(meta: Metadata): MinVarianceSummary = { val wrapped = meta.wrapped Try { MinVarianceSummary( dropped = wrapped.getArray[String](MinVarianceNames.Dropped).toSeq, featuresStatistics = statisticsFromMetadata(wrapped.get[Metadata](MinVarianceNames.FeaturesStatistics)), names = wrapped.getArray[String](MinVarianceNames.Names).toSeq ) } match { case Success(summary) => summary case Failure(_) => throw new IllegalArgumentException(s"failed to parse MinVarianceSummary from $meta") } } }
Example 9
Source File: ScalerMetadataTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.test.TestSparkContext import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner import com.salesforce.op.utils.json.JsonUtils import org.apache.spark.sql.types.MetadataBuilder import scala.util.{Failure, Success} @RunWith(classOf[JUnitRunner]) class ScalerMetadataTest extends FlatSpec with TestSparkContext { val linearArgs = LinearScalerArgs(slope = 2.0, intercept = 1.0) Spec[ScalerMetadata] should "properly construct ScalerMetadata for a LinearScaler" in { val metadata = ScalerMetadata(scalingType = ScalingType.Linear, scalingArgs = linearArgs).toMetadata() metadata.getString(ScalerMetadata.scalingTypeName) shouldBe ScalingType.Linear.entryName val args = JsonUtils.fromString[LinearScalerArgs](metadata.getString(ScalerMetadata.scalingArgsName)) args match { case Failure(err) => fail(err) case Success(x) => x shouldBe linearArgs } } it should "properly construct ScalerMetaData for a LogScaler" in { val metadata = ScalerMetadata(scalingType = ScalingType.Logarithmic, scalingArgs = EmptyScalerArgs()).toMetadata() metadata.getString(ScalerMetadata.scalingTypeName) shouldBe ScalingType.Logarithmic.entryName metadata.getString(ScalerMetadata.scalingArgsName) shouldBe "{}" } it should "use apply to properly convert metadata to ScalerMetadata" in { val metadata = new MetadataBuilder().putString(ScalerMetadata.scalingTypeName, ScalingType.Linear.entryName) .putString(ScalerMetadata.scalingArgsName, linearArgs.toJson(pretty = false)).build() ScalerMetadata.apply(metadata) match { case Failure(err) => fail(err) case Success(x) => x shouldBe ScalerMetadata(ScalingType.Linear, linearArgs) } } it should "error when apply is given an invalid scaling type" in { val invalidMetaData = new MetadataBuilder().putString(ScalerMetadata.scalingTypeName, "unsupportedScaling") .putString(ScalerMetadata.scalingArgsName, linearArgs.toJson(pretty = false)).build() val err = intercept[NoSuchElementException] ( ScalerMetadata.apply(invalidMetaData).get ) err.getMessage shouldBe "unsupportedScaling is not a member of Enum (Linear, Logarithmic)" } }
Example 10
Source File: OpPipelineStageReaderWriterTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages import com.salesforce.op.features._ import com.salesforce.op.features.types._ import com.salesforce.op.stages.OpPipelineStageReaderWriter._ import com.salesforce.op.test.PassengerSparkFixtureTest import com.salesforce.op.utils.reflection.ReflectionUtils import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.{Model, Transformer} import org.apache.spark.sql.types.{DataType, Metadata, MetadataBuilder} import org.json4s.JsonAST.JValue import org.json4s.jackson.JsonMethods.{compact, parse, pretty, render} import org.json4s.{JArray, JObject} import org.scalatest.FlatSpec import org.slf4j.LoggerFactory // TODO: consider adding a read/write test for a spark wrapped stage as well private[stages] abstract class OpPipelineStageReaderWriterTest extends FlatSpec with PassengerSparkFixtureTest { val meta = new MetadataBuilder().putString("foo", "bar").build() val expectedFeaturesLength = 1 def stage: OpPipelineStageBase with Transformer val expected: Array[Real] val hasOutputName = true private val log = LoggerFactory.getLogger(this.getClass) private lazy val savePath = tempDir + "/" + this.getClass.getSimpleName + "-" + System.currentTimeMillis() private lazy val writer = new OpPipelineStageWriter(stage) private lazy val stageJsonString: String = writer.writeToJsonString(savePath) private lazy val stageJson: JValue = parse(stageJsonString) private lazy val isModel = stage.isInstanceOf[Model[_]] private val FN = FieldNames Spec(this.getClass) should "write stage uid" in { log.info(pretty(stageJson)) (stageJson \ FN.Uid.entryName).extract[String] shouldBe stage.uid } it should "write class name" in { (stageJson \ FN.Class.entryName).extract[String] shouldBe stage.getClass.getName } it should "write params map" in { val params = extractParams(stageJson).extract[Map[String, Any]] if (hasOutputName) { params should have size 4 params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema", "outputFeatureName") } else { params should have size 3 params.keys shouldBe Set("inputFeatures", "outputMetadata", "inputSchema") } } it should "write outputMetadata" in { val params = extractParams(stageJson) val metadataStr = compact(render(extractParams(stageJson) \ "outputMetadata")) val metadata = Metadata.fromJson(metadataStr) metadata shouldBe stage.getMetadata() } it should "write inputSchema" in { val schemaStr = compact(render(extractParams(stageJson) \ "inputSchema")) val schema = DataType.fromJson(schemaStr) schema shouldBe stage.getInputSchema() } it should "write input features" in { val jArray = (extractParams(stageJson) \ "inputFeatures").extract[JArray] jArray.values should have length expectedFeaturesLength val obj = jArray(0).extract[JObject] obj.values.keys shouldBe Set("name", "isResponse", "isRaw", "uid", "typeName", "stages", "originFeatures") } it should "write model ctor args" in { if (stage.isInstanceOf[Model[_]]) { val ctorArgs = (stageJson \ FN.CtorArgs.entryName).extract[JObject] val (_, args) = ReflectionUtils.bestCtorWithArgs(stage) ctorArgs.values.keys shouldBe args.map(_._1).toSet } } it should "load stage correctly" in { val reader = new OpPipelineStageReader(stage) val stageLoaded = reader.loadFromJsonString(stageJsonString, path = savePath) stageLoaded shouldBe a[OpPipelineStageBase] stageLoaded shouldBe a[Transformer] stageLoaded.getOutput() shouldBe a[FeatureLike[_]] val _ = stage.asInstanceOf[Transformer].transform(passengersDataSet) val transformed = stageLoaded.asInstanceOf[Transformer].transform(passengersDataSet) transformed.collect(stageLoaded.getOutput().asInstanceOf[FeatureLike[Real]]) shouldBe expected stageLoaded.uid shouldBe stage.uid stageLoaded.operationName shouldBe stage.operationName stageLoaded.getInputFeatures() shouldBe stage.getInputFeatures() stageLoaded.getInputSchema() shouldBe stage.getInputSchema() } private def extractParams(stageJson: JValue): JValue = { val defaultParamsMap = stageJson \ FN.DefaultParamMap.entryName val paramsMap = stageJson \ FN.ParamMap.entryName defaultParamsMap.merge(paramsMap) } }
Example 11
Source File: RemoveAliasOnlyProjectSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.MetadataBuilder class RemoveAliasOnlyProjectSuite extends PlanTest with PredicateHelper { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("RemoveAliasOnlyProject", FixedPoint(50), RemoveAliasOnlyProject) :: Nil } test("all expressions in project list are aliased child output") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('a as 'a, 'b as 'b).analyze val optimized = Optimize.execute(query) comparePlans(optimized, relation) } test("all expressions in project list are aliased child output but with different order") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('b as 'b, 'a as 'a).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } test("some expressions in project list are aliased child output") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('a as 'a, 'b).analyze val optimized = Optimize.execute(query) comparePlans(optimized, relation) } test("some expressions in project list are aliased child output but with different order") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('b as 'b, 'a).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } test("some expressions in project list are not Alias or Attribute") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('a as 'a, 'b + 1).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } test("some expressions in project list are aliased child output but with metadata") { val relation = LocalRelation('a.int, 'b.int) val metadata = new MetadataBuilder().putString("x", "y").build() val aliasWithMeta = Alias('a, "a")(explicitMetadata = Some(metadata)) val query = relation.select(aliasWithMeta, 'b).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } }
Example 12
Source File: MetadataSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{MetadataBuilder, Metadata} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() //元数据构建器和getter test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } //元数据的JSON转换 test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 13
Source File: HBaseAdvancedSQLQuerySuite.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{MetadataBuilder, StructType} import org.apache.spark.sql.{DataFrame, Row} class HBaseAdvancedSQLQuerySuite extends TestBaseWithSplitData { import org.apache.spark.sql.hbase.TestHbase._ import org.apache.spark.sql.hbase.TestHbase.implicits._ test("aggregation with codegen") { val originalValue = TestHbase.sessionState.conf.wholeStageEnabled TestHbase.sessionState.conf.setConfString(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true") val result = sql("SELECT col1 FROM ta GROUP BY col1").collect() assert(result.length == 14, s"aggregation with codegen test failed on size") TestHbase.sessionState.conf.setConfString(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, originalValue.toString) } test("dsl simple select 0") { val tableA = sql("SELECT * FROM ta") checkAnswer( tableA.where('col7 === 1).orderBy('col2.asc).select('col4), Row(1) :: Nil) checkAnswer( tableA.where('col2 === 6).orderBy('col2.asc).select('col7), Row(-31) :: Nil) } test("metadata is propagated correctly") { val tableA = sql("SELECT col7, col1, col3 FROM ta") val schema = tableA.schema val docKey = "doc" val docValue = "first name" val metadata = new MetadataBuilder() .putString(docKey, docValue) .build() val schemaWithMeta = new StructType(Array( schema("col7"), schema("col1").copy(metadata = metadata), schema("col3"))) val personWithMeta = createDataFrame(tableA.rdd, schemaWithMeta) def validateMetadata(rdd: DataFrame): Unit = { assert(rdd.schema("col1").metadata.getString(docKey) == docValue) } personWithMeta.createOrReplaceTempView("personWithMeta") validateMetadata(personWithMeta.select($"col1")) validateMetadata(personWithMeta.select($"col1")) validateMetadata(personWithMeta.select($"col7", $"col1")) validateMetadata(sql("SELECT * FROM personWithMeta")) validateMetadata(sql("SELECT col7, col1 FROM personWithMeta")) validateMetadata(sql("SELECT * FROM personWithMeta JOIN salary ON col7 = personId")) validateMetadata(sql("SELECT col1, salary FROM personWithMeta JOIN salary ON col7 = personId")) } }
Example 14
Source File: EventTimeWatermark.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval object EventTimeWatermark { case class EventTimeWatermark( eventTime: Attribute, delay: CalendarInterval, child: LogicalPlan) extends UnaryNode { // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val delayMs = EventTimeWatermark.getDelayMs(delay) val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 15
Source File: MetadataSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Metadata, MetadataBuilder} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 16
Source File: AggregatedDialect.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import org.apache.spark.sql.types.{DataType, MetadataBuilder} private class AggregatedDialect(dialects: List[JdbcDialect]) extends JdbcDialect { require(dialects.nonEmpty) override def canHandle(url : String): Boolean = dialects.map(_.canHandle(url)).reduce(_ && _) override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { dialects.flatMap(_.getCatalystType(sqlType, typeName, size, md)).headOption } override def getJDBCType(dt: DataType): Option[JdbcType] = { dialects.flatMap(_.getJDBCType(dt)).headOption } override def quoteIdentifier(colName: String): String = { dialects.head.quoteIdentifier(colName) } override def getTableExistsQuery(table: String): String = { dialects.head.getTableExistsQuery(table) } override def getSchemaQuery(table: String): String = { dialects.head.getSchemaQuery(table) } override def isCascadingTruncateTable(): Option[Boolean] = { // If any dialect claims cascading truncate, this dialect is also cascading truncate. // Otherwise, if any dialect has unknown cascading truncate, this dialect is also unknown. dialects.flatMap(_.isCascadingTruncateTable()).reduceOption(_ || _) match { case Some(true) => Some(true) case _ if dialects.exists(_.isCascadingTruncateTable().isEmpty) => None case _ => Some(false) } } override def getTruncateQuery(table: String): String = { dialects.head.getTruncateQuery(table) } }
Example 17
Source File: MySQLDialect.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 18
Source File: EventTimeWatermarkExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 19
Source File: MetadataSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{MetadataBuilder, Metadata} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 20
Source File: MySQLDialect.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, LongType, DataType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } }
Example 21
Source File: MetadataSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Metadata, MetadataBuilder} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 22
Source File: RemoveAliasOnlyProjectSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.MetadataBuilder class RemoveAliasOnlyProjectSuite extends PlanTest with PredicateHelper { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("RemoveAliasOnlyProject", FixedPoint(50), RemoveAliasOnlyProject) :: Nil } test("all expressions in project list are aliased child output") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('a as 'a, 'b as 'b).analyze val optimized = Optimize.execute(query) comparePlans(optimized, relation) } test("all expressions in project list are aliased child output but with different order") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('b as 'b, 'a as 'a).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } test("some expressions in project list are aliased child output") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('a as 'a, 'b).analyze val optimized = Optimize.execute(query) comparePlans(optimized, relation) } test("some expressions in project list are aliased child output but with different order") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('b as 'b, 'a).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } test("some expressions in project list are not Alias or Attribute") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('a as 'a, 'b + 1).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } test("some expressions in project list are aliased child output but with metadata") { val relation = LocalRelation('a.int, 'b.int) val metadata = new MetadataBuilder().putString("x", "y").build() val aliasWithMeta = Alias('a, "a")(explicitMetadata = Some(metadata)) val query = relation.select(aliasWithMeta, 'b).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } }
Example 23
Source File: MySQLDialect.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 24
Source File: MetadataTransformUtils.scala From automl with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature.operator import org.apache.spark.sql.types.{MetadataBuilder, StructField} import scala.collection.mutable.ArrayBuffer def vectorCartesianTransform(fields: Array[StructField], numFeatures: Int): MetadataBuilder = { if (fields.length < 2) { throw new IllegalArgumentException("the number of cols in the input DataFrame should be no less than 2") } var res = Array[String]() if (fields.head.metadata.contains(DERIVATION)) { res = fields.head.metadata.getStringArray(DERIVATION) } else { res = createDerivation(numFeatures) } for (i <- 1 until fields.length) { if (fields(i).metadata.contains(DERIVATION)) { res = cartesianWithArray(res, fields(i).metadata.getStringArray(DERIVATION)) } else { res = cartesianWithArray(res, createDerivation(numFeatures)) } } val metadata = fields.last.metadata new MetadataBuilder().withMetadata(metadata).putStringArray(DERIVATION, res) } }
Example 25
Source File: EventTimeWatermark.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval object EventTimeWatermark { case class EventTimeWatermark( eventTime: Attribute, delay: CalendarInterval, child: LogicalPlan) extends UnaryNode { // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val delayMs = EventTimeWatermark.getDelayMs(delay) val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 26
Source File: MetadataSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Metadata, MetadataBuilder} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 27
Source File: MySQLDialect.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 28
Source File: EventTimeWatermarkExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 29
Source File: TableColumnsParser.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.parser import org.apache.spark.sql.catalyst.AbstractSparkSQLParser import org.apache.spark.sql.catalyst.util.DataTypeParser import org.apache.spark.sql.types.{Metadata, MetadataBuilder, StructField} trait TableColumnsParser extends AbstractSparkSQLParser with DataTypeParser with AnnotationParser { protected def commentIndicator: Keyword protected lazy val columnName = acceptMatch("column name", { case lexical.Identifier(chars) => chars case lexical.Keyword(chars) if !sqlReservedWords.contains(chars.toUpperCase) => chars }) protected lazy val tableColumns: Parser[Seq[StructField]] = "(" ~> repsep(annotatedCol, ",") <~ ")" protected lazy val annotatedCol: Parser[StructField] = columnName ~ metadata ~ dataType ^^ { case name ~ md ~ typ => StructField(name, typ, nullable = true, metadata = toTableMetadata(md)) } | columnName ~ dataType ~ (commentIndicator ~> stringLit).? ^^ { case name ~ typ ~ cm => val meta = cm match { case Some(comment) => new MetadataBuilder().putString(commentIndicator.str.toLowerCase, comment).build() case None => Metadata.empty } StructField(name, typ, nullable = true, meta) } }
Example 30
Source File: EventTimeWatermark.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval object EventTimeWatermark { case class EventTimeWatermark( eventTime: Attribute, delay: CalendarInterval, child: LogicalPlan) extends LogicalPlan { // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override val children: Seq[LogicalPlan] = child :: Nil }
Example 31
Source File: MetadataSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Metadata, MetadataBuilder} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }
Example 32
Source File: RemoveAliasOnlyProjectSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules._ import org.apache.spark.sql.types.MetadataBuilder class RemoveAliasOnlyProjectSuite extends PlanTest with PredicateHelper { object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("RemoveAliasOnlyProject", FixedPoint(50), RemoveAliasOnlyProject) :: Nil } test("all expressions in project list are aliased child output") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('a as 'a, 'b as 'b).analyze val optimized = Optimize.execute(query) comparePlans(optimized, relation) } test("all expressions in project list are aliased child output but with different order") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('b as 'b, 'a as 'a).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } test("some expressions in project list are aliased child output") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('a as 'a, 'b).analyze val optimized = Optimize.execute(query) comparePlans(optimized, relation) } test("some expressions in project list are aliased child output but with different order") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('b as 'b, 'a).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } test("some expressions in project list are not Alias or Attribute") { val relation = LocalRelation('a.int, 'b.int) val query = relation.select('a as 'a, 'b + 1).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } test("some expressions in project list are aliased child output but with metadata") { val relation = LocalRelation('a.int, 'b.int) val metadata = new MetadataBuilder().putString("x", "y").build() val aliasWithMeta = Alias('a, "a")(explicitMetadata = Some(metadata)) val query = relation.select(aliasWithMeta, 'b).analyze val optimized = Optimize.execute(query) comparePlans(optimized, query) } }
Example 33
Source File: MySQLDialect.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.jdbc import java.sql.Types import org.apache.spark.sql.types.{BooleanType, DataType, LongType, MetadataBuilder} private case object MySQLDialect extends JdbcDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:mysql") override def getCatalystType( sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): Option[DataType] = { if (sqlType == Types.VARBINARY && typeName.equals("BIT") && size != 1) { // This could instead be a BinaryType if we'd rather return bit-vectors of up to 64 bits as // byte arrays instead of longs. md.putLong("binarylong", 1) Option(LongType) } else if (sqlType == Types.BIT && typeName.equals("TINYINT")) { Option(BooleanType) } else None } override def quoteIdentifier(colName: String): String = { s"`$colName`" } override def getTableExistsQuery(table: String): String = { s"SELECT 1 FROM $table LIMIT 1" } override def isCascadingTruncateTable(): Option[Boolean] = Some(false) }
Example 34
Source File: EventTimeWatermarkExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 35
Source File: NetezzaBaseSuite.scala From spark-netezza with Apache License 2.0 | 5 votes |
package com.ibm.spark.netezza import org.apache.spark.sql.types.{MetadataBuilder, StructField, StructType} import org.scalatest.FunSuite def buildSchema(cols: Array[Column]): StructType = { val fields = new Array[StructField](cols.length) var i = 0 for (col <- cols) { val columnType = NetezzaSchema.getSparkSqlType( col.jdbcType, col.precision, col.scale, col.signed) val metadata = new MetadataBuilder().putString("name", col.name) fields(i) = StructField(col.name, columnType, true, metadata.build()) i = i + 1 } new StructType(fields) } }
Example 36
Source File: HasEmbeddingsProperties.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.embeddings import com.johnsnowlabs.nlp.AnnotatorType import org.apache.spark.ml.param.{BooleanParam, IntParam, Params} import org.apache.spark.sql.Column import org.apache.spark.sql.types.MetadataBuilder trait HasEmbeddingsProperties extends Params { val dimension = new IntParam(this, "dimension", "Number of embedding dimensions") def setDimension(value: Int): this.type = set(this.dimension, value) def getDimension: Int = $(dimension) protected def wrapEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = { val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", AnnotatorType.WORD_EMBEDDINGS) metadataBuilder.putLong("dimension", embeddingsDim.toLong) embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref)) col.as(col.toString, metadataBuilder.build) } protected def wrapSentenceEmbeddingsMetadata(col: Column, embeddingsDim: Int, embeddingsRef: Option[String] = None): Column = { val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", AnnotatorType.SENTENCE_EMBEDDINGS) metadataBuilder.putLong("dimension", embeddingsDim.toLong) embeddingsRef.foreach(ref => metadataBuilder.putString("ref", ref)) col.as(col.toString, metadataBuilder.build) } }
Example 37
Source File: AnnotatorApproach.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.storage.HasStorage import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer} import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType} import org.apache.spark.ml.util.DefaultParamsWritable override final def transformSchema(schema: StructType): StructType = { require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" + msgHelper(schema) + s"\nMake sure such annotators exist in your pipeline, " + s"with the right output names and that they have following annotator types: " + s"${inputAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", outputAnnotatorType) val outputFields = schema.fields :+ StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build) StructType(outputFields) } }
Example 38
Source File: EventTimeWatermark.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval object EventTimeWatermark { case class EventTimeWatermark( eventTime: Attribute, delay: CalendarInterval, child: LogicalPlan) extends LogicalPlan { // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override val children: Seq[LogicalPlan] = child :: Nil }
Example 39
Source File: MetadataSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import org.json4s.jackson.JsonMethods.parse import org.apache.spark.SparkFunSuite import org.apache.spark.sql.types.{Metadata, MetadataBuilder} class MetadataSuite extends SparkFunSuite { val baseMetadata = new MetadataBuilder() .putString("purpose", "ml") .putBoolean("isBase", true) .build() val summary = new MetadataBuilder() .putLong("numFeatures", 10L) .build() val age = new MetadataBuilder() .putString("name", "age") .putLong("index", 1L) .putBoolean("categorical", false) .putDouble("average", 45.0) .build() val gender = new MetadataBuilder() .putString("name", "gender") .putLong("index", 5) .putBoolean("categorical", true) .putStringArray("categories", Array("male", "female")) .build() val metadata = new MetadataBuilder() .withMetadata(baseMetadata) .putBoolean("isBase", false) // overwrite an existing key .putMetadata("summary", summary) .putLongArray("long[]", Array(0L, 1L)) .putDoubleArray("double[]", Array(3.0, 4.0)) .putBooleanArray("boolean[]", Array(true, false)) .putMetadataArray("features", Array(age, gender)) .build() test("metadata builder and getters") { assert(age.contains("summary") === false) assert(age.contains("index") === true) assert(age.getLong("index") === 1L) assert(age.contains("average") === true) assert(age.getDouble("average") === 45.0) assert(age.contains("categorical") === true) assert(age.getBoolean("categorical") === false) assert(age.contains("name") === true) assert(age.getString("name") === "age") assert(metadata.contains("purpose") === true) assert(metadata.getString("purpose") === "ml") assert(metadata.contains("isBase") === true) assert(metadata.getBoolean("isBase") === false) assert(metadata.contains("summary") === true) assert(metadata.getMetadata("summary") === summary) assert(metadata.contains("long[]") === true) assert(metadata.getLongArray("long[]").toSeq === Seq(0L, 1L)) assert(metadata.contains("double[]") === true) assert(metadata.getDoubleArray("double[]").toSeq === Seq(3.0, 4.0)) assert(metadata.contains("boolean[]") === true) assert(metadata.getBooleanArray("boolean[]").toSeq === Seq(true, false)) assert(gender.contains("categories") === true) assert(gender.getStringArray("categories").toSeq === Seq("male", "female")) assert(metadata.contains("features") === true) assert(metadata.getMetadataArray("features").toSeq === Seq(age, gender)) } test("metadata json conversion") { val json = metadata.json withClue("toJson must produce a valid JSON string") { parse(json) } val parsed = Metadata.fromJson(json) assert(parsed === metadata) assert(parsed.## === metadata.##) } }