org.apache.spark.sql.functions.udf Scala Examples
The following examples show how to use org.apache.spark.sql.functions.udf.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkPFASuiteBase.scala From aardpfark with Apache License 2.0 | 6 votes |
package com.ibm.aardpfark.pfa import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.SparkConf import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.functions.udf import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.scalactic.Equality import org.scalatest.FunSuite abstract class SparkPFASuiteBase extends FunSuite with DataFrameSuiteBase with PFATestUtils { val sparkTransformer: Transformer val input: Array[String] val expectedOutput: Array[String] val sparkConf = new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID). set("spark.driver.host", "localhost") override lazy val spark = SparkSession.builder().config(sparkConf).getOrCreate() override val reuseContextIfPossible = true // Converts column containing a vector to an array def withColumnAsArray(df: DataFrame, colName: String) = { val vecToArray = udf { v: Vector => v.toArray } df.withColumn(colName, vecToArray(df(colName))) } def withColumnAsArray(df: DataFrame, first: String, others: String*) = { val vecToArray = udf { v: Vector => v.toArray } var result = df.withColumn(first, vecToArray(df(first))) others.foreach(c => result = result.withColumn(c, vecToArray(df(c)))) result } // Converts column containing a vector to a sparse vector represented as a map def getColumnAsSparseVectorMap(df: DataFrame, colName: String) = { val vecToMap = udf { v: Vector => v.toSparse.indices.map(i => (i.toString, v(i))).toMap } df.withColumn(colName, vecToMap(df(colName))) } } abstract class Result object ApproxEquality extends ApproxEquality trait ApproxEquality { import org.scalactic.Tolerance._ import org.scalactic.TripleEquals._ implicit val seqApproxEq: Equality[Seq[Double]] = new Equality[Seq[Double]] { override def areEqual(a: Seq[Double], b: Any): Boolean = { b match { case d: Seq[Double] => a.zip(d).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } implicit val vectorApproxEq: Equality[Vector] = new Equality[Vector] { override def areEqual(a: Vector, b: Any): Boolean = { b match { case v: Vector => a.toArray.zip(v.toArray).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } }
Example 2
Source File: Entropy.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{col, sum, udf} case class Entropy(column: String, where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Entropy", column :: Nil) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { val summands = udf { (count: Double) => if (count == 0.0) { 0.0 } else { -(count / numRows) * math.log(count / numRows) } } sum(summands(col(COUNT_COL))) :: Nil } override def filterCondition: Option[String] = where }
Example 3
Source File: SparkXGBoostClassifierSuite.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.ml.Pipeline import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.VectorIndexer import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.sql.functions.udf import org.scalatest.FunSuite import rotationsymmetry.sxgboost.loss.LogisticLoss import rotationsymmetry.sxgboost.utils.MLlibTestSparkContext class SparkXGBoostClassifierSuite extends FunSuite with TestData with MLlibTestSparkContext { test("test with simple data") { val rawdata = Seq( LabeledPoint(0, Vectors.dense(0.0, 0.0)), LabeledPoint(0, Vectors.dense(0.0, 0.0)), LabeledPoint(1, Vectors.dense(0.0, 0.0)), LabeledPoint(1, Vectors.dense(1.0, 0.0)), LabeledPoint(1, Vectors.dense(1.0, 0.0)), LabeledPoint(0, Vectors.dense(1.0, 0.0)), LabeledPoint(1, Vectors.dense(0.0, 1.0)), LabeledPoint(1, Vectors.dense(0.0, 1.0)), LabeledPoint(0, Vectors.dense(0.0, 1.0)), LabeledPoint(0, Vectors.dense(1.0, 1.0)), LabeledPoint(0, Vectors.dense(1.0, 1.0)), LabeledPoint(1, Vectors.dense(1.0, 1.0)) ) val data = sqlContext.createDataFrame(sc.parallelize(rawdata, 2)) val truthUDF = udf { feature: Vector => if (feature(0) == feature(1)) 0.0 else 1.0 } val dataWithTruth = data.withColumn("truth", truthUDF(data("features"))) val featureIndexer = new VectorIndexer() .setInputCol("features") .setOutputCol("indexedFeatures") .setMaxCategories(2) .fit(data) val sparkXGBoostClassifier = new SparkXGBoostClassifier(new LogisticLoss) .setFeaturesCol("indexedFeatures") .setMaxDepth(2) .setNumTrees(1) val sparkXGBoostPipeline = new Pipeline() .setStages(Array(featureIndexer, sparkXGBoostClassifier)) val sXGBoostModel = sparkXGBoostPipeline.fit(data) val evaluator = new MulticlassClassificationEvaluator() .setLabelCol("truth") .setPredictionCol("prediction") .setMetricName("precision") val precision = evaluator.evaluate(sXGBoostModel.transform(dataWithTruth)) assert(precision === 1.0) } }
Example 4
Source File: ExtractTokensUDF.scala From jgit-spark-connector with Apache License 2.0 | 5 votes |
package tech.sourced.engine.udf import gopkg.in.bblfsh.sdk.v1.uast.generated.Node import org.apache.spark.sql.SparkSession import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.udf case object ExtractTokensUDF extends CustomUDF { override val name = "extractTokens" override def apply(session: SparkSession): UserDefinedFunction = udf[Seq[String], Seq[Array[Byte]]](extractTokens) private def extractTokens(nodes: Seq[Array[Byte]]): Seq[String] = { timer.time({ if (nodes == null) { Seq() } else { nodes.map(Node.parseFrom).map(_.token) } }) } }
Example 5
Source File: QueryXPathUDF.scala From jgit-spark-connector with Apache License 2.0 | 5 votes |
package tech.sourced.engine.udf import gopkg.in.bblfsh.sdk.v1.uast.generated.Node import org.apache.spark.sql.SparkSession import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.udf import tech.sourced.engine.util.Bblfsh case object QueryXPathUDF extends CustomUDF { override val name = "queryXPath" override def apply(session: SparkSession): UserDefinedFunction = { val configB = session.sparkContext.broadcast(Bblfsh.getConfig(session)) udf[Seq[Array[Byte]], Seq[Array[Byte]], String]((nodes, query) => queryXPath(nodes, query, configB.value)) } private def queryXPath(nodes: Seq[Array[Byte]], query: String, config: Bblfsh.Config): Seq[Array[Byte]] = { timer.time({ if (nodes == null) { return null } nodes.map(Node.parseFrom).flatMap(n => { val result = Bblfsh.filter(n, query, config) if (result == null) { None } else { result.toIterator } }).map(_.toByteArray) }) } }
Example 6
Source File: ExtractUASTsUDF.scala From jgit-spark-connector with Apache License 2.0 | 5 votes |
package tech.sourced.engine.udf import org.apache.spark.sql.SparkSession import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.udf import tech.sourced.engine.util.Bblfsh trait ExtractUASTsUDF { def extractUASTs(path: String, content: Array[Byte], lang: String = null, config: Bblfsh.Config): Seq[Array[Byte]] = { if (content == null || content.isEmpty) { Seq() } else { Bblfsh.extractUAST(path, content, lang, config) } } } case object ExtractUASTsUDF extends CustomUDF with ExtractUASTsUDF { override val name = "extractUASTs" override def apply(session: SparkSession): UserDefinedFunction = { val configB = session.sparkContext.broadcast(Bblfsh.getConfig(session)) udf[Seq[Array[Byte]], String, Array[Byte], String]((path, content, lang) => extractUASTs(path, content, lang, configB.value)) } }
Example 7
Source File: ClassifyLanguagesUDF.scala From jgit-spark-connector with Apache License 2.0 | 5 votes |
package tech.sourced.engine.udf import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.udf import tech.sourced.enry.Enry def getLanguage(isBinary: Boolean, path: String, content: Array[Byte]): Option[String] = { timer.time({ if (isBinary) { None } else { val lang = try { Enry.getLanguage(path, content) } catch { case e@(_: RuntimeException | _: Exception) => log.error(s"get language for file '$path' failed", e) null } if (null == lang || lang.isEmpty) None else Some(lang) } }) } }
Example 8
Source File: HashingTF.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 9
Source File: LanguageAwareAnalyzer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.util.StopwordAnalyzerBase import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.HasOutputCol import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } def this() = this(Identifiable.randomUID("languageAnalyzer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), stemmTextUDF(dataset.col($(inputColLang)), dataset.col($(inputColText)))).toDF } @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputColText) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputColText))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), ArrayType(StringType, true)) } else { SchemaUtils.appendColumn(schema, $(outputCol), ArrayType(StringType, true)) } } } object LanguageAwareAnalyzer extends DefaultParamsReadable[LanguageAwareAnalyzer] { override def load(path: String): LanguageAwareAnalyzer = super.load(path) }
Example 10
Source File: LanguageDetectorTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import com.google.common.base.Optional import com.optimaize.langdetect.LanguageDetector import com.optimaize.langdetect.i18n.LdLocale import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} import scala.collection.Map def setOutputCol(value: String): this.type = set(outputCol, value) def this() = this(Identifiable.randomUID("languageDetector")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } @transient object languageDetectorWrapped extends Serializable { val languageDetector: LanguageDetector = LanguageDetectorUtils.buildLanguageDetector( LanguageDetectorUtils.readListLangsBuiltIn(), $(minimalConfidence), $(languagePriors).toMap) } }
Example 11
Source File: NGramExtractor.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1)) override def transform(dataset: Dataset[_]): DataFrame = { val lowerBound = $(lowerN) val upperBound = $(upperN) val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound)) dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), new ArrayType(StringType, true)) } else { schema } } } object NGramExtractor extends DefaultParamsReadable[NGramExtractor] { override def load(path: String): NGramExtractor = super.load(path) }
Example 12
Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import java.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{LongType, StructType} def setDim(value: Long): this.type = set(dim, value) def this() = this(Identifiable.randomUID("randomProjectionsHasher")) override def transform(dataset: Dataset[_]): DataFrame = { val dimensity = { if (!isSet(dim)) {//If dimensions is not set - will search AttributeGroup in metadata as it comes from OdklCountVectorizer val vectorsIndex = dataset.schema.fieldIndex($(inputCol)) AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size } else { $(dim).toInt } } val projectionMatrix = dataset.sqlContext.sparkContext.broadcast( Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix]) //the matrix of random vectors to costruct hash val binHashSparseVectorColumn = udf((vector: Vector) => { projectionMatrix.value.multiply(vector).values .map(f => if (f>0) 1L else 0L) .view.zipWithIndex .foldLeft(0L) {case (acc,(v, i)) => acc | (v << i) } }) dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), LongType) } }
Example 13
Source File: URLElimminator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, Params} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} def setInputCol(value: String): this.type = set(inputCol, value) def this() = this(Identifiable.randomUID("URLEliminator")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), filterTextUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), StringType) } else { schema } } } object URLElimminator extends DefaultParamsReadable[URLElimminator] { override def load(path: String): URLElimminator = super.load(path) }
Example 14
Source File: MutualInformation.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers._ import com.amazon.deequ.metrics.{DoubleMetric, Entity} import org.apache.spark.sql.functions.{col, sum, udf} import org.apache.spark.sql.types.StructType import Analyzers.COUNT_COL import com.amazon.deequ.analyzers.runners.MetricCalculationException override def preconditions: Seq[StructType => Unit] = { Preconditions.exactlyNColumns(columns, 2) +: super.preconditions } override def toFailureMetric(exception: Exception): DoubleMetric = { metricFromFailure(exception, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn) } override def filterCondition: Option[String] = where } object MutualInformation { def apply(columnA: String, columnB: String): MutualInformation = { new MutualInformation(columnA :: columnB :: Nil) } }
Example 15
Source File: HashingTF.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 16
Source File: HashingTF.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 17
Source File: GroupedDatasetSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.api.python.PythonEvalType import org.apache.spark.sql.catalyst.plans.logical.AnalysisBarrier import org.apache.spark.sql.execution.python.PythonUDF import org.apache.spark.sql.functions.udf import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types.{LongType, StructField, StructType} class GroupedDatasetSuite extends QueryTest with SharedSQLContext { import testImplicits._ private val scalaUDF = udf((x: Long) => { x + 1 }) private lazy val datasetWithUDF = spark.range(1).toDF("s").select($"s", scalaUDF($"s")) private def assertContainsAnalysisBarrier(ds: Dataset[_], atLevel: Int = 1): Unit = { assert(atLevel >= 0) var children = Seq(ds.queryExecution.logical) (1 to atLevel).foreach { _ => children = children.flatMap(_.children) } val barriers = children.collect { case ab: AnalysisBarrier => ab } assert(barriers.nonEmpty, s"Plan does not contain AnalysisBarrier at level $atLevel:\n" + ds.queryExecution.logical) } test("SPARK-24373: avoid running Analyzer rules twice on RelationalGroupedDataset") { val groupByDataset = datasetWithUDF.groupBy() val rollupDataset = datasetWithUDF.rollup("s") val cubeDataset = datasetWithUDF.cube("s") val pivotDataset = datasetWithUDF.groupBy().pivot("s", Seq(1, 2)) datasetWithUDF.cache() Seq(groupByDataset, rollupDataset, cubeDataset, pivotDataset).foreach { rgDS => val df = rgDS.count() assertContainsAnalysisBarrier(df) assertCached(df) } val flatMapGroupsInRDF = datasetWithUDF.groupBy().flatMapGroupsInR( Array.emptyByteArray, Array.emptyByteArray, Array.empty, StructType(Seq(StructField("s", LongType)))) val flatMapGroupsInPandasDF = datasetWithUDF.groupBy().flatMapGroupsInPandas(PythonUDF( "pyUDF", null, StructType(Seq(StructField("s", LongType))), Seq.empty, PythonEvalType.SQL_GROUPED_MAP_PANDAS_UDF, true)) Seq(flatMapGroupsInRDF, flatMapGroupsInPandasDF).foreach { df => assertContainsAnalysisBarrier(df, 2) assertCached(df) } datasetWithUDF.unpersist(true) } test("SPARK-24373: avoid running Analyzer rules twice on KeyValueGroupedDataset") { val kvDasaset = datasetWithUDF.groupByKey(_.getLong(0)) datasetWithUDF.cache() val mapValuesKVDataset = kvDasaset.mapValues(_.getLong(0)).reduceGroups(_ + _) val keysKVDataset = kvDasaset.keys val flatMapGroupsKVDataset = kvDasaset.flatMapGroups((k, _) => Seq(k)) val aggKVDataset = kvDasaset.count() val otherKVDataset = spark.range(1).groupByKey(_ + 1) val cogroupKVDataset = kvDasaset.cogroup(otherKVDataset)((k, _, _) => Seq(k)) Seq((mapValuesKVDataset, 1), (keysKVDataset, 2), (flatMapGroupsKVDataset, 2), (aggKVDataset, 1), (cogroupKVDataset, 2)).foreach { case (df, analysisBarrierDepth) => assertContainsAnalysisBarrier(df, analysisBarrierDepth) assertCached(df) } datasetWithUDF.unpersist(true) } }
Example 18
Source File: SampleRowsUniformly.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.algebra.sampling import org.apache.spark.sql.functions.{ rand, udf } import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Filter } import play.api.libs.json._ import mimir.algebra._ case class SampleRowsUniformly(probability:Double) extends SamplingMode { override def toString = s"WITH PROBABILITY $probability" def apply(plan: LogicalPlan, seed: Long): LogicalPlan = { // Adapted from Spark's df.stat.sampleBy method val r = rand(seed) val f = udf { (x: Double) => x < probability } Filter( f(r).expr, plan ) } def expressions: Seq[Expression] = Seq() def rebuildExpressions(x: Seq[Expression]): SamplingMode = this def toJson: JsValue = JsObject(Map[String,JsValue]( "mode" -> JsString(SampleRowsUniformly.MODE), "probability" -> JsNumber(probability) )) } object SampleRowsUniformly { val MODE = "uniform_probability" def parseJson(json:Map[String, JsValue]): Option[SampleRowsUniformly] = { if(json("mode").as[String].equals(MODE)){ Some(SampleRowsUniformly(json("probability").as[Double])) } else { None } } }
Example 19
Source File: SampleStratifiedOn.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.algebra.sampling import org.apache.spark.sql.functions.{rand, udf, col} import org.apache.spark.sql.catalyst.plans.logical.{ LogicalPlan, Filter } import play.api.libs.json._ import mimir.algebra._ import mimir.exec.spark.RAToSpark import mimir.serialization.{ Json => MimirJson } case class SampleStratifiedOn(column:ID, t:Type, strata:Map[PrimitiveValue,Double]) extends SamplingMode { val sparkStrata = strata.map { case (v, p) => RAToSpark.getNative(v, t) -> p } .toMap override def toString = s"ON $column WITH STRATA ${strata.map { case (v,p) => s"$v -> $p"}.mkString(" | ")}" def apply(plan: LogicalPlan, seed: Long): LogicalPlan = { // Adapted from Spark's df.stat.sampleBy method val c = col(column.id) val r = rand(seed) val f = udf { (stratum: Any, x: Double) => x < sparkStrata.getOrElse(stratum, 0.0) } Filter( f(c, r).expr, plan ) } def expressions: Seq[Expression] = Seq(Var(column)) def rebuildExpressions(x: Seq[Expression]): SamplingMode = { x(0) match { case Var(newColumn) => SampleStratifiedOn(newColumn, t, strata) case _ => throw new RAException("Internal Error: Rewriting stratification variable with arbitrary expression") } } def toJson: JsValue = JsObject(Map[String,JsValue]( "mode" -> JsString(SampleStratifiedOn.MODE), "column" -> JsString(column.id), "type" -> MimirJson.ofType(t), "strata" -> JsArray( strata .toSeq .map { case (v, p) => JsObject(Map[String,JsValue]( "value" -> MimirJson.ofPrimitive(v), "probability" -> JsNumber(p) )) } ) )) } object SampleStratifiedOn { val MODE = "stratified_on" def parseJson(json:Map[String, JsValue]): Option[SampleStratifiedOn] = { if(json("mode").as[String].equals(MODE)){ val t = MimirJson.toType(json("type")) Some(SampleStratifiedOn( ID(json("column").as[String]), t, json("strata") .as[Seq[Map[String,JsValue]]] .map { stratum => MimirJson.toPrimitive(t, stratum("value")) -> stratum("probability").as[Double] } .toMap )) } else { None } } }
Example 20
Source File: HashingTF.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 21
Source File: functions.scala From spark-fuzzy-matching with MIT License | 5 votes |
package com.pb.fuzzy.matching import org.apache.spark.sql.functions.udf import com.rockymadden.stringmetric.phonetic.MetaphoneAlgorithm import com.rockymadden.stringmetric.phonetic.MetaphoneMetric import com.rockymadden.stringmetric.phonetic.NysiisAlgorithm import com.rockymadden.stringmetric.phonetic.NysiisMetric import com.rockymadden.stringmetric.phonetic.RefinedNysiisAlgorithm import com.rockymadden.stringmetric.phonetic.RefinedNysiisMetric import com.rockymadden.stringmetric.phonetic.RefinedSoundexAlgorithm import com.rockymadden.stringmetric.phonetic.RefinedSoundexMetric import com.rockymadden.stringmetric.phonetic.SoundexAlgorithm import com.rockymadden.stringmetric.phonetic.SoundexMetric import com.rockymadden.stringmetric.similarity.DiceSorensenMetric import com.rockymadden.stringmetric.similarity.HammingMetric import com.rockymadden.stringmetric.similarity.JaccardMetric import com.rockymadden.stringmetric.similarity.JaroMetric import com.rockymadden.stringmetric.similarity.JaroWinklerMetric import com.rockymadden.stringmetric.similarity.LevenshteinMetric import com.rockymadden.stringmetric.similarity.NGramMetric import com.rockymadden.stringmetric.similarity.OverlapMetric import com.rockymadden.stringmetric.similarity.RatcliffObershelpMetric import com.rockymadden.stringmetric.similarity.WeightedLevenshteinMetric def metaphoneFn = udf { (document: String, document1: String) => MetaphoneMetric.compare(document, document1) } def computeMetaphoneFn = udf { (document: String) => MetaphoneAlgorithm.compute(document) } def nysiisFn = udf { (document: String, document1: String) => NysiisMetric.compare(document, document1) } def computeNysiisFn = udf { (document: String) => NysiisAlgorithm.compute(document) } def refinedNysiisFn = udf { (document: String, document1: String) => RefinedNysiisMetric.compare(document, document1) } def computeRefinedNysiisFn = udf { (document: String) => RefinedNysiisAlgorithm.compute(document) } def refinedSoundexFn = udf { (document: String, document1: String) => RefinedSoundexMetric.compare(document, document1) } def computeRefinedSoundexFn = udf { (document: String) => RefinedSoundexAlgorithm.compute(document) } def soundexFn = udf { (document: String, document1: String) => SoundexMetric.compare(document, document1) } def computeSoundexFn = udf { (document: String) => SoundexAlgorithm.compute(document) } }
Example 22
Source File: OilPriceFunc.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.geomesa import java.text.SimpleDateFormat import java.util.Calendar import org.apache.spark.sql.SparkSession import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{udf, window, last, col, lag} object OilPriceFunc { // use this if the window function misbehaves due to timezone e.g. BST // ./spark-shell --driver-java-options "-Duser.timezone=UTC" // ./spark-submit --conf 'spark.driver.extraJavaOptions=-Duser.timezone=UTC' // define a function to reformat the date field def convert(date:String) : String = { val df1 = new SimpleDateFormat("dd/MM/yyyy") val dt = df1.parse(date) val df2 = new SimpleDateFormat("yyyy-MM-dd") df2.format(dt) } // create and save oil price changes def createOilPriceDF(inputfile: String, outputfile: String, spark: SparkSession) = { val oilPriceDF = spark. read. option("header", "true"). option("inferSchema", "true"). csv(inputfile) val convertDateUDF = udf { (Date: String) => convert(Date) } val oilPriceDatedDF = oilPriceDF.withColumn("DATE", convertDateUDF(oilPriceDF("DATE"))) // offset to start at beginning of week val windowDF = oilPriceDatedDF.groupBy(window(oilPriceDatedDF.col("DATE"), "7 days", "7 days", "4 days")) val windowLastDF = windowDF.agg(last("PRICE") as "last(PRICE)").sort("window") // windowLastDF.show(20, false) val sortedWindow = Window.orderBy("window.start") val lagLastCol = lag(col("last(PRICE)"), 1).over(sortedWindow) val lagLastColDF = windowLastDF.withColumn("lastPrev(PRICE)", lagLastCol) // lagLastColDF.show(20, false) val simplePriceChangeFunc = udf { (last: Double, prevLast: Double) => var change = ((last - prevLast) compare 0).signum if (change == -1) change = 0 change.toDouble } val findDateTwoDaysAgoUDF = udf { (date: String) => val dateFormat = new SimpleDateFormat("yyyy-MM-dd") val cal = Calendar.getInstance cal.setTime(dateFormat.parse(date)) cal.add(Calendar.DATE, -3) dateFormat.format(cal.getTime) } val oilPriceChangeDF = lagLastColDF.withColumn("label", simplePriceChangeFunc( lagLastColDF("last(PRICE)"), lagLastColDF("lastPrev(PRICE)") )).withColumn("commonFriday", findDateTwoDaysAgoUDF(lagLastColDF("window.end"))) // oilPriceChangeDF.show(20, false) oilPriceChangeDF.select("label", "commonFriday"). write. format("com.databricks.spark.csv"). option("header", "true"). //.option("codec", "org.apache.hadoop.io.compress.GzipCodec") save(outputfile) } }
Example 23
Source File: functions.scala From Hands-On-Deep-Learning-with-Apache-Spark with MIT License | 5 votes |
package com.databricks.spark.corenlp import java.util.Properties import scala.collection.JavaConverters._ import edu.stanford.nlp.ling.CoreAnnotations import edu.stanford.nlp.neural.rnn.RNNCoreAnnotations import edu.stanford.nlp.pipeline.{Annotation, CleanXmlAnnotator, StanfordCoreNLP, TokenizerAnnotator} import edu.stanford.nlp.pipeline.CoreNLPProtos.Sentiment import edu.stanford.nlp.sentiment.SentimentCoreAnnotations import edu.stanford.nlp.simple.{Document, Sentence} import edu.stanford.nlp.util.Quadruple import org.apache.spark.sql.functions.udf object functions { @transient private var sentimentPipeline: StanfordCoreNLP = _ private def getOrCreateSentimentPipeline(): StanfordCoreNLP = { if (sentimentPipeline == null) { val props = new Properties() props.setProperty("annotators", "tokenize, ssplit, parse, sentiment") sentimentPipeline = new StanfordCoreNLP(props) } sentimentPipeline } private case class OpenIE(subject: String, relation: String, target: String, confidence: Double) { def this(quadruple: Quadruple[String, String, String, java.lang.Double]) = this(quadruple.first, quadruple.second, quadruple.third, quadruple.fourth) } private case class CorefMention(sentNum: Int, startIndex: Int, mention: String) private case class CorefChain(representative: String, mentions: Seq[CorefMention]) private case class SemanticGraphEdge( source: String, sourceIndex: Int, relation: String, target: String, targetIndex: Int, weight: Double) def sentiment = udf { sentence: String => val pipeline = getOrCreateSentimentPipeline() val annotation = pipeline.process(sentence) val tree = annotation.get(classOf[CoreAnnotations.SentencesAnnotation]) .asScala .head .get(classOf[SentimentCoreAnnotations.SentimentAnnotatedTree]) RNNCoreAnnotations.getPredictedClass(tree) } }
Example 24
Source File: DatasetUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata} import org.apache.spark.sql.{Column, DataFrame, Dataset} object DatasetUtil { def withColumns[T](ds: Dataset[T], colNames: Seq[String], cols: Seq[Column], metadata: Seq[Metadata]): DataFrame = { require(colNames.size == cols.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of columns: ${cols.size}") require(colNames.size == metadata.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of metadata elements: ${metadata.size}") val sparkSession = ds.sparkSession val queryExecution = ds.queryExecution val resolver = sparkSession.sessionState.analyzer.resolver val output = queryExecution.analyzed.output checkColumnNameDuplication(colNames, "in given column names", sparkSession.sessionState.conf.caseSensitiveAnalysis) val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) => colName -> col.as(colName, metadata) }.toMap val replacedAndExistingColumns = output.map { field => columnMap.find { case (colName, _) => resolver(field.name, colName) } match { case Some((colName: String, col: Column)) => col.as(colName) case _ => new Column(field) } } val newColumns = columnMap.filter { case (colName, col) => !output.exists(f => resolver(f.name, colName)) }.map { case (colName, col) => col.as(colName) } ds.select(replacedAndExistingColumns ++ newColumns: _*) } def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = { withColumns(ds, Seq(colName), Seq(col), Seq(metadata)) } private def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } /** * Cast a column in a Dataset to Vector type. * * The supported data types of the input column are * - Vector * - float/double type Array. * * Note: The returned column does not have Metadata. * * @param dataset input DataFrame * @param colName column name. * @return Vector column */ def columnToVector(dataset: Dataset[_], colName: String): Column = { val columnDataType = dataset.schema(colName).dataType columnDataType match { case _: VectorUDT => col(colName) case fdt: ArrayType => val transferUDF = fdt.elementType match { case _: FloatType => udf(f = (vector: Seq[Float]) => { val inputArray = Array.fill[Double](vector.size)(0.0) vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble) Vectors.dense(inputArray) }) case _: DoubleType => udf((vector: Seq[Double]) => { Vectors.dense(vector.toArray) }) case other => throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector") } transferUDF(col(colName)) case other => throw new IllegalArgumentException(s"$other column cannot be cast to Vector") } } }
Example 25
Source File: UDFTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.udf import com.github.dnvriend.TestSpec class UDFTest extends TestSpec { it should "uppercase using a user defined function or UDF" in withSparkSession { spark => import spark.implicits._ // lets create a DataFrame val df = Seq((0, "hello"), (1, "world")).toDF("id", "text") df.as[(Int, String)].collect() shouldBe Seq( (0, "hello"), (1, "world") ) // define a plain old Scala Function val upper: String => String = _.toUpperCase + "- foo" // create a User Defined Function import org.apache.spark.sql.functions.udf val upperUDF = udf(upper) // apply the user defined function df.withColumn("upper", upperUDF('text)) .as[(Int, String, String)].collect shouldBe Seq( (0, "hello", "HELLO- foo"), (1, "world", "WORLD- foo") ) // the UDF can be used in a query // first register a temp view so that // we can reference the DataFrame df.createOrReplaceTempView("df") // register the UDF by name 'upperUDF' spark.udf.register("upperUDF", upper) // use the UDF in a SQL-Query spark.sql("SELECT *, upperUDF(text) FROM df") .as[(Int, String, String)].collect shouldBe Seq( (0, "hello", "HELLO- foo"), (1, "world", "WORLD- foo") ) } }
Example 26
Source File: DataFrameTfrConverter.scala From ecosystem with Apache License 2.0 | 5 votes |
package org.tensorflow.spark.datasources.tfrecords.udf import org.apache.spark.sql.Row import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.udf import org.tensorflow.spark.datasources.tfrecords.serde.DefaultTfRecordRowEncoder object DataFrameTfrConverter { def getRowToTFRecordExampleUdf: UserDefinedFunction = udf(rowToTFRecordExampleUdf _ ) private def rowToTFRecordExampleUdf(row: Row): Array[Byte] = { DefaultTfRecordRowEncoder.encodeExample(row).toByteArray } def getRowToTFRecordSequenceExampleUdf: UserDefinedFunction = udf(rowToTFRecordSequenceExampleUdf _ ) private def rowToTFRecordSequenceExampleUdf(row: Row): Array[Byte] = { DefaultTfRecordRowEncoder.encodeSequenceExample(row).toByteArray } }
Example 27
Source File: functions.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions.{array, col, explode, udf} import org.apache.spark.sql.types.DataType import scala.reflect.runtime.universe._ object functions { implicit class FilterAnnotations(dataset: DataFrame) { def filterByAnnotationsCol(column: String, function: Seq[Annotation] => Boolean): DataFrame = { val meta = dataset.schema(column).metadata val func = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } dataset.filter(func(col(column)).as(column, meta)) } } def mapAnnotations[T](function: Seq[Annotation] => T, outputType: DataType): UserDefinedFunction = udf ( { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) }, outputType) def mapAnnotationsStrict(function: Seq[Annotation] => Seq[Annotation]): UserDefinedFunction = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } implicit class MapAnnotations(dataset: DataFrame) { def mapAnnotationsCol[T: TypeTag](column: String, outputCol: String, function: Seq[Annotation] => T): DataFrame = { val meta = dataset.schema(column).metadata val func = udf { annotatorProperties: Seq[Row] => function(annotatorProperties.map(Annotation(_))) } dataset.withColumn(outputCol, func(col(column)).as(outputCol, meta)) } } implicit class EachAnnotations(dataset: DataFrame) { import dataset.sparkSession.implicits._ def eachAnnotationsCol[T: TypeTag](column: String, function: Seq[Annotation] => Unit): Unit = { dataset.select(column).as[Array[Annotation]].foreach(function(_)) } } implicit class ExplodeAnnotations(dataset: DataFrame) { def explodeAnnotationsCol[T: TypeTag](column: String, outputCol: String): DataFrame = { val meta = dataset.schema(column).metadata dataset. withColumn(outputCol, explode(col(column))). withColumn(outputCol, array(col(outputCol)).as(outputCol, meta)) } } }
Example 28
Source File: HashingTF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 29
Source File: FilterTopFeaturesProcess.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.s2jobs.wal.process import org.apache.s2graph.s2jobs.task.TaskConf import org.apache.s2graph.s2jobs.wal.WalLogAgg import org.apache.s2graph.s2jobs.wal.transformer.{DefaultTransformer, Transformer} import org.apache.spark.broadcast.Broadcast import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import play.api.libs.json.{JsObject, Json} object FilterTopFeaturesProcess { private var validFeatureHashKeys: Set[Long] = null def getValidFeatureHashKeys(validFeatureHashKeysBCast: Broadcast[Array[Long]]): Set[Long] = { if (validFeatureHashKeys == null) { validFeatureHashKeys = validFeatureHashKeysBCast.value.toSet } validFeatureHashKeys } def collectDistinctFeatureHashes(ss: SparkSession, filteredDict: DataFrame): Array[Long] = { import ss.implicits._ val featureHashUDF = udf((dim: String, value: String) => WalLogAgg.toFeatureHash(dim, value)) filteredDict.withColumn("featureHash", featureHashUDF(col("dim"), col("value"))) .select("featureHash") .distinct().as[Long].collect() } def filterTopKsPerDim(dict: DataFrame, maxRankPerDim: Broadcast[Map[String, Int]], defaultMaxRank: Int): DataFrame = { val filterUDF = udf((dim: String, rank: Long) => { rank < maxRankPerDim.value.getOrElse(dim, defaultMaxRank) }) dict.filter(filterUDF(col("dim"), col("rank"))) } def filterWalLogAgg(ss: SparkSession, walLogAgg: Dataset[WalLogAgg], transformers: Seq[Transformer], validFeatureHashKeysBCast: Broadcast[Array[Long]]) = { import ss.implicits._ walLogAgg.mapPartitions { iter => val validFeatureHashKeys = getValidFeatureHashKeys(validFeatureHashKeysBCast) iter.map { walLogAgg => WalLogAgg.filterProps(walLogAgg, transformers, validFeatureHashKeys) } } } } class FilterTopFeaturesProcess(taskConf: TaskConf) extends org.apache.s2graph.s2jobs.task.Process(taskConf) { import FilterTopFeaturesProcess._ override def execute(ss: SparkSession, inputMap: Map[String, DataFrame]): DataFrame = { import ss.implicits._ val maxRankPerDim = taskConf.options.get("maxRankPerDim").map { s => Json.parse(s).as[JsObject].fields.map { case (k, jsValue) => k -> jsValue.as[Int] }.toMap } val maxRankPerDimBCast = ss.sparkContext.broadcast(maxRankPerDim.getOrElse(Map.empty)) val defaultMaxRank = taskConf.options.get("defaultMaxRank").map(_.toInt) val featureDict = inputMap(taskConf.options("featureDict")) val walLogAgg = inputMap(taskConf.options("walLogAgg")).as[WalLogAgg] val transformers = TaskConf.parseTransformers(taskConf) val filteredDict = filterTopKsPerDim(featureDict, maxRankPerDimBCast, defaultMaxRank.getOrElse(Int.MaxValue)) val validFeatureHashKeys = collectDistinctFeatureHashes(ss, filteredDict) val validFeatureHashKeysBCast = ss.sparkContext.broadcast(validFeatureHashKeys) filterWalLogAgg(ss, walLogAgg, transformers, validFeatureHashKeysBCast).toDF() } override def mandatoryOptions: Set[String] = Set("featureDict", "walLogAgg") }
Example 30
Source File: Grok.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.s2jobs.udfs import org.apache.s2graph.s2jobs.utils.GrokHelper import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.{DataType, StructType} import play.api.libs.json.{JsValue, Json} class Grok extends Udf { import org.apache.spark.sql.functions.udf def register(ss: SparkSession, name:String, options:Map[String, String]) = { // grok val patternDir = options.getOrElse("patternDir", "/tmp") val patternFiles = options.getOrElse("patternFiles", "").split(",").toSeq val patterns = Json.parse(options.getOrElse("patterns", "{}")).asOpt[Map[String, String]].getOrElse(Map.empty) val compilePattern = options("compilePattern") val schemaOpt = options.get("schema") patternFiles.foreach { patternFile => ss.sparkContext.addFile(s"${patternDir}/${patternFile}") } implicit val grok = GrokHelper.getGrok(name, patternFiles, patterns, compilePattern) val f = if(schemaOpt.isDefined) { val schema = DataType.fromJson(schemaOpt.get) implicit val keys:Array[String] = schema.asInstanceOf[StructType].fieldNames udf(GrokHelper.grokMatchWithSchema _, schema) } else { udf(GrokHelper.grokMatch _) } ss.udf.register(name, f) } }
Example 31
Source File: FlintTestData.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.functions.{ udf, sum } import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.percent_rank trait FlintTestData { protected def sqlContext: SQLContext private object internalImplicits extends SQLImplicits { override protected def _sqlContext: SQLContext = sqlContext } import internalImplicits._ import FlintTestData._ protected lazy val testData: DataFrame = { val df = sqlContext.sparkContext.parallelize( (0 to 97).map(i => TestData(i.toLong, i.toDouble)) ).toDF() df } protected lazy val testData2: DataFrame = { val df = sqlContext.sparkContext.parallelize( (0 to 101).map(i => TestData2(i.toLong, i.toDouble, -i.toDouble)) ).toDF() df } protected lazy val testDataCached: DataFrame = { val df = DFConverter.newDataFrame(testData) df.cache df.count df } protected val withTime2Column = { df: DataFrame => df.withColumn("time2", df("time") * 2) } protected val withTime3ColumnUdf = { df: DataFrame => val testUdf = udf({ time: Long => time * 2 }) df.withColumn("time3", testUdf(df("time"))) } protected val selectV = { df: DataFrame => df.select("v") } protected val selectExprVPlusOne = { df: DataFrame => df.selectExpr("v + 1 as v") } protected val filterV = { df: DataFrame => df.filter(df("v") > 0) } protected val orderByTime = { df: DataFrame => df.orderBy("time") } protected val orderByV = { df: DataFrame => df.orderBy("v") } protected val addRankColumn = { df: DataFrame => df.withColumn("rank", percent_rank().over(Window.partitionBy("time").orderBy("v"))) } protected val selectSumV = { df: DataFrame => df.select(sum("v")) } protected val selectExprSumV = { df: DataFrame => df.selectExpr("sum(v)") } protected val groupByTimeSumV = { df: DataFrame => df.groupBy("time").agg(sum("v").alias("v")) } protected val repartition = { df: DataFrame => df.repartition(10) } protected val coalesce = { df: DataFrame => df.coalesce(5) } protected val cache = { df: DataFrame => df.cache(); df.count(); df } protected val unpersist = { df: DataFrame => df.unpersist() } } object FlintTestData { case class TestData(time: Long, v: Double) case class TestData2(time: Long, v: Double, v2: Double) }
Example 32
Source File: ReebDiagramTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vectors, EuclideanDistance, Vector} import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class ReebDiagramTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") val cover = new Cover() .setExploding(true) .setInputCols("double", "integer") .setOutputCol("cover_id") property("argument topTreeSize must be positive") { intercept[IllegalArgumentException] { val reeb = new ReebDiagram() // .setIdCol("id") // .setCoverCol("cover_id") // .setFeaturesCol("vector") // .setOutputCol("cluster_id") .setTopTreeSize(0) } } property("placeholder") { val reeb = new ReebDiagram() .setK(15) .setIdCol("id") .setCoverCol("cover_id") .setFeaturesCol("vector") .setOutputCol("cluster_id") forAll(dataframeGen.arbitrary) { df => val assembled = assembler.transform(df) whenever( assembled.count() > 0 && hasDistinctValues(assembled, "double", "integer")) { val transformed = cover .fit(assembled) .transform(assembled) val result = reeb .setTopTreeSize(1) .fit(transformed) .transform(transformed) // result.show() } } } }
Example 33
Source File: CoverTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class CoverTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") property("argument numSplits must be positive") { intercept[IllegalArgumentException] { val cover = new Cover() .setInputCols("double") .setOutputCol("cover_ids") .setNumSplits(0) } } property("argument overlapRatio must be positive") { intercept[IllegalArgumentException] { val cover = new Cover() .setInputCols("double") .setOutputCol("cover_ids") .setOverlapRatio(0.0) } } property("cover estimator changes nothing with the original dataframe") { val cover = new Cover() .setInputCols("double", "integer", "vector") .setOutputCol("cover_ids") forAll(dataframeGen.arbitrary) { df => val transformed = assembler.transform(df) whenever( transformed.count() > 0 && hasDistinctValues(transformed, "double", "integer", "vector")) { val covered = cover .fit(transformed) .transform(transformed) .drop("cover_ids") .except(transformed) .count() should be(0) } } } property("generated cover covers all range of specified columns") { val cover = new Cover() .setInputCols("double", "integer", "vector") .setOutputCol("cover_ids") val uncovered = udf { xs: Seq[Long] => xs.length == 0 } forAll(dataframeGen.arbitrary) { df => val transformed = assembler.transform(df) whenever( transformed.count() > 0 && hasDistinctValues(transformed, "double", "integer", "vector")) { cover .fit(transformed) .transform(transformed) .where(uncovered(col("cover_ids"))) .count() should be(0) } } } property("Cover is readable/writable") { val cover = new Cover() .setInputCols("double", "integer") .setOutputCol("cover_ids") testDefaultReadWrite(cover) } property("CoverModel is readable/writable") { val model = new CoverModel("myCoverModel", Vectors.dense(-1.0, 0.0), Vectors.dense(1.0, 10.0)) .setInputCols("double", "integer") .setOutputCol("cover_ids") val newModel = testDefaultReadWrite(model) assert(newModel.min === model.min) assert(newModel.max === model.max) } }
Example 34
Source File: MathUnary.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType} import org.apache.spark.sql.functions.udf private val className = classOf[MathUnary].getName override def load(path: String): MathUnary = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("operation").head() val operation = data.getAs[String](0) val model = MathUnaryModel(UnaryOperation.forName(operation)) val transformer = new MathUnary(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 35
Source File: MultinomialLabeler.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.MultinomialLabelerModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasFeaturesCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{udf, col} import ml.combust.mleap.core.util.VectorConverters._ class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"), val model: MultinomialLabelerModel) extends Transformer with HasFeaturesCol with HasProbabilitiesCol with HasLabelsCol { def setFeaturesCol(value: String): this.type = set(featuresCol, value) def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value) def setLabelsCol(value: String): this.type = set(labelsCol, value) @org.apache.spark.annotation.Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val probabilitiesUdf = udf { (vector: Vector) => model.top(vector).map(_._1).toArray } val labelsUdf = udf { (vector: Vector) => model.topLabels(vector).toArray } dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))). withColumn($(labelsCol), labelsUdf(col($(featuresCol)))) } override def copy(extra: ParamMap): Transformer = copyValues(new MultinomialLabeler(uid, model), extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT], s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(probabilitiesCol)), s"Output column ${$(probabilitiesCol)} already exists.") require(!inputFields.exists(_.name == $(labelsCol)), s"Output column ${$(labelsCol)} already exists.") StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)), StructField($(labelsCol), ArrayType(StringType)))) } }
Example 36
Source File: PointSuite.scala From magellan with Apache License 2.0 | 5 votes |
package magellan import com.fasterxml.jackson.databind.ObjectMapper import org.apache.spark.sql.types._ import org.scalatest.FunSuite class PointSuite extends FunSuite with TestSparkContext { test("bounding box") { val point = Point(1.0, 1.0) val BoundingBox(xmin, ymin, xmax, ymax) = point.boundingBox assert(xmin === 1.0) assert(ymin === 1.0) assert(xmax === 1.0) assert(ymax === 1.0) } test("serialization") { val point = Point(1.0, 1.0) val pointUDT = new PointUDT val BoundingBox(xmin, ymin, xmax, ymax) = point.boundingBox val row = pointUDT.serialize(point) assert(row.getInt(0) === point.getType()) assert(row.getDouble(1) === xmin) assert(row.getDouble(2) === ymin) assert(row.getDouble(3) === xmax) assert(row.getDouble(4) === ymax) val serializedPoint = pointUDT.deserialize(row) assert(point.equals(serializedPoint)) } test("point udf") { val sqlContext = this.sqlContext import sqlContext.implicits._ val points = sc.parallelize(Seq((-1.0, -1.0), (-1.0, 1.0), (1.0, -1.0))).toDF("x", "y") import org.apache.spark.sql.functions.udf val toPointUDF = udf{(x:Double,y:Double) => Point(x,y) } val point = points.withColumn("point", toPointUDF('x, 'y)) .select('point) .first()(0) .asInstanceOf[Point] assert(point.getX() === -1.0) assert(point.getY() === -1.0) } test("jackson serialization") { val s = new ObjectMapper().writeValueAsString(Point(1.0, 1.0)) assert(s.contains("boundingBox")) assert(s.contains("x")) assert(s.contains("y")) } test("within circle") { assert(Point(0.0, 0.0) withinCircle (Point(0.5, 0.5), 0.75)) assert(!(Point(0.0, 0.0) withinCircle (Point(0.5, 0.5), 0.5))) } test("buffer point") { val polygon = Point(0.0, 1.0).buffer(0.5) assert(polygon.getNumRings() === 1) // check that [0.0, 0.75] is within this polygon assert(polygon.contains(Point(0.0, 0.75))) // check that [0.4, 1.0] is within this polygon assert(polygon.contains(Point(0.4, 1.0))) // check that [0.6, 1.0] is outside this polygon assert(!polygon.contains(Point(0.6, 1.0))) } }
Example 37
Source File: ShortestPaths.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import java.util import scala.collection.JavaConverters._ import org.apache.spark.graphx.{lib => graphxlib} import org.apache.spark.sql.{Column, DataFrame, Row} import org.apache.spark.sql.api.java.UDF1 import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{IntegerType, MapType} import org.graphframes.GraphFrame def landmarks(value: util.ArrayList[Any]): this.type = { landmarks(value.asScala) } def run(): DataFrame = { ShortestPaths.run(graph, check(lmarks, "landmarks")) } } private object ShortestPaths { private def run(graph: GraphFrame, landmarks: Seq[Any]): DataFrame = { val idType = graph.vertices.schema(GraphFrame.ID).dataType val longIdToLandmark = landmarks.map(l => GraphXConversions.integralId(graph, l) -> l).toMap val gx = graphxlib.ShortestPaths.run( graph.cachedTopologyGraphX, longIdToLandmark.keys.toSeq.sorted).mapVertices { case (_, m) => m.toSeq } val g = GraphXConversions.fromGraphX(graph, gx, vertexNames = Seq(DISTANCE_ID)) val distanceCol: Column = if (graph.hasIntegralIdType) { // It seems there are no easy way to convert a sequence of pairs into a map val mapToLandmark = udf { distances: Seq[Row] => distances.map { case Row(k: Long, v: Int) => k -> v }.toMap } mapToLandmark(g.vertices(DISTANCE_ID)) } else { val func = new UDF1[Seq[Row], Map[Any, Int]] { override def call(t1: Seq[Row]): Map[Any, Int] = { t1.map { case Row(k: Long, v: Int) => longIdToLandmark(k) -> v }.toMap } } val mapToLandmark = udf(func, MapType(idType, IntegerType, false)) mapToLandmark(col(DISTANCE_ID)) } val cols = graph.vertices.columns.map(col) :+ distanceCol.as(DISTANCE_ID) g.vertices.select(cols: _*) } private val DISTANCE_ID = "distances" }
Example 38
Source File: HashingTF.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 39
Source File: Cleaner.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions.clean import com.hankcs.hanlp.HanLP import config.paramconf.{HasOutputCol, HasInputCol} import functions.MySchemaUtils import functions.clean.chinese.BCConvert import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset} setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1) override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val cleanFunc = udf {line: String => var cleaned = "" getFanJian match { case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line) case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line) case _ => cleaned = line } getQuanBan match { case "q2b" => cleaned = BCConvert.qj2bj(cleaned) case "b2q" => cleaned = BCConvert.bj2qj(cleaned) case _ => cleaned = cleaned } cleaned } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record => val outputIndex = record.fieldIndex($(outputCol)) record.getString(outputIndex).length >= getMinLineLen } } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.typeName.equals(StringType.typeName), s"Input type must be StringType but got $inputType.") MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable) } } object Cleaner extends DefaultParamsReadable[Cleaner] { override def load(path: String): Cleaner = super.load(path) }
Example 40
Source File: HigherOrderKind.scala From cleanframes with Apache License 2.0 | 5 votes |
package cleanframes.instances import cleanframes.Cleaner import org.apache.spark.sql.functions.udf import scala.reflect.runtime.universe.TypeTag trait HigherOrderKind { implicit def stringHigherOrder[A, B[_]](implicit aTag: TypeTag[A], bTag: TypeTag[B[A]], func: String => B[A]): Cleaner[B[A]] = Cleaner.materialize { (frame, name, alias) => List( udf(func).apply(frame.col(name.get)) as alias.get ) } }
Example 41
Source File: ServingUDFs.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package org.apache.spark.sql.execution.streaming import com.microsoft.ml.spark.io.http.HTTPResponseData import com.microsoft.ml.spark.io.http.HTTPSchema.{binary_to_response, empty_response, string_to_response} import org.apache.spark.sql.execution.streaming.continuous.HTTPSourceStateHolder import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{lit, struct, to_json, udf} import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, Row} import scala.util.Try object ServingUDFs { private def jsonReply(c: Column) = string_to_response(to_json(c)) def makeReplyUDF(data: Column, dt: DataType, code: Column = lit(200), reason: Column = lit("Success")): Column = { dt match { case NullType => empty_response(code, reason) case StringType => string_to_response(data, code, reason) case BinaryType => binary_to_response(data) case _: StructType => jsonReply(data) case _: MapType => jsonReply(data) case at: ArrayType => at.elementType match { case _: StructType => jsonReply(data) case _: MapType => jsonReply(data) case _ => jsonReply(struct(data)) } case _ => jsonReply(struct(data)) } } private def sendReplyHelper(mapper: Row => HTTPResponseData)(serviceName: String, reply: Row, id: Row): Boolean = { if (Option(reply).isEmpty || Option(id).isEmpty) { null.asInstanceOf[Boolean] //scalastyle:ignore null } else { Try(HTTPSourceStateHolder.getServer(serviceName).replyTo(id.getString(0), id.getString(1), mapper(reply))) .toOption.isDefined } } def sendReplyUDF: UserDefinedFunction = { val toData = HTTPResponseData.makeFromRowConverter udf(sendReplyHelper(toData) _, BooleanType) } }
Example 42
Source File: PageSplitter.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.featurize.text import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol, Wrappable} import org.apache.spark.ml._ import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} object PageSplitter extends DefaultParamsReadable[PageSplitter] class PageSplitter(override val uid: String) extends Transformer with HasInputCol with HasOutputCol with Wrappable with DefaultParamsWritable { def this() = this(Identifiable.randomUID("PageSplitter")) setDefault(outputCol, uid + "_output") val maximumPageLength = new IntParam(this, "maximumPageLength", "the maximum number of characters to be in a page") def setMaximumPageLength(v: Int): this.type = set(maximumPageLength, v) def getMaximumPageLength: Int = $(maximumPageLength) val minimumPageLength = new IntParam(this, "minimumPageLength", "the the minimum number of characters " + "to have on a page in order to preserve work boundaries") def setMinimumPageLength(v: Int): this.type = set(minimumPageLength, v) def getMinimumPageLength: Int = $(minimumPageLength) val boundaryRegex = new Param[String](this, "boundaryRegex", "how to split into words") def setBoundaryRegex(v: String): this.type = set(boundaryRegex, v) def getBoundaryRegex: String = $(boundaryRegex) setDefault(maximumPageLength -> 5000, minimumPageLength -> 4500, boundaryRegex -> "\\s") def split(textOpt: String): Seq[String] = { Option(textOpt).map { text => if (text.length < getMaximumPageLength) { Seq(text) } else { val lengths = text .split(getBoundaryRegex) .map(_.length) .flatMap(l => List(l, 1)) .dropRight(1) val indicies = lengths.scanLeft((0, 0, Nil: List[Int])) { case ((total, count, _), l) => if (count + l < getMaximumPageLength) { (total + l, count + l, Nil) } else if (count > getMinimumPageLength) { (total + l, l, List(total)) } else { val firstPageChars = getMaximumPageLength - count val firstPage = firstPageChars + total val remainingChars = l - firstPageChars val numPages = remainingChars / getMaximumPageLength val remainder = remainingChars - getMaximumPageLength * numPages val pages = List(firstPage) ::: (1 to numPages).map(i => total + firstPageChars + getMaximumPageLength * i).toList (total + l, remainder, pages) } }.flatMap(_._3) val words = (List(0) ::: indicies.toList ::: List(text.length)) .sliding(2) .map { case List(start, end) => text.substring(start, end) } .toSeq words } }.orNull } override def transform(dataset: Dataset[_]): DataFrame = { dataset.toDF().withColumn(getOutputCol, udf(split _, ArrayType(StringType))(col(getInputCol))) } override def copy(extra: ParamMap): MultiNGram = defaultCopy(extra) def transformSchema(schema: StructType): StructType = { assert(schema(getInputCol).dataType == StringType) schema.add(getOutputCol, ArrayType(StringType)) } }
Example 43
Source File: Lambda.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.SparkContext import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.{ParamMap, UDFParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object Lambda extends ComplexParamsReadable[Lambda] { def apply(f: Dataset[_] => DataFrame): Lambda = { new Lambda().setTransform(f) } } class Lambda(val uid: String) extends Transformer with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("Lambda")) val transformFunc = new UDFParam(this, "transformFunc", "holder for dataframe function") def setTransform(f: Dataset[_] => DataFrame): this.type = { set(transformFunc, udf(f, StringType)) } def getTransform: Dataset[_] => DataFrame = { $(transformFunc).f.asInstanceOf[Dataset[_] => DataFrame] } val transformSchemaFunc = new UDFParam(this, "transformSchemaFunc", "the output schema after the transformation") def setTransformSchema(f: StructType => StructType): this.type = { set(transformSchemaFunc, udf(f, StringType)) } def getTransformSchema: StructType => StructType = { $(transformSchemaFunc).f.asInstanceOf[StructType => StructType] } override def transform(dataset: Dataset[_]): DataFrame = { getTransform(dataset) } def transformSchema(schema: StructType): StructType = { if (get(transformSchemaFunc).isEmpty) { val sc = SparkContext.getOrCreate() val df = SparkSession.builder().getOrCreate().createDataFrame(sc.emptyRDD[Row], schema) transform(df).schema } else { getTransformSchema(schema) } } def copy(extra: ParamMap): Lambda = defaultCopy(extra) }
Example 44
Source File: udfs.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Column import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.DoubleType import scala.collection.mutable //scalastyle:off object udfs { def get_value_at(colName: String, i: Int): Column = { udf({ vec: org.apache.spark.ml.linalg.Vector => vec(i) }, DoubleType)(col(colName)) } val to_vector: UserDefinedFunction = udf({ arr: Seq[Double] => Vectors.dense(arr.toArray) }, VectorType) def to_vector(colName: String): Column = to_vector(col(colName)) }
Example 45
Source File: VowpalWabbitClassifier.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.env.InternalWrapper import com.microsoft.ml.spark.core.schema.DatasetExtensions import org.apache.spark.ml.ComplexParamsReadable import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.ml.classification.{ProbabilisticClassificationModel, ProbabilisticClassifier} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql._ import org.apache.spark.sql.functions.{col, udf} import org.vowpalwabbit.spark.VowpalWabbitExample import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import scala.math.exp object VowpalWabbitClassifier extends DefaultParamsReadable[VowpalWabbitClassifier] @InternalWrapper class VowpalWabbitClassifier(override val uid: String) extends ProbabilisticClassifier[Row, VowpalWabbitClassifier, VowpalWabbitClassificationModel] with VowpalWabbitBase { def this() = this(Identifiable.randomUID("VowpalWabbitClassifier")) // to support Grid search we need to replicate the parameters here... val labelConversion = new BooleanParam(this, "labelConversion", "Convert 0/1 Spark ML style labels to -1/1 VW style labels. Defaults to true.") setDefault(labelConversion -> true) def getLabelConversion: Boolean = $(labelConversion) def setLabelConversion(value: Boolean): this.type = set(labelConversion, value) override protected def train(dataset: Dataset[_]): VowpalWabbitClassificationModel = { val model = new VowpalWabbitClassificationModel(uid) .setFeaturesCol(getFeaturesCol) .setAdditionalFeatures(getAdditionalFeatures) .setPredictionCol(getPredictionCol) .setProbabilityCol(getProbabilityCol) .setRawPredictionCol(getRawPredictionCol) val finalDataset = if (!getLabelConversion) dataset else { val inputLabelCol = dataset.withDerivativeCol("label") dataset .withColumnRenamed(getLabelCol, inputLabelCol) .withColumn(getLabelCol, col(inputLabelCol) * 2 - 1) } trainInternal(finalDataset, model) } override def copy(extra: ParamMap): VowpalWabbitClassifier = defaultCopy(extra) } // Preparation for multi-class learning, though it no fun as numClasses is spread around multiple reductions @InternalWrapper class VowpalWabbitClassificationModel(override val uid: String) extends ProbabilisticClassificationModel[Row, VowpalWabbitClassificationModel] with VowpalWabbitBaseModel { def numClasses: Int = 2 override def transform(dataset: Dataset[_]): DataFrame = { val df = transformImplInternal(dataset) // which mode one wants to use depends a bit on how this should be deployed // 1. if you stay in spark w/o link=logistic is probably more convenient as it also returns the raw prediction // 2. if you want to export the model *and* get probabilities at scoring term w/ link=logistic is preferable // convert raw prediction to probability (if needed) val probabilityUdf = if (vwArgs.getArgs.contains("--link logistic")) udf { (pred: Double) => Vectors.dense(Array(1 - pred, pred)) } else udf { (pred: Double) => { val prob = 1.0 / (1.0 + exp(-pred)) Vectors.dense(Array(1 - prob, prob)) } } val df2 = df.withColumn($(probabilityCol), probabilityUdf(col($(rawPredictionCol)))) // convert probability to prediction val probability2predictionUdf = udf(probability2prediction _) df2.withColumn($(predictionCol), probability2predictionUdf(col($(probabilityCol)))) } override def copy(extra: ParamMap): this.type = defaultCopy(extra) protected override def predictRaw(features: Row): Vector = { throw new NotImplementedError("Not implemented") } protected override def raw2probabilityInPlace(rawPrediction: Vector): Vector= { throw new NotImplementedError("Not implemented") } } object VowpalWabbitClassificationModel extends ComplexParamsReadable[VowpalWabbitClassificationModel]
Example 46
Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable} import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions] class VowpalWabbitInteractions(override val uid: String) extends Transformer with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("VowpalWabbitInteractions")) override def transform(dataset: Dataset[_]): DataFrame = { val fieldSubset = dataset.schema.fields .filter(f => getInputCols.contains(f.name)) val mask = getMask val mode = udf((r: Row) => { // compute the final number of features val numElems = (0 until r.length) .map(r.getAs[Vector](_).numNonzeros).product val newIndices = new Array[Int](numElems) val newValues = new Array[Double](numElems) // build interaction features using FNV-1 val fnvPrime = 16777619 var i = 0 def interact(idx: Int, value: Double, ns: Int): Unit = { if (ns == r.size) { newIndices(i) += mask & idx newValues(i) += value i += 1 } else { val idx1 = idx * fnvPrime r.getAs[Vector](ns).foreachActive { case (idx2, value2) => interact(idx1 ^ idx2, value * value2, ns + 1) } } } // start the recursion interact(0, 1, 0) val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions) Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted) }) dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*))) } override def transformSchema(schema: StructType): StructType = { val fieldNames = schema.fields.map(_.name) for (f <- getInputCols) if (!fieldNames.contains(f)) throw new IllegalArgumentException("missing input column " + f) else { val fieldType = schema.fields(schema.fieldIndex(f)).dataType if (fieldType != VectorType) throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName) } schema.add(StructField(getOutputCol, VectorType, true)) } override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra) }
Example 47
Source File: HTTPTransformer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.http import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasOutputCol, Wrappable} import com.microsoft.ml.spark.io.http.HandlingUtils.HandlerFunc import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.concurrent.ExecutionContext import scala.concurrent.duration.Duration trait HasHandler extends Params { val handler: UDFParam = new UDFParam( this, "handler", "Which strategy to use when handling requests") override def transform(dataset: Dataset[_]): DataFrame = { val df = dataset.toDF() val enc = RowEncoder(transformSchema(df.schema)) val colIndex = df.schema.fieldNames.indexOf(getInputCol) val fromRow = HTTPRequestData.makeFromRowConverter val toRow = HTTPResponseData.makeToRowConverter df.mapPartitions { it => if (!it.hasNext) { Iterator() }else{ val c = clientHolder.get val responsesWithContext = c.sendRequestsWithContext(it.map{row => c.RequestWithContext(Option(row.getStruct(colIndex)).map(fromRow), Some(row)) }) responsesWithContext.map { rwc => Row.merge(rwc.context.get.asInstanceOf[Row], Row(rwc.response.flatMap(Option(_)).map(toRow).orNull)) } } }(enc) } def copy(extra: ParamMap): HTTPTransformer = defaultCopy(extra) def transformSchema(schema: StructType): StructType = { assert(schema(getInputCol).dataType == HTTPSchema.Request) schema.add(getOutputCol, HTTPSchema.Response, nullable=true) } }
Example 48
Source File: SparkBindingsTest.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.schema import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.spark.sql.Row import org.apache.spark.sql.functions.{col, udf} case class Foo(a: Int, b: String, c: Seq[Bar]) object Foo extends SparkBindings[Foo] case class Bar(a: Int, c: Seq[Byte]) object Bar extends SparkBindings[Bar] class SparkBindingsTest2 extends TestBase { import session.implicits._ test("Test to make sure there are no strange memory leaks") { (1 to 40).foreach { i => val foos = (0 to 40).map(i => Tuple1(Foo(i, i.toString, Seq(Bar(i, "foo".getBytes))))) val converter = Foo.makeFromRowConverter val df = foos.toDF("foos") .repartition(2) .withColumn("mapped2", udf({ r: Row => converter(r) }, Foo.schema)(col("foos"))) val results = df.collect().toList println(results.head) } } }
Example 49
Source File: ParserSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.io.split1 import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import com.microsoft.ml.spark.io.http._ import org.apache.http.client.methods.HttpPost import org.apache.spark.ml.Transformer import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, SparkSession} trait ParserUtils extends WithServer { def sampleDf(spark: SparkSession): DataFrame = { val df = spark.createDataFrame((1 to 10).map(Tuple1(_))) .toDF("data") val df2 = new JSONInputParser().setInputCol("data") .setOutputCol("parsedInput").setUrl(url) .transform(df) .withColumn("unparsedOutput", udf({ x: Int => HTTPResponseData( Array(), Some(EntityData( "{\"foo\": \"here\"}".getBytes, None, None, None, false, false, false)), StatusLineData(ProtocolVersionData("foo", 1, 1), 200, "bar"), "en") }).apply(col("data")) ) new JSONOutputParser() .setDataType(new StructType().add("foo", StringType)) .setInputCol("unparsedOutput") .setOutputCol("parsedOutput") .transform(df2) } def makeTestObject[T <: Transformer](t: T, session: SparkSession): Seq[TestObject[T]] = { Seq(new TestObject(t, sampleDf(session))) } } class JsonInputParserSuite extends TransformerFuzzing[JSONInputParser] with ParserUtils { override def testObjects(): Seq[TestObject[JSONInputParser]] = makeTestObject( new JSONInputParser().setInputCol("data").setOutputCol("out") .setUrl(url), session) override def reader: MLReadable[_] = JSONInputParser } class JsonOutputParserSuite extends TransformerFuzzing[JSONOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[JSONOutputParser]] = makeTestObject( new JSONOutputParser().setInputCol("unparsedOutput").setOutputCol("out") .setDataType(new StructType().add("foo", StringType)), session) override def reader: MLReadable[_] = JSONOutputParser } class StringOutputParserSuite extends TransformerFuzzing[StringOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[StringOutputParser]] = makeTestObject( new StringOutputParser().setInputCol("unparsedOutput").setOutputCol("out"), session) override def reader: MLReadable[_] = StringOutputParser } class CustomInputParserSuite extends TransformerFuzzing[CustomInputParser] with ParserUtils { override def testObjects(): Seq[TestObject[CustomInputParser]] = makeTestObject( new CustomInputParser().setInputCol("data").setOutputCol("out") .setUDF({ x: Int => new HttpPost(s"http://$x") }), session) override def reader: MLReadable[_] = CustomInputParser } class CustomOutputParserSuite extends TransformerFuzzing[CustomOutputParser] with ParserUtils { override def testObjects(): Seq[TestObject[CustomOutputParser]] = makeTestObject( new CustomOutputParser().setInputCol("unparsedOutput").setOutputCol("out") .setUDF({ x: HTTPResponseData => x.locale }), session) override def reader: MLReadable[_] = CustomOutputParser }
Example 50
Source File: HashingTF.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} @Since("2.0.0") def setBinary(value: Boolean): this.type = set(binary, value) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)).setBinary($(binary)) // TODO: Make the hashingTF.transform natively in ml framework to avoid extra conversion. val t = udf { terms: Seq[_] => hashingTF.transform(terms).asML } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } @Since("1.4.0") override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } @Since("1.4.1") override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 51
Source File: SchemaColumnSelection.scala From data-faker with MIT License | 5 votes |
package com.dunnhumby.datafaker.schema.table.columns import scala.reflect.runtime.universe.TypeTag import java.sql.{Date, Timestamp} import com.dunnhumby.datafaker.YamlParser.YamlParserProtocol import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{rand, udf} case class SchemaColumnSelection[T](override val name: String, values: List[T])(implicit tag: TypeTag[T]) extends SchemaColumn { override def column(rowID: Option[Column] = None): Column = { val intToSelectionUDF = udf((index: Int) => { values(index) }) intToSelectionUDF(rand() * values.length % values.length) } } object SchemaColumnSelectionProtocol extends SchemaColumnSelectionProtocol trait SchemaColumnSelectionProtocol extends YamlParserProtocol { import net.jcazevedo.moultingyaml._ implicit object SchemaColumnSelectionFormat extends YamlFormat[SchemaColumnSelection[_]] { override def read(yaml: YamlValue): SchemaColumnSelection[_] = { val fields = yaml.asYamlObject.fields val YamlString(dataType) = fields.getOrElse(YamlString("data_type"), deserializationError("data_type not set")) val YamlString(name) = fields.getOrElse(YamlString("name"), deserializationError("name not set")) val values = fields.getOrElse(YamlString("values"), deserializationError("selection values not set")) dataType match { case SchemaColumnDataType.Int => SchemaColumnSelection(name, values.convertTo[List[Int]]) case SchemaColumnDataType.Long => SchemaColumnSelection(name, values.convertTo[List[Long]]) case SchemaColumnDataType.Float => SchemaColumnSelection(name, values.convertTo[List[Float]]) case SchemaColumnDataType.Double => SchemaColumnSelection(name, values.convertTo[List[Double]]) case SchemaColumnDataType.Date => SchemaColumnSelection(name, values.convertTo[List[Date]]) case SchemaColumnDataType.Timestamp => SchemaColumnSelection(name, values.convertTo[List[Timestamp]]) case SchemaColumnDataType.String => SchemaColumnSelection(name, values.convertTo[List[String]]) case _ => deserializationError(s"unsupported data_type: $dataType for ${SchemaColumnType.Selection}") } } override def write(obj: SchemaColumnSelection[_]): YamlValue = ??? } }