org.apache.spark.sql.types.FloatType Scala Examples
The following examples show how to use org.apache.spark.sql.types.FloatType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RegressionEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 2
Source File: ExtremeSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.rdd.function.summarize.summarizer.Summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.{ SummarizerFactory, SummarizerSuite } import com.twosigma.flint.timeseries.{ CSV, Summarizers, TimeSeriesRDD, TimeSeriesSuite } import org.apache.spark.sql.types.{ DataType, DoubleType, FloatType, IntegerType, LongType, StructType } import java.util.Random import org.apache.spark.sql.Row class ExtremeSummarizerSpec extends SummarizerSuite { override val defaultResourceDir: String = "/timeseries/summarize/summarizer/meansummarizer" private def test[T]( dataType: DataType, randValue: Row => Any, summarizer: String => SummarizerFactory, reduceFn: (T, T) => T, inputColumn: String, outputColumn: String ): Unit = { val priceTSRdd = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)).addColumns( inputColumn -> dataType -> randValue ) val data = priceTSRdd.collect().map{ row => row.getAs[T](inputColumn) } val trueExtreme = data.reduceLeft[T]{ case (x, y) => reduceFn(x, y) } val result = priceTSRdd.summarize(summarizer(inputColumn)) val extreme = result.first().getAs[T](outputColumn) val outputType = result.schema(outputColumn).dataType assert(outputType == dataType, s"$outputType") assert(trueExtreme === extreme, s"extreme: $extreme, trueExtreme: $trueExtreme, data: ${data.toSeq}") } "MaxSummarizer" should "compute double max correctly" in { val rand = new Random() test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.max, math.max, "x", "x_max") } it should "compute long max correctly" in { val rand = new Random() test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.max, math.max, "x", "x_max") } it should "compute float max correctly" in { val rand = new Random() test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.max, math.max, "x", "x_max") } it should "compute int max correctly" in { val rand = new Random() test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.max, math.max, "x", "x_max") } "MinSummarizer" should "compute double min correctly" in { val rand = new Random() test[Double](DoubleType, { _: Row => rand.nextDouble() }, Summarizers.min, math.min, "x", "x_min") } it should "compute long min correctly" in { val rand = new Random() test[Long](LongType, { _: Row => rand.nextLong() }, Summarizers.min, math.min, "x", "x_min") } it should "compute float min correctly" in { val rand = new Random() test[Float](FloatType, { _: Row => rand.nextFloat() }, Summarizers.min, math.min, "x", "x_min") } it should "compute int min correctly" in { val rand = new Random() test[Int](IntegerType, { _: Row => rand.nextInt() }, Summarizers.min, math.min, "x", "x_min") } it should "pass summarizer property test" in { summarizerPropertyTest(AllProperties)(Summarizers.max("x1")) summarizerPropertyTest(AllProperties)(Summarizers.min("x2")) } it should "ignore null values" in { val input = fromCSV("Price.csv", Schema("id" -> IntegerType, "price" -> DoubleType)) val inputWithNull = insertNullRows(input, "price") assertEquals( input.summarize(Summarizers.min("price")), inputWithNull.summarize(Summarizers.min("price")) ) } }
Example 3
Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.http.HttpStreamClient import org.junit.Assert import org.junit.Test import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.types.BooleanType import org.apache.spark.sql.types.FloatType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.ByteType import org.apache.spark.sql.execution.streaming.http.HttpStreamServer import org.apache.spark.sql.execution.streaming.http.StreamPrinter import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException class HttpStreamServerClientTest { val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte), Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte), Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte)); val ROWS2 = Array(Row("hello"), Row("world"), Row("bye"), Row("world")); @Test def testHttpStreamIO() { //starts a http server val kryoSerializer = new KryoSerializer(new SparkConf()); val server = HttpStreamServer.start("/xxxx", 8080); val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]") .getOrCreate(); spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/"); val sqlContext = spark.sqlContext; import spark.implicits._ //add a local message buffer to server, with 2 topics registered server.withBuffer() .addListener(new StreamPrinter()) .createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1") .createTopic[String]("topic-2"); val client = HttpStreamClient.connect("http://localhost:8080/xxxx"); //tests schema of topics val schema1 = client.fetchSchema("topic-1"); Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType), schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]); val schema2 = client.fetchSchema("topic-2"); Assert.assertArrayEquals(Array[Object](StringType), schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]); //prepare to consume messages val sid1 = client.subscribe("topic-1")._1; val sid2 = client.subscribe("topic-2")._1; //produces some data client.sendRows("topic-1", 1, ROWS1); val sid4 = client.subscribe("topic-1")._1; val sid5 = client.subscribe("topic-2")._1; client.sendRows("topic-2", 1, ROWS2); //consumes data val fetched = client.fetchStream(sid1).map(_.originalRow); Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]); //it is empty now Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); client.unsubscribe(sid4); try { client.fetchStream(sid4); //exception should be thrown, because subscriber id is invalidated Assert.assertTrue(false); } catch { case e: Throwable ⇒ e.printStackTrace(); Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass); } server.stop(); } }
Example 4
Source File: RegressionEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 5
Source File: XGBoostBaseBestModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.{Booster, DMatrix} import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType} trait BaseXGBoostBestModelFinder[G, M <: org.apache.spark.ml.ForecastBaseModel[M]] extends BestModelFinder[G, M] with HasGroupByCol { protected def buildTrainSchema(sparkContext: SparkContext): Broadcast[StructType] = sparkContext.broadcast { StructType( Seq( StructField($(groupByCol).get, FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, ArrayType(new VectorUDT)))) } protected def xGBoostEvaluation(row: Row, model: Booster, broadcastEvaluator: Broadcast[TimeSeriesEvaluator[G]], id: G, parameters: ParamMap): ModelParamEvaluation[G] = { val featuresArray = row .getAs[Array[org.apache.spark.ml.linalg.Vector]](IUberdataForecastUtil.FEATURES_COL_NAME) .map { vec => val values = vec.toArray.map(DataTransformer.toFloat) LabeledPoint(values.head, null, values.tail) } val features = new DMatrix(featuresArray.toIterator) log.warn(s"Evaluating forecast for id $id, with xgboost") val prediction = model.predict(features).flatten val (forecastToBeValidated, _) = prediction.splitAt(featuresArray.length) val toBeValidated = featuresArray.zip(forecastToBeValidated) val metric = broadcastEvaluator.value.evaluate(toBeValidated.map(f => (f._1.label.toDouble, f._2.toDouble))) val metricName = broadcastEvaluator.value.getMetricName new ModelParamEvaluation[G]( id, metric, parameters, Some(metricName), SupportedAlgorithm.XGBoostAlgorithm) } }
Example 6
Source File: RegressionEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 7
Source File: RegressionEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 8
Source File: RegressionEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("1.4.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema val predictionColName = $(predictionCol) val predictionType = schema($(predictionCol)).dataType require(predictionType == FloatType || predictionType == DoubleType, s"Prediction column $predictionColName must be of type float or double, " + s" but not $predictionType") val labelColName = $(labelCol) val labelType = schema($(labelCol)).dataType require(labelType == FloatType || labelType == DoubleType, s"Label column $labelColName must be of type float or double, but not $labelType") val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 9
Source File: package.scala From spark-iqmulus with Apache License 2.0 | 5 votes |
package fr.ign.spark.iqmulus import org.apache.spark.sql.{ SQLContext, DataFrameReader, DataFrameWriter, DataFrame, Row } import org.apache.spark.sql.types.{ FloatType, StructType } package object xyz { implicit class XyzDataFrameReader(reader: DataFrameReader) { def xyz: String => DataFrame = reader.format("fr.ign.spark.iqmulus.xyz").load } implicit class XyzDataFrame(df: DataFrame) { def saveAsXyz(location: String) = { val df_id = df.drop("id") require(df_id.schema.fieldNames.take(3) sameElements Array("x", "y", "z")) require(df_id.schema.fields.map(_.dataType).take(3).forall(_ == FloatType)) val saver = (key: Int, iter: Iterator[Row]) => Iterator(iter.saveXyz(s"$location/$key.xyz")) df_id.rdd.mapPartitionsWithIndex(saver, true).collect } } implicit class XyzRowIterator(iter: Iterator[Row]) { def saveXyz(filename: String) = { val path = new org.apache.hadoop.fs.Path(filename) val fs = path.getFileSystem(new org.apache.hadoop.conf.Configuration) val f = fs.create(path) val dos = new java.io.DataOutputStream(f) var count = 0L iter.foreach(row => { count += 1; dos.writeBytes(row.mkString("", "\t", "\n")) }) dos.close (filename, count) } } }
Example 10
Source File: DatasetUtil.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.util import org.apache.spark.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, DoubleType, FloatType, Metadata} import org.apache.spark.sql.{Column, DataFrame, Dataset} object DatasetUtil { def withColumns[T](ds: Dataset[T], colNames: Seq[String], cols: Seq[Column], metadata: Seq[Metadata]): DataFrame = { require(colNames.size == cols.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of columns: ${cols.size}") require(colNames.size == metadata.size, s"The size of column names: ${colNames.size} isn't equal to " + s"the size of metadata elements: ${metadata.size}") val sparkSession = ds.sparkSession val queryExecution = ds.queryExecution val resolver = sparkSession.sessionState.analyzer.resolver val output = queryExecution.analyzed.output checkColumnNameDuplication(colNames, "in given column names", sparkSession.sessionState.conf.caseSensitiveAnalysis) val columnMap = colNames.zip(cols).zip(metadata).map { case ((colName: String, col: Column), metadata: Metadata) => colName -> col.as(colName, metadata) }.toMap val replacedAndExistingColumns = output.map { field => columnMap.find { case (colName, _) => resolver(field.name, colName) } match { case Some((colName: String, col: Column)) => col.as(colName) case _ => new Column(field) } } val newColumns = columnMap.filter { case (colName, col) => !output.exists(f => resolver(f.name, colName)) }.map { case (colName, col) => col.as(colName) } ds.select(replacedAndExistingColumns ++ newColumns: _*) } def withColumn[T](ds: Dataset[T], colName: String, col: Column, metadata: Metadata): DataFrame = { withColumns(ds, Seq(colName), Seq(col), Seq(metadata)) } private def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) if (names.distinct.length != names.length) { val duplicateColumns = names.groupBy(identity).collect { case (x, ys) if ys.length > 1 => s"`$x`" } throw new Exception(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") } } /** * Cast a column in a Dataset to Vector type. * * The supported data types of the input column are * - Vector * - float/double type Array. * * Note: The returned column does not have Metadata. * * @param dataset input DataFrame * @param colName column name. * @return Vector column */ def columnToVector(dataset: Dataset[_], colName: String): Column = { val columnDataType = dataset.schema(colName).dataType columnDataType match { case _: VectorUDT => col(colName) case fdt: ArrayType => val transferUDF = fdt.elementType match { case _: FloatType => udf(f = (vector: Seq[Float]) => { val inputArray = Array.fill[Double](vector.size)(0.0) vector.indices.foreach(idx => inputArray(idx) = vector(idx).toDouble) Vectors.dense(inputArray) }) case _: DoubleType => udf((vector: Seq[Double]) => { Vectors.dense(vector.toArray) }) case other => throw new IllegalArgumentException(s"Array[$other] column cannot be cast to Vector") } transferUDF(col(colName)) case other => throw new IllegalArgumentException(s"$other column cannot be cast to Vector") } } }
Example 11
Source File: RegressionEvaluator.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.evaluation import com.tencent.angel.sona.ml.evaluation.evaluating.RegressionSummaryImpl import com.tencent.angel.sona.ml.param.{Param, ParamMap, ParamValidators} import com.tencent.angel.sona.ml.param.shared.{HasLabelCol, HasPredictionCol} import com.tencent.angel.sona.ml.util._ import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{DoubleType, FloatType} import org.apache.spark.sql.util.SONASchemaUtils /** * :: Experimental :: * Evaluator for regression, which expects two input columns: prediction and label. */ final class RegressionEvaluator(override val uid: String) extends Evaluator with HasPredictionCol with HasLabelCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("regEval")) /** * Param for metric name in evaluation. Supports: * - `"rmse"` (default): root mean squared error * - `"mse"`: mean squared error * - `"r2"`: R^2^ metric * - `"mae"`: mean absolute error * * @group param */ val metricName: Param[String] = { val allowedParams = ParamValidators.inArray(Array("mse", "rmse", "r2", "mae")) new Param(this, "metricName", "metric name in evaluation (mse|rmse|r2|mae)", allowedParams) } def getMetricName: String = $(metricName) def setMetricName(value: String): this.type = set(metricName, value) def setPredictionCol(value: String): this.type = set(predictionCol, value) def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SONASchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SONASchemaUtils.checkNumericType(schema, $(labelCol)) val summary = new RegressionSummaryImpl(dataset.toDF(), $(predictionCol), $(labelCol)) val metrics = summary.regMetrics val metric = $(metricName) match { case "rmse" => summary.rmse case "mse" => summary.mse case "r2" => summary.r2 case "mae" => summary.absDiff } metric } override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { override def load(path: String): RegressionEvaluator = super.load(path) }