org.apache.spark.ml.param.ParamMap Scala Examples
The following examples show how to use org.apache.spark.ml.param.ParamMap.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MNISTBenchmark.scala From spark-knn with Apache License 2.0 | 6 votes |
package com.github.saurfang.spark.ml.knn.examples import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} import org.apache.log4j import scala.collection.mutable object MNISTBenchmark { val logger = log4j.Logger.getLogger(getClass) def main(args: Array[String]) { val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt) val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2" val numPartitions = if(args.length >= 3) args(2).toInt else 10 val models = if(args.length >=4) args(3).split(',') else Array("tree","naive") val spark = SparkSession.builder().getOrCreate() val sc = spark.sparkContext import spark.implicits._ //read in raw label and features val rawDataset = MLUtils.loadLibSVMFile(sc, path) .zipWithIndex() .filter(_._2 < ns.max) .sortBy(_._2, numPartitions = numPartitions) .keys .toDF() // convert "features" from mllib.linalg.Vector to ml.linalg.Vector val dataset = MLUtils.convertVectorColumnsToML(rawDataset) .cache() dataset.count() //force persist val limiter = new Limiter() val knn = new KNNClassifier() .setTopTreeSize(numPartitions * 10) .setFeaturesCol("features") .setPredictionCol("prediction") .setK(1) val naiveKNN = new NaiveKNNClassifier() val pipeline = new Pipeline() .setStages(Array(limiter, knn)) val naivePipeline = new Pipeline() .setStages(Array(limiter, naiveKNN)) val paramGrid = new ParamGridBuilder() .addGrid(limiter.n, ns) .build() val bm = new Benchmarker() .setEvaluator(new MulticlassClassificationEvaluator) .setEstimatorParamMaps(paramGrid) .setNumTimes(3) val metrics = mutable.ArrayBuffer[String]() if(models.contains("tree")) { val bmModel = bm.setEstimator(pipeline).fit(dataset) metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}" } if(models.contains("naive")) { val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset) metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}" } logger.info(metrics.mkString("\n")) } } class Limiter(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("limiter")) val n: IntParam = new IntParam(this, "n", "number of rows to limit") def setN(value: Int): this.type = set(n, value) // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN) override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF() override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = schema }
Example 2
Source File: CompareParamGrid.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl import org.apache.spark.ml.param.ParamMap import org.scalatest.{Assertions, Matchers} trait CompareParamGrid extends Matchers with Assertions { def gridCompare(g1: Array[ParamMap], g2: Array[ParamMap]): Unit = { val g1values = g1.toSet[ParamMap].map(_.toSeq.toSet) val g2values = g2.toSet[ParamMap].map(_.toSeq.toSet) matchTwoSets(g1values, g2values) } private def matchTwoSets[T](actual: Set[T], expected: Set[T]): Unit = { def stringify(set: Set[T]): String = { val list = set.toList val chunk = list take 10 val strings = chunk.map(_.toString).sorted if (list.size > chunk.size) strings.mkString else strings.mkString + ",..." } val missing = stringify(expected -- actual) val extra = stringify(actual -- expected) withClue(s"Missing:\n $missing\nExtra:\n$extra") { actual shouldBe expected } } }
Example 3
Source File: NameAssigner.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasInputCols import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Dataset, functions} import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType} class NameAssigner(override val uid: String) extends Transformer with HasInputCols{ def setInputCols(column: String*) : this.type = set(inputCols, column.toArray) def this() = this(Identifiable.randomUID("NameAssigner")) override def transform(dataset: Dataset[_]): DataFrame = { $(inputCols) $(inputCols).foldLeft(dataset.toDF)((data, column) => { val metadata: Metadata = dataset.schema(column).metadata val attributes = AttributeGroup.fromStructField( StructField(column, new VectorUDT, nullable = false, metadata = metadata)) val map = attributes.attributes .map(arr => arr.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) val func = functions.udf[String, Number](x => if(x == null) { null } else { val i = x.intValue() map.getOrElse(i, i.toString) }) data.withColumn(column, func(data(column)).as(column, metadata)) }).toDF } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.map(f => if ($(inputCols).contains(f.name)) { StructField(f.name, StringType, f.nullable, f.metadata) } else { f })) }
Example 4
Source File: URLElimminator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, Params} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} def setInputCol(value: String): this.type = set(inputCol, value) def this() = this(Identifiable.randomUID("URLEliminator")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), filterTextUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), StringType) } else { schema } } } object URLElimminator extends DefaultParamsReadable[URLElimminator] { override def load(path: String): URLElimminator = super.load(path) }
Example 5
Source File: RegexpReplaceTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{StringType, StructType} def setInputCol(value: String): this.type = set(inputCol, value) def this() = this(Identifiable.randomUID("RegexpReplaceTransformer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), regexp_replace(dataset.col($(inputCol)), $(regexpPattern), $(regexpReplacement))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputCol))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), StringType) } else { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } } } object RegexpReplaceTransformer extends DefaultParamsReadable[RegexpReplaceTransformer] { override def load(path: String): RegexpReplaceTransformer = super.load(path) }
Example 6
Source File: NGramExtractor.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1)) override def transform(dataset: Dataset[_]): DataFrame = { val lowerBound = $(lowerN) val upperBound = $(upperN) val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound)) dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputCol) != $(outputCol)) { schema.add($(outputCol), new ArrayType(StringType, true)) } else { schema } } } object NGramExtractor extends DefaultParamsReadable[NGramExtractor] { override def load(path: String): NGramExtractor = super.load(path) }
Example 7
Source File: LanguageDetectorTransformer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import com.google.common.base.Optional import com.optimaize.langdetect.LanguageDetector import com.optimaize.langdetect.i18n.LdLocale import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} import scala.collection.Map def setOutputCol(value: String): this.type = set(outputCol, value) def this() = this(Identifiable.randomUID("languageDetector")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), StringType) } @transient object languageDetectorWrapped extends Serializable { val languageDetector: LanguageDetector = LanguageDetectorUtils.buildLanguageDetector( LanguageDetectorUtils.readListLangsBuiltIn(), $(minimalConfidence), $(languagePriors).toMap) } }
Example 8
Source File: LanguageAwareAnalyzer.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import org.apache.lucene.analysis.util.StopwordAnalyzerBase import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.HasOutputCol import org.apache.spark.ml.param.{Param, ParamMap, Params} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{ArrayType, StringType, StructType} def setOutputCol(value: String): this.type = set(outputCol, value) override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } def this() = this(Identifiable.randomUID("languageAnalyzer")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.withColumn($(outputCol), stemmTextUDF(dataset.col($(inputColLang)), dataset.col($(inputColText)))).toDF } @DeveloperApi override def transformSchema(schema: StructType): StructType = { if ($(inputColText) equals $(outputCol)) { val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputColText))) SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), ArrayType(StringType, true)) } else { SchemaUtils.appendColumn(schema, $(outputCol), ArrayType(StringType, true)) } } } object LanguageAwareAnalyzer extends DefaultParamsReadable[LanguageAwareAnalyzer] { override def load(path: String): LanguageAwareAnalyzer = super.load(path) }
Example 9
Source File: ParamGridBuilderSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tuning import scala.collection.mutable import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.{ParamMap, TestParams} class ParamGridBuilderSuite extends SparkFunSuite { val solver = new TestParams() import solver.{inputCol, maxIter} test("param grid builder") {//参数网格生成器 def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = { assert(maps.size === expected.size) maps.foreach { m =>//m:ParamMap类型 //(10,input0)(10,input1) val tuple = (m(maxIter), m(inputCol)) assert(expected.contains(tuple)) expected.remove(tuple) } assert(expected.isEmpty) } //通过addGrid添加我们需要寻找的最佳参数 //ParamGridBuilder构建待选参数(如:logistic regression的regParam) val maps0 = new ParamGridBuilder() .baseOn(maxIter -> 10) .addGrid(inputCol, Array("input0", "input1")) .build() //期望值 val expected0 = mutable.Set( (10, "input0"), (10, "input1")) validateGrid(maps0, expected0) val maps1 = new ParamGridBuilder() .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten 将被覆盖 .addGrid(maxIter, Array(10, 20))//重载 .addGrid(inputCol, Array("input0", "input1")) .build() val expected1 = mutable.Set( (10, "input0"), (20, "input0"), (10, "input1"), (20, "input1")) validateGrid(maps1, expected1) } }
Example 10
Source File: RegressionEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //默认均方根误差 setDefault(metricName -> "rmse") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { //均方根误差 case "rmse" => metrics.rootMeanSquaredError //均方差 case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 //平均绝对误差 case "mae" => metrics.meanAbsoluteError } metric } override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false//均方根误差 case "mse" => false//均方差 case "r2" => true//平方系统 case "mae" => false//平均绝对误差 } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) }
Example 11
Source File: MulticlassClassificationEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 setDefault(metricName -> "f1") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision//准确率 case "recall" => metrics.recall//召回率 case "weightedPrecision" => metrics.weightedPrecision//加权准确率 case "weightedRecall" => metrics.weightedRecall//加权召回率 } metric } override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true//F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标 case "precision" => true//准确率 case "recall" => true//召回率 case "weightedPrecision" => true//加权准确率 case "weightedRecall" => true//加权召回率 } override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) }
Example 12
Source File: HashingTF.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 13
Source File: OpRegressionEvaluatorTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.evaluators import com.salesforce.op.features.types._ import com.salesforce.op.stages.impl.classification.OpLogisticRegression import com.salesforce.op.stages.impl.regression.{OpLinearRegression, RegressionModelSelector} import com.salesforce.op.stages.impl.selector.ModelSelectorNames.EstimatorType import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.ParamGridBuilder import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpRegressionEvaluatorTest extends FlatSpec with TestSparkContext { val (ds, rawLabel, features) = TestFeatureBuilder[RealNN, OPVector]( Seq( (10.0, Vectors.dense(1.0, 4.3, 1.3)), (20.0, Vectors.dense(2.0, 0.3, 0.1)), (30.0, Vectors.dense(3.0, 3.9, 4.3)), (40.0, Vectors.dense(4.0, 1.3, 0.9)), (50.0, Vectors.dense(5.0, 4.7, 1.3)), (10.0, Vectors.dense(1.0, 4.3, 1.3)), (20.0, Vectors.dense(2.0, 0.3, 0.1)), (30.0, Vectors.dense(3.0, 3.9, 4.3)), (40.0, Vectors.dense(4.0, 1.3, 0.9)), (50.0, Vectors.dense(5.0, 4.7, 1.3)) ).map(v => v._1.toRealNN -> v._2.toOPVector) ) val label = rawLabel.copy(isResponse = true) val lr = new OpLogisticRegression() val lrParams = new ParamGridBuilder().addGrid(lr.regParam, Array(0.0)).build() val testEstimator = RegressionModelSelector.withTrainValidationSplit(dataSplitter = None, trainRatio = 0.5, modelsAndParameters = Seq(lr -> lrParams)) .setInput(label, features) val prediction = testEstimator.getOutput() val testEvaluator = new OpRegressionEvaluator().setLabelCol(label).setPredictionCol(prediction) val testEstimator2 = new OpLinearRegression().setInput(label, features) val prediction2 = testEstimator2.getOutput() val testEvaluator2 = new OpRegressionEvaluator().setLabelCol(label).setPredictionCol(prediction2) Spec[OpRegressionEvaluator] should "copy" in { val testEvaluatorCopy = testEvaluator.copy(ParamMap()) testEvaluatorCopy.uid shouldBe testEvaluator.uid } it should "evaluate the metrics from a model selector" in { val model = testEstimator.fit(ds) val transformedData = model.setInput(label, features).transform(ds) val metrics = testEvaluator.evaluateAll(transformedData).toMetadata() assert(metrics.getDouble(RegressionEvalMetrics.RootMeanSquaredError.toString) <= 1E-12, "rmse should be close to 0") assert(metrics.getDouble(RegressionEvalMetrics.MeanSquaredError.toString) <= 1E-24, "mse should be close to 0") assert(metrics.getDouble(RegressionEvalMetrics.R2.toString) == 1.0, "R2 should equal 1.0") assert(metrics.getDouble(RegressionEvalMetrics.MeanAbsoluteError.toString) <= 1E-12, "mae should be close to 0") } it should "evaluate the metrics from a single model" in { val model = testEstimator2.fit(ds) val transformedData = model.setInput(label, features).transform(ds) val metrics = testEvaluator2.evaluateAll(transformedData).toMetadata() assert(metrics.getDouble(RegressionEvalMetrics.RootMeanSquaredError.toString) <= 1E-12, "rmse should be close to 0") assert(metrics.getDouble(RegressionEvalMetrics.MeanSquaredError.toString) <= 1E-24, "mse should be close to 0") assert(metrics.getDouble(RegressionEvalMetrics.R2.toString) == 1.0, "R2 should equal 1.0") assert(metrics.getDouble(RegressionEvalMetrics.MeanAbsoluteError.toString) <= 1E-12, "mae should be close to 0") } }
Example 14
Source File: RegressionEvaluator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl class RegressionEvaluator(override val uid: String) extends Evaluator[RegressionEvaluator](uid) { val throughOrigin = new BooleanParam(this, "throughOrigin", "True if the regression is through the origin. For example, in " + "linear regression, it will be true without fitting intercept.") def setThroughOrigin(value: Boolean): this.type = set(throughOrigin, value) def getThroughOrigin: Boolean = $(throughOrigin) def this() = this(Identifiable.randomUID("regressionEvaluator")) override def transform(dataset: Dataset[_]): DataFrame = { try { val predictions: RDD[(Double, Double)] = dataset.select($(predictionCol), $(labelCol)) .rdd.map { case Row(score: Double, label: Double) => (score, label) } val metrics = Try(new RegressionMetrics(predictions)) val rows = metrics.toOption.map(m => Seq( "r2" -> m.r2, "rmse" -> m.rootMeanSquaredError, "explainedVariance" -> m.explainedVariance, "meanAbsoluteError" -> m.meanAbsoluteError, "meanSquaredError" -> m.meanSquaredError ).map(Row.fromTuple)).getOrElse(Seq()) SparkSqlUtils.reflectionLock.synchronized( dataset.sqlContext.createDataFrame( dataset.sparkSession.sparkContext.parallelize(rows, 1), transformSchema(dataset.schema))) } catch { // Most probably evaluation dataset is empty case e: Exception => logWarning("Failed to calculate metrics due to " + e.getMessage) SparkSqlUtils.reflectionLock.synchronized( dataset.sqlContext.createDataFrame( dataset.sparkSession.sparkContext.emptyRDD[Row], transformSchema(dataset.schema))) } } override def copy(extra: ParamMap): RegressionEvaluator = { copyValues(new RegressionEvaluator(), extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { new StructType() .add("metric", StringType, nullable = false) .add("value", DoubleType, nullable = false) } }
Example 15
Source File: RichParamMap.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.spark import com.salesforce.op.features.TransientFeature import org.apache.spark.ml.PipelineStage import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.types.StructType object RichParamMap { def getAsMap(): Map[String, Any] = { val mapped = params.toSeq.map(pp => pp.param.name -> pp.value).toMap mapped.map { case (k, v: Array[_]) => if (v.headOption.exists(_.isInstanceOf[TransientFeature])) { k -> v.map(_.asInstanceOf[TransientFeature].toJsonString()).toList } else k -> v.toList case (k, v: StructType) => k -> v.toString() case (k, v: PipelineStage) => k -> v.getClass.getName case (k, Some(v: PipelineStage)) => k -> v.getClass.getName case (k, v) => k -> v } } } }
Example 16
Source File: ModelSelectorFactory.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.selector import com.salesforce.op.evaluators.{EvaluationMetrics, OpEvaluatorBase} import com.salesforce.op.stages.impl.ModelsToTry import com.salesforce.op.stages.impl.selector.ModelSelectorNames.{EstimatorType, ModelType} import com.salesforce.op.stages.impl.tuning.{OpValidator, Splitter, ValidatorParamDefaults} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.ParamGridBuilder import scala.concurrent.duration.Duration protected def selector( validator: OpValidator[ModelType, EstimatorType], splitter: Option[Splitter], trainTestEvaluators: Seq[OpEvaluatorBase[_ <: EvaluationMetrics]], modelTypesToUse: Seq[ModelsToTry], modelsAndParameters: Seq[(EstimatorType, Array[ParamMap])], modelDefaults: ModelDefaults[_ <: ModelsToTry] ): ModelSelector[ModelType, EstimatorType] = { val modelTypeNames = modelTypesToUse.map(_.entryName).toSet val modelsToUse = { // if no models are specified use the defaults and filter by the named models to use if (modelsAndParameters.isEmpty) { modelDefaults.modelsAndParams .collect { case (e, grid) if modelTypeNames(e.getClass.getSimpleName) => e -> grid.build() } } // if models to use has been specified and the models have been specified - filter the models by the names else if (modelTypesToUse.toSet != modelDefaults.modelTypesToUse.toSet) { modelsAndParameters.filter { case (e, p) => modelTypeNames(e.getClass.getSimpleName) } } // else just use the specified models else modelsAndParameters } new ModelSelector( validator = validator, splitter = splitter, models = modelsToUse, evaluators = trainTestEvaluators ) } }
Example 17
Source File: OpTransformerWrapper.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.sparkwrappers.generic.SwUnaryTransformer import org.apache.spark.ml.SparkMLSharedParamConstants import org.apache.spark.ml.SparkMLSharedParamConstants.InOutTransformer import org.apache.spark.ml.param.ParamMap import scala.reflect.runtime.universe.TypeTag class OpTransformerWrapper[I <: FeatureType, O <: FeatureType, T <: InOutTransformer] ( val transformer: T, uid: String = UID[OpTransformerWrapper[I, O, T]] )( implicit tti: TypeTag[I], tto: TypeTag[O], ttov: TypeTag[O#Value] ) extends SwUnaryTransformer[I, O, T]( inputParamName = SparkMLSharedParamConstants.InputColName, outputParamName = SparkMLSharedParamConstants.OutputColName, operationName = transformer.getClass.getSimpleName, // cloning below to prevent parameter changes to the underlying transformer outside the wrapper sparkMlStageIn = Option(transformer).map(_.copy(ParamMap.empty).asInstanceOf[T]), uid = uid )
Example 18
Source File: OpEstimatorWrapper.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.sparkwrappers.generic.SwUnaryEstimator import org.apache.spark.ml._ import org.apache.spark.ml.param.ParamMap import scala.reflect.runtime.universe.TypeTag class OpEstimatorWrapper[I <: FeatureType, O <: FeatureType, E <: Estimator[M], M <: Model[M]] ( val estimator: E, uid: String = UID[OpEstimatorWrapper[I, O, E, M]] )( implicit tti: TypeTag[I], tto: TypeTag[O], ttov: TypeTag[O#Value] ) extends SwUnaryEstimator[I, O, M, E]( inputParamName = SparkMLSharedParamConstants.InputColName, outputParamName = SparkMLSharedParamConstants.OutputColName, operationName = estimator.getClass.getSimpleName, // cloning below to prevent parameter changes to the underlying classifier outside the wrapper sparkMlStageIn = Option(estimator).map(_.copy(ParamMap.empty).asInstanceOf[E]), uid = uid )
Example 19
Source File: OpBinaryEstimatorWrapper.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.UID import com.salesforce.op.features.types.FeatureType import com.salesforce.op.stages.sparkwrappers.generic.SwBinaryEstimator import org.apache.spark.ml._ import org.apache.spark.ml.param.ParamMap import scala.reflect.runtime.universe.TypeTag class OpBinaryEstimatorWrapper[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType, E <: Estimator[M], M <: Model[M]] ( val estimator: E, uid: String = UID[OpBinaryEstimatorWrapper[I1, I2, O, E, M]] )(implicit tti1: TypeTag[I1], tti2: TypeTag[I2], tto: TypeTag[O], ttov: TypeTag[O#Value] ) extends SwBinaryEstimator[I1, I2, O, M, E]( inputParam1Name = SparkMLSharedParamConstants.LabelColName, inputParam2Name = SparkMLSharedParamConstants.FeaturesColName, outputParamName = SparkMLSharedParamConstants.PredictionColName, operationName = estimator.getClass.getSimpleName, // cloning below to prevent parameter changes to the underlying classifier outside the wrapper sparkMlStageIn = Option(estimator).map(_.copy(ParamMap.empty).asInstanceOf[E]), uid = uid ) { final protected def getSparkStage: E = getSparkMlStage().get }
Example 20
Source File: UnaryEstimatorTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.base.unary import com.salesforce.op.UID import com.salesforce.op.features.Feature import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{DoubleType, MetadataBuilder, StructField, StructType} import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class UnaryEstimatorTest extends OpEstimatorSpec[Real, UnaryModel[Real, Real], UnaryEstimator[Real, Real]] { val expectedResult = Seq(0.0, 0.8, 0.4, 0.2, 1.0).map(_.toReal) } class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator]) extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) { def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = { val grouped = dataset.groupBy() val maxVal = grouped.max().first().getDouble(0) val minVal = grouped.min().first().getDouble(0) new MinMaxNormEstimatorModel(min = minVal, max = maxVal, operationName = operationName, uid = uid) } } final class MinMaxNormEstimatorModel private[op](val min: Double, val max: Double, operationName: String, uid: String) extends UnaryModel[Real, Real](operationName = operationName, uid = uid) { def transformFn: Real => Real = _.v.map(v => (v - min) / (max - min)).toReal }
Example 21
Source File: EstimatorWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.StructType import org.apache.spark.{ml, sql} import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.{Transformer, Estimator} import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class EstimatorWrapper( executionContext: ExecutionContext, estimator: Estimator[Transformer]) extends ML.Estimator[TransformerWrapper] { override def fitDF(dataset: sql.DataFrame): TransformerWrapper = { new TransformerWrapper( executionContext, estimator._fit(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF()))) } override def copy(extra: ParamMap): EstimatorWrapper = { val params = ParamTransformer.transform(extra) val estimatorCopy = estimator.replicate().set(params: _*) new EstimatorWrapper(executionContext, estimatorCopy) } override def transformSchema(schema: StructType): StructType = { schema } override lazy val params: Array[ml.param.Param[_]] = { estimator.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("EstimatorWrapper") }
Example 22
Source File: EvaluatorWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.evaluation import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.Evaluator import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class EvaluatorWrapper( context: ExecutionContext, evaluator: Evaluator) extends ML.Evaluator { override def evaluateDF(dataset: sql.DataFrame): Double = { evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value } override def copy(extra: ParamMap): evaluation.Evaluator = { val params = ParamTransformer.transform(extra) val evaluatorCopy = evaluator.replicate().set(params: _*) new EvaluatorWrapper(context, evaluatorCopy) } override lazy val params: Array[Param[_]] = { evaluator.params.map(new ParamWrapper(uid, _)) } override def isLargerBetter: Boolean = evaluator.isLargerBetter override val uid: String = Identifiable.randomUID("EvaluatorWrapper") }
Example 23
Source File: TransformerWrapper.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import org.apache.spark.sql.types.StructType import ai.deepsense.deeplang.ExecutionContext import ai.deepsense.deeplang.doperables.Transformer import ai.deepsense.deeplang.doperables.dataframe.DataFrame import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import ai.deepsense.sparkutils.ML class TransformerWrapper( executionContext: ExecutionContext, transformer: Transformer) extends ML.Model[TransformerWrapper] { override def copy(extra: ParamMap): TransformerWrapper = { val params = ParamTransformer.transform(extra) val transformerCopy = transformer.replicate().set(params: _*) new TransformerWrapper(executionContext, transformerCopy) } override def transformDF(dataset: sql.DataFrame): sql.DataFrame = { transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())) .sparkDataFrame } override def transformSchema(schema: StructType): StructType = { transformer._transformSchema(schema).get } override lazy val params: Array[Param[_]] = { transformer.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("TransformerWrapper") }
Example 24
Source File: SerializableSparkModel.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.Model import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import ai.deepsense.sparkutils.ML class SerializableSparkModel[M <: Model[M]](val sparkModel: M) extends ML.Model[SerializableSparkModel[M]] with MLWritable { override def copy(extra: ParamMap): SerializableSparkModel[M] = new SerializableSparkModel(sparkModel.copy(extra)) override def write: MLWriter = { sparkModel match { case w: MLWritable => w.write case _ => new DefaultMLWriter(this) } } override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset) override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema) override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae" } // This class may seem unused, but it is used reflectively by spark deserialization mechanism object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] { override def read: MLReader[SerializableSparkModel[_]] = { new DefaultMLReader[SerializableSparkModel[_]]() } }
Example 25
Source File: SerializableSparkEstimator.scala From seahorse with Apache License 2.0 | 5 votes |
package ai.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql import org.apache.spark.sql.types.StructType import ai.deepsense.sparkutils.ML class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E) extends ML.Estimator[SerializableSparkModel[T]] with MLWritable { override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0" override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = { val result: T = sparkEstimator.fit(dataset) new SerializableSparkModel[T](result) } override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] = new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E]) override def write: MLWriter = new DefaultMLWriter(this) override def transformSchema(schema: StructType): StructType = sparkEstimator.transformSchema(schema) }
Example 26
Source File: ParamGridBuilderSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tuning import scala.collection.mutable import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.{ParamMap, TestParams} class ParamGridBuilderSuite extends SparkFunSuite { val solver = new TestParams() import solver.{inputCol, maxIter} test("param grid builder") { def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = { assert(maps.size === expected.size) maps.foreach { m => val tuple = (m(maxIter), m(inputCol)) assert(expected.contains(tuple)) expected.remove(tuple) } assert(expected.isEmpty) } val maps0 = new ParamGridBuilder() .baseOn(maxIter -> 10) .addGrid(inputCol, Array("input0", "input1")) .build() val expected0 = mutable.Set( (10, "input0"), (10, "input1")) validateGrid(maps0, expected0) val maps1 = new ParamGridBuilder() .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten .addGrid(maxIter, Array(10, 20)) .addGrid(inputCol, Array("input0", "input1")) .build() val expected1 = mutable.Set( (10, "input0"), (20, "input0"), (10, "input1"), (20, "input1")) validateGrid(maps1, expected1) } }
Example 27
Source File: LogisticRegressionRecommender.scala From wordpress-posts-recommender with Apache License 2.0 | 5 votes |
package wordpressworkshop import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.param.ParamMap import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame case class LogisticRegressionRecommender(training: DataFrame) { val lr = new LogisticRegression() val paramMap = ParamMap(lr.maxIter -> 20) .put(lr.regParam -> 0.01) .put(lr.probabilityCol -> "probability") val model: LogisticRegressionModel = lr.fit(training, paramMap) def metrics(testData: DataFrame) = { val predictionAndLabels: RDD[(Double, Double)] = model.transform(testData).map(row => row.getAs[Vector]("probability")(1) -> row.getAs[Double]("label")) new BinaryClassificationMetrics(predictionAndLabels) } def likeScores(testData: DataFrame): RDD[(Long, Long, Double)] = model.transform(testData) .map(row => (row.getAs[Long]("userId"), row.getAs[Long]("postId"), row.getAs[Vector]("probability")(1))) }
Example 28
Source File: EstimatorWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.StructType import org.apache.spark.{ml, sql} import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.{Transformer, Estimator} import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class EstimatorWrapper( executionContext: ExecutionContext, estimator: Estimator[Transformer]) extends ML.Estimator[TransformerWrapper] { override def fitDF(dataset: sql.DataFrame): TransformerWrapper = { new TransformerWrapper( executionContext, estimator._fit(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF()))) } override def copy(extra: ParamMap): EstimatorWrapper = { val params = ParamTransformer.transform(extra) val estimatorCopy = estimator.replicate().set(params: _*) new EstimatorWrapper(executionContext, estimatorCopy) } override def transformSchema(schema: StructType): StructType = { schema } override lazy val params: Array[ml.param.Param[_]] = { estimator.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("EstimatorWrapper") }
Example 29
Source File: EvaluatorWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.evaluation import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.Evaluator import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class EvaluatorWrapper( context: ExecutionContext, evaluator: Evaluator) extends ML.Evaluator { override def evaluateDF(dataset: sql.DataFrame): Double = { evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value } override def copy(extra: ParamMap): evaluation.Evaluator = { val params = ParamTransformer.transform(extra) val evaluatorCopy = evaluator.replicate().set(params: _*) new EvaluatorWrapper(context, evaluatorCopy) } override lazy val params: Array[Param[_]] = { evaluator.params.map(new ParamWrapper(uid, _)) } override def isLargerBetter: Boolean = evaluator.isLargerBetter override val uid: String = Identifiable.randomUID("EvaluatorWrapper") }
Example 30
Source File: TransformerWrapper.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.wrappers import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql import org.apache.spark.sql.types.StructType import io.deepsense.deeplang.ExecutionContext import io.deepsense.deeplang.doperables.Transformer import io.deepsense.deeplang.doperables.dataframe.DataFrame import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper import io.deepsense.sparkutils.ML class TransformerWrapper( executionContext: ExecutionContext, transformer: Transformer) extends ML.Model[TransformerWrapper] { override def copy(extra: ParamMap): TransformerWrapper = { val params = ParamTransformer.transform(extra) val transformerCopy = transformer.replicate().set(params: _*) new TransformerWrapper(executionContext, transformerCopy) } override def transformDF(dataset: sql.DataFrame): sql.DataFrame = { transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())) .sparkDataFrame } override def transformSchema(schema: StructType): StructType = { transformer._transformSchema(schema).get } override lazy val params: Array[Param[_]] = { transformer.params.map(new ParamWrapper(uid, _)) } override val uid: String = Identifiable.randomUID("TransformerWrapper") }
Example 31
Source File: SerializableSparkModel.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.Model import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import io.deepsense.sparkutils.ML class SerializableSparkModel[M <: Model[M]](val sparkModel: M) extends ML.Model[SerializableSparkModel[M]] with MLWritable { override def copy(extra: ParamMap): SerializableSparkModel[M] = new SerializableSparkModel(sparkModel.copy(extra)) override def write: MLWriter = { sparkModel match { case w: MLWritable => w.write case _ => new DefaultMLWriter(this) } } override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset) override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema) override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae" } // This class may seem unused, but it is used reflectively by spark deserialization mechanism object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] { override def read: MLReader[SerializableSparkModel[_]] = { new DefaultMLReader[SerializableSparkModel[_]]() } }
Example 32
Source File: SerializableSparkEstimator.scala From seahorse-workflow-executor with Apache License 2.0 | 5 votes |
package io.deepsense.deeplang.doperables.serialization import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.sql import org.apache.spark.sql.types.StructType import io.deepsense.sparkutils.ML class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E) extends ML.Estimator[SerializableSparkModel[T]] with MLWritable { override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0" override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = { val result: T = sparkEstimator.fit(dataset) new SerializableSparkModel[T](result) } override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] = new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E]) override def write: MLWriter = new DefaultMLWriter(this) override def transformSchema(schema: StructType): StructType = sparkEstimator.transformSchema(schema) }
Example 33
Source File: SimpleVectorAssembler.scala From albedo with MIT License | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuilder def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val schema = dataset.schema val assembleFunc = udf { r: Row => SimpleVectorAssembler.assemble(r.toSeq: _*) } val args = $(inputCols).map { c => schema(c).dataType match { case DoubleType => dataset(c) case _: VectorUDT => dataset(c) case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid") } } dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol))) } override def transformSchema(schema: StructType): StructType = { val inputColNames = $(inputCols) val outputColName = $(outputCol) val inputDataTypes = inputColNames.map(name => schema(name).dataType) inputDataTypes.foreach { case _: NumericType | BooleanType => case t if t.isInstanceOf[VectorUDT] => case other => throw new IllegalArgumentException(s"Data type $other is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true)) } override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra) } object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] { override def load(path: String): SimpleVectorAssembler = super.load(path) def assemble(vv: Any*): Vector = { val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] var cur = 0 vv.foreach { case v: Double => if (v != 0.0) { indices += cur values += v } cur += 1 case vec: Vector => vec.foreachActive { case (i, v) => if (v != 0.0) { indices += cur + i values += v } } cur += vec.size case null => // TODO: output Double.NaN? throw new SparkException("Values to assemble cannot be null.") case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } Vectors.sparse(cur, indices.result(), values.result()).compressed } }
Example 34
Source File: UserRepoTransformer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ class UserRepoTransformer(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("userRepoTransformer")) } val inputCols: StringArrayParam = new StringArrayParam(this, "inputCols", "Input column names") def getInputCols: Array[String] = $(inputCols) def setInputCols(value: Array[String]): this.type = set(inputCols, value) override def transformSchema(schema: StructType): StructType = { $(inputCols).foreach((inputColName: String) => { require(schema.fieldNames.contains(inputColName), s"Input column $inputColName must exist.") }) val newFields: Array[StructField] = Array( StructField("repo_language_index_in_user_recent_repo_languages", IntegerType, nullable = false), StructField("repo_language_count_in_user_recent_repo_languages", IntegerType, nullable = false) ) StructType(schema.fields ++ newFields) } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) import dataset.sparkSession.implicits._ dataset .withColumn("repo_language_index_in_user_recent_repo_languages", repoLanguageIndexInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages")) .withColumn("repo_language_count_in_user_recent_repo_languages", repoLanguageCountInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages")) } override def copy(extra: ParamMap): UserRepoTransformer = { defaultCopy(extra) } } object UserRepoTransformer extends DefaultParamsReadable[UserRepoTransformer]
Example 35
Source File: HanLPTokenizer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import java.util import com.hankcs.hanlp.HanLP import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary import com.hankcs.hanlp.seg.common.Term import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.{BooleanParam, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types._ import scala.collection.JavaConverters._ class HanLPTokenizer(override val uid: String) extends UnaryTransformer[String, Seq[String], HanLPTokenizer] with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("hanLPTokenizer")) } val shouldRemoveStopWords = new BooleanParam(this, "shouldRemoveStopWords", "Whether to remove stop words") def getShouldRemoveStopWords: Boolean = $(shouldRemoveStopWords) def setShouldRemoveStopWords(value: Boolean): this.type = set(shouldRemoveStopWords, value) setDefault(shouldRemoveStopWords -> true) override def createTransformFunc: String => Seq[String] = { originStr => HanLP.Config.ShowTermNature = false HanLP.Config.Normalization = false val segment = HanLP.newSegment() val termList: util.List[Term] = segment.seg(HanLP.convertToSimplifiedChinese(originStr.toLowerCase)) if ($(shouldRemoveStopWords)) { CoreStopWordDictionary.apply(termList) } val LanguageRE = """(c|r|c\+\+|c#|f#)""".r val OneCharExceptCJKRE = """([^\p{InHiragana}\p{InKatakana}\p{InBopomofo}\p{InCJKCompatibilityIdeographs}\p{InCJKUnifiedIdeographs}])""".r termList .asScala .flatMap((term: Term) => { val word = term.word word match { case LanguageRE(language) => Array(language) case OneCharExceptCJKRE(_) => Array.empty[String] case _ => """([\w\.\-_\p{InHiragana}\p{InKatakana}\p{InBopomofo}\p{InCJKCompatibilityIdeographs}\p{InCJKUnifiedIdeographs}]+)""".r.findAllIn(word).toList } }) } override def validateInputType(inputType: DataType): Unit = { require(inputType == StringType, s"Input type must be string type but got $inputType.") } override def outputDataType: DataType = { new ArrayType(StringType, false) } override def copy(extra: ParamMap): HanLPTokenizer = { defaultCopy(extra) } } object HanLPTokenizer extends DefaultParamsReadable[HanLPTokenizer]
Example 36
Source File: SnowballStemmer.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.types.{ArrayType, DataType, StringType} import org.tartarus.snowball.ext.EnglishStemmer class SnowballStemmer(override val uid: String) extends UnaryTransformer[Seq[String], Seq[String], SnowballStemmer] with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("snowballStemmer")) } override def createTransformFunc: Seq[String] => Seq[String] = { strings => val stemmer = new EnglishStemmer() strings.map((str: String) => { try { stemmer.setCurrent(str) stemmer.stem() stemmer.getCurrent() } catch { case _: Exception => str } }) } override def validateInputType(inputType: DataType): Unit = { require(inputType == ArrayType(StringType), s"Input type must be string type but got $inputType.") } override def outputDataType: DataType = { ArrayType(StringType) } override def copy(extra: ParamMap): SnowballStemmer = { defaultCopy(extra) } } object SnowballStemmer extends DefaultParamsReadable[SnowballStemmer]
Example 37
Source File: RankingMetricFormatter.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} import ws.vinta.albedo.closures.UDFs._ import ws.vinta.albedo.evaluators.RankingEvaluator._ class RankingMetricFormatter(override val uid: String, val sourceType: String) extends Transformer with DefaultParamsWritable { def this(sourceType: String) = { this(Identifiable.randomUID("rankingMetricFormatter"), sourceType) } val userCol = new Param[String](this, "userCol", "User column name") def getUserCol: String = $(userCol) def setUserCol(value: String): this.type = set(userCol, value) setDefault(userCol -> "user") val itemCol = new Param[String](this, "itemCol", "Item column name") def getItemCol: String = $(itemCol) def setItemCol(value: String): this.type = set(itemCol, value) setDefault(itemCol -> "item") val predictionCol = new Param[String](this, "predictionCol", "Prediction column name") def getPredictionCol: String = $(predictionCol) def setPredictionCol(value: String): this.type = set(predictionCol, value) setDefault(predictionCol -> "prediction") val topK = new IntParam(this, "topK", "Recommend top-k items for every user") def getTopK: Int = $(topK) def setTopK(value: Int): this.type = set(topK, value) setDefault(topK -> 15) override def transformSchema(schema: StructType): StructType = { Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType) .foreach{ case(columnName: String, expectedDataType: DataType) => { val actualDataType = schema(columnName).dataType require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.") } } schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) sourceType match { case "als" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK))) case "lr" => dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK))) } } override def copy(extra: ParamMap): RankingMetricFormatter = { val copied = new RankingMetricFormatter(uid, sourceType) copyValues(copied, extra) } } object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter]
Example 38
Source File: IntermediateCacher.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.transformers import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{ParamMap, StringArrayParam} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} class IntermediateCacher(override val uid: String) extends Transformer with DefaultParamsWritable { def this() = { this(Identifiable.randomUID("intermediateCacher")) } val inputCols = new StringArrayParam(this, "inputCols", "Input column names") def getInputCols: Array[String] = $(inputCols) def setInputCols(value: Array[String]): this.type = set(inputCols, value) setDefault(inputCols -> Array.empty[String]) override def transformSchema(schema: StructType): StructType = { schema } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema) val intermediateDF = if ($(inputCols).isEmpty) dataset.toDF() else dataset.select($(inputCols).map(col(_)): _*) intermediateDF.cache() } override def copy(extra: ParamMap): IntermediateCacher = { defaultCopy(extra) } } object IntermediateCacher extends DefaultParamsReadable[IntermediateCacher]
Example 39
Source File: RegressionEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.Experimental import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.DoubleType def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => -metrics.rootMeanSquaredError case "mse" => -metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => -metrics.meanAbsoluteError } metric } override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) }
Example 40
Source File: ParamGridBuilderSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tuning import scala.collection.mutable import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.{ParamMap, TestParams} class ParamGridBuilderSuite extends SparkFunSuite { val solver = new TestParams() import solver.{inputCol, maxIter} test("param grid builder") { def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = { assert(maps.size === expected.size) maps.foreach { m => val tuple = (m(maxIter), m(inputCol)) assert(expected.contains(tuple)) expected.remove(tuple) } assert(expected.isEmpty) } val maps0 = new ParamGridBuilder() .baseOn(maxIter -> 10) .addGrid(inputCol, Array("input0", "input1")) .build() val expected0 = mutable.Set( (10, "input0"), (10, "input1")) validateGrid(maps0, expected0) val maps1 = new ParamGridBuilder() .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten .addGrid(maxIter, Array(10, 20)) .addGrid(inputCol, Array("input0", "input1")) .build() val expected1 = mutable.Set( (10, "input0"), (20, "input0"), (10, "input1"), (20, "input1")) validateGrid(maps1, expected1) } }
Example 41
Source File: RegressionEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("1.4.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema val predictionColName = $(predictionCol) val predictionType = schema($(predictionCol)).dataType require(predictionType == FloatType || predictionType == DoubleType, s"Prediction column $predictionColName must be of type float or double, " + s" but not $predictionType") val labelColName = $(labelCol) val labelType = schema($(labelCol)).dataType require(labelType == FloatType || labelType == DoubleType, s"Label column $labelColName must be of type float or double, but not $labelType") val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 42
Source File: MulticlassClassificationEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, SchemaUtils, Identifiable} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("1.5.0") override def evaluate(dataset: DataFrame): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType) val predictionAndLabels = dataset.select($(predictionCol), $(labelCol)) .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "precision" => metrics.precision case "recall" => metrics.recall case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall } metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "f1" => true case "precision" => true case "recall" => true case "weightedPrecision" => true case "weightedRecall" => true } @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 43
Source File: SQLTransformer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkContext import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.param.{ParamMap, Param} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{SQLContext, DataFrame, Row} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("1.6.0") override def transform(dataset: DataFrame): DataFrame = { val tableName = Identifiable.randomUID(uid) dataset.registerTempTable(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val outputDF = dataset.sqlContext.sql(realStatement) outputDF } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val sc = SparkContext.getOrCreate() val sqlContext = SQLContext.getOrCreate(sc) val dummyRDD = sc.parallelize(Seq(Row.empty)) val dummyDF = sqlContext.createDataFrame(dummyRDD, schema) dummyDF.registerTempTable(tableIdentifier) val outputSchema = sqlContext.sql($(statement)).schema outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 44
Source File: HashingTF.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.{Since, Experimental} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) } @Since("1.6.0") object HashingTF extends DefaultParamsReadable[HashingTF] { @Since("1.6.0") override def load(path: String): HashingTF = super.load(path) }
Example 45
Source File: PredictorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasWeightCol import org.apache.spark.ml.util._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext { import PredictorSuite._ test("should support all NumericType labels and weights, and not support other types") { val df = spark.createDataFrame(Seq( (0, 1, Vectors.dense(0, 2, 3)), (1, 2, Vectors.dense(0, 3, 9)), (0, 3, Vectors.dense(0, 2, 6)) )).toDF("label", "weight", "features") val types = Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0)) val predictor = new MockPredictor().setWeightCol("weight") types.foreach { t => predictor.fit(df.select(col("label").cast(t), col("weight").cast(t), col("features"))) } intercept[IllegalArgumentException] { predictor.fit(df.select(col("label").cast(StringType), col("weight"), col("features"))) } intercept[IllegalArgumentException] { predictor.fit(df.select(col("label"), col("weight").cast(StringType), col("features"))) } } } object PredictorSuite { class MockPredictor(override val uid: String) extends Predictor[Vector, MockPredictor, MockPredictionModel] with HasWeightCol { def this() = this(Identifiable.randomUID("mockpredictor")) def setWeightCol(value: String): this.type = set(weightCol, value) override def train(dataset: Dataset[_]): MockPredictionModel = { require(dataset.schema("label").dataType == DoubleType) require(dataset.schema("weight").dataType == DoubleType) new MockPredictionModel(uid) } override def copy(extra: ParamMap): MockPredictor = throw new NotImplementedError() } class MockPredictionModel(override val uid: String) extends PredictionModel[Vector, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictormodel")) override def predict(features: Vector): Double = throw new NotImplementedError() override def copy(extra: ParamMap): MockPredictionModel = throw new NotImplementedError() } }
Example 46
Source File: ParamGridBuilderSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tuning import scala.collection.mutable import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.{ParamMap, TestParams} class ParamGridBuilderSuite extends SparkFunSuite { val solver = new TestParams() import solver.{inputCol, maxIter} test("param grid builder") { def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = { assert(maps.size === expected.size) maps.foreach { m => val tuple = (m(maxIter), m(inputCol)) assert(expected.contains(tuple)) expected.remove(tuple) } assert(expected.isEmpty) } val maps0 = new ParamGridBuilder() .baseOn(maxIter -> 10) .addGrid(inputCol, Array("input0", "input1")) .build() val expected0 = mutable.Set( (10, "input0"), (10, "input1")) validateGrid(maps0, expected0) val maps1 = new ParamGridBuilder() .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten .addGrid(maxIter, Array(10, 20)) .addGrid(inputCol, Array("input0", "input1")) .build() val expected1 = mutable.Set( (10, "input0"), (20, "input0"), (10, "input1"), (20, "input1")) validateGrid(maps1, expected1) } }
Example 47
Source File: RegressionEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 48
Source File: MulticlassClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 49
Source File: SQLTransformer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) // Call SessionCatalog.dropTempView to avoid unpersisting the possibly cached dataset. dataset.sparkSession.sessionState.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 50
Source File: VectorExplode.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.util.collection.OpenHashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.odkl.SparkSqlUtils import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row, functions} class VectorExplode(override val uid: String) extends Transformer with DefaultParamsWritable { val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.") def setValueCol(value: String) : this.type = set(valueCol, value) setDefault(valueCol -> "value") def this() = this(Identifiable.randomUID("vectorExplode")) override def transform(dataset: Dataset[_]): DataFrame = { val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT]) val resultSchema = StructType(Seq( StructField($(valueCol), StringType, nullable = false)) ++ vectors.map(f => StructField(f.name, DoubleType, nullable = true)) ) val arraySize = resultSchema.size - 1 val names: Array[Map[Int, String]] = vectors.map( f => { AttributeGroup.fromStructField(f).attributes .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) }) val maxCapacity = names.map(_.size).max val explodeVectors : (Row => Array[Row]) = (r: Row ) => { val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity) for(i <- 0 until r.length) { val vector = r.getAs[Vector](i) vector.foreachActive((index, value) => { val name = names(i).getOrElse(index, s"${vectors(i).name}_$index") accumulator.changeValue( name, Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN}, v => {v(i) = value; v}) }) } accumulator.map(x => new GenericRowWithSchema( (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray, resultSchema)).toArray } val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*) val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType))) val expression = functions.explode(explodeUDF(vectorsStruct)) dataset .withColumn(uid, expression) .select( dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++ resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.fields.map(x => x.dataType match { case vector: VectorUDT => StructField(x.name, typeFromVector(x)) case _ => x } )) def typeFromVector(field: StructField): StructType = { val attributes = AttributeGroup.fromStructField(field) StructType(attributes.attributes .map(_.map(a => a.name.getOrElse(s"_${a.index.get}"))) .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" }) .map(name => StructField(name, DoubleType, nullable = false))) } }
Example 51
Source File: CrossValidation.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel} import org.apache.spark.ml.{Model, Pipeline, PipelineStage} import org.apache.spark.sql._ @throws(classOf[IllegalArgumentException]) protected def apply( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): CrossValidatorModel = { require(stages.size > 0, "Cannot cross-validate pipeline without stages") require(grid.size > 0, "Cannot cross-validate with undefined grid") val pipeline = new Pipeline().setStages(stages ++ Array[PipelineStage](estimator)) new CrossValidator() .setEstimator(pipeline) .setEstimatorParamMaps(grid) .setEvaluator(new BinaryClassificationEvaluator) .setNumFolds(numFolds) .fit(trainDf) } protected def evaluate( trainDf: DataFrame, stages: Array[PipelineStage], grid: Array[ParamMap] ): Evaluator = this(trainDf, stages, grid).getEvaluator }
Example 52
Source File: ArimaBestModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.TimeSeriesModel import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.StructType class ArimaBestModel[L, M <: TimeSeriesModel]( override val uid: String, val bestPrediction: RDD[(L, M)], val validationMetrics: RDD[(L, Seq[ModelParamEvaluation[L]])] ) extends Model[ArimaBestModel[L, M]] with TimeSeriesBestModelFinderParam[L] { //TODO avaliar necessidade override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) dataset.toDF() } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): ArimaBestModel[L, M] = { val copied = new ArimaBestModel[L, M](uid, bestPrediction, validationMetrics) copyValues(copied, extra) } }
Example 53
Source File: XGBoostBigModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberXGBoostModel import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint} import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)]) extends ForecastBaseModel[XGBoostBigModel[I]] with HasLabelCol with HasIdCol { def setLabelcol(label: String): this.type = set(labelCol, label) def setIdcol(id: String): this.type = set(idCol, id) override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) ) } .join(prediction) .map { case (id, (features, predictValue)) => Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } protected def predict(dataSet: Dataset[_]) = { val features = dataSet.rdd.map { case (row: Row) => val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME) val id = row.getAs[I]($(idCol)) SparkLabeledPoint(DataTransformer.toFloat(id), features) }.cache val (_, model) = models.head UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(getPredictionSchema) protected def getPredictionSchema: Array[StructField] = { Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) } }
Example 54
Source File: TimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} import scala.reflect.ClassTag def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataSet: Dataset[_]): DataFrame = { val rdd = dataSet.rdd val sparkContext = dataSet.sqlContext.sparkContext val index = sparkContext.broadcast(dataSet.schema.fieldIndex($(timeCol).get)) val labelColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(groupByCol).get)) val featuresColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(featuresCol))) val grouped = rdd.map { case (row: Row) => val timeColRow = IUberdataForecastUtil.convertColumnToLong(row, index.value) convertColumnToDouble(timeColRow, featuresColIndex) }.groupBy { row => row.getAs[L](labelColIndex.value) }.map { case (key, values) => val toBeUsed = values.toArray.sortBy(row => row.getAs[Long](index.value)) (key, toBeUsed) } val toBeTrained = grouped.map { case (key, values) => org.apache.spark.sql.Row( key, Vectors.dense(values.map(_.getAs[Double](featuresColIndex.value))) ) } val trainSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(toBeTrained, trainSchema) } override def transformSchema(schema: StructType): StructType = { val labelIndex = schema.fieldIndex($(groupByCol).get) StructType( Seq( schema.fields(labelIndex), StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT) ) ) } override def copy(extra: ParamMap): TimeSeriesGenerator[L] = defaultCopy(extra) } object TimeSeriesGenerator extends DefaultParamsReadable[TimeSeriesGenerator[_]] { override def load(path: String): TimeSeriesGenerator[_] = super.load(path) }
Example 55
Source File: XGBoostBaseBestModel.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.{Booster, DMatrix} import ml.dmlc.xgboost4j.LabeledPoint import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.Row import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType} trait BaseXGBoostBestModelFinder[G, M <: org.apache.spark.ml.ForecastBaseModel[M]] extends BestModelFinder[G, M] with HasGroupByCol { protected def buildTrainSchema(sparkContext: SparkContext): Broadcast[StructType] = sparkContext.broadcast { StructType( Seq( StructField($(groupByCol).get, FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, ArrayType(new VectorUDT)))) } protected def xGBoostEvaluation(row: Row, model: Booster, broadcastEvaluator: Broadcast[TimeSeriesEvaluator[G]], id: G, parameters: ParamMap): ModelParamEvaluation[G] = { val featuresArray = row .getAs[Array[org.apache.spark.ml.linalg.Vector]](IUberdataForecastUtil.FEATURES_COL_NAME) .map { vec => val values = vec.toArray.map(DataTransformer.toFloat) LabeledPoint(values.head, null, values.tail) } val features = new DMatrix(featuresArray.toIterator) log.warn(s"Evaluating forecast for id $id, with xgboost") val prediction = model.predict(features).flatten val (forecastToBeValidated, _) = prediction.splitAt(featuresArray.length) val toBeValidated = featuresArray.zip(forecastToBeValidated) val metric = broadcastEvaluator.value.evaluate(toBeValidated.map(f => (f._1.label.toDouble, f._2.toDouble))) val metricName = broadcastEvaluator.value.getMetricName new ModelParamEvaluation[G]( id, metric, parameters, Some(metricName), SupportedAlgorithm.XGBoostAlgorithm) } }
Example 56
Source File: XGBoost.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import eleflow.uberdata.models.UberXGBOOSTModel import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType} import scala.reflect.ClassTag class XGBoost[I](override val uid: String, val models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))])( implicit kt: ClassTag[I], ord: Ordering[I] = null) extends ForecastBaseModel[XGBoostSmallModel[I]] with HasInputCol with HasOutputCol with DefaultParamsWritable with HasFeaturesCol with HasNFutures with HasGroupByCol { def this( models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))] )(implicit kt: ClassTag[I], ord: Ordering[I] ) = this(Identifiable.randomUID("xgboost"), models) override def transform(dataSet: Dataset[_]): DataFrame = { val schema = dataSet.schema val predSchema = transformSchema(schema) val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)}) val predictions = joined.map { case (id, ((bestModel, metrics), row)) => val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]]( IUberdataForecastUtil.FEATURES_COL_NAME ) val label = DataTransformer.toFloat(row.getAs($(featuresCol))) val labelPoint = features.map { vec => val array = vec.toArray.map(_.toFloat) LabeledPoint(label, null, array) } val matrix = new DMatrix(labelPoint.toIterator) val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance .predict(matrix) .flatMap(_.map(_.toDouble)) .splitAt(features.length) Row( row.toSeq :+ Vectors .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _* ) } dataSet.sqlContext.createDataFrame(predictions, predSchema) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra) }
Example 57
Source File: PredictorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext { import PredictorSuite._ test("should support all NumericType labels and not support other types") { val df = spark.createDataFrame(Seq( (0, Vectors.dense(0, 2, 3)), (1, Vectors.dense(0, 3, 9)), (0, Vectors.dense(0, 2, 6)) )).toDF("label", "features") val types = Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0)) val predictor = new MockPredictor() types.foreach { t => predictor.fit(df.select(col("label").cast(t), col("features"))) } intercept[IllegalArgumentException] { predictor.fit(df.select(col("label").cast(StringType), col("features"))) } } } object PredictorSuite { class MockPredictor(override val uid: String) extends Predictor[Vector, MockPredictor, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictor")) override def train(dataset: Dataset[_]): MockPredictionModel = { require(dataset.schema("label").dataType == DoubleType) new MockPredictionModel(uid) } override def copy(extra: ParamMap): MockPredictor = throw new NotImplementedError() } class MockPredictionModel(override val uid: String) extends PredictionModel[Vector, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictormodel")) override def predict(features: Vector): Double = throw new NotImplementedError() override def copy(extra: ParamMap): MockPredictionModel = throw new NotImplementedError() } }
Example 58
Source File: ParamGridBuilderSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tuning import scala.collection.mutable import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.{ParamMap, TestParams} class ParamGridBuilderSuite extends SparkFunSuite { val solver = new TestParams() import solver.{inputCol, maxIter} test("param grid builder") { def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = { assert(maps.size === expected.size) maps.foreach { m => val tuple = (m(maxIter), m(inputCol)) assert(expected.contains(tuple)) expected.remove(tuple) } assert(expected.isEmpty) } val maps0 = new ParamGridBuilder() .baseOn(maxIter -> 10) .addGrid(inputCol, Array("input0", "input1")) .build() val expected0 = mutable.Set( (10, "input0"), (10, "input1")) validateGrid(maps0, expected0) val maps1 = new ParamGridBuilder() .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten .addGrid(maxIter, Array(10, 20)) .addGrid(inputCol, Array("input0", "input1")) .build() val expected1 = mutable.Set( (10, "input0"), (20, "input0"), (10, "input1"), (20, "input1")) validateGrid(maps1, expected1) } }
Example 59
Source File: RegressionEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 60
Source File: MulticlassClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 61
Source File: SQLTransformer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 62
Source File: GaussianProcessRegression.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import breeze.linalg.{DenseVector => BDV, _} import org.apache.spark.internal.Logging import org.apache.spark.ml.commons._ import org.apache.spark.ml.commons.kernel.Kernel import org.apache.spark.ml.commons.util._ import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, Instrumentation} import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset class GaussianProcessRegression(override val uid: String) extends Regressor[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with GaussianProcessParams with GaussianProcessCommons[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with Logging { def this() = this(Identifiable.randomUID("gaussProcessReg")) override protected def train(dataset: Dataset[_]): GaussianProcessRegressionModel = { val instr = Instrumentation.create(this, dataset) val points: RDD[LabeledPoint] = getPoints(dataset).cache() val expertLabelsAndKernels: RDD[(BDV[Double], Kernel)] = getExpertLabelsAndKernels(points).cache() val optimalHyperparameters = optimizeHypers(instr, expertLabelsAndKernels, likelihoodAndGradient) expertLabelsAndKernels.foreach(_._2.setHyperparameters(optimalHyperparameters)) produceModel(instr, points, expertLabelsAndKernels, optimalHyperparameters) } private def likelihoodAndGradient(yAndK : (BDV[Double], Kernel), x : BDV[Double]) = { val (y: BDV[Double], kernel : Kernel) = yAndK kernel.setHyperparameters(x) val (k, derivative) = kernel.trainingKernelAndDerivative() val (_, logdet, kinv) = logDetAndInv(k) val alpha = kinv * y val likelihood = 0.5 * (y.t * alpha) + 0.5 * logdet val alphaAlphaTMinusKinv = alpha * alpha.t alphaAlphaTMinusKinv -= kinv val gradient = derivative.map(derivative => -0.5 * sum(derivative *= alphaAlphaTMinusKinv)) (likelihood, BDV(gradient:_*)) } override def copy(extra: ParamMap): GaussianProcessRegression = defaultCopy(extra) override protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor): GaussianProcessRegressionModel = new GaussianProcessRegressionModel(uid, rawPredictor) } class GaussianProcessRegressionModel private[regression](override val uid: String, private val gaussianProjectedProcessRawPredictor: GaussianProjectedProcessRawPredictor) extends RegressionModel[Vector, GaussianProcessRegressionModel] { override protected def predict(features: Vector): Double = { gaussianProjectedProcessRawPredictor.predict(features)._1 } override def copy(extra: ParamMap): GaussianProcessRegressionModel = { val newModel = copyValues(new GaussianProcessRegressionModel(uid, gaussianProjectedProcessRawPredictor), extra) newModel.setParent(parent) } }
Example 63
Source File: MLPipelineTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.mlpipeline import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel} import org.apache.spark.ml.param.ParamMap import org.scalaml.Logging import org.scalaml.spark.ResourcesLoader import org.scalatest.{FlatSpec, Matchers} final class MLPipelineTest extends FlatSpec with Matchers with Logging { protected val name = "Spark ML pipeline" final val trainFile = "/data/spark/mlpipeline_training.csv" final val testFile = "/data/spark/mlpipeline_test.csv" final val columns = Array[String]("date", "asset", "region", "agent") it should s"$name simple predictor" in { show(s"$name simple predictor") (for { trainPath <- ResourcesLoader.getPath(trainFile) testPath <- ResourcesLoader.getPath(testFile) } yield { val predictor = new SimplePredictor[LogisticRegressionModel]( new LogisticRegression().setMaxIter(5).setRegParam(0.1), columns, trainPath ) (predictor, predictor.classify(predictor(), testPath)) }).map { case (predictor, output) => { output.printSchema val predictedValues = output.select("prediction").collect.map(_.getDouble(0)) output.show predictor.stop predictedValues(0) } should be(0.0) } } it should s"$name cross validation" in { show(s"$name cross validation") (for { trainPath <- ResourcesLoader.getPath(trainFile) testPath <- ResourcesLoader.getPath(testFile) } yield { val lr = new LogisticRegression().setMaxIter(5).setRegParam(0.1) val paramsMap = new ParamMap().put(lr.maxIter -> 30).put(lr.regParam -> 0.1) val validator = new ValidatedPredictor[LogisticRegressionModel](lr, columns, trainPath) val (f1, auROC) = validator.trainingWithSummary.getOrElse((Double.NaN, Double.NaN)) println(s"F1-measure = ${f1} auROC = ${auROC}") validator.stop f1 should be(0.025 +- 0.005) auROC should be(0.600 +- 0.005) }) } } // -------------------------------- EOF ---------------------------------------------
Example 64
Source File: MovingAverage.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.{IntParam, ParamMap} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{VectorUDT, Vectors} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types._ def setOutputCol(value: String): this.type = set(outputCol, value) setDefault(windowSize -> 3) override def transform(dataSet: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataSet.schema) val sparkContext = dataSet.sqlContext.sparkContext val inputType = outputSchema($(inputCol)).dataType val inputTypeBr = sparkContext.broadcast(inputType) val dataSetRdd = dataSet.rdd val inputColName = sparkContext.broadcast($(inputCol)) val inputColIndex = dataSet.columns.indexOf($(inputCol)) val inputColIndexBr = sparkContext.broadcast(inputColIndex) val windowSizeBr = sparkContext.broadcast($(windowSize)) val maRdd = dataSetRdd.map { case (row: Row) => val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) { val vector = row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value) (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1))) } else { val iterable = row.getAs[Iterable[Double]](inputColName.value) (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1))) } val (before, after) = row.toSeq.splitAt(inputColIndexBr.value) Row( (before :+ rawValue) ++ after.tail :+ MovingAverageCalc .simpleMovingAverageArray(array, windowSizeBr.value): _* ) } dataSet.sqlContext.createDataFrame(maRdd, outputSchema) } override def transformSchema(schema: StructType): StructType = { schema.add(StructField($(outputCol), ArrayType(DoubleType))) } override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra) } object MovingAverageCalc { private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = { (for (i <- 1 to values.length) yield //TODO rollback this comment with the right size of features to make the meanaverage return // the features values for the first values of the calc if (i < period) 0d //values(i) else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d) } } object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] { override def load(path: String): MovingAverage[_] = super.load(path) }
Example 65
Source File: L9-15MLPipeline.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.runtime.universe import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.regression.RandomForestRegressor import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.ml.param.ParamMap object MLPipelineApp { case class Activity(label: Double, accelXHand: Double, accelYHand: Double, accelZHand: Double, accelXChest: Double, accelYChest: Double, accelZChest: Double, accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double) def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: MLPipelineApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val sqlC = new SQLContext(ssc.sparkContext) import sqlC.implicits._ val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) == "4" || f(1) == "5") .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38))) .map(f => f.map(v => v.toDouble)) .foreachRDD(rdd => { if (!rdd.isEmpty) { val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF() val split = accelerometer.randomSplit(Array(0.3, 0.7)) val test = split(0) val train = split(1) val assembler = new VectorAssembler() .setInputCols(Array( "accelXHand", "accelYHand", "accelZHand", "accelXChest", "accelYChest", "accelZChest", "accelXAnkle", "accelYAnkle", "accelZAnkle")) .setOutputCol("vectors") val normalizer = new Normalizer() .setInputCol(assembler.getOutputCol) .setOutputCol("features") val regressor = new RandomForestRegressor() val pipeline = new Pipeline() .setStages(Array(assembler, normalizer, regressor)) val pMap = ParamMap(normalizer.p -> 1.0) val model = pipeline.fit(train, pMap) val prediction = model.transform(test) prediction.show() } }) ssc.start() ssc.awaitTermination() } }
Example 66
Source File: MultinomialLabeler.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.MultinomialLabelerModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasFeaturesCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{udf, col} import ml.combust.mleap.core.util.VectorConverters._ class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"), val model: MultinomialLabelerModel) extends Transformer with HasFeaturesCol with HasProbabilitiesCol with HasLabelsCol { def setFeaturesCol(value: String): this.type = set(featuresCol, value) def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value) def setLabelsCol(value: String): this.type = set(labelsCol, value) @org.apache.spark.annotation.Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val probabilitiesUdf = udf { (vector: Vector) => model.top(vector).map(_._1).toArray } val labelsUdf = udf { (vector: Vector) => model.topLabels(vector).toArray } dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))). withColumn($(labelsCol), labelsUdf(col($(featuresCol)))) } override def copy(extra: ParamMap): Transformer = copyValues(new MultinomialLabeler(uid, model), extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT], s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(probabilitiesCol)), s"Output column ${$(probabilitiesCol)} already exists.") require(!inputFields.exists(_.name == $(labelsCol)), s"Output column ${$(labelsCol)} already exists.") StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)), StructField($(labelsCol), ArrayType(StringType)))) } }
Example 67
Source File: WordLengthFilter.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.WordLengthFilterModel import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, Params} import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset} final def getWordLength: Int = $(wordLength) } class WordLengthFilter(override val uid: String) extends Transformer with WordLengthFilterParams with DefaultParamsWritable { val defaultLength = 3 var model: WordLengthFilterModel = new WordLengthFilterModel(defaultLength) //Initialize with default filter length 3 def this(model: WordLengthFilterModel) = this(uid = Identifiable.randomUID("filter_words")) def this() = this(new WordLengthFilterModel) def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) def setWordLength(value: Int = defaultLength): this.type = set(wordLength, value) override def transform(dataset: Dataset[_]): DataFrame = { if(defaultLength != getWordLength) model = new WordLengthFilterModel(getWordLength) val filterWordsUdf = udf { (words: Seq[String]) => model(words) } dataset.withColumn($(outputCol), filterWordsUdf(dataset($(inputCol)))) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { require(schema($(inputCol)).dataType.isInstanceOf[ArrayType], s"Input column must be of type ArrayType(StringType,true) but got ${schema($(inputCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(outputCol)), s"Output column ${$(outputCol)} already exists.") StructType(schema.fields :+ StructField($(outputCol), ArrayType(StringType, true))) } } object WordLengthFilter extends DefaultParamsReadable[WordLengthFilter] { override def load(path: String): WordLengthFilter = super.load(path) }
Example 68
Source File: MathUnary.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType} import org.apache.spark.sql.functions.udf private val className = classOf[MathUnary].getName override def load(path: String): MathUnary = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("operation").head() val operation = data.getAs[String](0) val model = MathUnaryModel(UnaryOperation.forName(operation)) val transformer = new MathUnary(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 69
Source File: StringMap.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.{HandleInvalid, StringMapModel} import org.apache.hadoop.fs.Path import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ private val className = classOf[StringMap].getName override def load(path: String): StringMap = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath).select("labels", "handleInvalid", "defaultValue").head() val labels = data.getAs[Map[String, Double]](0) val handleInvalid = HandleInvalid.fromString(data.getAs[String](1)) val defaultValue = data.getAs[Double](2) val model = new StringMapModel(labels, handleInvalid = handleInvalid, defaultValue = defaultValue) val transformer = new StringMap(metadata.uid, model) metadata.getAndSetParams(transformer) transformer } } }
Example 70
Source File: Sampler.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import scala.util.Random class Sampler(fraction: Double, override val uid: String, seed: Int = Random.nextInt) extends Transformer { def this(fraction: Double) = this(fraction, Identifiable.randomUID("sampler")) final def getOutputCol: String = $(inputCol) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sample(false, fraction, seed).toDF } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Sampler = defaultCopy(extra) } object Sampler { def main(args: Array[String]): Unit = { val ss = SparkSession .builder .master("local") .appName("preprocess") .getOrCreate() val training = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") println(training.count) val sampler = new Sampler(0.5) .setInputCol("features") val pipeline = new Pipeline() .setStages(Array(sampler)) val model = pipeline.fit(training) val test = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") model.transform(test).select("*") .collect() .foreach { case Row(label: Double, vector: Vector) => println(s"($label, " + s"${vector.toSparse.indices.mkString("[", ",", "]")}, " + s"${vector.toSparse.values.mkString("[", ",", "]")}") } ss.stop() } }
Example 71
Source File: DLEstimatorBase.scala From BigDL with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol} import org.apache.spark.mllib.linalg.{Vector, VectorUDT} import org.apache.spark.rdd.RDD import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Row} abstract class DLEstimatorBase[Learner <: DLEstimatorBase[Learner, M], M <: DLTransformerBase[M]] extends Estimator[M] with HasLabelCol { protected def internalFit(dataFrame: DataFrame): M override def fit(dataFrame: DataFrame): M = { transformSchema(dataFrame.schema, logging = true) internalFit(dataFrame) } override def copy(extra: ParamMap): Learner = defaultCopy(extra) }
Example 72
Source File: DLClassifier.scala From BigDL with Apache License 2.0 | 5 votes |
package com.intel.analytics.bigdl.dlframes import com.intel.analytics.bigdl.tensor.Tensor import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric import com.intel.analytics.bigdl.{Criterion, Module} import org.apache.spark.ml.adapter.SchemaUtils import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types._ import scala.reflect.ClassTag @deprecated("`DLClassifierModel` is deprecated." + "com.intel.analytics.bigdl.dlframes is deprecated in BigDL 0.11, " + "and will be removed in future releases", "0.10.0") class DLClassifierModel[T: ClassTag]( @transient override val model: Module[T], featureSize : Array[Int], override val uid: String = "DLClassifierModel" )(implicit ev: TensorNumeric[T]) extends DLModel[T](model, featureSize) { protected override def outputToPrediction(output: Tensor[T]): Any = { if (output.size().deep == Array(1).deep) { val raw = ev.toType[Double](output.toArray().head) if (raw > 0.5) 1.0 else 0.0 } else { ev.toType[Double](output.max(1)._2.valueAt(1)) } } override def transformSchema(schema : StructType): StructType = { validateDataType(schema, $(featuresCol)) SchemaUtils.appendColumn(schema, $(predictionCol), DoubleType) } }
Example 73
Source File: ParamGridBuilderSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tuning import scala.collection.mutable import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.{ParamMap, TestParams} class ParamGridBuilderSuite extends SparkFunSuite { val solver = new TestParams() import solver.{inputCol, maxIter} test("param grid builder") { def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = { assert(maps.size === expected.size) maps.foreach { m => val tuple = (m(maxIter), m(inputCol)) assert(expected.contains(tuple)) expected.remove(tuple) } assert(expected.isEmpty) } val maps0 = new ParamGridBuilder() .baseOn(maxIter -> 10) .addGrid(inputCol, Array("input0", "input1")) .build() val expected0 = mutable.Set( (10, "input0"), (10, "input1")) validateGrid(maps0, expected0) val maps1 = new ParamGridBuilder() .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten .addGrid(maxIter, Array(10, 20)) .addGrid(inputCol, Array("input0", "input1")) .build() val expected1 = mutable.Set( (10, "input0"), (20, "input0"), (10, "input1"), (20, "input1")) validateGrid(maps1, expected1) } }
Example 74
Source File: RegressionEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 75
Source File: MulticlassClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 76
Source File: UDFTransformer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasInputCols, HasOutputCol, Wrappable} import com.microsoft.ml.spark.core.env.InternalWrapper import com.microsoft.ml.spark.core.serialize.ComplexParam import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.{ParamMap, UDFParam, UDPyFParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.execution.python.UserDefinedPythonFunction import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.types.{DataType, StructField, StructType} import org.apache.spark.sql.{Column, DataFrame, Dataset} import org.apache.spark.sql.functions.col object UDFTransformer extends ComplexParamsReadable[UDFTransformer] override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) if (isSet(inputCol)) { dataset.withColumn(getOutputCol, applyUDF(dataset.col(getInputCol))) } else { dataset.withColumn(getOutputCol, applyUDFOnCols(getInputCols.map(col): _*)) } } def validateAndTransformSchema(schema: StructType): StructType = { if (isSet(inputCol)) schema(getInputCol) else schema(Set(getInputCols: _*)) schema.add(StructField(getOutputCol, getDataType)) } def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema) def copy(extra: ParamMap): UDFTransformer = defaultCopy(extra) }
Example 77
Source File: SQLTransformer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 78
Source File: HashingTF.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Experimental import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.mllib.feature import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StructType} def setNumFeatures(value: Int): this.type = set(numFeatures, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val hashingTF = new feature.HashingTF($(numFeatures)) val t = udf { terms: Seq[_] => hashingTF.transform(terms) } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.isInstanceOf[ArrayType], s"The input column must be ArrayType, but got $inputType.") val attrGroup = new AttributeGroup($(outputCol), $(numFeatures)) SchemaUtils.appendColumn(schema, attrGroup.toStructField()) } override def copy(extra: ParamMap): HashingTF = defaultCopy(extra) }
Example 79
Source File: PredictorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg._ import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Dataset import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext { import PredictorSuite._ test("should support all NumericType labels and not support other types") { val df = spark.createDataFrame(Seq( (0, Vectors.dense(0, 2, 3)), (1, Vectors.dense(0, 3, 9)), (0, Vectors.dense(0, 2, 6)) )).toDF("label", "features") val types = Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0)) val predictor = new MockPredictor() types.foreach { t => predictor.fit(df.select(col("label").cast(t), col("features"))) } intercept[IllegalArgumentException] { predictor.fit(df.select(col("label").cast(StringType), col("features"))) } } } object PredictorSuite { class MockPredictor(override val uid: String) extends Predictor[Vector, MockPredictor, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictor")) override def train(dataset: Dataset[_]): MockPredictionModel = { require(dataset.schema("label").dataType == DoubleType) new MockPredictionModel(uid) } override def copy(extra: ParamMap): MockPredictor = throw new NotImplementedError() } class MockPredictionModel(override val uid: String) extends PredictionModel[Vector, MockPredictionModel] { def this() = this(Identifiable.randomUID("mockpredictormodel")) override def predict(features: Vector): Double = throw new NotImplementedError() override def copy(extra: ParamMap): MockPredictionModel = throw new NotImplementedError() } }
Example 80
Source File: ParamGridBuilderSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tuning import scala.collection.mutable import org.apache.spark.SparkFunSuite import org.apache.spark.ml.param.{ParamMap, TestParams} class ParamGridBuilderSuite extends SparkFunSuite { val solver = new TestParams() import solver.{inputCol, maxIter} test("param grid builder") { def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = { assert(maps.size === expected.size) maps.foreach { m => val tuple = (m(maxIter), m(inputCol)) assert(expected.contains(tuple)) expected.remove(tuple) } assert(expected.isEmpty) } val maps0 = new ParamGridBuilder() .baseOn(maxIter -> 10) .addGrid(inputCol, Array("input0", "input1")) .build() val expected0 = mutable.Set( (10, "input0"), (10, "input1")) validateGrid(maps0, expected0) val maps1 = new ParamGridBuilder() .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten .addGrid(maxIter, Array(10, 20)) .addGrid(inputCol, Array("input0", "input1")) .build() val expected1 = mutable.Set( (10, "input0"), (20, "input0"), (10, "input1"), (20, "input1")) validateGrid(maps1, expected1) } }
Example 81
Source File: RegressionEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.RegressionMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, FloatType} @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "rmse") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType)) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) .rdd .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) val metric = $(metricName) match { case "rmse" => metrics.rootMeanSquaredError case "mse" => metrics.meanSquaredError case "r2" => metrics.r2 case "mae" => metrics.meanAbsoluteError } metric } @Since("1.4.0") override def isLargerBetter: Boolean = $(metricName) match { case "rmse" => false case "mse" => false case "r2" => true case "mae" => false } @Since("1.5.0") override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra) } @Since("1.6.0") object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] { @Since("1.6.0") override def load(path: String): RegressionEvaluator = super.load(path) }
Example 82
Source File: MulticlassClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.5.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "f1") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType) SchemaUtils.checkNumericType(schema, $(labelCol)) val predictionAndLabels = dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new MulticlassMetrics(predictionAndLabels) val metric = $(metricName) match { case "f1" => metrics.weightedFMeasure case "weightedPrecision" => metrics.weightedPrecision case "weightedRecall" => metrics.weightedRecall case "accuracy" => metrics.accuracy } metric } @Since("1.5.0") override def isLargerBetter: Boolean = true @Since("1.5.0") override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object MulticlassClassificationEvaluator extends DefaultParamsReadable[MulticlassClassificationEvaluator] { @Since("1.6.0") override def load(path: String): MulticlassClassificationEvaluator = super.load(path) }
Example 83
Source File: SQLTransformer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.Transformer import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.sql.types.StructType @Since("1.6.0") def getStatement: String = $(statement) private val tableIdentifier: String = "__THIS__" @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val tableName = Identifiable.randomUID(uid) dataset.createOrReplaceTempView(tableName) val realStatement = $(statement).replace(tableIdentifier, tableName) val result = dataset.sparkSession.sql(realStatement) dataset.sparkSession.catalog.dropTempView(tableName) result } @Since("1.6.0") override def transformSchema(schema: StructType): StructType = { val spark = SparkSession.builder().getOrCreate() val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty)) val dummyDF = spark.createDataFrame(dummyRDD, schema) val tableName = Identifiable.randomUID(uid) val realStatement = $(statement).replace(tableIdentifier, tableName) dummyDF.createOrReplaceTempView(tableName) val outputSchema = spark.sql(realStatement).schema spark.catalog.dropTempView(tableName) outputSchema } @Since("1.6.0") override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra) } @Since("1.6.0") object SQLTransformer extends DefaultParamsReadable[SQLTransformer] { @Since("1.6.0") override def load(path: String): SQLTransformer = super.load(path) }
Example 84
Source File: RepartitionSuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.Pipeline import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.MLReadable class RepartitionSuite extends TestBase with TransformerFuzzing[Repartition] { import session.implicits._ lazy val input = Seq( (0, "guitars", "drums"), (1, "piano", "trumpet"), (2, "bass", "cymbals"), (3, "guitars", "drums"), (4, "piano", "trumpet"), (5, "bass", "cymbals"), (6, "guitars", "drums"), (7, "piano", "trumpet"), (8, "bass", "cymbals"), (9, "guitars", "drums"), (10, "piano", "trumpet"), (11, "bass", "cymbals") ).toDF("numbers", "words", "more") test("Work for several values of n") { def test(n: Int): Unit = { val result = new Repartition() .setN(n) .transform(input) assert(result.rdd.getNumPartitions == n) () } List(1, 2, 3, 10).foreach(test) } test("Should allow a user to set the partitions specifically in pipeline transform") { val r = new Repartition().setN(1) val pipe = new Pipeline().setStages(Array(r)) val fitPipe = pipe.fit(input) assert(fitPipe.transform(input).rdd.getNumPartitions==1) assert(fitPipe.transform(input, ParamMap(r.n->5)).rdd.getNumPartitions ==5) } def testObjects(): Seq[TestObject[Repartition]] = List(new TestObject( new Repartition().setN(1), input)) def reader: MLReadable[_] = Repartition }
Example 85
Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable} import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions] class VowpalWabbitInteractions(override val uid: String) extends Transformer with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("VowpalWabbitInteractions")) override def transform(dataset: Dataset[_]): DataFrame = { val fieldSubset = dataset.schema.fields .filter(f => getInputCols.contains(f.name)) val mask = getMask val mode = udf((r: Row) => { // compute the final number of features val numElems = (0 until r.length) .map(r.getAs[Vector](_).numNonzeros).product val newIndices = new Array[Int](numElems) val newValues = new Array[Double](numElems) // build interaction features using FNV-1 val fnvPrime = 16777619 var i = 0 def interact(idx: Int, value: Double, ns: Int): Unit = { if (ns == r.size) { newIndices(i) += mask & idx newValues(i) += value i += 1 } else { val idx1 = idx * fnvPrime r.getAs[Vector](ns).foreachActive { case (idx2, value2) => interact(idx1 ^ idx2, value * value2, ns + 1) } } } // start the recursion interact(0, 1, 0) val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions) Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted) }) dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*))) } override def transformSchema(schema: StructType): StructType = { val fieldNames = schema.fields.map(_.name) for (f <- getInputCols) if (!fieldNames.contains(f)) throw new IllegalArgumentException("missing input column " + f) else { val fieldType = schema.fields(schema.fieldIndex(f)).dataType if (fieldType != VectorType) throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName) } schema.add(StructField(getOutputCol, VectorType, true)) } override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra) }
Example 86
Source File: EvaluationUtils.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.automl import com.microsoft.ml.spark.core.metrics.MetricConstants import com.microsoft.ml.spark.core.schema.SchemaConstants import com.microsoft.ml.spark.train.{TrainClassifier, TrainRegressor, TrainedClassifierModel, TrainedRegressorModel} import org.apache.spark.injections.RegressionUtils import org.apache.spark.ml.classification.{ClassificationModel, Classifier} import org.apache.spark.ml.{PipelineStage, Transformer} import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.regression._ object EvaluationUtils { val ModelTypeUnsupportedErr = "Model type not supported for evaluation" // Find type of trained models def getModelType(model: PipelineStage): String = { model match { case _: TrainRegressor => SchemaConstants.RegressionKind case _: TrainClassifier => SchemaConstants.ClassificationKind case _: Classifier[_, _, _] => SchemaConstants.ClassificationKind case regressor: PipelineStage if RegressionUtils.isRegressor(regressor) => SchemaConstants.RegressionKind case _: DecisionTreeRegressor => SchemaConstants.RegressionKind case _: GBTRegressor => SchemaConstants.RegressionKind case _: RandomForestRegressor => SchemaConstants.RegressionKind case _: TrainedRegressorModel => SchemaConstants.RegressionKind case _: TrainedClassifierModel => SchemaConstants.ClassificationKind case evm: BestModel => getModelType(evm.getBestModel) case _: ClassificationModel[_, _] => SchemaConstants.ClassificationKind case _: RegressionModel[_, _] => SchemaConstants.RegressionKind case _ => throw new Exception(ModelTypeUnsupportedErr) } } def getMetricWithOperator(model: PipelineStage, evaluationMetric: String): (String, Ordering[Double]) = { val modelType = getModelType(model) getMetricWithOperator(modelType, evaluationMetric) } def getMetricWithOperator(modelType: String, evaluationMetric: String): (String, Ordering[Double]) = { val chooseHighest = Ordering.Double val chooseLowest = Ordering.Double.reverse val (evaluationMetricColumnName, operator): (String, Ordering[Double]) = modelType match { case SchemaConstants.RegressionKind => evaluationMetric match { case MetricConstants.MseSparkMetric => (MetricConstants.MseColumnName, chooseLowest) case MetricConstants.RmseSparkMetric => (MetricConstants.RmseColumnName, chooseLowest) case MetricConstants.R2SparkMetric => (MetricConstants.R2ColumnName, chooseHighest) case MetricConstants.MaeSparkMetric => (MetricConstants.MaeColumnName, chooseLowest) case _ => throw new Exception("Metric is not supported for regressors") } case SchemaConstants.ClassificationKind => evaluationMetric match { case MetricConstants.AucSparkMetric => (MetricConstants.AucColumnName, chooseHighest) case MetricConstants.PrecisionSparkMetric => (MetricConstants.PrecisionColumnName, chooseHighest) case MetricConstants.RecallSparkMetric => (MetricConstants.RecallColumnName, chooseHighest) case MetricConstants.AccuracySparkMetric => (MetricConstants.AccuracyColumnName, chooseHighest) case _ => throw new Exception("Metric is not supported for classifiers") } case _ => throw new Exception("Model type not supported for evaluation") } (evaluationMetricColumnName, operator) } def getModelParams(model: Transformer): ParamMap = { model match { case reg: TrainedRegressorModel => reg.getParamMap case cls: TrainedClassifierModel => cls.getParamMap case evm: BestModel => getModelParams(evm.getBestModel) case _ => throw new Exception("Model type not supported for evaluation") } } def modelParamsToString(model: Transformer): String = getModelParams(model).toSeq.map(pv => s"${pv.param.name}: ${pv.value}").sorted.mkString(", ") }
Example 87
Source File: Lambda.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.SparkContext import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.{ParamMap, UDFParam} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} object Lambda extends ComplexParamsReadable[Lambda] { def apply(f: Dataset[_] => DataFrame): Lambda = { new Lambda().setTransform(f) } } class Lambda(val uid: String) extends Transformer with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("Lambda")) val transformFunc = new UDFParam(this, "transformFunc", "holder for dataframe function") def setTransform(f: Dataset[_] => DataFrame): this.type = { set(transformFunc, udf(f, StringType)) } def getTransform: Dataset[_] => DataFrame = { $(transformFunc).f.asInstanceOf[Dataset[_] => DataFrame] } val transformSchemaFunc = new UDFParam(this, "transformSchemaFunc", "the output schema after the transformation") def setTransformSchema(f: StructType => StructType): this.type = { set(transformSchemaFunc, udf(f, StringType)) } def getTransformSchema: StructType => StructType = { $(transformSchemaFunc).f.asInstanceOf[StructType => StructType] } override def transform(dataset: Dataset[_]): DataFrame = { getTransform(dataset) } def transformSchema(schema: StructType): StructType = { if (get(transformSchemaFunc).isEmpty) { val sc = SparkContext.getOrCreate() val df = SparkSession.builder().getOrCreate().createDataFrame(sc.emptyRDD[Row], schema) transform(df).schema } else { getTransformSchema(schema) } } def copy(extra: ParamMap): Lambda = defaultCopy(extra) }
Example 88
Source File: PipelineSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.mockito.Matchers.{any, eq => meq} import org.mockito.Mockito.when import org.scalatest.mock.MockitoSugar.mock import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.HashingTF import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.DataFrame class PipelineSuite extends SparkFunSuite { abstract class MyModel extends Model[MyModel] test("pipeline") { val estimator0 = mock[Estimator[MyModel]] val model0 = mock[MyModel] val transformer1 = mock[Transformer] val estimator2 = mock[Estimator[MyModel]] val model2 = mock[MyModel] val transformer3 = mock[Transformer] val dataset0 = mock[DataFrame] val dataset1 = mock[DataFrame] val dataset2 = mock[DataFrame] val dataset3 = mock[DataFrame] val dataset4 = mock[DataFrame] when(estimator0.copy(any[ParamMap])).thenReturn(estimator0) when(model0.copy(any[ParamMap])).thenReturn(model0) when(transformer1.copy(any[ParamMap])).thenReturn(transformer1) when(estimator2.copy(any[ParamMap])).thenReturn(estimator2) when(model2.copy(any[ParamMap])).thenReturn(model2) when(transformer3.copy(any[ParamMap])).thenReturn(transformer3) when(estimator0.fit(meq(dataset0))).thenReturn(model0) when(model0.transform(meq(dataset0))).thenReturn(dataset1) when(model0.parent).thenReturn(estimator0) when(transformer1.transform(meq(dataset1))).thenReturn(dataset2) when(estimator2.fit(meq(dataset2))).thenReturn(model2) when(model2.transform(meq(dataset2))).thenReturn(dataset3) when(model2.parent).thenReturn(estimator2) when(transformer3.transform(meq(dataset3))).thenReturn(dataset4) val pipeline = new Pipeline() .setStages(Array(estimator0, transformer1, estimator2, transformer3)) val pipelineModel = pipeline.fit(dataset0) assert(pipelineModel.stages.length === 4) assert(pipelineModel.stages(0).eq(model0)) assert(pipelineModel.stages(1).eq(transformer1)) assert(pipelineModel.stages(2).eq(model2)) assert(pipelineModel.stages(3).eq(transformer3)) val output = pipelineModel.transform(dataset0) assert(output.eq(dataset4)) } test("pipeline with duplicate stages") { val estimator = mock[Estimator[MyModel]] val pipeline = new Pipeline() .setStages(Array(estimator, estimator)) val dataset = mock[DataFrame] intercept[IllegalArgumentException] { pipeline.fit(dataset) } } test("PipelineModel.copy") { val hashingTF = new HashingTF() .setNumFeatures(100) val model = new PipelineModel("pipeline", Array[Transformer](hashingTF)) val copied = model.copy(ParamMap(hashingTF.numFeatures -> 10)) require(copied.stages(0).asInstanceOf[HashingTF].getNumFeatures === 10, "copy should handle extra stage params") } }
Example 89
Source File: IsolationForest.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.isolationforest import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util._ import org.apache.spark.ml.{Estimator, Model} import com.linkedin.relevance.isolationforest.{IsolationForestParams, IsolationForest => IsolationForestSource, IsolationForestModel => IsolationForestModelSource} import com.microsoft.ml.spark.core.contracts.Wrappable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types.StructType object IsolationForest extends DefaultParamsReadable[IsolationForest] class IsolationForest(override val uid: String, val that: IsolationForestSource) extends Estimator[IsolationForestModel] with IsolationForestParams with DefaultParamsWritable with Wrappable { def this(uid: String) = this(uid, new IsolationForestSource(uid)) def this() = this(Identifiable.randomUID("IsolationForest")) override def copy(extra: ParamMap): IsolationForest = new IsolationForest(uid, that.copy(extra)) override def fit(data: Dataset[_]): IsolationForestModel = new IsolationForestModel(uid, that.fit(data)) override def transformSchema(schema: StructType): StructType = that.transformSchema(schema) } class IsolationForestModel(override val uid: String, val that: IsolationForestModelSource) extends Model[IsolationForestModel] with MLWritable { override def copy(extra: ParamMap): IsolationForestModel = new IsolationForestModel(uid, that.copy(extra)) override def transform(data: Dataset[_]): DataFrame = that.transform(data) override def transformSchema(schema: StructType): StructType = that.transformSchema(schema) override def write: MLWriter = that.write } class IsolationForestModelReader extends MLReader[IsolationForestModel] with Serializable { override def load(path: String): IsolationForestModel = { val that = IsolationForestModelSource.load(path) new IsolationForestModel(that.uid, that) } } object IsolationForestModel extends MLReadable[IsolationForestModel] { override def read: MLReader[IsolationForestModel] = new IsolationForestModelReader }
Example 90
Source File: S2CellTransformer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import com.google.common.geometry.{S2LatLng, S2CellId} import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.{DoubleType, StructType} class S2CellTransformer(override val uid: String) extends Transformer { def this() = this(Identifiable.randomUID("S2CellTransformer")) // Input/Output column names val latCol: Param[String] = new Param[String](this, "latCol", "latitude column") val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column") val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column") val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]", (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i)) // Default parameters setDefault( latCol -> "lat", lonCol -> "lon", cellCol -> "cell", level -> 10 ) def getLatCol: String = $(latCol) def getLonCol: String = $(lonCol) def getCellCol: String = $(cellCol) def getLevel: Int = $(level) def setLatCol(value: String): this.type = set(latCol, value) def setLonCol(value: String): this.type = set(lonCol, value) def setCellCol(value: String): this.type = set(cellCol, value) def setLevel(value: Int): this.type = set(level, value) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val currentLevel = $(level) val t = udf { (lat: Double, lon: Double) => val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon)) cellId.parent(currentLevel).toToken } val metadata = outputSchema($(cellCol)).metadata dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata)) } override def transformSchema(schema: StructType): StructType = { val latColumnName = $(latCol) val latDataType = schema(latColumnName).dataType require(latDataType == DoubleType, s"The latitude column $latColumnName must be Double type, " + s"but got $latDataType.") val lonColumnName = $(lonCol) val lonDataType = schema(lonColumnName).dataType require(lonDataType == DoubleType, s"The longitude column $lonColumnName must be Double type, " + s"but got $lonDataType.") val inputFields = schema.fields val outputColName = $(cellCol) require(inputFields.forall(_.name != outputColName), s"Output column $outputColName already exists.") val attr = NominalAttribute.defaultAttr.withName($(cellCol)) val outputFields = inputFields :+ attr.toStructField() StructType(outputFields) } override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra) }
Example 91
Source File: RecursivePipeline.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import org.apache.spark.internal.Logging import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter} import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset} import scala.collection.mutable.ListBuffer class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline { def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty) def this(uid: String) = this(uid, Array.empty) def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages) this.setStages(baseStages) override def fit(dataset: Dataset[_]): PipelineModel = { transformSchema(dataset.schema, logging = true) val theStages = $(stages) var indexOfLastEstimator = -1 theStages.view.zipWithIndex.foreach { case (stage, index) => stage match { case _: Estimator[_] => indexOfLastEstimator = index case _ => } } var curDataset = dataset val transformers = ListBuffer.empty[Transformer] theStages.view.zipWithIndex.foreach { case (stage, index) => if (index <= indexOfLastEstimator) { val transformer = stage match { case estimator: HasRecursiveFit[_] => estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset)) case estimator: Estimator[_] => estimator.fit(curDataset) case t: Transformer => t case _ => throw new IllegalArgumentException( s"Does not support stage $stage of type ${stage.getClass}") } if (index < indexOfLastEstimator) { curDataset = transformer.transform(curDataset) } transformers += transformer } else { transformers += stage.asInstanceOf[Transformer] } } createPipeline(dataset, transformers.toArray) } } class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel) extends Model[RecursivePipelineModel] with MLWritable with Logging { def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline) // drops right at most because is itself included private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel = new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset) override def copy(extra: ParamMap): RecursivePipelineModel = { new RecursivePipelineModel(uid, innerPipeline.copy(extra)) } override def write: MLWriter = { innerPipeline.write } override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match { case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset)) case t: AnnotatorModel[_] if t.getLazyAnnotator => cur case t: Transformer => t.transform(cur) }) } override def transformSchema(schema: StructType): StructType = { innerPipeline.transformSchema(schema) } }
Example 92
Source File: AnnotatorApproach.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp import com.johnsnowlabs.storage.HasStorage import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer} import org.apache.spark.sql.{Dataset, SparkSession} import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType} import org.apache.spark.ml.util.DefaultParamsWritable override final def transformSchema(schema: StructType): StructType = { require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" + msgHelper(schema) + s"\nMake sure such annotators exist in your pipeline, " + s"with the right output names and that they have following annotator types: " + s"${inputAnnotatorTypes.mkString(", ")}") val metadataBuilder: MetadataBuilder = new MetadataBuilder() metadataBuilder.putString("annotatorType", outputAnnotatorType) val outputFields = schema.fields :+ StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build) StructType(outputFields) } }
Example 93
Source File: Cleaner.scala From CkoocNLP with Apache License 2.0 | 5 votes |
package functions.clean import com.hankcs.hanlp.HanLP import config.paramconf.{HasOutputCol, HasInputCol} import functions.MySchemaUtils import functions.clean.chinese.BCConvert import org.apache.spark.ml.Transformer import org.apache.spark.ml.param.{IntParam, Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{StringType, StructType} import org.apache.spark.sql.{DataFrame, Dataset} setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1) override def transform(dataset: Dataset[_]): DataFrame = { val outputSchema = transformSchema(dataset.schema, logging = true) val cleanFunc = udf {line: String => var cleaned = "" getFanJian match { case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line) case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line) case _ => cleaned = line } getQuanBan match { case "q2b" => cleaned = BCConvert.qj2bj(cleaned) case "b2q" => cleaned = BCConvert.bj2qj(cleaned) case _ => cleaned = cleaned } cleaned } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record => val outputIndex = record.fieldIndex($(outputCol)) record.getString(outputIndex).length >= getMinLineLen } } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) override def transformSchema(schema: StructType): StructType = { val inputType = schema($(inputCol)).dataType require(inputType.typeName.equals(StringType.typeName), s"Input type must be StringType but got $inputType.") MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable) } } object Cleaner extends DefaultParamsReadable[Cleaner] { override def load(path: String): Cleaner = super.load(path) }
Example 94
Source File: HoltWintersBestModelFinder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberHoltWintersModel import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasGroupByCol import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.rdd.RDD import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import scala.reflect.ClassTag class HoltWintersBestModelFinder[G]( override val uid: String )(implicit kt: ClassTag[G]) extends HoltWintersBestModelEvaluation[G, HoltWintersModel[G]] with DefaultParamsWritable with HasGroupByCol with TimeSeriesBestModelFinder { def setTimeSeriesEvaluator(eval: TimeSeriesEvaluator[G]): this.type = set(timeSeriesEvaluator, eval) def setEstimatorParamMaps(value: Array[ParamMap]): this.type = set(estimatorParamMaps, value) def setNFutures(value: Int): this.type = set(nFutures, value) override def setValidationCol(value: String): this.type = set(validationCol, value) def setLabelCol(label: String): this.type = set(labelCol, label) def setGroupByCol(groupBy: String): this.type = set(groupByCol, Some(groupBy)) def this()(implicit kt: ClassTag[G]) = this(Identifiable.randomUID("arima")) def modelEvaluation( idModels: RDD[(G, Row, Option[UberHoltWintersModel])] ): RDD[(G, (UberHoltWintersModel, ModelParamEvaluation[G]))] = { val eval = $(timeSeriesEvaluator) val broadcastEvaluator = idModels.context.broadcast(eval) idModels.filter(_._3.isDefined).map { case (id, row, models) => val evaluatedModels = models.map { model => holtWintersEvaluation(row, model, broadcastEvaluator, id) }.head log.warn(s"best model reach ${evaluatedModels._2.metricResult}") (id, evaluatedModels) } } override protected def train(dataSet: Dataset[_]): HoltWintersModel[G] = { val splitDs = split(dataSet, $(nFutures)) val idModels = splitDs.rdd.map(train) new HoltWintersModel[G](uid, modelEvaluation(idModels)) .setValidationCol($(validationCol)) .asInstanceOf[HoltWintersModel[G]] } def train(row: Row): (G, Row, Option[UberHoltWintersModel]) = { val id = row.getAs[G]($(groupByCol).get) val result = try { val dense = row.getAs[org.apache.spark.ml.linalg.DenseVector]($(featuresCol)) val ts:org.apache.spark.mllib.linalg.Vector = org.apache.spark.mllib.linalg.Vectors.dense(dense.toArray); Some( UberHoltWintersModel.fitModelWithBOBYQA(ts, $(nFutures)) ) } catch { case e: Exception => log.error( s"Got the following Exception ${e.getLocalizedMessage} in id $id" ) None } (id, row, result) } } object HoltWintersBestModelFinder extends DefaultParamsReadable[HoltWintersBestModelFinder[_]] { override def load(path: String): HoltWintersBestModelFinder[_] = super.load(path) }
Example 95
Source File: XGBoostBigModelTimeSeries.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import java.sql.Timestamp import eleflow.uberdata.IUberdataForecastUtil import eleflow.uberdata.core.data.DataTransformer import eleflow.uberdata.enums.SupportedAlgorithm import ml.dmlc.xgboost4j.scala.spark.XGBoostModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasTimeCol import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, _} class XGBoostBigModelTimeSeries[I](override val uid: String, override val models: Seq[(ParamMap, XGBoostModel)]) extends XGBoostBigModel[I](uid, models) with HasTimeCol{ def setTimecol(time: String): this.type = set(timeCol, Some(time)) override def transform(dataSet: Dataset[_]): DataFrame = { val prediction = predict(dataSet) val rows = dataSet.rdd .map { case (row: Row) => (DataTransformer.toFloat(row.getAs($(idCol))), (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME), row.getAs[java.sql.Timestamp]($(timeCol).get))) } .join(prediction) .map { case (id, ((features, time), predictValue)) => Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue) } dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(Array( StructField($(idCol), FloatType), StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT), StructField($(timeCol).get, TimestampType), StructField(IUberdataForecastUtil.ALGORITHM, StringType), StructField("prediction", FloatType) ) ) }
Example 96
Source File: HoltWintersEstimator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.TimeSeriesModel import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.StructType import org.apache.spark.sql.Dataset class HoltWintersBestModel[T, M <: TimeSeriesModel]( override val uid: String, val bestPrediction: RDD[(T, M)], val validationMetrics: RDD[(T, ModelParamEvaluation[T])] ) extends Model[HoltWintersBestModel[T, M]] with TimeSeriesBestModelFinderParam[T] { //TODO look for this method usage to see if it can be removed override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) dataset.toDF() } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): HoltWintersBestModel[T, M] = { val copied = new HoltWintersBestModel[T, M](uid, bestPrediction, validationMetrics) copyValues(copied, extra) } }
Example 97
Source File: AllColumnsTimeSeriesGenerator.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable} import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import scala.reflect.ClassTag def setOutputCol(value: String): this.type = set(outputCol, value) // override def transform(dataSet: DataFrame): DataFrame = { override def transform(dataSet: Dataset[_] ): DataFrame = { val rdd = dataSet.rdd val sparkContext = dataSet.sqlContext.sparkContext val labelColIndex = sparkContext.broadcast(dataSet.schema.fieldIndex($(labelCol))) val keyValueDataSet = rdd.map { case (row: Row) => Row( row.getAs[T](labelColIndex.value), row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol)) ) } val trainSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(keyValueDataSet, trainSchema) } override def transformSchema(schema: StructType): StructType = { StructType( schema.filter(_.name == $(labelCol)).head +: Seq( StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT) ) ) } override def copy(extra: ParamMap): AllColumnsTimeSeriesGenerator[T, U] = defaultCopy(extra) } object AllColumnsTimeSeriesGenerator extends DefaultParamsReadable[AllColumnsTimeSeriesGenerator[_, _]] { override def load(path: String): AllColumnsTimeSeriesGenerator[_, _] = super.load(path) }
Example 98
Source File: HoltWintersBestModelEvaluation.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import com.cloudera.sparkts.models.UberHoltWintersModel import eleflow.uberdata.enums.SupportedAlgorithm import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.evaluation.TimeSeriesEvaluator import org.apache.spark.ml.param.{ParamMap, ParamPair} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.Row import scala.reflect.ClassTag abstract class HoltWintersBestModelEvaluation[L, M <: ForecastBaseModel[M]]( implicit kt: ClassTag[L], ord: Ordering[L] = null ) extends BestModelFinder[L, M] with HoltWintersParams { protected def holtWintersEvaluation( row: Row, model: UberHoltWintersModel, broadcastEvaluator: Broadcast[TimeSeriesEvaluator[L]], id: L ): (UberHoltWintersModel, ModelParamEvaluation[L]) = { val features = row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol)) log.warn( s"Evaluating forecast for id $id, with parameters " + s"alpha ${model.alpha}, beta ${model.beta} and gamma ${model.gamma}" ) val expectedResult = row.getAs[org.apache.spark.ml.linalg.Vector](partialValidationCol) val forecastToBeValidated = Vectors.dense(new Array[Double]($(nFutures))) model.forecast(org.apache.spark.mllib.linalg.Vectors.fromML(features), forecastToBeValidated).toArray val toBeValidated = expectedResult.toArray.zip(forecastToBeValidated.toArray) val metric = broadcastEvaluator.value.evaluate(toBeValidated) val metricName = broadcastEvaluator.value.getMetricName val params = ParamMap().put( ParamPair(gamma, model.gamma), ParamPair(beta, model.beta), ParamPair(alpha, model.alpha) ) (model, new ModelParamEvaluation[L]( id, metric, params, Some(metricName), SupportedAlgorithm.HoltWinters )) } }
Example 99
Source File: VectorizeEncoder.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml import eleflow.uberdata.core.data.DataTransformer import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.VectorUDT import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.{StructField, StructType} class VectorizeEncoder(override val uid: String) extends Transformer with HasIdCol with HasTimeCol with HasInputCols with HasLabelCol with HasGroupByCol with HasOutputCol with DefaultParamsWritable { def this() = this(Identifiable.randomUID("vectorizer")) def setIdCol(input: String) = set(idCol, input) def setLabelCol(input: String) = set(labelCol, input) def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy)) def setInputCol(input: Array[String]) = set(inputCols, input) def setTimeCol(time: String) = set(timeCol, Some(time)) def setOutputCol(output: String) = set(outputCol, output) override def transform(dataSet: Dataset[_]): DataFrame = { val context = dataSet.sqlContext.sparkContext val input = context.broadcast($(inputCols)) val allColumnNames = dataSet.schema.map(_.name) val nonInputColumnIndexes = context.broadcast( allColumnNames.zipWithIndex.filter( f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol) || f._1 == $(timeCol).getOrElse(""))) val result = dataSet.rdd.map { case (row: Row) => val rowSeq = row.toSeq val nonInputColumns = nonInputColumnIndexes.value.map { case (_, index) => rowSeq(index) } val size = input.value.length val (values, indices) = input.value .filter(col => row.getAs(col) != null) .map { column => DataTransformer.toDouble(row.getAs(column)) } .zipWithIndex .filter(f => f._1 != 0d) .unzip Row( nonInputColumns :+ org.apache.spark.ml.linalg.Vectors .sparse(size, indices.toArray, values.toArray): _* ) } val newSchema = transformSchema(dataSet.schema) dataSet.sqlContext.createDataFrame(result, newSchema) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType( schema.filter( col => !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol) || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("") ) ).add(StructField($(outputCol), new VectorUDT)) }