org.apache.spark.ml.param.ParamMap Scala Examples

The following examples show how to use org.apache.spark.ml.param.ParamMap. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: MNISTBenchmark.scala    From spark-knn   with Apache License 2.0 6 votes vote down vote up
package com.github.saurfang.spark.ml.knn.examples

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.classification.{KNNClassifier, NaiveKNNClassifier}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.tuning.{Benchmarker, ParamGridBuilder}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
import org.apache.log4j

import scala.collection.mutable


object MNISTBenchmark {

  val logger = log4j.Logger.getLogger(getClass)

  def main(args: Array[String]) {
    val ns = if(args.isEmpty) (2500 to 10000 by 2500).toArray else args(0).split(',').map(_.toInt)
    val path = if(args.length >= 2) args(1) else "data/mnist/mnist.bz2"
    val numPartitions = if(args.length >= 3) args(2).toInt else 10
    val models = if(args.length >=4) args(3).split(',') else Array("tree","naive")

    val spark = SparkSession.builder().getOrCreate()
    val sc = spark.sparkContext
    import spark.implicits._

    //read in raw label and features
    val rawDataset = MLUtils.loadLibSVMFile(sc, path)
      .zipWithIndex()
      .filter(_._2 < ns.max)
      .sortBy(_._2, numPartitions = numPartitions)
      .keys
      .toDF()

    // convert "features" from mllib.linalg.Vector to ml.linalg.Vector
    val dataset =  MLUtils.convertVectorColumnsToML(rawDataset)
      .cache()
    dataset.count() //force persist

    val limiter = new Limiter()
    val knn = new KNNClassifier()
      .setTopTreeSize(numPartitions * 10)
      .setFeaturesCol("features")
      .setPredictionCol("prediction")
      .setK(1)
    val naiveKNN = new NaiveKNNClassifier()

    val pipeline = new Pipeline()
      .setStages(Array(limiter, knn))
    val naivePipeline = new Pipeline()
      .setStages(Array(limiter, naiveKNN))

    val paramGrid = new ParamGridBuilder()
      .addGrid(limiter.n, ns)
      .build()

    val bm = new Benchmarker()
      .setEvaluator(new MulticlassClassificationEvaluator)
      .setEstimatorParamMaps(paramGrid)
      .setNumTimes(3)

    val metrics = mutable.ArrayBuffer[String]()
    if(models.contains("tree")) {
      val bmModel = bm.setEstimator(pipeline).fit(dataset)
      metrics += s"knn: ${bmModel.avgTrainingRuntimes.toSeq} / ${bmModel.avgEvaluationRuntimes.toSeq}"
    }
    if(models.contains("naive")) {
      val naiveBMModel = bm.setEstimator(naivePipeline).fit(dataset)
      metrics += s"naive: ${naiveBMModel.avgTrainingRuntimes.toSeq} / ${naiveBMModel.avgEvaluationRuntimes.toSeq}"
    }
    logger.info(metrics.mkString("\n"))
  }
}

class Limiter(override val uid: String) extends Transformer {
  def this() = this(Identifiable.randomUID("limiter"))

  val n: IntParam = new IntParam(this, "n", "number of rows to limit")

  def setN(value: Int): this.type = set(n, value)

  // hack to maintain number of partitions (otherwise it collapses to 1 which is unfair for naiveKNN)
  override def transform(dataset: Dataset[_]): DataFrame = dataset.limit($(n)).repartition(dataset.rdd.partitions.length).toDF()

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = schema
} 
Example 2
Source File: CompareParamGrid.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl

import org.apache.spark.ml.param.ParamMap
import org.scalatest.{Assertions, Matchers}


trait CompareParamGrid extends Matchers with Assertions {

  
  def gridCompare(g1: Array[ParamMap], g2: Array[ParamMap]): Unit = {
    val g1values = g1.toSet[ParamMap].map(_.toSeq.toSet)
    val g2values = g2.toSet[ParamMap].map(_.toSeq.toSet)
    matchTwoSets(g1values, g2values)
  }

  private def matchTwoSets[T](actual: Set[T], expected: Set[T]): Unit = {
    def stringify(set: Set[T]): String = {
      val list = set.toList
      val chunk = list take 10
      val strings = chunk.map(_.toString).sorted
      if (list.size > chunk.size) strings.mkString else strings.mkString + ",..."
    }
    val missing = stringify(expected -- actual)
    val extra = stringify(actual -- expected)
    withClue(s"Missing:\n $missing\nExtra:\n$extra") {
      actual shouldBe expected
    }
  }
} 
Example 3
Source File: NameAssigner.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasInputCols
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.{DataFrame, Dataset, functions}
import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType}


class NameAssigner(override val uid: String) extends Transformer with HasInputCols{

  def setInputCols(column: String*) : this.type = set(inputCols, column.toArray)

  def this() = this(Identifiable.randomUID("NameAssigner"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    $(inputCols)

    $(inputCols).foldLeft(dataset.toDF)((data, column) => {
      val metadata: Metadata = dataset.schema(column).metadata
      val attributes = AttributeGroup.fromStructField(
        StructField(column, new VectorUDT, nullable = false, metadata = metadata))

      val map = attributes.attributes
        .map(arr => arr.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap)
        .getOrElse(Map())

      val func = functions.udf[String, Number](x => if(x == null) {
        null
      } else {
        val i = x.intValue()
        map.getOrElse(i, i.toString)
      })

      data.withColumn(column, func(data(column)).as(column, metadata))
    }).toDF
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(schema.map(f => if ($(inputCols).contains(f.name)) {
      StructField(f.name, StringType, f.nullable, f.metadata)
    } else {
      f
    }))
} 
Example 4
Source File: URLElimminator.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl.texts

import org.apache.lucene.analysis.standard.UAX29URLEmailTokenizer
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, Params}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}


  def setInputCol(value: String): this.type = set(inputCol, value)

  def this() = this(Identifiable.randomUID("URLEliminator"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), filterTextUDF(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputCol) != $(outputCol)) {
      schema.add($(outputCol), StringType)
    } else {
      schema
    }
  }
}

object URLElimminator extends DefaultParamsReadable[URLElimminator] {
  override def load(path: String): URLElimminator = super.load(path)
} 
Example 5
Source File: RegexpReplaceTransformer.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl.texts

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{StringType, StructType}


  def setInputCol(value: String): this.type = set(inputCol, value)

  def this() = this(Identifiable.randomUID("RegexpReplaceTransformer"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), regexp_replace(dataset.col($(inputCol)), $(regexpPattern), $(regexpReplacement)))
  }
  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputCol) equals $(outputCol)) {
      val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputCol)))
      SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), StringType)
    } else {
      SchemaUtils.appendColumn(schema, $(outputCol), StringType)
    }
  }

}

object RegexpReplaceTransformer extends DefaultParamsReadable[RegexpReplaceTransformer] {
  override def load(path: String): RegexpReplaceTransformer = super.load(path)
} 
Example 6
Source File: NGramExtractor.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl.texts

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamPair, ParamValidators, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}


  def setOutputCol(value: String): this.type = set(outputCol, value)

  setDefault(new ParamPair[Int](upperN, 2), new ParamPair[Int](lowerN, 1))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val lowerBound = $(lowerN)
    val upperBound = $(upperN)
    val nGramUDF = udf[Seq[String], Seq[String]](NGramUtils.nGramFun(_,lowerBound,upperBound))
    dataset.withColumn($(outputCol), nGramUDF(dataset.col($(inputCol))))
  }


  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputCol) != $(outputCol)) {
      schema.add($(outputCol), new ArrayType(StringType, true))
    } else {
      schema
    }
  }
}
object NGramExtractor extends DefaultParamsReadable[NGramExtractor] {
  override def load(path: String): NGramExtractor = super.load(path)
} 
Example 7
Source File: LanguageDetectorTransformer.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl.texts

import com.google.common.base.Optional
import com.optimaize.langdetect.LanguageDetector
import com.optimaize.langdetect.i18n.LdLocale
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}

import scala.collection.Map


  def setOutputCol(value: String): this.type = set(outputCol, value)

  def this() = this(Identifiable.randomUID("languageDetector"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), languageDetection(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), StringType)
  }

  @transient object languageDetectorWrapped extends Serializable {
    val languageDetector: LanguageDetector =
      LanguageDetectorUtils.buildLanguageDetector(
        LanguageDetectorUtils.readListLangsBuiltIn(),
        $(minimalConfidence),
        $(languagePriors).toMap)
  }

} 
Example 8
Source File: LanguageAwareAnalyzer.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl.texts

import org.apache.lucene.analysis.util.StopwordAnalyzerBase
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.HasOutputCol
import org.apache.spark.ml.param.{Param, ParamMap, Params}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{ArrayType, StringType, StructType}


  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  def this() = this(Identifiable.randomUID("languageAnalyzer"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.withColumn($(outputCol), stemmTextUDF(dataset.col($(inputColLang)), dataset.col($(inputColText)))).toDF
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    if ($(inputColText) equals $(outputCol)) {
      val schemaWithoutInput = new StructType(schema.fields.filterNot(_.name equals $(inputColText)))
      SchemaUtils.appendColumn(schemaWithoutInput, $(outputCol), ArrayType(StringType, true))
    } else {
      SchemaUtils.appendColumn(schema, $(outputCol), ArrayType(StringType, true))
    }
  }

}

object LanguageAwareAnalyzer extends DefaultParamsReadable[LanguageAwareAnalyzer] {
  override def load(path: String): LanguageAwareAnalyzer = super.load(path)
} 
Example 9
Source File: ParamGridBuilderSuite.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tuning

import scala.collection.mutable

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.{ParamMap, TestParams}

class ParamGridBuilderSuite extends SparkFunSuite {

  val solver = new TestParams()
  import solver.{inputCol, maxIter}

  test("param grid builder") {//参数网格生成器
    def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = {
      assert(maps.size === expected.size)
      maps.foreach { m =>//m:ParamMap类型
        //(10,input0)(10,input1)
        val tuple = (m(maxIter), m(inputCol))
        assert(expected.contains(tuple))
        expected.remove(tuple)
      }
      assert(expected.isEmpty)
    }
    //通过addGrid添加我们需要寻找的最佳参数
    //ParamGridBuilder构建待选参数(如:logistic regression的regParam)
    val maps0 = new ParamGridBuilder()
      .baseOn(maxIter -> 10)
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    //期望值
    val expected0 = mutable.Set(
      (10, "input0"),
      (10, "input1"))
    validateGrid(maps0, expected0)
    val maps1 = new ParamGridBuilder()
      .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten 将被覆盖
      .addGrid(maxIter, Array(10, 20))//重载
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected1 = mutable.Set(
      (10, "input0"),
      (20, "input0"),
      (10, "input1"),
      (20, "input1"))
    validateGrid(maps1, expected1)
  }
} 
Example 10
Source File: RegressionEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.DoubleType


  def setLabelCol(value: String): this.type = set(labelCol, value)
  //默认均方根误差
  setDefault(metricName -> "rmse")

  override def evaluate(dataset: DataFrame): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)

    val predictionAndLabels = dataset.select($(predictionCol), $(labelCol))
      .map { case Row(prediction: Double, label: Double) =>
        (prediction, label)
      }     
    val metrics = new RegressionMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      //均方根误差
      case "rmse" => metrics.rootMeanSquaredError
      //均方差
      case "mse" => metrics.meanSquaredError
      case "r2" => metrics.r2
      //平均绝对误差
      case "mae" => metrics.meanAbsoluteError
    }
    metric
  }

  override def isLargerBetter: Boolean = $(metricName) match {
    case "rmse" => false//均方根误差
    case "mse" => false//均方差
    case "r2" => true//平方系统
    case "mae" => false//平均绝对误差
  }

  override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
} 
Example 11
Source File: MulticlassClassificationEvaluator.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{SchemaUtils, Identifiable}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.types.DoubleType


  def setLabelCol(value: String): this.type = set(labelCol, value)
//F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标
  setDefault(metricName -> "f1")

  override def evaluate(dataset: DataFrame): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)

    val predictionAndLabels = dataset.select($(predictionCol), $(labelCol))
      .map { case Row(prediction: Double, label: Double) =>
      (prediction, label)
    }
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      //F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标
      case "f1" => metrics.weightedFMeasure
      case "precision" => metrics.precision//准确率
      case "recall" => metrics.recall//召回率
      case "weightedPrecision" => metrics.weightedPrecision//加权准确率
      case "weightedRecall" => metrics.weightedRecall//加权召回率
    }
    metric
  }

  override def isLargerBetter: Boolean = $(metricName) match {
    case "f1" => true//F1-Measure是根据准确率Precision和召回率Recall二者给出的一个综合的评价指标
    case "precision" => true//准确率
    case "recall" => true//召回率
    case "weightedPrecision" => true//加权准确率
    case "weightedRecall" => true//加权召回率
  }

  override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
} 
Example 12
Source File: HashingTF.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
} 
Example 13
Source File: OpRegressionEvaluatorTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.evaluators

import com.salesforce.op.features.types._
import com.salesforce.op.stages.impl.classification.OpLogisticRegression
import com.salesforce.op.stages.impl.regression.{OpLinearRegression, RegressionModelSelector}
import com.salesforce.op.stages.impl.selector.ModelSelectorNames.EstimatorType
import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.ParamGridBuilder
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class OpRegressionEvaluatorTest extends FlatSpec with TestSparkContext {

  val (ds, rawLabel, features) = TestFeatureBuilder[RealNN, OPVector](
    Seq(
      (10.0, Vectors.dense(1.0, 4.3, 1.3)),
      (20.0, Vectors.dense(2.0, 0.3, 0.1)),
      (30.0, Vectors.dense(3.0, 3.9, 4.3)),
      (40.0, Vectors.dense(4.0, 1.3, 0.9)),
      (50.0, Vectors.dense(5.0, 4.7, 1.3)),
      (10.0, Vectors.dense(1.0, 4.3, 1.3)),
      (20.0, Vectors.dense(2.0, 0.3, 0.1)),
      (30.0, Vectors.dense(3.0, 3.9, 4.3)),
      (40.0, Vectors.dense(4.0, 1.3, 0.9)),
      (50.0, Vectors.dense(5.0, 4.7, 1.3))
    ).map(v => v._1.toRealNN -> v._2.toOPVector)
  )

  val label = rawLabel.copy(isResponse = true)

  val lr = new OpLogisticRegression()
  val lrParams = new ParamGridBuilder().addGrid(lr.regParam, Array(0.0)).build()

  val testEstimator = RegressionModelSelector.withTrainValidationSplit(dataSplitter = None, trainRatio = 0.5,
    modelsAndParameters = Seq(lr -> lrParams))
    .setInput(label, features)

  val prediction = testEstimator.getOutput()
  val testEvaluator = new OpRegressionEvaluator().setLabelCol(label).setPredictionCol(prediction)

  val testEstimator2 = new OpLinearRegression().setInput(label, features)

  val prediction2 = testEstimator2.getOutput()
  val testEvaluator2 = new OpRegressionEvaluator().setLabelCol(label).setPredictionCol(prediction2)


  Spec[OpRegressionEvaluator] should "copy" in {
    val testEvaluatorCopy = testEvaluator.copy(ParamMap())
    testEvaluatorCopy.uid shouldBe testEvaluator.uid
  }

  it should "evaluate the metrics from a model selector" in {
    val model = testEstimator.fit(ds)
    val transformedData = model.setInput(label, features).transform(ds)
    val metrics = testEvaluator.evaluateAll(transformedData).toMetadata()

    assert(metrics.getDouble(RegressionEvalMetrics.RootMeanSquaredError.toString) <= 1E-12, "rmse should be close to 0")
    assert(metrics.getDouble(RegressionEvalMetrics.MeanSquaredError.toString) <= 1E-24, "mse should be close to 0")
    assert(metrics.getDouble(RegressionEvalMetrics.R2.toString) == 1.0, "R2 should equal 1.0")
    assert(metrics.getDouble(RegressionEvalMetrics.MeanAbsoluteError.toString) <= 1E-12, "mae should be close to 0")
  }

  it should "evaluate the metrics from a single model" in {
    val model = testEstimator2.fit(ds)
    val transformedData = model.setInput(label, features).transform(ds)
    val metrics = testEvaluator2.evaluateAll(transformedData).toMetadata()

    assert(metrics.getDouble(RegressionEvalMetrics.RootMeanSquaredError.toString) <= 1E-12, "rmse should be close to 0")
    assert(metrics.getDouble(RegressionEvalMetrics.MeanSquaredError.toString) <= 1E-24, "mse should be close to 0")
    assert(metrics.getDouble(RegressionEvalMetrics.R2.toString) == 1.0, "R2 should equal 1.0")
    assert(metrics.getDouble(RegressionEvalMetrics.MeanAbsoluteError.toString) <= 1E-12, "mae should be close to 0")
  }
} 
Example 14
Source File: RegressionEvaluator.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl


class RegressionEvaluator(override val uid: String) extends Evaluator[RegressionEvaluator](uid) {

  val throughOrigin = new BooleanParam(this, "throughOrigin",
    "True if the regression is through the origin. For example, in " +
      "linear regression, it will be true without fitting intercept.")

  def setThroughOrigin(value: Boolean): this.type = set(throughOrigin, value)

  def getThroughOrigin: Boolean = $(throughOrigin)

  def this() = this(Identifiable.randomUID("regressionEvaluator"))


  override def transform(dataset: Dataset[_]): DataFrame = {

    try {
      val predictions: RDD[(Double, Double)] = dataset.select($(predictionCol), $(labelCol))
        .rdd.map { case Row(score: Double, label: Double) => (score, label) }

      val metrics = Try(new RegressionMetrics(predictions))


      val rows = metrics.toOption.map(m => Seq(
        "r2" -> m.r2,
        "rmse" -> m.rootMeanSquaredError,
        "explainedVariance" -> m.explainedVariance,
        "meanAbsoluteError" -> m.meanAbsoluteError,
        "meanSquaredError" -> m.meanSquaredError
      ).map(Row.fromTuple)).getOrElse(Seq())

      SparkSqlUtils.reflectionLock.synchronized(
        dataset.sqlContext.createDataFrame(
          dataset.sparkSession.sparkContext.parallelize(rows, 1), transformSchema(dataset.schema)))
    } catch {
      // Most probably evaluation dataset is empty
      case e: Exception =>
        logWarning("Failed to calculate metrics due to " + e.getMessage)
        SparkSqlUtils.reflectionLock.synchronized(
          dataset.sqlContext.createDataFrame(
            dataset.sparkSession.sparkContext.emptyRDD[Row], transformSchema(dataset.schema)))
    }
  }

  override def copy(extra: ParamMap): RegressionEvaluator = {
    copyValues(new RegressionEvaluator(), extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    new StructType()
      .add("metric", StringType, nullable = false)
      .add("value", DoubleType, nullable = false)
  }
} 
Example 15
Source File: RichParamMap.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.utils.spark

import com.salesforce.op.features.TransientFeature
import org.apache.spark.ml.PipelineStage
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.types.StructType

object RichParamMap {

  
    def getAsMap(): Map[String, Any] = {
      val mapped = params.toSeq.map(pp => pp.param.name -> pp.value).toMap
      mapped.map {
        case (k, v: Array[_]) =>
          if (v.headOption.exists(_.isInstanceOf[TransientFeature])) {
            k -> v.map(_.asInstanceOf[TransientFeature].toJsonString()).toList
          } else k -> v.toList
        case (k, v: StructType) => k -> v.toString()
        case (k, v: PipelineStage) => k -> v.getClass.getName
        case (k, Some(v: PipelineStage)) => k -> v.getClass.getName
        case (k, v) => k -> v
      }
    }
  }

} 
Example 16
Source File: ModelSelectorFactory.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.impl.selector

import com.salesforce.op.evaluators.{EvaluationMetrics, OpEvaluatorBase}
import com.salesforce.op.stages.impl.ModelsToTry
import com.salesforce.op.stages.impl.selector.ModelSelectorNames.{EstimatorType, ModelType}
import com.salesforce.op.stages.impl.tuning.{OpValidator, Splitter, ValidatorParamDefaults}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.ParamGridBuilder

import scala.concurrent.duration.Duration


  protected def selector(
    validator: OpValidator[ModelType, EstimatorType],
    splitter: Option[Splitter],
    trainTestEvaluators: Seq[OpEvaluatorBase[_ <: EvaluationMetrics]],
    modelTypesToUse: Seq[ModelsToTry],
    modelsAndParameters: Seq[(EstimatorType, Array[ParamMap])],
    modelDefaults: ModelDefaults[_ <: ModelsToTry]
  ): ModelSelector[ModelType, EstimatorType] = {
    val modelTypeNames = modelTypesToUse.map(_.entryName).toSet
    val modelsToUse = {
      // if no models are specified use the defaults and filter by the named models to use
      if (modelsAndParameters.isEmpty) {
        modelDefaults.modelsAndParams
          .collect { case (e, grid) if modelTypeNames(e.getClass.getSimpleName) => e -> grid.build() }
      }
      // if models to use has been specified and the models have been specified - filter the models by the names
      else if (modelTypesToUse.toSet != modelDefaults.modelTypesToUse.toSet) {
        modelsAndParameters.filter { case (e, p) => modelTypeNames(e.getClass.getSimpleName) }
      }
      // else just use the specified models
      else modelsAndParameters
    }

    new ModelSelector(
      validator = validator,
      splitter = splitter,
      models = modelsToUse,
      evaluators = trainTestEvaluators
    )
  }

} 
Example 17
Source File: OpTransformerWrapper.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.sparkwrappers.specific

import com.salesforce.op.UID
import com.salesforce.op.features.types.FeatureType
import com.salesforce.op.stages.sparkwrappers.generic.SwUnaryTransformer
import org.apache.spark.ml.SparkMLSharedParamConstants
import org.apache.spark.ml.SparkMLSharedParamConstants.InOutTransformer
import org.apache.spark.ml.param.ParamMap

import scala.reflect.runtime.universe.TypeTag


class OpTransformerWrapper[I <: FeatureType, O <: FeatureType, T <: InOutTransformer]
(
  val transformer: T,
  uid: String = UID[OpTransformerWrapper[I, O, T]]
)(
  implicit tti: TypeTag[I],
  tto: TypeTag[O],
  ttov: TypeTag[O#Value]
) extends SwUnaryTransformer[I, O, T](
  inputParamName = SparkMLSharedParamConstants.InputColName,
  outputParamName = SparkMLSharedParamConstants.OutputColName,
  operationName = transformer.getClass.getSimpleName,
  // cloning below to prevent parameter changes to the underlying transformer outside the wrapper
  sparkMlStageIn = Option(transformer).map(_.copy(ParamMap.empty).asInstanceOf[T]),
  uid = uid
) 
Example 18
Source File: OpEstimatorWrapper.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.sparkwrappers.specific

import com.salesforce.op.UID
import com.salesforce.op.features.types.FeatureType
import com.salesforce.op.stages.sparkwrappers.generic.SwUnaryEstimator
import org.apache.spark.ml._
import org.apache.spark.ml.param.ParamMap

import scala.reflect.runtime.universe.TypeTag


class OpEstimatorWrapper[I <: FeatureType, O <: FeatureType, E <: Estimator[M], M <: Model[M]]
(
  val estimator: E,
  uid: String = UID[OpEstimatorWrapper[I, O, E, M]]
)(
  implicit tti: TypeTag[I],
  tto: TypeTag[O],
  ttov: TypeTag[O#Value]
) extends SwUnaryEstimator[I, O, M, E](
  inputParamName = SparkMLSharedParamConstants.InputColName,
  outputParamName = SparkMLSharedParamConstants.OutputColName,
  operationName = estimator.getClass.getSimpleName,
  // cloning below to prevent parameter changes to the underlying classifier outside the wrapper
  sparkMlStageIn = Option(estimator).map(_.copy(ParamMap.empty).asInstanceOf[E]),
  uid = uid
) 
Example 19
Source File: OpBinaryEstimatorWrapper.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.sparkwrappers.specific

import com.salesforce.op.UID
import com.salesforce.op.features.types.FeatureType
import com.salesforce.op.stages.sparkwrappers.generic.SwBinaryEstimator
import org.apache.spark.ml._
import org.apache.spark.ml.param.ParamMap

import scala.reflect.runtime.universe.TypeTag


class OpBinaryEstimatorWrapper[I1 <: FeatureType, I2 <: FeatureType, O <: FeatureType, E <: Estimator[M], M <: Model[M]]
(
  val estimator: E,
  uid: String = UID[OpBinaryEstimatorWrapper[I1, I2, O, E, M]]
)(implicit tti1: TypeTag[I1],
  tti2: TypeTag[I2],
  tto: TypeTag[O],
  ttov: TypeTag[O#Value]
) extends SwBinaryEstimator[I1, I2, O, M, E](
  inputParam1Name = SparkMLSharedParamConstants.LabelColName,
  inputParam2Name = SparkMLSharedParamConstants.FeaturesColName,
  outputParamName = SparkMLSharedParamConstants.PredictionColName,
  operationName = estimator.getClass.getSimpleName,
  // cloning below to prevent parameter changes to the underlying classifier outside the wrapper
  sparkMlStageIn = Option(estimator).map(_.copy(ParamMap.empty).asInstanceOf[E]),
  uid = uid
) {
  final protected def getSparkStage: E = getSparkMlStage().get
} 
Example 20
Source File: UnaryEstimatorTest.scala    From TransmogrifAI   with BSD 3-Clause "New" or "Revised" License 5 votes vote down vote up
package com.salesforce.op.stages.base.unary

import com.salesforce.op.UID
import com.salesforce.op.features.Feature
import com.salesforce.op.features.types._
import com.salesforce.op.test.{OpEstimatorSpec, TestFeatureBuilder, TestSparkContext}
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{DoubleType, MetadataBuilder, StructField, StructType}
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner


@RunWith(classOf[JUnitRunner])
class UnaryEstimatorTest extends OpEstimatorSpec[Real, UnaryModel[Real, Real], UnaryEstimator[Real, Real]] {

  
  val expectedResult = Seq(0.0, 0.8, 0.4, 0.2, 1.0).map(_.toReal)

}

class MinMaxNormEstimator(uid: String = UID[MinMaxNormEstimator])
  extends UnaryEstimator[Real, Real](operationName = "minMaxNorm", uid = uid) {

  def fitFn(dataset: Dataset[Real#Value]): UnaryModel[Real, Real] = {
    val grouped = dataset.groupBy()
    val maxVal = grouped.max().first().getDouble(0)
    val minVal = grouped.min().first().getDouble(0)
    new MinMaxNormEstimatorModel(min = minVal, max = maxVal, operationName = operationName, uid = uid)
  }
}

final class MinMaxNormEstimatorModel private[op](val min: Double, val max: Double, operationName: String, uid: String)
  extends UnaryModel[Real, Real](operationName = operationName, uid = uid) {
  def transformFn: Real => Real = _.v.map(v => (v - min) / (max - min)).toReal
} 
Example 21
Source File: EstimatorWrapper.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.StructType
import org.apache.spark.{ml, sql}

import ai.deepsense.deeplang.ExecutionContext
import ai.deepsense.deeplang.doperables.{Transformer, Estimator}
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import ai.deepsense.sparkutils.ML

class EstimatorWrapper(
    executionContext: ExecutionContext,
    estimator: Estimator[Transformer])
  extends ML.Estimator[TransformerWrapper] {

  override def fitDF(dataset: sql.DataFrame): TransformerWrapper = {
    new TransformerWrapper(
      executionContext,
      estimator._fit(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())))
  }

  override def copy(extra: ParamMap): EstimatorWrapper = {
    val params = ParamTransformer.transform(extra)
    val estimatorCopy = estimator.replicate().set(params: _*)
    new EstimatorWrapper(executionContext, estimatorCopy)
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override lazy val params: Array[ml.param.Param[_]] = {
    estimator.params.map(new ParamWrapper(uid, _))
  }

  override val uid: String = Identifiable.randomUID("EstimatorWrapper")
} 
Example 22
Source File: EvaluatorWrapper.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.evaluation
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql

import ai.deepsense.deeplang.ExecutionContext
import ai.deepsense.deeplang.doperables.Evaluator
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import ai.deepsense.sparkutils.ML

class EvaluatorWrapper(
    context: ExecutionContext,
    evaluator: Evaluator)
  extends ML.Evaluator {

  override def evaluateDF(dataset: sql.DataFrame): Double = {
    evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value
  }

  override def copy(extra: ParamMap): evaluation.Evaluator = {
    val params = ParamTransformer.transform(extra)
    val evaluatorCopy = evaluator.replicate().set(params: _*)
    new EvaluatorWrapper(context, evaluatorCopy)
  }

  override lazy val params: Array[Param[_]] = {
    evaluator.params.map(new ParamWrapper(uid, _))
  }

  override def isLargerBetter: Boolean = evaluator.isLargerBetter

  override val uid: String = Identifiable.randomUID("EvaluatorWrapper")
} 
Example 23
Source File: TransformerWrapper.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import ai.deepsense.deeplang.ExecutionContext
import ai.deepsense.deeplang.doperables.Transformer
import ai.deepsense.deeplang.doperables.dataframe.DataFrame
import ai.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import ai.deepsense.sparkutils.ML

class TransformerWrapper(
    executionContext: ExecutionContext,
    transformer: Transformer)
  extends ML.Model[TransformerWrapper] {

  override def copy(extra: ParamMap): TransformerWrapper = {
    val params = ParamTransformer.transform(extra)
    val transformerCopy = transformer.replicate().set(params: _*)
    new TransformerWrapper(executionContext, transformerCopy)
  }

  override def transformDF(dataset: sql.DataFrame): sql.DataFrame = {
    transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF()))
      .sparkDataFrame
  }

  override def transformSchema(schema: StructType): StructType = {
    transformer._transformSchema(schema).get
  }

  override lazy val params: Array[Param[_]] = {
    transformer.params.map(new ParamWrapper(uid, _))
  }

  override val uid: String = Identifiable.randomUID("TransformerWrapper")
} 
Example 24
Source File: SerializableSparkModel.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.Model
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType

import ai.deepsense.sparkutils.ML

class SerializableSparkModel[M <: Model[M]](val sparkModel: M)
  extends ML.Model[SerializableSparkModel[M]]
  with MLWritable {

  override def copy(extra: ParamMap): SerializableSparkModel[M] =
    new SerializableSparkModel(sparkModel.copy(extra))

  override def write: MLWriter = {
    sparkModel match {
      case w: MLWritable => w.write
      case _ => new DefaultMLWriter(this)
    }
  }

  override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset)

  override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema)

  override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae"
}

// This class may seem unused, but it is used reflectively by spark deserialization mechanism
object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] {
  override def read: MLReader[SerializableSparkModel[_]] = {
    new DefaultMLReader[SerializableSparkModel[_]]()
  }
} 
Example 25
Source File: SerializableSparkEstimator.scala    From seahorse   with Apache License 2.0 5 votes vote down vote up
package ai.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import ai.deepsense.sparkutils.ML

class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E)
  extends ML.Estimator[SerializableSparkModel[T]]
  with MLWritable {

  override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0"

  override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = {
    val result: T = sparkEstimator.fit(dataset)
    new SerializableSparkModel[T](result)
  }

  override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] =
    new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E])

  override def write: MLWriter = new DefaultMLWriter(this)

  override def transformSchema(schema: StructType): StructType =
    sparkEstimator.transformSchema(schema)
} 
Example 26
Source File: ParamGridBuilderSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tuning

import scala.collection.mutable

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.{ParamMap, TestParams}

class ParamGridBuilderSuite extends SparkFunSuite {

  val solver = new TestParams()
  import solver.{inputCol, maxIter}

  test("param grid builder") {
    def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = {
      assert(maps.size === expected.size)
      maps.foreach { m =>
        val tuple = (m(maxIter), m(inputCol))
        assert(expected.contains(tuple))
        expected.remove(tuple)
      }
      assert(expected.isEmpty)
    }

    val maps0 = new ParamGridBuilder()
      .baseOn(maxIter -> 10)
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected0 = mutable.Set(
      (10, "input0"),
      (10, "input1"))
    validateGrid(maps0, expected0)

    val maps1 = new ParamGridBuilder()
      .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten
      .addGrid(maxIter, Array(10, 20))
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected1 = mutable.Set(
      (10, "input0"),
      (20, "input0"),
      (10, "input1"),
      (20, "input1"))
    validateGrid(maps1, expected1)
  }
} 
Example 27
Source File: LogisticRegressionRecommender.scala    From wordpress-posts-recommender   with Apache License 2.0 5 votes vote down vote up
package wordpressworkshop

import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame

case class LogisticRegressionRecommender(training: DataFrame) {

  val lr = new LogisticRegression()
  val paramMap = ParamMap(lr.maxIter -> 20)
                 .put(lr.regParam -> 0.01)
                 .put(lr.probabilityCol -> "probability")

  val model: LogisticRegressionModel = lr.fit(training, paramMap)

  def metrics(testData: DataFrame) = {
    val predictionAndLabels: RDD[(Double, Double)] =
      model.transform(testData).map(row => row.getAs[Vector]("probability")(1) -> row.getAs[Double]("label"))

    new BinaryClassificationMetrics(predictionAndLabels)
  }

  def likeScores(testData: DataFrame): RDD[(Long, Long, Double)] =
    model.transform(testData)
    .map(row => (row.getAs[Long]("userId"), row.getAs[Long]("postId"), row.getAs[Vector]("probability")(1)))
} 
Example 28
Source File: EstimatorWrapper.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.StructType
import org.apache.spark.{ml, sql}

import io.deepsense.deeplang.ExecutionContext
import io.deepsense.deeplang.doperables.{Transformer, Estimator}
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import io.deepsense.sparkutils.ML

class EstimatorWrapper(
    executionContext: ExecutionContext,
    estimator: Estimator[Transformer])
  extends ML.Estimator[TransformerWrapper] {

  override def fitDF(dataset: sql.DataFrame): TransformerWrapper = {
    new TransformerWrapper(
      executionContext,
      estimator._fit(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF())))
  }

  override def copy(extra: ParamMap): EstimatorWrapper = {
    val params = ParamTransformer.transform(extra)
    val estimatorCopy = estimator.replicate().set(params: _*)
    new EstimatorWrapper(executionContext, estimatorCopy)
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override lazy val params: Array[ml.param.Param[_]] = {
    estimator.params.map(new ParamWrapper(uid, _))
  }

  override val uid: String = Identifiable.randomUID("EstimatorWrapper")
} 
Example 29
Source File: EvaluatorWrapper.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.evaluation
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql

import io.deepsense.deeplang.ExecutionContext
import io.deepsense.deeplang.doperables.Evaluator
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import io.deepsense.sparkutils.ML

class EvaluatorWrapper(
    context: ExecutionContext,
    evaluator: Evaluator)
  extends ML.Evaluator {

  override def evaluateDF(dataset: sql.DataFrame): Double = {
    evaluator.evaluate(context)(())(DataFrame.fromSparkDataFrame(dataset.toDF())).value
  }

  override def copy(extra: ParamMap): evaluation.Evaluator = {
    val params = ParamTransformer.transform(extra)
    val evaluatorCopy = evaluator.replicate().set(params: _*)
    new EvaluatorWrapper(context, evaluatorCopy)
  }

  override lazy val params: Array[Param[_]] = {
    evaluator.params.map(new ParamWrapper(uid, _))
  }

  override def isLargerBetter: Boolean = evaluator.isLargerBetter

  override val uid: String = Identifiable.randomUID("EvaluatorWrapper")
} 
Example 30
Source File: TransformerWrapper.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.wrappers

import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import io.deepsense.deeplang.ExecutionContext
import io.deepsense.deeplang.doperables.Transformer
import io.deepsense.deeplang.doperables.dataframe.DataFrame
import io.deepsense.deeplang.params.wrappers.deeplang.ParamWrapper
import io.deepsense.sparkutils.ML

class TransformerWrapper(
    executionContext: ExecutionContext,
    transformer: Transformer)
  extends ML.Model[TransformerWrapper] {

  override def copy(extra: ParamMap): TransformerWrapper = {
    val params = ParamTransformer.transform(extra)
    val transformerCopy = transformer.replicate().set(params: _*)
    new TransformerWrapper(executionContext, transformerCopy)
  }

  override def transformDF(dataset: sql.DataFrame): sql.DataFrame = {
    transformer._transform(executionContext, DataFrame.fromSparkDataFrame(dataset.toDF()))
      .sparkDataFrame
  }

  override def transformSchema(schema: StructType): StructType = {
    transformer._transformSchema(schema).get
  }

  override lazy val params: Array[Param[_]] = {
    transformer.params.map(new ParamWrapper(uid, _))
  }

  override val uid: String = Identifiable.randomUID("TransformerWrapper")
} 
Example 31
Source File: SerializableSparkModel.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.Model
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType

import io.deepsense.sparkutils.ML

class SerializableSparkModel[M <: Model[M]](val sparkModel: M)
  extends ML.Model[SerializableSparkModel[M]]
  with MLWritable {

  override def copy(extra: ParamMap): SerializableSparkModel[M] =
    new SerializableSparkModel(sparkModel.copy(extra))

  override def write: MLWriter = {
    sparkModel match {
      case w: MLWritable => w.write
      case _ => new DefaultMLWriter(this)
    }
  }

  override def transformDF(dataset: DataFrame): DataFrame = sparkModel.transform(dataset)

  override def transformSchema(schema: StructType): StructType = sparkModel.transformSchema(schema)

  override val uid: String = "dc7178fe-b209-44f5-8a74-d3c4dafa0fae"
}

// This class may seem unused, but it is used reflectively by spark deserialization mechanism
object SerializableSparkModel extends MLReadable[SerializableSparkModel[_]] {
  override def read: MLReader[SerializableSparkModel[_]] = {
    new DefaultMLReader[SerializableSparkModel[_]]()
  }
} 
Example 32
Source File: SerializableSparkEstimator.scala    From seahorse-workflow-executor   with Apache License 2.0 5 votes vote down vote up
package io.deepsense.deeplang.doperables.serialization

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model}
import org.apache.spark.sql
import org.apache.spark.sql.types.StructType

import io.deepsense.sparkutils.ML

class SerializableSparkEstimator[T <: Model[T], E <: Estimator[T]](val sparkEstimator: E)
  extends ML.Estimator[SerializableSparkModel[T]]
  with MLWritable {

  override val uid: String = "e2a121fe-da6e-4ef2-9c5e-56ee558c14f0"

  override def fitDF(dataset: sql.DataFrame): SerializableSparkModel[T] = {
    val result: T = sparkEstimator.fit(dataset)
    new SerializableSparkModel[T](result)
  }

  override def copy(extra: ParamMap): Estimator[SerializableSparkModel[T]] =
    new SerializableSparkEstimator[T, E](sparkEstimator.copy(extra).asInstanceOf[E])

  override def write: MLWriter = new DefaultMLWriter(this)

  override def transformSchema(schema: StructType): StructType =
    sparkEstimator.transformSchema(schema)
} 
Example 33
Source File: SimpleVectorAssembler.scala    From albedo   with MIT License 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkException
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row}

import scala.collection.mutable.ArrayBuilder



  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)

    val schema = dataset.schema
    val assembleFunc = udf { r: Row =>
      SimpleVectorAssembler.assemble(r.toSeq: _*)
    }
    val args = $(inputCols).map { c =>
      schema(c).dataType match {
        case DoubleType => dataset(c)
        case _: VectorUDT => dataset(c)
        case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid")
      }
    }

    dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol)))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputColNames = $(inputCols)
    val outputColName = $(outputCol)
    val inputDataTypes = inputColNames.map(name => schema(name).dataType)
    inputDataTypes.foreach {
      case _: NumericType | BooleanType =>
      case t if t.isInstanceOf[VectorUDT] =>
      case other =>
        throw new IllegalArgumentException(s"Data type $other is not supported.")
    }
    if (schema.fieldNames.contains(outputColName)) {
      throw new IllegalArgumentException(s"Output column $outputColName already exists.")
    }
    StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true))
  }

  override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra)
}

object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] {
  override def load(path: String): SimpleVectorAssembler = super.load(path)

  def assemble(vv: Any*): Vector = {
    val indices = ArrayBuilder.make[Int]
    val values = ArrayBuilder.make[Double]
    var cur = 0
    vv.foreach {
      case v: Double =>
        if (v != 0.0) {
          indices += cur
          values += v
        }
        cur += 1
      case vec: Vector =>
        vec.foreachActive { case (i, v) =>
          if (v != 0.0) {
            indices += cur + i
            values += v
          }
        }
        cur += vec.size
      case null =>
        // TODO: output Double.NaN?
        throw new SparkException("Values to assemble cannot be null.")
      case o =>
        throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.")
    }
    Vectors.sparse(cur, indices.result(), values.result()).compressed
  }
} 
Example 34
Source File: UserRepoTransformer.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.closures.UDFs._

class UserRepoTransformer(override val uid: String)
  extends Transformer with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("userRepoTransformer"))
  }

  val inputCols: StringArrayParam = new StringArrayParam(this, "inputCols", "Input column names")

  def getInputCols: Array[String] = $(inputCols)

  def setInputCols(value: Array[String]): this.type = set(inputCols, value)

  override def transformSchema(schema: StructType): StructType = {
    $(inputCols).foreach((inputColName: String) => {
      require(schema.fieldNames.contains(inputColName), s"Input column $inputColName must exist.")
    })

    val newFields: Array[StructField] = Array(
      StructField("repo_language_index_in_user_recent_repo_languages", IntegerType, nullable = false),
      StructField("repo_language_count_in_user_recent_repo_languages", IntegerType, nullable = false)
    )
    StructType(schema.fields ++ newFields)
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    import dataset.sparkSession.implicits._

    dataset
      .withColumn("repo_language_index_in_user_recent_repo_languages", repoLanguageIndexInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages"))
      .withColumn("repo_language_count_in_user_recent_repo_languages", repoLanguageCountInUserRecentRepoLanguagesUDF($"repo_language", $"user_recent_repo_languages"))
  }

  override def copy(extra: ParamMap): UserRepoTransformer = {
    defaultCopy(extra)
  }
}

object UserRepoTransformer extends DefaultParamsReadable[UserRepoTransformer] 
Example 35
Source File: HanLPTokenizer.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import java.util

import com.hankcs.hanlp.HanLP
import com.hankcs.hanlp.dictionary.stopword.CoreStopWordDictionary
import com.hankcs.hanlp.seg.common.Term
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.{BooleanParam, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types._

import scala.collection.JavaConverters._

class HanLPTokenizer(override val uid: String)
  extends UnaryTransformer[String, Seq[String], HanLPTokenizer] with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("hanLPTokenizer"))
  }

  val shouldRemoveStopWords = new BooleanParam(this, "shouldRemoveStopWords", "Whether to remove stop words")

  def getShouldRemoveStopWords: Boolean = $(shouldRemoveStopWords)

  def setShouldRemoveStopWords(value: Boolean): this.type = set(shouldRemoveStopWords, value)
  setDefault(shouldRemoveStopWords -> true)

  override def createTransformFunc: String => Seq[String] = { originStr =>
    HanLP.Config.ShowTermNature = false
    HanLP.Config.Normalization = false
    val segment = HanLP.newSegment()
    val termList: util.List[Term] = segment.seg(HanLP.convertToSimplifiedChinese(originStr.toLowerCase))

    if ($(shouldRemoveStopWords)) {
      CoreStopWordDictionary.apply(termList)
    }

    val LanguageRE = """(c|r|c\+\+|c#|f#)""".r
    val OneCharExceptCJKRE = """([^\p{InHiragana}\p{InKatakana}\p{InBopomofo}\p{InCJKCompatibilityIdeographs}\p{InCJKUnifiedIdeographs}])""".r
    termList
      .asScala
      .flatMap((term: Term) => {
        val word = term.word
        word match {
          case LanguageRE(language) => Array(language)
          case OneCharExceptCJKRE(_) => Array.empty[String]
          case _ => """([\w\.\-_\p{InHiragana}\p{InKatakana}\p{InBopomofo}\p{InCJKCompatibilityIdeographs}\p{InCJKUnifiedIdeographs}]+)""".r.findAllIn(word).toList
        }
      })
  }

  override def validateInputType(inputType: DataType): Unit = {
    require(inputType == StringType, s"Input type must be string type but got $inputType.")
  }

  override def outputDataType: DataType = {
    new ArrayType(StringType, false)
  }

  override def copy(extra: ParamMap): HanLPTokenizer = {
    defaultCopy(extra)
  }
}

object HanLPTokenizer extends DefaultParamsReadable[HanLPTokenizer] 
Example 36
Source File: SnowballStemmer.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.types.{ArrayType, DataType, StringType}
import org.tartarus.snowball.ext.EnglishStemmer

class SnowballStemmer(override val uid: String)
  extends UnaryTransformer[Seq[String], Seq[String], SnowballStemmer] with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("snowballStemmer"))
  }

  override def createTransformFunc: Seq[String] => Seq[String] = { strings =>
    val stemmer = new EnglishStemmer()

    strings.map((str: String) => {
      try {
        stemmer.setCurrent(str)
        stemmer.stem()
        stemmer.getCurrent()
      } catch {
        case _: Exception => str
      }
    })
  }

  override def validateInputType(inputType: DataType): Unit = {
    require(inputType == ArrayType(StringType), s"Input type must be string type but got $inputType.")
  }

  override def outputDataType: DataType = {
    ArrayType(StringType)
  }

  override def copy(extra: ParamMap): SnowballStemmer = {
    defaultCopy(extra)
  }
}

object SnowballStemmer extends DefaultParamsReadable[SnowballStemmer] 
Example 37
Source File: RankingMetricFormatter.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}
import ws.vinta.albedo.closures.UDFs._
import ws.vinta.albedo.evaluators.RankingEvaluator._

class RankingMetricFormatter(override val uid: String, val sourceType: String)
  extends Transformer with DefaultParamsWritable {

  def this(sourceType: String) = {
    this(Identifiable.randomUID("rankingMetricFormatter"), sourceType)
  }

  val userCol = new Param[String](this, "userCol", "User column name")

  def getUserCol: String = $(userCol)

  def setUserCol(value: String): this.type = set(userCol, value)
  setDefault(userCol -> "user")

  val itemCol = new Param[String](this, "itemCol", "Item column name")

  def getItemCol: String = $(itemCol)

  def setItemCol(value: String): this.type = set(itemCol, value)
  setDefault(itemCol -> "item")

  val predictionCol = new Param[String](this, "predictionCol", "Prediction column name")

  def getPredictionCol: String = $(predictionCol)

  def setPredictionCol(value: String): this.type = set(predictionCol, value)
  setDefault(predictionCol -> "prediction")

  val topK = new IntParam(this, "topK", "Recommend top-k items for every user")

  def getTopK: Int = $(topK)

  def setTopK(value: Int): this.type = set(topK, value)
  setDefault(topK -> 15)

  override def transformSchema(schema: StructType): StructType = {
    Map($(userCol) -> IntegerType, $(itemCol) -> IntegerType)
      .foreach{
        case(columnName: String, expectedDataType: DataType) => {
          val actualDataType = schema(columnName).dataType
          require(actualDataType.equals(expectedDataType), s"Column $columnName must be of type $expectedDataType but was actually $actualDataType.")
        }
      }

    schema
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    sourceType match {
      case "als" =>
        dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), col($(predictionCol)).desc, $(topK)))
      case "lr" =>
        dataset.transform(intoUserPredictedItems(col($(userCol)), col($(itemCol)), toArrayUDF(col($(predictionCol))).getItem(1).desc, $(topK)))
    }
  }

  override def copy(extra: ParamMap): RankingMetricFormatter = {
    val copied = new RankingMetricFormatter(uid, sourceType)
    copyValues(copied, extra)
  }
}

object RankingMetricFormatter extends DefaultParamsReadable[RankingMetricFormatter] 
Example 38
Source File: IntermediateCacher.scala    From albedo   with MIT License 5 votes vote down vote up
package ws.vinta.albedo.transformers

import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{ParamMap, StringArrayParam}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}

class IntermediateCacher(override val uid: String)
  extends Transformer with DefaultParamsWritable {

  def this() = {
    this(Identifiable.randomUID("intermediateCacher"))
  }

  val inputCols = new StringArrayParam(this, "inputCols", "Input column names")

  def getInputCols: Array[String] = $(inputCols)

  def setInputCols(value: Array[String]): this.type = set(inputCols, value)
  setDefault(inputCols -> Array.empty[String])

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema)

    val intermediateDF = if ($(inputCols).isEmpty) dataset.toDF() else dataset.select($(inputCols).map(col(_)): _*)
    intermediateDF.cache()
  }

  override def copy(extra: ParamMap): IntermediateCacher = {
    defaultCopy(extra)
  }
}

object IntermediateCacher extends DefaultParamsReadable[IntermediateCacher] 
Example 39
Source File: RegressionEvaluator.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.DoubleType


  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "rmse")

  override def evaluate(dataset: DataFrame): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)

    val predictionAndLabels = dataset.select($(predictionCol), $(labelCol))
      .map { case Row(prediction: Double, label: Double) =>
        (prediction, label)
      }
    val metrics = new RegressionMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "rmse" =>
        -metrics.rootMeanSquaredError
      case "mse" =>
        -metrics.meanSquaredError
      case "r2" =>
        metrics.r2
      case "mae" =>
        -metrics.meanAbsoluteError
    }
    metric
  }

  override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
} 
Example 40
Source File: ParamGridBuilderSuite.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tuning

import scala.collection.mutable

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.{ParamMap, TestParams}

class ParamGridBuilderSuite extends SparkFunSuite {

  val solver = new TestParams()
  import solver.{inputCol, maxIter}

  test("param grid builder") {
    def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = {
      assert(maps.size === expected.size)
      maps.foreach { m =>
        val tuple = (m(maxIter), m(inputCol))
        assert(expected.contains(tuple))
        expected.remove(tuple)
      }
      assert(expected.isEmpty)
    }

    val maps0 = new ParamGridBuilder()
      .baseOn(maxIter -> 10)
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected0 = mutable.Set(
      (10, "input0"),
      (10, "input1"))
    validateGrid(maps0, expected0)

    val maps1 = new ParamGridBuilder()
      .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten
      .addGrid(maxIter, Array(10, 20))
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected1 = mutable.Set(
      (10, "input0"),
      (20, "input0"),
      (10, "input1"),
      (20, "input1"))
    validateGrid(maps1, expected1)
  }
} 
Example 41
Source File: RegressionEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, FloatType}


  @Since("1.4.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "rmse")

  @Since("1.4.0")
  override def evaluate(dataset: DataFrame): Double = {
    val schema = dataset.schema
    val predictionColName = $(predictionCol)
    val predictionType = schema($(predictionCol)).dataType
    require(predictionType == FloatType || predictionType == DoubleType,
      s"Prediction column $predictionColName must be of type float or double, " +
        s" but not $predictionType")
    val labelColName = $(labelCol)
    val labelType = schema($(labelCol)).dataType
    require(labelType == FloatType || labelType == DoubleType,
      s"Label column $labelColName must be of type float or double, but not $labelType")

    val predictionAndLabels = dataset
      .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType))
      .map { case Row(prediction: Double, label: Double) =>
        (prediction, label)
      }
    val metrics = new RegressionMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "rmse" => metrics.rootMeanSquaredError
      case "mse" => metrics.meanSquaredError
      case "r2" => metrics.r2
      case "mae" => metrics.meanAbsoluteError
    }
    metric
  }

  @Since("1.4.0")
  override def isLargerBetter: Boolean = $(metricName) match {
    case "rmse" => false
    case "mse" => false
    case "r2" => true
    case "mae" => false
  }

  @Since("1.5.0")
  override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] {

  @Since("1.6.0")
  override def load(path: String): RegressionEvaluator = super.load(path)
} 
Example 42
Source File: MulticlassClassificationEvaluator.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{ParamMap, ParamValidators, Param}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, SchemaUtils, Identifiable}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.types.DoubleType


  @Since("1.5.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "f1")

  @Since("1.5.0")
  override def evaluate(dataset: DataFrame): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
    SchemaUtils.checkColumnType(schema, $(labelCol), DoubleType)

    val predictionAndLabels = dataset.select($(predictionCol), $(labelCol))
      .map { case Row(prediction: Double, label: Double) =>
      (prediction, label)
    }
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "f1" => metrics.weightedFMeasure
      case "precision" => metrics.precision
      case "recall" => metrics.recall
      case "weightedPrecision" => metrics.weightedPrecision
      case "weightedRecall" => metrics.weightedRecall
    }
    metric
  }

  @Since("1.5.0")
  override def isLargerBetter: Boolean = $(metricName) match {
    case "f1" => true
    case "precision" => true
    case "recall" => true
    case "weightedPrecision" => true
    case "weightedRecall" => true
  }

  @Since("1.5.0")
  override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object MulticlassClassificationEvaluator
  extends DefaultParamsReadable[MulticlassClassificationEvaluator] {

  @Since("1.6.0")
  override def load(path: String): MulticlassClassificationEvaluator = super.load(path)
} 
Example 43
Source File: SQLTransformer.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{Since, Experimental}
import org.apache.spark.ml.param.{ParamMap, Param}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util._
import org.apache.spark.sql.{SQLContext, DataFrame, Row}
import org.apache.spark.sql.types.StructType


  @Since("1.6.0")
  def getStatement: String = $(statement)

  private val tableIdentifier: String = "__THIS__"

  @Since("1.6.0")
  override def transform(dataset: DataFrame): DataFrame = {
    val tableName = Identifiable.randomUID(uid)
    dataset.registerTempTable(tableName)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    val outputDF = dataset.sqlContext.sql(realStatement)
    outputDF
  }

  @Since("1.6.0")
  override def transformSchema(schema: StructType): StructType = {
    val sc = SparkContext.getOrCreate()
    val sqlContext = SQLContext.getOrCreate(sc)
    val dummyRDD = sc.parallelize(Seq(Row.empty))
    val dummyDF = sqlContext.createDataFrame(dummyRDD, schema)
    dummyDF.registerTempTable(tableIdentifier)
    val outputSchema = sqlContext.sql($(statement)).schema
    outputSchema
  }

  @Since("1.6.0")
  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
}

@Since("1.6.0")
object SQLTransformer extends DefaultParamsReadable[SQLTransformer] {

  @Since("1.6.0")
  override def load(path: String): SQLTransformer = super.load(path)
} 
Example 44
Source File: HashingTF.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.{Since, Experimental}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
}

@Since("1.6.0")
object HashingTF extends DefaultParamsReadable[HashingTF] {

  @Since("1.6.0")
  override def load(path: String): HashingTF = super.load(path)
} 
Example 45
Source File: PredictorSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasWeightCol
import org.apache.spark.ml.util._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import PredictorSuite._

  test("should support all NumericType labels and weights, and not support other types") {
    val df = spark.createDataFrame(Seq(
      (0, 1, Vectors.dense(0, 2, 3)),
      (1, 2, Vectors.dense(0, 3, 9)),
      (0, 3, Vectors.dense(0, 2, 6))
    )).toDF("label", "weight", "features")

    val types =
      Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0))

    val predictor = new MockPredictor().setWeightCol("weight")

    types.foreach { t =>
      predictor.fit(df.select(col("label").cast(t), col("weight").cast(t), col("features")))
    }

    intercept[IllegalArgumentException] {
      predictor.fit(df.select(col("label").cast(StringType), col("weight"), col("features")))
    }

    intercept[IllegalArgumentException] {
      predictor.fit(df.select(col("label"), col("weight").cast(StringType), col("features")))
    }
  }
}

object PredictorSuite {

  class MockPredictor(override val uid: String)
    extends Predictor[Vector, MockPredictor, MockPredictionModel] with HasWeightCol {

    def this() = this(Identifiable.randomUID("mockpredictor"))

    def setWeightCol(value: String): this.type = set(weightCol, value)

    override def train(dataset: Dataset[_]): MockPredictionModel = {
      require(dataset.schema("label").dataType == DoubleType)
      require(dataset.schema("weight").dataType == DoubleType)
      new MockPredictionModel(uid)
    }

    override def copy(extra: ParamMap): MockPredictor =
      throw new NotImplementedError()
  }

  class MockPredictionModel(override val uid: String)
    extends PredictionModel[Vector, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictormodel"))

    override def predict(features: Vector): Double =
      throw new NotImplementedError()

    override def copy(extra: ParamMap): MockPredictionModel =
      throw new NotImplementedError()
  }
} 
Example 46
Source File: ParamGridBuilderSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tuning

import scala.collection.mutable

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.{ParamMap, TestParams}

class ParamGridBuilderSuite extends SparkFunSuite {

  val solver = new TestParams()
  import solver.{inputCol, maxIter}

  test("param grid builder") {
    def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = {
      assert(maps.size === expected.size)
      maps.foreach { m =>
        val tuple = (m(maxIter), m(inputCol))
        assert(expected.contains(tuple))
        expected.remove(tuple)
      }
      assert(expected.isEmpty)
    }

    val maps0 = new ParamGridBuilder()
      .baseOn(maxIter -> 10)
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected0 = mutable.Set(
      (10, "input0"),
      (10, "input1"))
    validateGrid(maps0, expected0)

    val maps1 = new ParamGridBuilder()
      .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten
      .addGrid(maxIter, Array(10, 20))
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected1 = mutable.Set(
      (10, "input0"),
      (20, "input0"),
      (10, "input1"),
      (20, "input1"))
    validateGrid(maps1, expected1)
  }
} 
Example 47
Source File: RegressionEvaluator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, FloatType}


  @Since("1.4.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "rmse")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType))
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels = dataset
      .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType))
      .rdd
      .map { case Row(prediction: Double, label: Double) => (prediction, label) }
    val metrics = new RegressionMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "rmse" => metrics.rootMeanSquaredError
      case "mse" => metrics.meanSquaredError
      case "r2" => metrics.r2
      case "mae" => metrics.meanAbsoluteError
    }
    metric
  }

  @Since("1.4.0")
  override def isLargerBetter: Boolean = $(metricName) match {
    case "rmse" => false
    case "mse" => false
    case "r2" => true
    case "mae" => false
  }

  @Since("1.5.0")
  override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] {

  @Since("1.6.0")
  override def load(path: String): RegressionEvaluator = super.load(path)
} 
Example 48
Source File: MulticlassClassificationEvaluator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType


  @Since("1.5.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "f1")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels =
      dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map {
        case Row(prediction: Double, label: Double) => (prediction, label)
      }
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "f1" => metrics.weightedFMeasure
      case "weightedPrecision" => metrics.weightedPrecision
      case "weightedRecall" => metrics.weightedRecall
      case "accuracy" => metrics.accuracy
    }
    metric
  }

  @Since("1.5.0")
  override def isLargerBetter: Boolean = true

  @Since("1.5.0")
  override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object MulticlassClassificationEvaluator
  extends DefaultParamsReadable[MulticlassClassificationEvaluator] {

  @Since("1.6.0")
  override def load(path: String): MulticlassClassificationEvaluator = super.load(path)
} 
Example 49
Source File: SQLTransformer.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType


  @Since("1.6.0")
  def getStatement: String = $(statement)

  private val tableIdentifier: String = "__THIS__"

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val tableName = Identifiable.randomUID(uid)
    dataset.createOrReplaceTempView(tableName)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    val result = dataset.sparkSession.sql(realStatement)
    // Call SessionCatalog.dropTempView to avoid unpersisting the possibly cached dataset.
    dataset.sparkSession.sessionState.catalog.dropTempView(tableName)
    result
  }

  @Since("1.6.0")
  override def transformSchema(schema: StructType): StructType = {
    val spark = SparkSession.builder().getOrCreate()
    val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty))
    val dummyDF = spark.createDataFrame(dummyRDD, schema)
    val tableName = Identifiable.randomUID(uid)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    dummyDF.createOrReplaceTempView(tableName)
    val outputSchema = spark.sql(realStatement).schema
    spark.catalog.dropTempView(tableName)
    outputSchema
  }

  @Since("1.6.0")
  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
}

@Since("1.6.0")
object SQLTransformer extends DefaultParamsReadable[SQLTransformer] {

  @Since("1.6.0")
  override def load(path: String): SQLTransformer = super.load(path)
} 
Example 50
Source File: VectorExplode.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl


import odkl.analysis.spark.util.collection.OpenHashMap
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.odkl.SparkSqlUtils
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, Row, functions}


class VectorExplode(override val uid: String) extends
  Transformer with DefaultParamsWritable {

  val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.")

  def setValueCol(value: String) : this.type = set(valueCol, value)

  setDefault(valueCol -> "value")


  def this() = this(Identifiable.randomUID("vectorExplode"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT])

    val resultSchema = StructType(Seq(
      StructField($(valueCol), StringType, nullable = false)) ++
      vectors.map(f => StructField(f.name, DoubleType, nullable = true))
    )

    val arraySize = resultSchema.size - 1

    val names: Array[Map[Int, String]] = vectors.map(
      f => {
        AttributeGroup.fromStructField(f).attributes
          .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap)
          .getOrElse(Map())
      })

    val maxCapacity = names.map(_.size).max

    val explodeVectors : (Row => Array[Row]) = (r: Row ) => {
      val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity)

      for(i <- 0 until r.length) {
        val vector = r.getAs[Vector](i)

        vector.foreachActive((index, value) => {
          val name = names(i).getOrElse(index, s"${vectors(i).name}_$index")

          accumulator.changeValue(
            name,
            Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN},
            v => {v(i) = value; v})
        })
      }

      accumulator.map(x => new GenericRowWithSchema(
        (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray,
        resultSchema)).toArray
    }

    val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*)
    val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType)))
        
    val expression = functions.explode(explodeUDF(vectorsStruct))

    dataset
      .withColumn(uid, expression)
      .select(
        dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++
          resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*)
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(schema.fields.map(x =>
      x.dataType match {
        case vector: VectorUDT => StructField(x.name, typeFromVector(x))
        case _ => x
      }
    ))

  def typeFromVector(field: StructField): StructType = {
    val attributes = AttributeGroup.fromStructField(field)
    StructType(attributes.attributes
      .map(_.map(a => a.name.getOrElse(s"_${a.index.get}")))
      .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" })
      .map(name => StructField(name, DoubleType, nullable = false)))
  }
} 
Example 51
Source File: CrossValidation.scala    From Scala-for-Machine-Learning-Second-Edition   with MIT License 5 votes vote down vote up
package org.scalaml.spark.mlpipeline

import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, Evaluator}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tuning.{CrossValidator, CrossValidatorModel}
import org.apache.spark.ml.{Model, Pipeline, PipelineStage}
import org.apache.spark.sql._



  @throws(classOf[IllegalArgumentException])
  protected def apply(
    trainDf: DataFrame,
    stages: Array[PipelineStage],
    grid: Array[ParamMap]
  ): CrossValidatorModel = {
    require(stages.size > 0, "Cannot cross-validate pipeline without stages")
    require(grid.size > 0, "Cannot cross-validate with undefined grid")

    val pipeline = new Pipeline().setStages(stages ++ Array[PipelineStage](estimator))
    new CrossValidator()
      .setEstimator(pipeline)
      .setEstimatorParamMaps(grid)
      .setEvaluator(new BinaryClassificationEvaluator)
      .setNumFolds(numFolds)
      .fit(trainDf)
  }

  protected def evaluate(
    trainDf: DataFrame,
    stages: Array[PipelineStage],
    grid: Array[ParamMap]
  ): Evaluator = this(trainDf, stages, grid).getEvaluator
} 
Example 52
Source File: ArimaBestModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.TimeSeriesModel
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.StructType


class ArimaBestModel[L, M <: TimeSeriesModel](
  override val uid: String,
  val bestPrediction: RDD[(L, M)],
  val validationMetrics: RDD[(L, Seq[ModelParamEvaluation[L]])]
) extends Model[ArimaBestModel[L, M]]
    with TimeSeriesBestModelFinderParam[L] {

  //TODO avaliar necessidade
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)

    dataset.toDF()
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def copy(extra: ParamMap): ArimaBestModel[L, M] = {
    val copied =
      new ArimaBestModel[L, M](uid, bestPrediction, validationMetrics)
    copyValues(copied, extra)
  }
} 
Example 53
Source File: XGBoostBigModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml


import com.cloudera.sparkts.models.UberXGBoostModel
import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.feature.{LabeledPoint => SparkLabeledPoint}
import org.apache.spark.ml.param.shared.{HasIdCol, HasLabelCol}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModel[I](val uid: String, val models: Seq[(ParamMap, XGBoostModel)])
    extends ForecastBaseModel[XGBoostBigModel[I]]
    with HasLabelCol
    with HasIdCol {

  def setLabelcol(label: String): this.type = set(labelCol, label)

  def setIdcol(id: String): this.type = set(idCol, id)

  override def copy(extra: ParamMap): XGBoostBigModel[I] = new XGBoostBigModel[I](uid, models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
            )
      }
      .join(prediction)
      .map {
        case (id, (features, predictValue)) =>
          Row(id, features, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }

  protected def predict(dataSet: Dataset[_]) = {
    val features = dataSet.rdd.map { case (row: Row) =>
      val features = row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME)
      val id = row.getAs[I]($(idCol))
      SparkLabeledPoint(DataTransformer.toFloat(id), features)
    }.cache
    val (_, model) = models.head
    UberXGBoostModel.labelPredict(features.map(_.features.toDense), booster = model)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(getPredictionSchema)

  protected def getPredictionSchema: Array[StructField] = {
    Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    )
  }
} 
Example 54
Source File: TimeSeriesGenerator.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, StructType}

import scala.reflect.ClassTag


  def setOutputCol(value: String): this.type = set(outputCol, value)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val rdd = dataSet.rdd

    val sparkContext = dataSet.sqlContext.sparkContext
    val index = sparkContext.broadcast(dataSet.schema.fieldIndex($(timeCol).get))
    val labelColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(groupByCol).get))
    val featuresColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(featuresCol)))
    val grouped = rdd.map { case (row: Row) =>
      val timeColRow =
        IUberdataForecastUtil.convertColumnToLong(row, index.value)
      convertColumnToDouble(timeColRow, featuresColIndex)
    }.groupBy { row =>
      row.getAs[L](labelColIndex.value)
    }.map {
      case (key, values) =>
        val toBeUsed =
          values.toArray.sortBy(row => row.getAs[Long](index.value))
        (key, toBeUsed)
    }

    val toBeTrained = grouped.map {
      case (key, values) =>
        org.apache.spark.sql.Row(
          key,
          Vectors.dense(values.map(_.getAs[Double](featuresColIndex.value)))
        )
    }

    val trainSchema = transformSchema(dataSet.schema)
    dataSet.sqlContext.createDataFrame(toBeTrained, trainSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    val labelIndex = schema.fieldIndex($(groupByCol).get)
    StructType(
      Seq(
        schema.fields(labelIndex),
        StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT)
      )
    )
  }

  override def copy(extra: ParamMap): TimeSeriesGenerator[L] =
    defaultCopy(extra)

}

object TimeSeriesGenerator extends DefaultParamsReadable[TimeSeriesGenerator[_]] {

  override def load(path: String): TimeSeriesGenerator[_] = super.load(path)
} 
Example 55
Source File: XGBoostBaseBestModel.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.{Booster, DMatrix}
import ml.dmlc.xgboost4j.LabeledPoint
import org.apache.spark.SparkContext
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{ArrayType, FloatType, StructField, StructType}


trait BaseXGBoostBestModelFinder[G, M <: org.apache.spark.ml.ForecastBaseModel[M]]
    extends BestModelFinder[G, M]
    with HasGroupByCol {

  protected def buildTrainSchema(sparkContext: SparkContext): Broadcast[StructType] = sparkContext.broadcast {
    StructType(
      Seq(
        StructField($(groupByCol).get, FloatType),
        StructField(IUberdataForecastUtil.FEATURES_COL_NAME, ArrayType(new VectorUDT))))
  }


  protected def xGBoostEvaluation(row: Row,
                                  model: Booster,
                                  broadcastEvaluator: Broadcast[TimeSeriesEvaluator[G]],
                                  id: G,
                                  parameters: ParamMap): ModelParamEvaluation[G] = {
    val featuresArray = row
      .getAs[Array[org.apache.spark.ml.linalg.Vector]](IUberdataForecastUtil.FEATURES_COL_NAME)
      .map { vec =>
        val values = vec.toArray.map(DataTransformer.toFloat)
        LabeledPoint(values.head, null, values.tail)
      }
    val features = new DMatrix(featuresArray.toIterator)
    log.warn(s"Evaluating forecast for id $id, with xgboost")
    val prediction = model.predict(features).flatten
    val (forecastToBeValidated, _) = prediction.splitAt(featuresArray.length)
    val toBeValidated = featuresArray.zip(forecastToBeValidated)
    val metric = broadcastEvaluator.value.evaluate(toBeValidated.map(f =>
      (f._1.label.toDouble, f._2.toDouble)))
    val metricName = broadcastEvaluator.value.getMetricName
    new ModelParamEvaluation[G](
      id,
      metric,
      parameters,
      Some(metricName),
      SupportedAlgorithm.XGBoostAlgorithm)
  }
} 
Example 56
Source File: XGBoost.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import eleflow.uberdata.models.UberXGBOOSTModel
import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{ArrayType, DoubleType, StructField, StructType}

import scala.reflect.ClassTag


class XGBoost[I](override val uid: String,
                 val models: RDD[(I, (UberXGBOOSTModel,
                   Seq[(ModelParamEvaluation[I])]))])(
  implicit kt: ClassTag[I],
  ord: Ordering[I] = null)
    extends ForecastBaseModel[XGBoostSmallModel[I]]
    with HasInputCol
    with HasOutputCol
    with DefaultParamsWritable
    with HasFeaturesCol
    with HasNFutures
    with HasGroupByCol {

  def this(
    models: RDD[(I, (UberXGBOOSTModel, Seq[(ModelParamEvaluation[I])]))]
  )(implicit kt: ClassTag[I], ord: Ordering[I] ) =
    this(Identifiable.randomUID("xgboost"), models)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val schema = dataSet.schema
    val predSchema = transformSchema(schema)
    val joined = models.join(dataSet.rdd.map{case (r: Row) => (r.getAs[I]($(groupByCol).get), r)})

    val predictions = joined.map {
      case (id, ((bestModel, metrics), row)) =>
        val features = row.getAs[Array[org.apache.spark.ml.linalg.Vector]](
          IUberdataForecastUtil.FEATURES_COL_NAME
        )
        val label = DataTransformer.toFloat(row.getAs($(featuresCol)))
        val labelPoint = features.map { vec =>
          val array = vec.toArray.map(_.toFloat)
          LabeledPoint(label, null, array)
        }
        val matrix = new DMatrix(labelPoint.toIterator)
        val (ownFeaturesPrediction, forecast) = bestModel.boosterInstance
          .predict(matrix)
          .flatMap(_.map(_.toDouble))
          .splitAt(features.length)
        Row(
          row.toSeq :+ Vectors
            .dense(forecast) :+ SupportedAlgorithm.XGBoostAlgorithm.toString :+ bestModel.params
            .map(f => f._1 -> f._2.toString) :+ Vectors.dense(ownFeaturesPrediction): _*
        )
    }
    dataSet.sqlContext.createDataFrame(predictions, predSchema)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField($(outputCol), ArrayType(DoubleType)))
  }

  override def copy(extra: ParamMap): XGBoostSmallModel[I] = defaultCopy(extra)
} 
Example 57
Source File: PredictorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import PredictorSuite._

  test("should support all NumericType labels and not support other types") {
    val df = spark.createDataFrame(Seq(
      (0, Vectors.dense(0, 2, 3)),
      (1, Vectors.dense(0, 3, 9)),
      (0, Vectors.dense(0, 2, 6))
    )).toDF("label", "features")

    val types =
      Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0))

    val predictor = new MockPredictor()

    types.foreach { t =>
      predictor.fit(df.select(col("label").cast(t), col("features")))
    }

    intercept[IllegalArgumentException] {
      predictor.fit(df.select(col("label").cast(StringType), col("features")))
    }
  }
}

object PredictorSuite {

  class MockPredictor(override val uid: String)
    extends Predictor[Vector, MockPredictor, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictor"))

    override def train(dataset: Dataset[_]): MockPredictionModel = {
      require(dataset.schema("label").dataType == DoubleType)
      new MockPredictionModel(uid)
    }

    override def copy(extra: ParamMap): MockPredictor =
      throw new NotImplementedError()
  }

  class MockPredictionModel(override val uid: String)
    extends PredictionModel[Vector, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictormodel"))

    override def predict(features: Vector): Double =
      throw new NotImplementedError()

    override def copy(extra: ParamMap): MockPredictionModel =
      throw new NotImplementedError()
  }
} 
Example 58
Source File: ParamGridBuilderSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tuning

import scala.collection.mutable

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.{ParamMap, TestParams}

class ParamGridBuilderSuite extends SparkFunSuite {

  val solver = new TestParams()
  import solver.{inputCol, maxIter}

  test("param grid builder") {
    def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = {
      assert(maps.size === expected.size)
      maps.foreach { m =>
        val tuple = (m(maxIter), m(inputCol))
        assert(expected.contains(tuple))
        expected.remove(tuple)
      }
      assert(expected.isEmpty)
    }

    val maps0 = new ParamGridBuilder()
      .baseOn(maxIter -> 10)
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected0 = mutable.Set(
      (10, "input0"),
      (10, "input1"))
    validateGrid(maps0, expected0)

    val maps1 = new ParamGridBuilder()
      .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten
      .addGrid(maxIter, Array(10, 20))
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected1 = mutable.Set(
      (10, "input0"),
      (20, "input0"),
      (10, "input1"),
      (20, "input1"))
    validateGrid(maps1, expected1)
  }
} 
Example 59
Source File: RegressionEvaluator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, FloatType}


  @Since("1.4.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "rmse")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType))
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels = dataset
      .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType))
      .rdd
      .map { case Row(prediction: Double, label: Double) => (prediction, label) }
    val metrics = new RegressionMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "rmse" => metrics.rootMeanSquaredError
      case "mse" => metrics.meanSquaredError
      case "r2" => metrics.r2
      case "mae" => metrics.meanAbsoluteError
    }
    metric
  }

  @Since("1.4.0")
  override def isLargerBetter: Boolean = $(metricName) match {
    case "rmse" => false
    case "mse" => false
    case "r2" => true
    case "mae" => false
  }

  @Since("1.5.0")
  override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] {

  @Since("1.6.0")
  override def load(path: String): RegressionEvaluator = super.load(path)
} 
Example 60
Source File: MulticlassClassificationEvaluator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType


  @Since("1.5.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "f1")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels =
      dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map {
        case Row(prediction: Double, label: Double) => (prediction, label)
      }
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "f1" => metrics.weightedFMeasure
      case "weightedPrecision" => metrics.weightedPrecision
      case "weightedRecall" => metrics.weightedRecall
      case "accuracy" => metrics.accuracy
    }
    metric
  }

  @Since("1.5.0")
  override def isLargerBetter: Boolean = true

  @Since("1.5.0")
  override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object MulticlassClassificationEvaluator
  extends DefaultParamsReadable[MulticlassClassificationEvaluator] {

  @Since("1.6.0")
  override def load(path: String): MulticlassClassificationEvaluator = super.load(path)
} 
Example 61
Source File: SQLTransformer.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType


  @Since("1.6.0")
  def getStatement: String = $(statement)

  private val tableIdentifier: String = "__THIS__"

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val tableName = Identifiable.randomUID(uid)
    dataset.createOrReplaceTempView(tableName)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    val result = dataset.sparkSession.sql(realStatement)
    dataset.sparkSession.catalog.dropTempView(tableName)
    result
  }

  @Since("1.6.0")
  override def transformSchema(schema: StructType): StructType = {
    val spark = SparkSession.builder().getOrCreate()
    val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty))
    val dummyDF = spark.createDataFrame(dummyRDD, schema)
    val tableName = Identifiable.randomUID(uid)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    dummyDF.createOrReplaceTempView(tableName)
    val outputSchema = spark.sql(realStatement).schema
    spark.catalog.dropTempView(tableName)
    outputSchema
  }

  @Since("1.6.0")
  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
}

@Since("1.6.0")
object SQLTransformer extends DefaultParamsReadable[SQLTransformer] {

  @Since("1.6.0")
  override def load(path: String): SQLTransformer = super.load(path)
} 
Example 62
Source File: GaussianProcessRegression.scala    From spark-gp   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.regression

import breeze.linalg.{DenseVector => BDV, _}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.commons._
import org.apache.spark.ml.commons.kernel.Kernel
import org.apache.spark.ml.commons.util._
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{Identifiable, Instrumentation}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Dataset


class GaussianProcessRegression(override val uid: String)
  extends Regressor[Vector, GaussianProcessRegression, GaussianProcessRegressionModel]
    with GaussianProcessParams
    with GaussianProcessCommons[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with Logging {

  def this() = this(Identifiable.randomUID("gaussProcessReg"))

  override protected def train(dataset: Dataset[_]): GaussianProcessRegressionModel = {
    val instr = Instrumentation.create(this, dataset)

    val points: RDD[LabeledPoint] = getPoints(dataset).cache()

    val expertLabelsAndKernels: RDD[(BDV[Double], Kernel)] = getExpertLabelsAndKernels(points).cache()

    val optimalHyperparameters = optimizeHypers(instr, expertLabelsAndKernels, likelihoodAndGradient)

    expertLabelsAndKernels.foreach(_._2.setHyperparameters(optimalHyperparameters))

    produceModel(instr,
      points, expertLabelsAndKernels, optimalHyperparameters)
  }

  private def likelihoodAndGradient(yAndK : (BDV[Double], Kernel), x : BDV[Double]) = {
    val (y: BDV[Double], kernel : Kernel) = yAndK
    kernel.setHyperparameters(x)
    val (k, derivative) = kernel.trainingKernelAndDerivative()
    val (_, logdet, kinv) = logDetAndInv(k)
    val alpha = kinv * y
    val likelihood = 0.5 * (y.t * alpha) + 0.5 * logdet

    val alphaAlphaTMinusKinv = alpha * alpha.t
    alphaAlphaTMinusKinv -= kinv

    val gradient = derivative.map(derivative => -0.5 * sum(derivative *= alphaAlphaTMinusKinv))
    (likelihood, BDV(gradient:_*))
  }

  override def copy(extra: ParamMap): GaussianProcessRegression = defaultCopy(extra)

  override protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor): GaussianProcessRegressionModel = new GaussianProcessRegressionModel(uid, rawPredictor)
}

class GaussianProcessRegressionModel private[regression](override val uid: String,
          private val gaussianProjectedProcessRawPredictor: GaussianProjectedProcessRawPredictor)
  extends RegressionModel[Vector, GaussianProcessRegressionModel] {

  override protected def predict(features: Vector): Double = {
    gaussianProjectedProcessRawPredictor.predict(features)._1
  }

  override def copy(extra: ParamMap): GaussianProcessRegressionModel = {
    val newModel = copyValues(new GaussianProcessRegressionModel(uid, gaussianProjectedProcessRawPredictor), extra)
    newModel.setParent(parent)
  }
} 
Example 63
Source File: MLPipelineTest.scala    From Scala-for-Machine-Learning-Second-Edition   with MIT License 5 votes vote down vote up
package org.scalaml.spark.mlpipeline

import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.param.ParamMap
import org.scalaml.Logging
import org.scalaml.spark.ResourcesLoader
import org.scalatest.{FlatSpec, Matchers}

final class MLPipelineTest extends FlatSpec with Matchers with Logging {
  protected val name = "Spark ML pipeline"

  final val trainFile = "/data/spark/mlpipeline_training.csv"
  final val testFile = "/data/spark/mlpipeline_test.csv"


  final val columns = Array[String]("date", "asset", "region", "agent")

  it should s"$name simple predictor" in {
    show(s"$name simple predictor")

    (for {
      trainPath <- ResourcesLoader.getPath(trainFile)
      testPath <- ResourcesLoader.getPath(testFile)
    } yield {
      val predictor = new SimplePredictor[LogisticRegressionModel](
        new LogisticRegression().setMaxIter(5).setRegParam(0.1),
        columns,
        trainPath
      )

      (predictor, predictor.classify(predictor(), testPath))
    }).map {
      case (predictor, output) => {
        output.printSchema
        val predictedValues = output.select("prediction").collect.map(_.getDouble(0))
        output.show

        predictor.stop
        predictedValues(0)
      } should be(0.0)
    }
  }

  
  it should s"$name cross validation" in {
    show(s"$name cross validation")

    (for {
      trainPath <- ResourcesLoader.getPath(trainFile)
      testPath <- ResourcesLoader.getPath(testFile)
    } yield {
      val lr = new LogisticRegression().setMaxIter(5).setRegParam(0.1)
      val paramsMap = new ParamMap().put(lr.maxIter -> 30).put(lr.regParam -> 0.1)
      val validator = new ValidatedPredictor[LogisticRegressionModel](lr, columns, trainPath)

      val (f1, auROC) = validator.trainingWithSummary.getOrElse((Double.NaN, Double.NaN))
      println(s"F1-measure = ${f1} auROC = ${auROC}")
      validator.stop
      f1 should be(0.025 +- 0.005)
      auROC should be(0.600 +- 0.005)
    })
  }
}

// --------------------------------  EOF --------------------------------------------- 
Example 64
Source File: MovingAverage.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.param.{IntParam, ParamMap}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.{VectorUDT, Vectors}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types._


  def setOutputCol(value: String): this.type = set(outputCol, value)

  setDefault(windowSize -> 3)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataSet.schema)
    val sparkContext = dataSet.sqlContext.sparkContext
    val inputType = outputSchema($(inputCol)).dataType
    val inputTypeBr = sparkContext.broadcast(inputType)
    val dataSetRdd = dataSet.rdd
    val inputColName = sparkContext.broadcast($(inputCol))
    val inputColIndex = dataSet.columns.indexOf($(inputCol))
    val inputColIndexBr = sparkContext.broadcast(inputColIndex)
    val windowSizeBr = sparkContext.broadcast($(windowSize))
    val maRdd = dataSetRdd.map { case (row: Row) =>
      val (array, rawValue) = if (inputTypeBr.value.isInstanceOf[VectorUDT]) {
        val vector =
          row.getAs[org.apache.spark.ml.linalg.Vector](inputColName.value)
        (vector.toArray, Vectors.dense(vector.toArray.drop(windowSizeBr.value - 1)))
      } else {
        val iterable = row.getAs[Iterable[Double]](inputColName.value)
        (iterable.toArray, Vectors.dense(iterable.toArray.drop(windowSizeBr.value - 1)))
      }
      val (before, after) = row.toSeq.splitAt(inputColIndexBr.value)
      Row(
        (before :+ rawValue) ++ after.tail :+ MovingAverageCalc
          .simpleMovingAverageArray(array, windowSizeBr.value): _*
      )
    }
    dataSet.sqlContext.createDataFrame(maRdd, outputSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    schema.add(StructField($(outputCol), ArrayType(DoubleType)))
  }

  override def copy(extra: ParamMap): MovingAverage[T] = defaultCopy(extra)
}

object MovingAverageCalc {
  private[ml] def simpleMovingAverageArray(values: Array[Double], period: Int): Array[Double] = {
    (for (i <- 1 to values.length)
      yield
      //TODO rollback this comment with the right size of features to make the meanaverage return
      // the features values for the first values of the calc
      if (i < period) 0d //values(i)
      else values.slice(i - period, i).sum / period).toArray.dropWhile(_ == 0d)
  }
}

object MovingAverage extends DefaultParamsReadable[MovingAverage[_]] {

  override def load(path: String): MovingAverage[_] = super.load(path)
} 
Example 65
Source File: L9-15MLPipeline.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import scala.reflect.runtime.universe
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.regression.RandomForestRegressor
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.ml.param.ParamMap

object MLPipelineApp {

  case class Activity(label: Double,
    accelXHand: Double, accelYHand: Double, accelZHand: Double,
    accelXChest: Double, accelYChest: Double, accelZChest: Double,
    accelXAnkle: Double, accelYAnkle: Double, accelZAnkle: Double)

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: MLPipelineApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val sqlC = new SQLContext(ssc.sparkContext)
    import sqlC.implicits._

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) == "4" || f(1) == "5")
      .map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
      .map(f => f.map(v => v.toDouble))
      .foreachRDD(rdd => {
        if (!rdd.isEmpty) {
          val accelerometer = rdd.map(x => Activity(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7), x(8), x(9))).toDF()
          val split = accelerometer.randomSplit(Array(0.3, 0.7))
          val test = split(0)
          val train = split(1)

          val assembler = new VectorAssembler()
            .setInputCols(Array(
              "accelXHand", "accelYHand", "accelZHand",
              "accelXChest", "accelYChest", "accelZChest",
              "accelXAnkle", "accelYAnkle", "accelZAnkle"))
            .setOutputCol("vectors")
          val normalizer = new Normalizer()
            .setInputCol(assembler.getOutputCol)
            .setOutputCol("features")
          val regressor = new RandomForestRegressor()

          val pipeline = new Pipeline()
            .setStages(Array(assembler, normalizer, regressor))
          val pMap =  ParamMap(normalizer.p -> 1.0)
          val model = pipeline.fit(train, pMap)
          val prediction = model.transform(test)
          prediction.show()
        }
      })

    ssc.start()
    ssc.awaitTermination()
  }

} 
Example 66
Source File: MultinomialLabeler.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.MultinomialLabelerModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.{Vector, VectorUDT}
import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasFeaturesCol
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions.{udf, col}
import ml.combust.mleap.core.util.VectorConverters._


class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"),
                         val model: MultinomialLabelerModel) extends Transformer
  with HasFeaturesCol
  with HasProbabilitiesCol
  with HasLabelsCol {

  def setFeaturesCol(value: String): this.type = set(featuresCol, value)
  def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value)
  def setLabelsCol(value: String): this.type = set(labelsCol, value)

  @org.apache.spark.annotation.Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    val probabilitiesUdf = udf {
      (vector: Vector) => model.top(vector).map(_._1).toArray
    }

    val labelsUdf = udf {
      (vector: Vector) => model.topLabels(vector).toArray
    }

    dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))).
      withColumn($(labelsCol), labelsUdf(col($(featuresCol))))
  }

  override def copy(extra: ParamMap): Transformer =
    copyValues(new MultinomialLabeler(uid, model), extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT],
      s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}")
    val inputFields = schema.fields
    require(!inputFields.exists(_.name == $(probabilitiesCol)),
      s"Output column ${$(probabilitiesCol)} already exists.")
    require(!inputFields.exists(_.name == $(labelsCol)),
      s"Output column ${$(labelsCol)} already exists.")

    StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)),
      StructField($(labelsCol), ArrayType(StringType))))
  }
} 
Example 67
Source File: WordLengthFilter.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.WordLengthFilterModel
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators, Params}
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset}



  final def getWordLength: Int = $(wordLength)
}

class WordLengthFilter(override val uid: String) extends Transformer
  with WordLengthFilterParams
  with DefaultParamsWritable {

  val defaultLength = 3
  var model: WordLengthFilterModel = new WordLengthFilterModel(defaultLength) //Initialize with default filter length 3

  def this(model: WordLengthFilterModel) = this(uid = Identifiable.randomUID("filter_words"))
  def this() = this(new WordLengthFilterModel)

  def setInputCol(value: String): this.type = set(inputCol, value)
  def setOutputCol(value: String): this.type = set(outputCol, value)
  def setWordLength(value: Int = defaultLength): this.type = set(wordLength, value)

  override def transform(dataset: Dataset[_]): DataFrame = {
    if(defaultLength != getWordLength) model = new WordLengthFilterModel(getWordLength)
    val filterWordsUdf = udf {
      (words: Seq[String]) => model(words)
    }

    dataset.withColumn($(outputCol), filterWordsUdf(dataset($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer =  defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    require(schema($(inputCol)).dataType.isInstanceOf[ArrayType],
      s"Input column must be of type ArrayType(StringType,true) but got ${schema($(inputCol)).dataType}")
    val inputFields = schema.fields

    require(!inputFields.exists(_.name == $(outputCol)),
      s"Output column ${$(outputCol)} already exists.")

    StructType(schema.fields :+ StructField($(outputCol), ArrayType(StringType, true)))

  }
}

object WordLengthFilter extends  DefaultParamsReadable[WordLengthFilter] {
  override def load(path: String): WordLengthFilter = super.load(path)
} 
Example 68
Source File: MathUnary.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.{MathUnaryModel, UnaryOperation}
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{DefaultParamsReader, DefaultParamsWriter, Identifiable, MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types.{DoubleType, NumericType, StructField, StructType}
import org.apache.spark.sql.functions.udf


    private val className = classOf[MathUnary].getName

    override def load(path: String): MathUnary = {
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)

      val dataPath = new Path(path, "data").toString

      val data = sparkSession.read.parquet(dataPath).select("operation").head()
      val operation = data.getAs[String](0)

      val model = MathUnaryModel(UnaryOperation.forName(operation))
      val transformer = new MathUnary(metadata.uid, model)

      metadata.getAndSetParams(transformer)
      transformer
    }
  }

} 
Example 69
Source File: StringMap.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.mleap.feature

import ml.combust.mleap.core.feature.{HandleInvalid, StringMapModel}
import org.apache.hadoop.fs.Path
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types._


    private val className = classOf[StringMap].getName

    override def load(path: String): StringMap = {
      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)

      val dataPath = new Path(path, "data").toString

      val data = sparkSession.read.parquet(dataPath).select("labels", "handleInvalid", "defaultValue").head()
      val labels = data.getAs[Map[String, Double]](0)
      val handleInvalid = HandleInvalid.fromString(data.getAs[String](1))
      val defaultValue = data.getAs[Double](2)

      val model = new StringMapModel(labels, handleInvalid = handleInvalid, defaultValue = defaultValue)
      val transformer = new StringMap(metadata.uid, model)

      metadata.getAndSetParams(transformer)
      transformer
    }
  }

} 
Example 70
Source File: Sampler.scala    From automl   with Apache License 2.0 5 votes vote down vote up
package com.tencent.angel.spark.automl.feature.preprocess

import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

import scala.util.Random


class Sampler(fraction: Double,
              override val uid: String,
              seed: Int = Random.nextInt)
  extends Transformer {

  def this(fraction: Double) = this(fraction, Identifiable.randomUID("sampler"))

  
  final def getOutputCol: String = $(inputCol)

  override def transform(dataset: Dataset[_]): DataFrame = {
    dataset.sample(false, fraction, seed).toDF
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def copy(extra: ParamMap): Sampler = defaultCopy(extra)
}

object Sampler {

  def main(args: Array[String]): Unit = {
    val ss = SparkSession
      .builder
      .master("local")
      .appName("preprocess")
      .getOrCreate()

    val training = ss.read.format("libsvm")
      .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt")

    println(training.count)

    val sampler = new Sampler(0.5)
      .setInputCol("features")

    val pipeline = new Pipeline()
      .setStages(Array(sampler))

    val model = pipeline.fit(training)

    val test = ss.read.format("libsvm")
      .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt")

    model.transform(test).select("*")
      .collect()
      .foreach { case Row(label: Double, vector: Vector) =>
        println(s"($label, " +
          s"${vector.toSparse.indices.mkString("[", ",", "]")}, " +
          s"${vector.toSparse.values.mkString("[", ",", "]")}")
      }

    ss.stop()
  }
} 
Example 71
Source File: DLEstimatorBase.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasLabelCol, HasPredictionCol}
import org.apache.spark.mllib.linalg.{Vector, VectorUDT}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Row}


abstract class DLEstimatorBase[Learner <: DLEstimatorBase[Learner, M],
    M <: DLTransformerBase[M]]
  extends Estimator[M] with HasLabelCol {

  protected def internalFit(dataFrame: DataFrame): M

  override def fit(dataFrame: DataFrame): M = {
    transformSchema(dataFrame.schema, logging = true)
    internalFit(dataFrame)
  }

  override def copy(extra: ParamMap): Learner = defaultCopy(extra)
} 
Example 72
Source File: DLClassifier.scala    From BigDL   with Apache License 2.0 5 votes vote down vote up
package com.intel.analytics.bigdl.dlframes

import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.tensor.TensorNumericMath.TensorNumeric
import com.intel.analytics.bigdl.{Criterion, Module}
import org.apache.spark.ml.adapter.SchemaUtils
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types._

import scala.reflect.ClassTag


@deprecated("`DLClassifierModel` is deprecated." +
  "com.intel.analytics.bigdl.dlframes is deprecated in BigDL 0.11, " +
  "and will be removed in future releases", "0.10.0")
class DLClassifierModel[T: ClassTag](
    @transient override val model: Module[T],
    featureSize : Array[Int],
    override val uid: String = "DLClassifierModel"
  )(implicit ev: TensorNumeric[T]) extends DLModel[T](model, featureSize) {

  protected override def outputToPrediction(output: Tensor[T]): Any = {
    if (output.size().deep == Array(1).deep) {
      val raw = ev.toType[Double](output.toArray().head)
      if (raw > 0.5) 1.0 else 0.0
    } else {
      ev.toType[Double](output.max(1)._2.valueAt(1))
    }
  }

  override def transformSchema(schema : StructType): StructType = {
    validateDataType(schema, $(featuresCol))
    SchemaUtils.appendColumn(schema, $(predictionCol), DoubleType)
  }
} 
Example 73
Source File: ParamGridBuilderSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tuning

import scala.collection.mutable

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.{ParamMap, TestParams}

class ParamGridBuilderSuite extends SparkFunSuite {

  val solver = new TestParams()
  import solver.{inputCol, maxIter}

  test("param grid builder") {
    def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = {
      assert(maps.size === expected.size)
      maps.foreach { m =>
        val tuple = (m(maxIter), m(inputCol))
        assert(expected.contains(tuple))
        expected.remove(tuple)
      }
      assert(expected.isEmpty)
    }

    val maps0 = new ParamGridBuilder()
      .baseOn(maxIter -> 10)
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected0 = mutable.Set(
      (10, "input0"),
      (10, "input1"))
    validateGrid(maps0, expected0)

    val maps1 = new ParamGridBuilder()
      .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten
      .addGrid(maxIter, Array(10, 20))
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected1 = mutable.Set(
      (10, "input0"),
      (20, "input0"),
      (10, "input1"),
      (20, "input1"))
    validateGrid(maps1, expected1)
  }
} 
Example 74
Source File: RegressionEvaluator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, FloatType}


  @Since("1.4.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "rmse")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType))
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels = dataset
      .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType))
      .rdd
      .map { case Row(prediction: Double, label: Double) => (prediction, label) }
    val metrics = new RegressionMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "rmse" => metrics.rootMeanSquaredError
      case "mse" => metrics.meanSquaredError
      case "r2" => metrics.r2
      case "mae" => metrics.meanAbsoluteError
    }
    metric
  }

  @Since("1.4.0")
  override def isLargerBetter: Boolean = $(metricName) match {
    case "rmse" => false
    case "mse" => false
    case "r2" => true
    case "mae" => false
  }

  @Since("1.5.0")
  override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] {

  @Since("1.6.0")
  override def load(path: String): RegressionEvaluator = super.load(path)
} 
Example 75
Source File: MulticlassClassificationEvaluator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType


  @Since("1.5.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "f1")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels =
      dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map {
        case Row(prediction: Double, label: Double) => (prediction, label)
      }
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "f1" => metrics.weightedFMeasure
      case "weightedPrecision" => metrics.weightedPrecision
      case "weightedRecall" => metrics.weightedRecall
      case "accuracy" => metrics.accuracy
    }
    metric
  }

  @Since("1.5.0")
  override def isLargerBetter: Boolean = true

  @Since("1.5.0")
  override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object MulticlassClassificationEvaluator
  extends DefaultParamsReadable[MulticlassClassificationEvaluator] {

  @Since("1.6.0")
  override def load(path: String): MulticlassClassificationEvaluator = super.load(path)
} 
Example 76
Source File: UDFTransformer.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.contracts.{HasInputCol, HasInputCols, HasOutputCol, Wrappable}
import com.microsoft.ml.spark.core.env.InternalWrapper
import com.microsoft.ml.spark.core.serialize.ComplexParam
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param.{ParamMap, UDFParam, UDPyFParam}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.execution.python.UserDefinedPythonFunction
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.types.{DataType, StructField, StructType}
import org.apache.spark.sql.{Column, DataFrame, Dataset}
import org.apache.spark.sql.functions.col

object UDFTransformer extends ComplexParamsReadable[UDFTransformer]


  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    if (isSet(inputCol)) {
      dataset.withColumn(getOutputCol, applyUDF(dataset.col(getInputCol)))
    } else {
      dataset.withColumn(getOutputCol, applyUDFOnCols(getInputCols.map(col): _*))
    }
  }

  def validateAndTransformSchema(schema: StructType): StructType = {
    if (isSet(inputCol)) schema(getInputCol) else schema(Set(getInputCols: _*))
    schema.add(StructField(getOutputCol, getDataType))
  }

  def transformSchema(schema: StructType): StructType = validateAndTransformSchema(schema)

  def copy(extra: ParamMap): UDFTransformer = defaultCopy(extra)

} 
Example 77
Source File: SQLTransformer.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType


  @Since("1.6.0")
  def getStatement: String = $(statement)

  private val tableIdentifier: String = "__THIS__"

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val tableName = Identifiable.randomUID(uid)
    dataset.createOrReplaceTempView(tableName)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    val result = dataset.sparkSession.sql(realStatement)
    dataset.sparkSession.catalog.dropTempView(tableName)
    result
  }

  @Since("1.6.0")
  override def transformSchema(schema: StructType): StructType = {
    val spark = SparkSession.builder().getOrCreate()
    val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty))
    val dummyDF = spark.createDataFrame(dummyRDD, schema)
    val tableName = Identifiable.randomUID(uid)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    dummyDF.createOrReplaceTempView(tableName)
    val outputSchema = spark.sql(realStatement).schema
    spark.catalog.dropTempView(tableName)
    outputSchema
  }

  @Since("1.6.0")
  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
}

@Since("1.6.0")
object SQLTransformer extends DefaultParamsReadable[SQLTransformer] {

  @Since("1.6.0")
  override def load(path: String): SQLTransformer = super.load(path)
} 
Example 78
Source File: HashingTF.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Experimental
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.{IntParam, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.mllib.feature
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{ArrayType, StructType}


  def setNumFeatures(value: Int): this.type = set(numFeatures, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val hashingTF = new feature.HashingTF($(numFeatures))
    val t = udf { terms: Seq[_] => hashingTF.transform(terms) }
    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.isInstanceOf[ArrayType],
      s"The input column must be ArrayType, but got $inputType.")
    val attrGroup = new AttributeGroup($(outputCol), $(numFeatures))
    SchemaUtils.appendColumn(schema, attrGroup.toStructField())
  }

  override def copy(extra: ParamMap): HashingTF = defaultCopy(extra)
} 
Example 79
Source File: PredictorSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._

class PredictorSuite extends SparkFunSuite with MLlibTestSparkContext {

  import PredictorSuite._

  test("should support all NumericType labels and not support other types") {
    val df = spark.createDataFrame(Seq(
      (0, Vectors.dense(0, 2, 3)),
      (1, Vectors.dense(0, 3, 9)),
      (0, Vectors.dense(0, 2, 6))
    )).toDF("label", "features")

    val types =
      Seq(ShortType, LongType, IntegerType, FloatType, ByteType, DoubleType, DecimalType(10, 0))

    val predictor = new MockPredictor()

    types.foreach { t =>
      predictor.fit(df.select(col("label").cast(t), col("features")))
    }

    intercept[IllegalArgumentException] {
      predictor.fit(df.select(col("label").cast(StringType), col("features")))
    }
  }
}

object PredictorSuite {

  class MockPredictor(override val uid: String)
    extends Predictor[Vector, MockPredictor, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictor"))

    override def train(dataset: Dataset[_]): MockPredictionModel = {
      require(dataset.schema("label").dataType == DoubleType)
      new MockPredictionModel(uid)
    }

    override def copy(extra: ParamMap): MockPredictor =
      throw new NotImplementedError()
  }

  class MockPredictionModel(override val uid: String)
    extends PredictionModel[Vector, MockPredictionModel] {

    def this() = this(Identifiable.randomUID("mockpredictormodel"))

    override def predict(features: Vector): Double =
      throw new NotImplementedError()

    override def copy(extra: ParamMap): MockPredictionModel =
      throw new NotImplementedError()
  }
} 
Example 80
Source File: ParamGridBuilderSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.tuning

import scala.collection.mutable

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.{ParamMap, TestParams}

class ParamGridBuilderSuite extends SparkFunSuite {

  val solver = new TestParams()
  import solver.{inputCol, maxIter}

  test("param grid builder") {
    def validateGrid(maps: Array[ParamMap], expected: mutable.Set[(Int, String)]): Unit = {
      assert(maps.size === expected.size)
      maps.foreach { m =>
        val tuple = (m(maxIter), m(inputCol))
        assert(expected.contains(tuple))
        expected.remove(tuple)
      }
      assert(expected.isEmpty)
    }

    val maps0 = new ParamGridBuilder()
      .baseOn(maxIter -> 10)
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected0 = mutable.Set(
      (10, "input0"),
      (10, "input1"))
    validateGrid(maps0, expected0)

    val maps1 = new ParamGridBuilder()
      .baseOn(ParamMap(maxIter -> 5, inputCol -> "input")) // will be overwritten
      .addGrid(maxIter, Array(10, 20))
      .addGrid(inputCol, Array("input0", "input1"))
      .build()
    val expected1 = mutable.Set(
      (10, "input0"),
      (20, "input0"),
      (10, "input1"),
      (20, "input1"))
    validateGrid(maps1, expected1)
  }
} 
Example 81
Source File: RegressionEvaluator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, FloatType}


  @Since("1.4.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "rmse")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnTypes(schema, $(predictionCol), Seq(DoubleType, FloatType))
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels = dataset
      .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType))
      .rdd
      .map { case Row(prediction: Double, label: Double) => (prediction, label) }
    val metrics = new RegressionMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "rmse" => metrics.rootMeanSquaredError
      case "mse" => metrics.meanSquaredError
      case "r2" => metrics.r2
      case "mae" => metrics.meanAbsoluteError
    }
    metric
  }

  @Since("1.4.0")
  override def isLargerBetter: Boolean = $(metricName) match {
    case "rmse" => false
    case "mse" => false
    case "r2" => true
    case "mae" => false
  }

  @Since("1.5.0")
  override def copy(extra: ParamMap): RegressionEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object RegressionEvaluator extends DefaultParamsReadable[RegressionEvaluator] {

  @Since("1.6.0")
  override def load(path: String): RegressionEvaluator = super.load(path)
} 
Example 82
Source File: MulticlassClassificationEvaluator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.evaluation

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators}
import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils}
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.sql.{Dataset, Row}
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.DoubleType


  @Since("1.5.0")
  def setLabelCol(value: String): this.type = set(labelCol, value)

  setDefault(metricName -> "f1")

  @Since("2.0.0")
  override def evaluate(dataset: Dataset[_]): Double = {
    val schema = dataset.schema
    SchemaUtils.checkColumnType(schema, $(predictionCol), DoubleType)
    SchemaUtils.checkNumericType(schema, $(labelCol))

    val predictionAndLabels =
      dataset.select(col($(predictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map {
        case Row(prediction: Double, label: Double) => (prediction, label)
      }
    val metrics = new MulticlassMetrics(predictionAndLabels)
    val metric = $(metricName) match {
      case "f1" => metrics.weightedFMeasure
      case "weightedPrecision" => metrics.weightedPrecision
      case "weightedRecall" => metrics.weightedRecall
      case "accuracy" => metrics.accuracy
    }
    metric
  }

  @Since("1.5.0")
  override def isLargerBetter: Boolean = true

  @Since("1.5.0")
  override def copy(extra: ParamMap): MulticlassClassificationEvaluator = defaultCopy(extra)
}

@Since("1.6.0")
object MulticlassClassificationEvaluator
  extends DefaultParamsReadable[MulticlassClassificationEvaluator] {

  @Since("1.6.0")
  override def load(path: String): MulticlassClassificationEvaluator = super.load(path)
} 
Example 83
Source File: SQLTransformer.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import org.apache.spark.annotation.Since
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.util._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.sql.types.StructType


  @Since("1.6.0")
  def getStatement: String = $(statement)

  private val tableIdentifier: String = "__THIS__"

  @Since("2.0.0")
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    val tableName = Identifiable.randomUID(uid)
    dataset.createOrReplaceTempView(tableName)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    val result = dataset.sparkSession.sql(realStatement)
    dataset.sparkSession.catalog.dropTempView(tableName)
    result
  }

  @Since("1.6.0")
  override def transformSchema(schema: StructType): StructType = {
    val spark = SparkSession.builder().getOrCreate()
    val dummyRDD = spark.sparkContext.parallelize(Seq(Row.empty))
    val dummyDF = spark.createDataFrame(dummyRDD, schema)
    val tableName = Identifiable.randomUID(uid)
    val realStatement = $(statement).replace(tableIdentifier, tableName)
    dummyDF.createOrReplaceTempView(tableName)
    val outputSchema = spark.sql(realStatement).schema
    spark.catalog.dropTempView(tableName)
    outputSchema
  }

  @Since("1.6.0")
  override def copy(extra: ParamMap): SQLTransformer = defaultCopy(extra)
}

@Since("1.6.0")
object SQLTransformer extends DefaultParamsReadable[SQLTransformer] {

  @Since("1.6.0")
  override def load(path: String): SQLTransformer = super.load(path)
} 
Example 84
Source File: RepartitionSuite.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.MLReadable

class RepartitionSuite extends TestBase with TransformerFuzzing[Repartition] {

  import session.implicits._

  lazy val input = Seq(
    (0, "guitars", "drums"),
    (1, "piano", "trumpet"),
    (2, "bass", "cymbals"),
    (3, "guitars", "drums"),
    (4, "piano", "trumpet"),
    (5, "bass", "cymbals"),
    (6, "guitars", "drums"),
    (7, "piano", "trumpet"),
    (8, "bass", "cymbals"),
    (9, "guitars", "drums"),
    (10, "piano", "trumpet"),
    (11, "bass", "cymbals")
  ).toDF("numbers", "words", "more")

  test("Work for several values of n") {

    def test(n: Int): Unit = {
      val result = new Repartition()
        .setN(n)
        .transform(input)
      assert(result.rdd.getNumPartitions == n)
      ()
    }
    List(1, 2, 3, 10).foreach(test)

  }

  test("Should allow a user to set the partitions specifically in pipeline transform") {
    val r = new Repartition().setN(1)
    val pipe = new Pipeline().setStages(Array(r))
    val fitPipe = pipe.fit(input)
    assert(fitPipe.transform(input).rdd.getNumPartitions==1)
    assert(fitPipe.transform(input, ParamMap(r.n->5)).rdd.getNumPartitions ==5)
  }

  def testObjects(): Seq[TestObject[Repartition]] = List(new TestObject(
    new Repartition().setN(1), input))

  def reader: MLReadable[_] = Repartition
} 
Example 85
Source File: VowpalWabbitInteractions.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw

import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable}
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.{DataFrame, Dataset, Row}
import org.apache.spark.sql.functions.{col, struct, udf}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType

object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions]


class VowpalWabbitInteractions(override val uid: String) extends Transformer
  with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable
{
  def this() = this(Identifiable.randomUID("VowpalWabbitInteractions"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val fieldSubset = dataset.schema.fields
      .filter(f => getInputCols.contains(f.name))

    val mask = getMask

    val mode = udf((r: Row) => {

      // compute the final number of features
      val numElems = (0 until r.length)
        .map(r.getAs[Vector](_).numNonzeros).product

      val newIndices = new Array[Int](numElems)
      val newValues = new Array[Double](numElems)

      // build interaction features using FNV-1
      val fnvPrime = 16777619
      var i = 0

      def interact(idx: Int, value: Double, ns: Int): Unit = {
        if (ns == r.size) {
          newIndices(i) += mask & idx
          newValues(i) += value

          i += 1
        }
        else {
          val idx1 = idx * fnvPrime

          r.getAs[Vector](ns).foreachActive { case (idx2, value2) =>
            interact(idx1 ^ idx2, value * value2, ns + 1)
          }
        }
      }

      // start the recursion
      interact(0, 1, 0)

      val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions)

      Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted)
    })

    dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*)))
  }

  override def transformSchema(schema: StructType): StructType = {
    val fieldNames = schema.fields.map(_.name)
    for (f <- getInputCols)
      if (!fieldNames.contains(f))
        throw new IllegalArgumentException("missing input column " + f)
      else {
        val fieldType = schema.fields(schema.fieldIndex(f)).dataType

        if (fieldType != VectorType)
          throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName)
      }

    schema.add(StructField(getOutputCol, VectorType, true))
  }

  override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra)
} 
Example 86
Source File: EvaluationUtils.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.automl

import com.microsoft.ml.spark.core.metrics.MetricConstants
import com.microsoft.ml.spark.core.schema.SchemaConstants
import com.microsoft.ml.spark.train.{TrainClassifier, TrainRegressor, TrainedClassifierModel, TrainedRegressorModel}
import org.apache.spark.injections.RegressionUtils
import org.apache.spark.ml.classification.{ClassificationModel, Classifier}
import org.apache.spark.ml.{PipelineStage, Transformer}
import org.apache.spark.ml.param.{Param, ParamMap}
import org.apache.spark.ml.regression._

object EvaluationUtils {
  val ModelTypeUnsupportedErr = "Model type not supported for evaluation"
  // Find type of trained models
  def getModelType(model: PipelineStage): String = {
    model match {
      case _: TrainRegressor => SchemaConstants.RegressionKind
      case _: TrainClassifier => SchemaConstants.ClassificationKind
      case _: Classifier[_, _, _] => SchemaConstants.ClassificationKind
      case regressor: PipelineStage if RegressionUtils.isRegressor(regressor) => SchemaConstants.RegressionKind
      case _: DecisionTreeRegressor => SchemaConstants.RegressionKind
      case _: GBTRegressor => SchemaConstants.RegressionKind
      case _: RandomForestRegressor => SchemaConstants.RegressionKind
      case _: TrainedRegressorModel => SchemaConstants.RegressionKind
      case _: TrainedClassifierModel => SchemaConstants.ClassificationKind
      case evm: BestModel => getModelType(evm.getBestModel)
      case _: ClassificationModel[_, _] => SchemaConstants.ClassificationKind
      case _: RegressionModel[_, _] => SchemaConstants.RegressionKind
      case _ => throw new Exception(ModelTypeUnsupportedErr)
    }
  }

  def getMetricWithOperator(model: PipelineStage, evaluationMetric: String): (String, Ordering[Double]) = {
    val modelType = getModelType(model)
    getMetricWithOperator(modelType, evaluationMetric)
  }

  def getMetricWithOperator(modelType: String, evaluationMetric: String): (String, Ordering[Double]) = {
    val chooseHighest = Ordering.Double
    val chooseLowest = Ordering.Double.reverse
    val (evaluationMetricColumnName, operator): (String, Ordering[Double]) = modelType match {
      case SchemaConstants.RegressionKind => evaluationMetric match {
        case MetricConstants.MseSparkMetric  => (MetricConstants.MseColumnName,  chooseLowest)
        case MetricConstants.RmseSparkMetric => (MetricConstants.RmseColumnName, chooseLowest)
        case MetricConstants.R2SparkMetric   => (MetricConstants.R2ColumnName,   chooseHighest)
        case MetricConstants.MaeSparkMetric  => (MetricConstants.MaeColumnName,  chooseLowest)
        case _ => throw new Exception("Metric is not supported for regressors")
      }
      case SchemaConstants.ClassificationKind => evaluationMetric match {
        case MetricConstants.AucSparkMetric       => (MetricConstants.AucColumnName, chooseHighest)
        case MetricConstants.PrecisionSparkMetric => (MetricConstants.PrecisionColumnName, chooseHighest)
        case MetricConstants.RecallSparkMetric    => (MetricConstants.RecallColumnName, chooseHighest)
        case MetricConstants.AccuracySparkMetric  => (MetricConstants.AccuracyColumnName, chooseHighest)
        case _ => throw new Exception("Metric is not supported for classifiers")
      }
      case _ => throw new Exception("Model type not supported for evaluation")
    }
    (evaluationMetricColumnName, operator)
  }

  def getModelParams(model: Transformer): ParamMap = {
    model match {
      case reg: TrainedRegressorModel => reg.getParamMap
      case cls: TrainedClassifierModel => cls.getParamMap
      case evm: BestModel => getModelParams(evm.getBestModel)
      case _ => throw new Exception("Model type not supported for evaluation")
    }
  }

  
  def modelParamsToString(model: Transformer): String =
    getModelParams(model).toSeq.map(pv => s"${pv.param.name}: ${pv.value}").sorted.mkString(", ")

} 
Example 87
Source File: Lambda.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.contracts.Wrappable
import org.apache.spark.SparkContext
import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer}
import org.apache.spark.ml.param.{ParamMap, UDFParam}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}

object Lambda extends ComplexParamsReadable[Lambda] {
  def apply(f: Dataset[_] => DataFrame): Lambda = {
    new Lambda().setTransform(f)
  }
}

class Lambda(val uid: String) extends Transformer with Wrappable with ComplexParamsWritable {
  def this() = this(Identifiable.randomUID("Lambda"))

  val transformFunc = new UDFParam(this, "transformFunc", "holder for dataframe function")

  def setTransform(f: Dataset[_] => DataFrame): this.type = {
    set(transformFunc, udf(f, StringType))
  }

  def getTransform: Dataset[_] => DataFrame = {
    $(transformFunc).f.asInstanceOf[Dataset[_] => DataFrame]
  }

  val transformSchemaFunc = new UDFParam(this, "transformSchemaFunc", "the output schema after the transformation")

  def setTransformSchema(f: StructType => StructType): this.type = {
    set(transformSchemaFunc, udf(f, StringType))
  }

  def getTransformSchema: StructType => StructType = {
    $(transformSchemaFunc).f.asInstanceOf[StructType => StructType]
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    getTransform(dataset)
  }

  def transformSchema(schema: StructType): StructType = {
    if (get(transformSchemaFunc).isEmpty) {
      val sc = SparkContext.getOrCreate()
      val df = SparkSession.builder().getOrCreate().createDataFrame(sc.emptyRDD[Row], schema)
      transform(df).schema
    } else {
      getTransformSchema(schema)
    }
  }

  def copy(extra: ParamMap): Lambda = defaultCopy(extra)

} 
Example 88
Source File: PipelineSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.mockito.Matchers.{any, eq => meq}
import org.mockito.Mockito.when
import org.scalatest.mock.MockitoSugar.mock

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.HashingTF
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.sql.DataFrame

class PipelineSuite extends SparkFunSuite {

  abstract class MyModel extends Model[MyModel]

  test("pipeline") {
    val estimator0 = mock[Estimator[MyModel]]
    val model0 = mock[MyModel]
    val transformer1 = mock[Transformer]
    val estimator2 = mock[Estimator[MyModel]]
    val model2 = mock[MyModel]
    val transformer3 = mock[Transformer]
    val dataset0 = mock[DataFrame]
    val dataset1 = mock[DataFrame]
    val dataset2 = mock[DataFrame]
    val dataset3 = mock[DataFrame]
    val dataset4 = mock[DataFrame]

    when(estimator0.copy(any[ParamMap])).thenReturn(estimator0)
    when(model0.copy(any[ParamMap])).thenReturn(model0)
    when(transformer1.copy(any[ParamMap])).thenReturn(transformer1)
    when(estimator2.copy(any[ParamMap])).thenReturn(estimator2)
    when(model2.copy(any[ParamMap])).thenReturn(model2)
    when(transformer3.copy(any[ParamMap])).thenReturn(transformer3)

    when(estimator0.fit(meq(dataset0))).thenReturn(model0)
    when(model0.transform(meq(dataset0))).thenReturn(dataset1)
    when(model0.parent).thenReturn(estimator0)
    when(transformer1.transform(meq(dataset1))).thenReturn(dataset2)
    when(estimator2.fit(meq(dataset2))).thenReturn(model2)
    when(model2.transform(meq(dataset2))).thenReturn(dataset3)
    when(model2.parent).thenReturn(estimator2)
    when(transformer3.transform(meq(dataset3))).thenReturn(dataset4)

    val pipeline = new Pipeline()
      .setStages(Array(estimator0, transformer1, estimator2, transformer3))
    val pipelineModel = pipeline.fit(dataset0)

    assert(pipelineModel.stages.length === 4)
    assert(pipelineModel.stages(0).eq(model0))
    assert(pipelineModel.stages(1).eq(transformer1))
    assert(pipelineModel.stages(2).eq(model2))
    assert(pipelineModel.stages(3).eq(transformer3))

    val output = pipelineModel.transform(dataset0)
    assert(output.eq(dataset4))
  }

  test("pipeline with duplicate stages") {
    val estimator = mock[Estimator[MyModel]]
    val pipeline = new Pipeline()
      .setStages(Array(estimator, estimator))
    val dataset = mock[DataFrame]
    intercept[IllegalArgumentException] {
      pipeline.fit(dataset)
    }
  }

  test("PipelineModel.copy") {
    val hashingTF = new HashingTF()
      .setNumFeatures(100)
    val model = new PipelineModel("pipeline", Array[Transformer](hashingTF))
    val copied = model.copy(ParamMap(hashingTF.numFeatures -> 10))
    require(copied.stages(0).asInstanceOf[HashingTF].getNumFeatures === 10,
      "copy should handle extra stage params")
  }
} 
Example 89
Source File: IsolationForest.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.isolationforest

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util._
import org.apache.spark.ml.{Estimator, Model}
import com.linkedin.relevance.isolationforest.{IsolationForestParams,
  IsolationForest => IsolationForestSource, IsolationForestModel => IsolationForestModelSource}
import com.microsoft.ml.spark.core.contracts.Wrappable
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.types.StructType

object IsolationForest extends DefaultParamsReadable[IsolationForest]

class IsolationForest(override val uid: String, val that: IsolationForestSource)
  extends Estimator[IsolationForestModel]
  with IsolationForestParams with DefaultParamsWritable with Wrappable {

  def this(uid: String) = this(uid, new IsolationForestSource(uid))

  def this() = this(Identifiable.randomUID("IsolationForest"))

  override def copy(extra: ParamMap): IsolationForest =
    new IsolationForest(uid, that.copy(extra))

  override def fit(data: Dataset[_]): IsolationForestModel =
    new IsolationForestModel(uid, that.fit(data))

  override def transformSchema(schema: StructType): StructType =
    that.transformSchema(schema)
}

class IsolationForestModel(override val uid: String, val that: IsolationForestModelSource)
  extends Model[IsolationForestModel]
  with MLWritable {

  override def copy(extra: ParamMap): IsolationForestModel =
    new IsolationForestModel(uid, that.copy(extra))

  override def transform(data: Dataset[_]): DataFrame =
    that.transform(data)

  override def transformSchema(schema: StructType): StructType =
    that.transformSchema(schema)

  override def write: MLWriter = that.write
}

class IsolationForestModelReader extends MLReader[IsolationForestModel] with Serializable {
  override def load(path: String): IsolationForestModel = {
    val that = IsolationForestModelSource.load(path)

    new IsolationForestModel(that.uid, that)
  }
}
object IsolationForestModel extends MLReadable[IsolationForestModel] {
  override def read: MLReader[IsolationForestModel] = new IsolationForestModelReader
} 
Example 90
Source File: S2CellTransformer.scala    From spark-ext   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.feature

import com.google.common.geometry.{S2LatLng, S2CellId}
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.NominalAttribute
import org.apache.spark.ml.param.{IntParam, Param, ParamMap, ParamValidators}
import org.apache.spark.ml.util.Identifiable
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types.{DoubleType, StructType}


class S2CellTransformer(override val uid: String) extends Transformer {

  def this() = this(Identifiable.randomUID("S2CellTransformer"))

  // Input/Output column names

  val latCol: Param[String] = new Param[String](this, "latCol", "latitude column")

  val lonCol: Param[String] = new Param[String](this, "lonCol", "longitude column")

  val cellCol: Param[String] = new Param[String](this, "cellCol", "S2 Cell Id column")

  val level: Param[Int] = new IntParam(this, "level", "S2 Level [0, 30]",
    (i: Int) => ParamValidators.gtEq(0)(i) && ParamValidators.ltEq(30)(i))

  // Default parameters

  setDefault(
    latCol  -> "lat",
    lonCol  -> "lon",
    cellCol -> "cell",
    level   -> 10
  )

  def getLatCol: String = $(latCol)

  def getLonCol: String = $(lonCol)

  def getCellCol: String = $(cellCol)

  def getLevel: Int = $(level)

  def setLatCol(value: String): this.type = set(latCol, value)

  def setLonCol(value: String): this.type = set(lonCol, value)

  def setCellCol(value: String): this.type = set(cellCol, value)

  def setLevel(value: Int): this.type = set(level, value)

  override def transform(dataset: DataFrame): DataFrame = {
    val outputSchema = transformSchema(dataset.schema)
    val currentLevel = $(level)
    val t = udf { (lat: Double, lon: Double) =>
      val cellId = S2CellId.fromLatLng(S2LatLng.fromDegrees(lat, lon))
      cellId.parent(currentLevel).toToken
    }
    val metadata = outputSchema($(cellCol)).metadata
    dataset.select(col("*"), t(col($(latCol)), col($(lonCol))).as($(cellCol), metadata))
  }

  override def transformSchema(schema: StructType): StructType = {
    val latColumnName = $(latCol)
    val latDataType = schema(latColumnName).dataType
    require(latDataType == DoubleType,
      s"The latitude column $latColumnName must be Double type, " +
        s"but got $latDataType.")

    val lonColumnName = $(lonCol)
    val lonDataType = schema(lonColumnName).dataType
    require(lonDataType == DoubleType,
      s"The longitude column $lonColumnName must be Double type, " +
        s"but got $lonDataType.")

    val inputFields = schema.fields
    val outputColName = $(cellCol)
    require(inputFields.forall(_.name != outputColName),
      s"Output column $outputColName already exists.")

    val attr = NominalAttribute.defaultAttr.withName($(cellCol))
    val outputFields = inputFields :+ attr.toStructField()
    StructType(outputFields)
  }

  override def copy(extra: ParamMap): S2CellTransformer = defaultCopy(extra)
} 
Example 91
Source File: RecursivePipeline.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp

import org.apache.spark.internal.Logging
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{Identifiable, MLWritable, MLWriter}
import org.apache.spark.ml.{Estimator, Model, Pipeline, PipelineModel, PipelineStage, Transformer}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, Dataset}

import scala.collection.mutable.ListBuffer

class RecursivePipeline(override val uid: String, baseStages: Array[PipelineStage]) extends Pipeline {

  def this() = this(Identifiable.randomUID("RECURSIVE_PIPELINE"), Array.empty)

  def this(uid: String) = this(uid, Array.empty)

  def this(pipeline: Pipeline) = this(pipeline.uid, pipeline.getStages)

  this.setStages(baseStages)

  
  override def fit(dataset: Dataset[_]): PipelineModel = {
    transformSchema(dataset.schema, logging = true)
    val theStages = $(stages)
    var indexOfLastEstimator = -1
    theStages.view.zipWithIndex.foreach { case (stage, index) =>
      stage match {
        case _: Estimator[_] =>
          indexOfLastEstimator = index
        case _ =>
      }
    }
    var curDataset = dataset
    val transformers = ListBuffer.empty[Transformer]
    theStages.view.zipWithIndex.foreach { case (stage, index) =>
      if (index <= indexOfLastEstimator) {
        val transformer = stage match {
          case estimator: HasRecursiveFit[_] =>
            estimator.recursiveFit(curDataset, new Pipeline(uid).setStages(transformers.toArray).fit(dataset))
          case estimator: Estimator[_] =>
            estimator.fit(curDataset)
          case t: Transformer =>
            t
          case _ =>
            throw new IllegalArgumentException(
              s"Does not support stage $stage of type ${stage.getClass}")
        }
        if (index < indexOfLastEstimator) {
          curDataset = transformer.transform(curDataset)
        }
        transformers += transformer
      } else {
        transformers += stage.asInstanceOf[Transformer]
      }
    }

    createPipeline(dataset, transformers.toArray)
  }

}

class RecursivePipelineModel(override val uid: String, innerPipeline: PipelineModel)
  extends Model[RecursivePipelineModel] with MLWritable with Logging {

  def this(pipeline: PipelineModel) = this(pipeline.uid, pipeline)

  // drops right at most because is itself included
  private def createRecursiveAnnotators(dataset: Dataset[_]): PipelineModel =
    new Pipeline(uid).setStages(innerPipeline.stages.dropRight(1)).fit(dataset)

  override def copy(extra: ParamMap): RecursivePipelineModel = {
    new RecursivePipelineModel(uid, innerPipeline.copy(extra))
  }

  override def write: MLWriter = {
    innerPipeline.write
  }

  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    innerPipeline.stages.foldLeft(dataset.toDF)((cur, transformer) => transformer match {
      case t: HasRecursiveTransform[_] => t.recursiveTransform(cur, createRecursiveAnnotators(dataset))
      case t: AnnotatorModel[_] if t.getLazyAnnotator => cur
      case t: Transformer => t.transform(cur)
    })
  }

  override def transformSchema(schema: StructType): StructType = {
    innerPipeline.transformSchema(schema)
  }
} 
Example 92
Source File: AnnotatorApproach.scala    From spark-nlp   with Apache License 2.0 5 votes vote down vote up
package com.johnsnowlabs.nlp

import com.johnsnowlabs.storage.HasStorage
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.{Estimator, Model, PipelineModel, Transformer}
import org.apache.spark.sql.{Dataset, SparkSession}
import org.apache.spark.sql.types.{ArrayType, MetadataBuilder, StructField, StructType}
import org.apache.spark.ml.util.DefaultParamsWritable


  override final def transformSchema(schema: StructType): StructType = {
    require(validate(schema), s"Wrong or missing inputCols annotators in $uid.\n" +
      msgHelper(schema) +
      s"\nMake sure such annotators exist in your pipeline, " +
      s"with the right output names and that they have following annotator types: " +
      s"${inputAnnotatorTypes.mkString(", ")}")
    val metadataBuilder: MetadataBuilder = new MetadataBuilder()
    metadataBuilder.putString("annotatorType", outputAnnotatorType)
    val outputFields = schema.fields :+
      StructField(getOutputCol, ArrayType(Annotation.dataType), nullable = false, metadataBuilder.build)
    StructType(outputFields)
  }
} 
Example 93
Source File: Cleaner.scala    From CkoocNLP   with Apache License 2.0 5 votes vote down vote up
package functions.clean

import com.hankcs.hanlp.HanLP
import config.paramconf.{HasOutputCol, HasInputCol}
import functions.MySchemaUtils
import functions.clean.chinese.BCConvert
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.param.{IntParam, Param, ParamMap}
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.sql.functions.{col, udf}
import org.apache.spark.sql.types.{StringType, StructType}
import org.apache.spark.sql.{DataFrame, Dataset}



  setDefault(fanjan -> "f2j", quanban -> "q2b", minLineLen -> 1)

  override def transform(dataset: Dataset[_]): DataFrame = {
    val outputSchema = transformSchema(dataset.schema, logging = true)

    val cleanFunc = udf {line: String =>
      var cleaned = ""
      getFanJian match {
        case "f2j" => cleaned = HanLP.convertToSimplifiedChinese(line)
        case "j2f" => cleaned = HanLP.convertToTraditionalChinese(line)
        case _ => cleaned = line
      }

      getQuanBan match {
        case "q2b" => cleaned = BCConvert.qj2bj(cleaned)
        case "b2q" => cleaned = BCConvert.bj2qj(cleaned)
        case _ => cleaned = cleaned
      }

      cleaned
    }

    val metadata = outputSchema($(outputCol)).metadata
    dataset.select(col("*"), cleanFunc(col($(inputCol))).as($(outputCol), metadata)).filter{record =>
      val outputIndex = record.fieldIndex($(outputCol))
      record.getString(outputIndex).length >= getMinLineLen
    }
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  override def transformSchema(schema: StructType): StructType = {
    val inputType = schema($(inputCol)).dataType
    require(inputType.typeName.equals(StringType.typeName),
      s"Input type must be StringType but got $inputType.")
    MySchemaUtils.appendColumn(schema, $(outputCol), inputType, schema($(inputCol)).nullable)
  }
}


object Cleaner extends DefaultParamsReadable[Cleaner] {
  override def load(path: String): Cleaner = super.load(path)
} 
Example 94
Source File: HoltWintersBestModelFinder.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.UberHoltWintersModel
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasGroupByCol
import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset

import scala.reflect.ClassTag


class HoltWintersBestModelFinder[G](
  override val uid: String
)(implicit kt: ClassTag[G])
    extends HoltWintersBestModelEvaluation[G, HoltWintersModel[G]]
    with DefaultParamsWritable
    with HasGroupByCol
    with TimeSeriesBestModelFinder {

  def setTimeSeriesEvaluator(eval: TimeSeriesEvaluator[G]): this.type =
    set(timeSeriesEvaluator, eval)

  def setEstimatorParamMaps(value: Array[ParamMap]): this.type =
    set(estimatorParamMaps, value)

  def setNFutures(value: Int): this.type = set(nFutures, value)

  override def setValidationCol(value: String): this.type = set(validationCol, value)

  def setLabelCol(label: String): this.type = set(labelCol, label)

  def setGroupByCol(groupBy: String): this.type = set(groupByCol, Some(groupBy))

  def this()(implicit kt: ClassTag[G]) = this(Identifiable.randomUID("arima"))

  def modelEvaluation(
    idModels: RDD[(G, Row, Option[UberHoltWintersModel])]
  ): RDD[(G, (UberHoltWintersModel, ModelParamEvaluation[G]))] = {
    val eval = $(timeSeriesEvaluator)
    val broadcastEvaluator = idModels.context.broadcast(eval)
    idModels.filter(_._3.isDefined).map {
      case (id, row, models) =>
        val evaluatedModels = models.map { model =>
          holtWintersEvaluation(row, model, broadcastEvaluator, id)
        }.head
        log.warn(s"best model reach ${evaluatedModels._2.metricResult}")
        (id, evaluatedModels)
    }
  }

  override protected def train(dataSet: Dataset[_]): HoltWintersModel[G] = {
    val splitDs = split(dataSet, $(nFutures))
    val idModels = splitDs.rdd.map(train)
    new HoltWintersModel[G](uid, modelEvaluation(idModels))
      .setValidationCol($(validationCol))
      .asInstanceOf[HoltWintersModel[G]]
  }

  def train(row: Row): (G, Row, Option[UberHoltWintersModel]) = {
    val id = row.getAs[G]($(groupByCol).get)

    val result = try {
      val dense = row.getAs[org.apache.spark.ml.linalg.DenseVector]($(featuresCol))
      val ts:org.apache.spark.mllib.linalg.Vector  = org.apache.spark.mllib.linalg.Vectors.dense(dense.toArray);
      Some(
        UberHoltWintersModel.fitModelWithBOBYQA(ts, $(nFutures))
      )
    } catch {
      case e: Exception =>
        log.error(
          s"Got the following Exception ${e.getLocalizedMessage} in id $id"
        )
        None
    }
    (id, row, result)
  }
}

object HoltWintersBestModelFinder extends DefaultParamsReadable[HoltWintersBestModelFinder[_]] {

  override def load(path: String): HoltWintersBestModelFinder[_] =
    super.load(path)
} 
Example 95
Source File: XGBoostBigModelTimeSeries.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import java.sql.Timestamp

import eleflow.uberdata.IUberdataForecastUtil
import eleflow.uberdata.core.data.DataTransformer
import eleflow.uberdata.enums.SupportedAlgorithm
import ml.dmlc.xgboost4j.scala.spark.XGBoostModel
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.linalg.{VectorUDT, Vector => SparkVector}
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared.HasTimeCol
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, _}


class XGBoostBigModelTimeSeries[I](override val uid: String,
                                   override val models: Seq[(ParamMap, XGBoostModel)])
                                  extends XGBoostBigModel[I](uid, models) with HasTimeCol{

  def setTimecol(time: String): this.type = set(timeCol, Some(time))

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val prediction = predict(dataSet)
    val rows = dataSet.rdd
      .map {
        case (row: Row) =>
          (DataTransformer.toFloat(row.getAs($(idCol))),
            (row.getAs[SparkVector](IUberdataForecastUtil.FEATURES_COL_NAME),
              row.getAs[java.sql.Timestamp]($(timeCol).get)))
      }
      .join(prediction)
      .map {
        case (id, ((features, time), predictValue)) =>
          Row(id, features, time, SupportedAlgorithm.XGBoostAlgorithm.toString, predictValue)
      }
    dataSet.sqlContext.createDataFrame(rows, transformSchema(dataSet.schema))
  }


  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(Array(
      StructField($(idCol), FloatType),
      StructField(IUberdataForecastUtil.FEATURES_COL_NAME, new VectorUDT),
      StructField($(timeCol).get, TimestampType),
      StructField(IUberdataForecastUtil.ALGORITHM, StringType),
      StructField("prediction", FloatType)
    ) )
} 
Example 96
Source File: HoltWintersEstimator.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.TimeSeriesModel

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.Dataset


class HoltWintersBestModel[T, M <: TimeSeriesModel](
  override val uid: String,
  val bestPrediction: RDD[(T, M)],
  val validationMetrics: RDD[(T, ModelParamEvaluation[T])]
) extends Model[HoltWintersBestModel[T, M]]
    with TimeSeriesBestModelFinderParam[T] {

  //TODO look for this method usage to see if it can be removed
  override def transform(dataset: Dataset[_]): DataFrame = {
    transformSchema(dataset.schema, logging = true)
    dataset.toDF()
  }

  override def transformSchema(schema: StructType): StructType = {
    schema
  }

  override def copy(extra: ParamMap): HoltWintersBestModel[T, M] = {
    val copied =
      new HoltWintersBestModel[T, M](uid, bestPrediction, validationMetrics)
    copyValues(copied, extra)
  }
} 
Example 97
Source File: AllColumnsTimeSeriesGenerator.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.util.{DefaultParamsReadable, Identifiable}

import org.apache.spark.sql.types.{StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset

import scala.reflect.ClassTag


  def setOutputCol(value: String): this.type = set(outputCol, value)

//  override def transform(dataSet: DataFrame): DataFrame = {
  override def transform(dataSet: Dataset[_] ): DataFrame = {
    val rdd = dataSet.rdd
    val sparkContext = dataSet.sqlContext.sparkContext
    val labelColIndex =
      sparkContext.broadcast(dataSet.schema.fieldIndex($(labelCol)))
    val keyValueDataSet = rdd.map { case (row: Row) =>
      Row(
        row.getAs[T](labelColIndex.value),
        row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol))
      )
    }
    val trainSchema = transformSchema(dataSet.schema)

    dataSet.sqlContext.createDataFrame(keyValueDataSet, trainSchema)
  }

  override def transformSchema(schema: StructType): StructType = {
    StructType(
      schema.filter(_.name == $(labelCol)).head +: Seq(
        StructField($(outputCol), new org.apache.spark.ml.linalg.VectorUDT)
      )
    )
  }

  override def copy(extra: ParamMap): AllColumnsTimeSeriesGenerator[T, U] =
    defaultCopy(extra)
}

object AllColumnsTimeSeriesGenerator
    extends DefaultParamsReadable[AllColumnsTimeSeriesGenerator[_, _]] {

  override def load(path: String): AllColumnsTimeSeriesGenerator[_, _] =
    super.load(path)
} 
Example 98
Source File: HoltWintersBestModelEvaluation.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import com.cloudera.sparkts.models.UberHoltWintersModel
import eleflow.uberdata.enums.SupportedAlgorithm
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.{ParamMap, ParamPair}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.Row

import scala.reflect.ClassTag


abstract class HoltWintersBestModelEvaluation[L, M <: ForecastBaseModel[M]](
  implicit kt: ClassTag[L],
  ord: Ordering[L] = null
) extends BestModelFinder[L, M]
    with HoltWintersParams {

  protected def holtWintersEvaluation(
    row: Row,
    model: UberHoltWintersModel,
    broadcastEvaluator: Broadcast[TimeSeriesEvaluator[L]],
    id: L
  ): (UberHoltWintersModel, ModelParamEvaluation[L]) = {
    val features =
      row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol))
    log.warn(
      s"Evaluating forecast for id $id, with parameters " +
        s"alpha ${model.alpha}, beta ${model.beta} and gamma ${model.gamma}"
    )
    val expectedResult =
      row.getAs[org.apache.spark.ml.linalg.Vector](partialValidationCol)
    val forecastToBeValidated = Vectors.dense(new Array[Double]($(nFutures)))
    model.forecast(org.apache.spark.mllib.linalg.Vectors.fromML(features), forecastToBeValidated).toArray
    val toBeValidated =
      expectedResult.toArray.zip(forecastToBeValidated.toArray)
    val metric = broadcastEvaluator.value.evaluate(toBeValidated)
    val metricName = broadcastEvaluator.value.getMetricName
    val params = ParamMap().put(
      ParamPair(gamma, model.gamma),
      ParamPair(beta, model.beta),
      ParamPair(alpha, model.alpha)
    )
    (model,
     new ModelParamEvaluation[L](
       id,
       metric,
       params,
       Some(metricName),
       SupportedAlgorithm.HoltWinters
     ))
  }
} 
Example 99
Source File: VectorizeEncoder.scala    From uberdata   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml

import eleflow.uberdata.core.data.DataTransformer
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable}
import org.apache.spark.ml.linalg.VectorUDT
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.{StructField, StructType}


class VectorizeEncoder(override val uid: String)
    extends Transformer
    with HasIdCol
    with HasTimeCol
    with HasInputCols
    with HasLabelCol
    with HasGroupByCol
    with HasOutputCol
    with DefaultParamsWritable {

  def this() = this(Identifiable.randomUID("vectorizer"))

  def setIdCol(input: String) = set(idCol, input)

  def setLabelCol(input: String) = set(labelCol, input)

  def setGroupByCol(toGroupBy: String) = set(groupByCol, Some(toGroupBy))

  def setInputCol(input: Array[String]) = set(inputCols, input)

  def setTimeCol(time: String) = set(timeCol, Some(time))

  def setOutputCol(output: String) = set(outputCol, output)

  override def transform(dataSet: Dataset[_]): DataFrame = {
    val context = dataSet.sqlContext.sparkContext
    val input = context.broadcast($(inputCols))
    val allColumnNames = dataSet.schema.map(_.name)

    val nonInputColumnIndexes = context.broadcast(
      allColumnNames.zipWithIndex.filter(
        f => !$(inputCols).contains(f._1) || f._1 == $(groupByCol).get || f._1 == $(idCol)
          || f._1 == $(timeCol).getOrElse("")))
    val result = dataSet.rdd.map { case (row: Row) =>
      val rowSeq = row.toSeq
      val nonInputColumns = nonInputColumnIndexes.value.map {
        case (_, index) => rowSeq(index)
      }
      val size = input.value.length
      val (values, indices) = input.value
        .filter(col => row.getAs(col) != null)
        .map { column =>
          DataTransformer.toDouble(row.getAs(column))
        }
        .zipWithIndex
        .filter(f => f._1 != 0d)
        .unzip
      Row(
        nonInputColumns :+ org.apache.spark.ml.linalg.Vectors
          .sparse(size, indices.toArray, values.toArray): _*
      )
    }
    val newSchema = transformSchema(dataSet.schema)
    dataSet.sqlContext.createDataFrame(result, newSchema)
  }

  override def copy(extra: ParamMap): Transformer = defaultCopy(extra)

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType =
    StructType(
      schema.filter(
        col =>
          !$(inputCols).contains(col.name) || col.name == $(groupByCol).getOrElse("") || col.name == $(idCol)
            || col.name == $(labelCol) || col.name == $(timeCol).getOrElse("")
      )
    ).add(StructField($(outputCol), new VectorUDT))
}