org.apache.spark.ml.regression.LinearRegression Scala Example

Source File: LinearRegressionWithElasticNetExample.scala From drizzle-spark with Apache License 2.0

6 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.regression.LinearRegression
// $example off$
import org.apache.spark.sql.SparkSession

object LinearRegressionWithElasticNetExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("LinearRegressionWithElasticNetExample")
      .getOrCreate()

    // $example on$
    // Load training data
    val training = spark.read.format("libsvm")
      .load("data/mllib/sample_linear_regression_data.txt")

    val lr = new LinearRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for linear regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
    trainingSummary.residuals.show()
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
    println(s"r2: ${trainingSummary.r2}")
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: OpLinearRegressionTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages.impl.regression

import com.salesforce.op.features.types._
import com.salesforce.op.stages.base.binary.{BinaryEstimator, BinaryModel}
import com.salesforce.op.stages.impl.PredictionEquality
import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpPredictorWrapperModel}
import com.salesforce.op.test._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class OpLinearRegressionTest extends OpEstimatorSpec[Prediction, OpPredictorWrapperModel[LinearRegressionModel],
  OpPredictorWrapper[LinearRegression, LinearRegressionModel]] with PredictionEquality {

  override def specName: String = Spec[OpLinearRegression]

  val (inputData, rawLabel, features) = TestFeatureBuilder(
    Seq[(RealNN, OPVector)](
      (10.0.toRealNN, Vectors.dense(1.0, 4.3, 1.3).toOPVector),
      (20.0.toRealNN, Vectors.dense(2.0, 0.3, 0.1).toOPVector),
      (30.0.toRealNN, Vectors.dense(3.0, 3.9, 4.3).toOPVector),
      (40.0.toRealNN, Vectors.dense(4.0, 1.3, 0.9).toOPVector),
      (50.0.toRealNN, Vectors.dense(5.0, 4.7, 1.3).toOPVector)
    )
  )
  val label = rawLabel.copy(isResponse = true)
  val estimator = new OpLinearRegression().setInput(label, features)

  val expectedResult = Seq(
    Prediction(10.0),
    Prediction(20.0),
    Prediction(30.0),
    Prediction(40.0),
    Prediction(50.0)
  )

  it should "allow the user to set the desired spark parameters" in {
    estimator
      .setMaxIter(10)
      .setRegParam(0.1)
      .setFitIntercept(true)
      .setElasticNetParam(0.1)
      .setSolver("normal")
    estimator.fit(inputData)

    estimator.predictor.getMaxIter shouldBe 10
    estimator.predictor.getRegParam shouldBe 0.1
    estimator.predictor.getFitIntercept shouldBe true
    estimator.predictor.getElasticNetParam shouldBe 0.1
    estimator.predictor.getSolver shouldBe "normal"

  }
}

Source File: LinearRegressionSuite.scala From aardpfark with Apache License 2.0

5 votes

package com.ibm.aardpfark.spark.ml.regression

import com.ibm.aardpfark.pfa.PredictorResult
import org.apache.spark.ml.regression.LinearRegression

class LinearRegressionSuite extends SparkRegressorPFASuiteBase[PredictorResult] {

  val dataset = spark.read.format("libsvm").load(inputPath)
  val lr = new LinearRegression()
  override val sparkTransformer = lr.fit(dataset)
  val result = sparkTransformer.transform(dataset)

  override val input = withColumnAsArray(result, lr.getFeaturesCol).toJSON.collect()
  override val expectedOutput = result.select(lr.getPredictionCol).toJSON.collect()

  // Additional tests
  test("LinearRegression w/o fitIntercept") {
    val sparkTransformer = lr.setFitIntercept(false).fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = result.select(lr.getPredictionCol).toJSON.collect()

    parityTest(sparkTransformer, input, expectedOutput)
  }

}

Source File: RegressionEvaluatorSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
  }

  test("read/write") {
    val evaluator = new RegressionEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("r2")
    testDefaultReadWrite(evaluator)
  }
}

Source File: SparkRWrappers.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ml.api.r

import org.apache.spark.ml.attribute._
import org.apache.spark.ml.feature.RFormula
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.DataFrame

private[r] object SparkRWrappers {
  def fitRModelFormula(
      value: String,
      df: DataFrame,
      family: String,
      lambda: Double,
      alpha: Double,
      standardize: Boolean,
      solver: String): PipelineModel = {
    val formula = new RFormula().setFormula(value)
    val estimator = family match {
      case "gaussian" => new LinearRegression()
        .setRegParam(lambda)
        .setElasticNetParam(alpha)
        .setFitIntercept(formula.hasIntercept)
        .setStandardization(standardize)
        .setSolver(solver)
      case "binomial" => new LogisticRegression()
        .setRegParam(lambda)
        .setElasticNetParam(alpha)
        .setFitIntercept(formula.hasIntercept)
        .setStandardization(standardize)
    }
    val pipeline = new Pipeline().setStages(Array(formula, estimator))
    pipeline.fit(df)
  }

  def getModelCoefficients(model: PipelineModel): Array[Double] = {
    model.stages.last match {
      case m: LinearRegressionModel => {
        val coefficientStandardErrorsR = Array(m.summary.coefficientStandardErrors.last) ++
          m.summary.coefficientStandardErrors.dropRight(1)
        val tValuesR = Array(m.summary.tValues.last) ++ m.summary.tValues.dropRight(1)
        val pValuesR = Array(m.summary.pValues.last) ++ m.summary.pValues.dropRight(1)
        if (m.getFitIntercept) {
          Array(m.intercept) ++ m.coefficients.toArray ++ coefficientStandardErrorsR ++
            tValuesR ++ pValuesR
        } else {
          m.coefficients.toArray ++ coefficientStandardErrorsR ++ tValuesR ++ pValuesR
        }
      }
      case m: LogisticRegressionModel => {
        if (m.getFitIntercept) {
          Array(m.intercept) ++ m.coefficients.toArray
        } else {
          m.coefficients.toArray
        }
      }
    }
  }

  def getModelDevianceResiduals(model: PipelineModel): Array[Double] = {
    model.stages.last match {
      case m: LinearRegressionModel =>
        m.summary.devianceResiduals
      case m: LogisticRegressionModel =>
        throw new UnsupportedOperationException(
          "No deviance residuals available for LogisticRegressionModel")
    }
  }

  def getModelFeatures(model: PipelineModel): Array[String] = {
    model.stages.last match {
      case m: LinearRegressionModel =>
        val attrs = AttributeGroup.fromStructField(
          m.summary.predictions.schema(m.summary.featuresCol))
        if (m.getFitIntercept) {
          Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get)
        } else {
          attrs.attributes.get.map(_.name.get)
        }
      case m: LogisticRegressionModel =>
        val attrs = AttributeGroup.fromStructField(
          m.summary.predictions.schema(m.summary.featuresCol))
        if (m.getFitIntercept) {
          Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get)
        } else {
          attrs.attributes.get.map(_.name.get)
        }
    }
  }

  def getModelName(model: PipelineModel): String = {
    model.stages.last match {
      case m: LinearRegressionModel =>
        "LinearRegressionModel"
      case m: LogisticRegressionModel =>
        "LogisticRegressionModel"
    }
  }
}

Source File: TrainValidationSplitExample.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.examples.ml

import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}


object TrainValidationSplitExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("TrainValidationSplitExample")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)

    // Prepare training and test data.
    val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
    val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)

    val lr = new LinearRegression()

    // We use a ParamGridBuilder to construct a grid of parameters to search over.
    // TrainValidationSplit will try all combinations of values and determine best model using
    // the evaluator.
    val paramGrid = new ParamGridBuilder()
      .addGrid(lr.regParam, Array(0.1, 0.01))
      .addGrid(lr.fitIntercept, Array(true, false))
      .addGrid(lr.elasticNetParam, Array(0.0, 0.5, 1.0))
      .build()

    // In this case the estimator is simply the linear regression.
    // A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
    val trainValidationSplit = new TrainValidationSplit()
      .setEstimator(lr)
      .setEvaluator(new RegressionEvaluator)
      .setEstimatorParamMaps(paramGrid)

    // 80% of the data will be used for training and the remaining 20% for validation.
    trainValidationSplit.setTrainRatio(0.8)

    // Run train validation split, and choose the best set of parameters.
    val model = trainValidationSplit.fit(training)

    // Make predictions on test data. model is the model with combination of parameters
    // that performed best.
    model.transform(test)
      .select("features", "label", "prediction")
      .show()

    sc.stop()
  }
}

Source File: LinearRegressionWithElasticNetExample.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.regression.LinearRegression
// $example off$
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object LinearRegressionWithElasticNetExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LinearRegressionWithElasticNetExample")
    val sc = new SparkContext(conf)
    val sqlCtx = new SQLContext(sc)

    // $example on$
    // Load training data
    val training = sqlCtx.read.format("libsvm")
      .load("data/mllib/sample_linear_regression_data.txt")

    val lr = new LinearRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for linear regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
    trainingSummary.residuals.show()
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
    println(s"r2: ${trainingSummary.r2}")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: RegressionEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
  }

  test("read/write") {
    val evaluator = new RegressionEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("r2")
    testDefaultReadWrite(evaluator)
  }

  test("should support all NumericType labels and not support other types") {
    MLTestingUtils.checkNumericTypes(new RegressionEvaluator, spark)
  }
}

Source File: LinearRegressionWithElasticNetExample.scala From Spark-2.3.1 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.regression.LinearRegression
// $example off$
import org.apache.spark.sql.SparkSession

object LinearRegressionWithElasticNetExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("LinearRegressionWithElasticNetExample")
      .getOrCreate()

    // $example on$
    // Load training data
    val training = spark.read.format("libsvm")
      .load("data/mllib/sample_linear_regression_data.txt")

    val lr = new LinearRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for linear regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
    trainingSummary.residuals.show()
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
    println(s"r2: ${trainingSummary.r2}")
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: RegressionEvaluatorSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {//评估回归:默认参数
    
    val trainer = new LinearRegression
    //fit()方法将DataFrame转化为一个Transformer的算法
    val model = trainer.fit(dataset) //转换
    //Prediction 预测
    //transform()方法将DataFrame转化为另外一个DataFrame的算法
    val predictions = model.transform(dataset)
    predictions.collect()

    // default = rmse
    //默认rmse均方根误差说明样本的离散程度
    val evaluator = new RegressionEvaluator()
    println("==MetricName="+evaluator.getMetricName+"=LabelCol="+evaluator.getLabelCol+"=PredictionCol="+evaluator.getPredictionCol)
    //==MetricName=rmse=LabelCol=label=PredictionCol=prediction,默认rmse均方根误差说明样本的离散程度
    assert(evaluator.evaluate(predictions) ~== 0.1019382 absTol 0.001)

    // r2 score 评分
    //R2平方系统也称判定系数,用来评估模型拟合数据的好坏
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)

    //MAE平均绝对误差是所有单个观测值与算术平均值的偏差的绝对值的平均
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08036075 absTol 0.001)
  }
}

Source File: SparkRWrappers.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ml.api.r

import org.apache.spark.ml.attribute._
import org.apache.spark.ml.feature.RFormula
import org.apache.spark.ml.classification.{LogisticRegression, LogisticRegressionModel}
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.sql.DataFrame

private[r] object SparkRWrappers {
  def fitRModelFormula(
      value: String,
      df: DataFrame,
      family: String,
      lambda: Double,
      alpha: Double): PipelineModel = {
    val formula = new RFormula().setFormula(value)
    val estimator = family match {
      case "gaussian" => new LinearRegression()
        .setRegParam(lambda)
        .setElasticNetParam(alpha)
        .setFitIntercept(formula.hasIntercept)
      case "binomial" => new LogisticRegression()
        .setRegParam(lambda)
        .setElasticNetParam(alpha)
        .setFitIntercept(formula.hasIntercept)
    }
    val pipeline = new Pipeline().setStages(Array(formula, estimator))
    pipeline.fit(df)
  }

  def getModelWeights(model: PipelineModel): Array[Double] = {
    model.stages.last match {
      case m: LinearRegressionModel =>
        Array(m.intercept) ++ m.weights.toArray
      case _: LogisticRegressionModel =>
        throw new UnsupportedOperationException(
          "No weights available for LogisticRegressionModel")  // SPARK-9492
    }
  }

  def getModelFeatures(model: PipelineModel): Array[String] = {
    model.stages.last match {
      case m: LinearRegressionModel =>
        val attrs = AttributeGroup.fromStructField(
          m.summary.predictions.schema(m.summary.featuresCol))
        Array("(Intercept)") ++ attrs.attributes.get.map(_.name.get)
      case _: LogisticRegressionModel =>
        throw new UnsupportedOperationException(
          "No features names available for LogisticRegressionModel")  // SPARK-9492
    }
  }
}

Source File: LinearRegressionWithElasticNetExample.scala From spark1.52 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.regression.LinearRegression
// $example off$
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.{SQLContext, DataFrame}

    trainingSummary.residuals.show()
    //rmse均方根误差说明样本的离散程度
    //RMSE: 10.189126225286143
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
    //R2平方系统也称判定系数,用来评估模型拟合数据的好坏
    //r2: 0.02285205756871944
    println(s"r2: ${trainingSummary.r2}")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: OpPredictorWrapperTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages.sparkwrappers.specific

import com.salesforce.op.features.types._
import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams
import com.salesforce.op.test.{PrestigeData, TestFeatureBuilder, TestSparkContext}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}
import org.junit.runner.RunWith
import org.scalatest.FlatSpec
import org.scalatest.junit.JUnitRunner
import org.slf4j.LoggerFactory


@RunWith(classOf[JUnitRunner])
class OpPredictorWrapperTest extends FlatSpec with TestSparkContext with PrestigeData {

  val log = LoggerFactory.getLogger(this.getClass)

  val (ds, targetLabel, featureVector) = TestFeatureBuilder[RealNN, OPVector](
    prestigeSeq.map(p => p.prestige.toRealNN -> Vectors.dense(p.education, p.income, p.women).toOPVector)
  )

  Spec[OpPredictorWrapper[_, _]] should
    "be able to run a simple logistic regression model (fitIntercept=true)" in {
    val lrModel: LinearRegressionModel = fitLinRegModel(fitIntercept = true)
    lrModel.intercept.abs should be > 1E-6
  }

  it should "be able to run a simple logistic regression model (fitIntercept=false)" in {
    val lrModel: LinearRegressionModel = fitLinRegModel(fitIntercept = false)
    lrModel.intercept.abs should be < Double.MinPositiveValue
  }

  private def fitLinRegModel(fitIntercept: Boolean): LinearRegressionModel = {
    val lrBase =
      new LinearRegression()
        .setMaxIter(10)
        .setRegParam(0.3)
        .setElasticNetParam(0.8)
        .setFitIntercept(fitIntercept)

    val lr = new OpPredictorWrapper[LinearRegression, LinearRegressionModel](lrBase)
      .setInput(targetLabel, featureVector)

    // Fit the model
    val model = lr.fit(ds).asInstanceOf[SparkWrapperParams[LinearRegressionModel]]
    val lrModel = model.getSparkMlStage().get

    // Print the coefficients and intercept for linear regression
    log.info(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    log.info(s"numIterations: ${trainingSummary.totalIterations}")
    log.info(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
    if (log.isInfoEnabled) trainingSummary.residuals.show()
    log.info(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
    log.info(s"r2: ${trainingSummary.r2}")
    // checking r2 as a cheap way to make sure things are running as intended.
    assert(trainingSummary.r2 > 0.9)

    if (log.isInfoEnabled) {
      val output = lrModel.transform(ds)
      output.show(false)
    }

    lrModel
  }
}

Source File: TypedLinearRegression.scala From frameless with Apache License 2.0

5 votes

package frameless
package ml
package regression

import frameless.ml.internals.LinearInputsChecker
import frameless.ml.params.linears.{LossStrategy, Solver}
import frameless.ml.{AppendTransformer, TypedEstimator}
import org.apache.spark.ml.regression.{LinearRegression, LinearRegressionModel}


final class TypedLinearRegression [Inputs] private[ml](
  lr: LinearRegression,
  labelCol: String,
  featuresCol: String,
  weightCol: Option[String]
) extends TypedEstimator[Inputs, TypedLinearRegression.Outputs, LinearRegressionModel] {

  val estimatorWithoutWeight : LinearRegression = lr
    .setLabelCol(labelCol)
    .setFeaturesCol(featuresCol)
    .setPredictionCol(AppendTransformer.tempColumnName)

  val estimator = if (weightCol.isDefined) estimatorWithoutWeight.setWeightCol(weightCol.get) else estimatorWithoutWeight

  def setRegParam(value: Double):           TypedLinearRegression[Inputs] = copy(lr.setRegParam(value))
  def setFitIntercept(value: Boolean):      TypedLinearRegression[Inputs] = copy(lr.setFitIntercept(value))
  def setStandardization(value: Boolean):   TypedLinearRegression[Inputs] = copy(lr.setStandardization(value))
  def setElasticNetParam(value: Double):    TypedLinearRegression[Inputs] = copy(lr.setElasticNetParam(value))
  def setMaxIter(value: Int):               TypedLinearRegression[Inputs] = copy(lr.setMaxIter(value))
  def setTol(value: Double):                TypedLinearRegression[Inputs] = copy(lr.setTol(value))
  def setSolver(value: Solver):             TypedLinearRegression[Inputs] = copy(lr.setSolver(value.sparkValue))
  def setAggregationDepth(value: Int):      TypedLinearRegression[Inputs] = copy(lr.setAggregationDepth(value))
  def setLoss(value: LossStrategy):         TypedLinearRegression[Inputs] = copy(lr.setLoss(value.sparkValue))
  def setEpsilon(value: Double):            TypedLinearRegression[Inputs] = copy(lr.setEpsilon(value))

  private def copy(newLr: LinearRegression): TypedLinearRegression[Inputs] =
    new TypedLinearRegression[Inputs](newLr, labelCol, featuresCol, weightCol)

}

object TypedLinearRegression {
  case class Outputs(prediction: Double)
  case class Weight(weight: Double)


  def apply[Inputs](implicit inputsChecker: LinearInputsChecker[Inputs]): TypedLinearRegression[Inputs] = {
    new TypedLinearRegression(new LinearRegression(), inputsChecker.labelCol, inputsChecker.featuresCol, inputsChecker.weightCol)
  }
}

Source File: RegressionEvaluatorSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== -0.1019382 absTol 0.001)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998196 absTol 0.001)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== -0.08036075 absTol 0.001)
  }
}

Source File: RegressionEvaluatorSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
  }

  test("read/write") {
    val evaluator = new RegressionEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("r2")
    testDefaultReadWrite(evaluator)
  }

  test("should support all NumericType labels and not support other types") {
    MLTestingUtils.checkNumericTypes(new RegressionEvaluator, spark)
  }
}

Source File: LinearRegressionWithElasticNetExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.regression.LinearRegression
// $example off$
import org.apache.spark.sql.SparkSession

object LinearRegressionWithElasticNetExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("LinearRegressionWithElasticNetExample")
      .getOrCreate()

    // $example on$
    // Load training data
    val training = spark.read.format("libsvm")
      .load("data/mllib/sample_linear_regression_data.txt")

    val lr = new LinearRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for linear regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
    trainingSummary.residuals.show()
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
    println(s"r2: ${trainingSummary.r2}")
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: RegressionEvaluatorSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
  }

  test("read/write") {
    val evaluator = new RegressionEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("r2")
    testDefaultReadWrite(evaluator)
  }

  test("should support all NumericType labels and not support other types") {
    MLTestingUtils.checkNumericTypes(new RegressionEvaluator, spark)
  }
}

Source File: LinearRegressionWithElasticNetExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.regression.LinearRegression
// $example off$
import org.apache.spark.sql.SparkSession

object LinearRegressionWithElasticNetExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("LinearRegressionWithElasticNetExample")
      .getOrCreate()

    // $example on$
    // Load training data
    val training = spark.read.format("libsvm")
      .load("data/mllib/sample_linear_regression_data.txt")

    val lr = new LinearRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for linear regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
    trainingSummary.residuals.show()
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
    println(s"r2: ${trainingSummary.r2}")
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: LinearRegressionPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.regression.bikesharing

import org.apache.log4j.Logger
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.{VectorAssembler, VectorIndexer}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.mllib.evaluation.RegressionMetrics
import org.apache.spark.sql.{DataFrame, SparkSession}


object LinearRegressionPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def linearRegressionWithVectorFormat(vectorAssembler: VectorAssembler, vectorIndexer: VectorIndexer, dataFrame: DataFrame) = {
    val lr = new LinearRegression()
      .setFeaturesCol("features")
      .setLabelCol("label")
      .setRegParam(0.1)
      .setElasticNetParam(1.0)
      .setMaxIter(10)

    val pipeline = new Pipeline().setStages(Array(vectorAssembler, vectorIndexer, lr))

    val Array(training, test) = dataFrame.randomSplit(Array(0.8, 0.2), seed = 12345)

    val model = pipeline.fit(training)

    val fullPredictions = model.transform(test).cache()
    val predictions = fullPredictions.select("prediction").rdd.map(_.getDouble(0))
    val labels = fullPredictions.select("label").rdd.map(_.getDouble(0))
    val RMSE = new RegressionMetrics(predictions.zip(labels)).rootMeanSquaredError
    println(s"  Root mean squared error (RMSE): $RMSE")
  }

  def linearRegressionWithSVMFormat(spark: SparkSession) = {
    // Load training data
    val training = spark.read.format("libsvm")
      .load("./src/main/scala/org/sparksamples/regression/dataset/BikeSharing/lsvmHours.txt")

    val lr = new LinearRegression()
      .setMaxIter(10)
      .setRegParam(0.3)
      .setElasticNetParam(0.8)

    // Fit the model
    val lrModel = lr.fit(training)

    // Print the coefficients and intercept for linear regression
    println(s"Coefficients: ${lrModel.coefficients} Intercept: ${lrModel.intercept}")

    // Summarize the model over the training set and print out some metrics
    val trainingSummary = lrModel.summary
    println(s"numIterations: ${trainingSummary.totalIterations}")
    println(s"objectiveHistory: ${trainingSummary.objectiveHistory.toList}")
    trainingSummary.residuals.show()
    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")

    println(s"r2: ${trainingSummary.r2}")
  }
}

Source File: LinearRegressionParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.parity.regression

import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.feature.{OneHotEncoderEstimator, StringIndexer, VectorAssembler}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql.DataFrame


class LinearRegressionParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "loan_amount")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer().
    setInputCol("fico_score_group_fnl").
    setOutputCol("fico_index"),
    new OneHotEncoderEstimator().
      setInputCols(Array("fico_index")).
      setOutputCols(Array("fico")),
    new VectorAssembler().
      setInputCols(Array("fico", "dti")).
      setOutputCol("features"),
    new LinearRegression().
      setFeaturesCol("features").
      setLabelCol("loan_amount").
      setPredictionCol("prediction"))).fit(dataset)

  override val unserializedParams = Set("stringOrderType", "elasticNetParam", "maxIter", "tol", "epsilon", "labelCol", "loss", "regParam", "solver")
}

Source File: RegressionEvaluatorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.{LinearDataGenerator, MLlibTestSparkContext}
import org.apache.spark.mllib.util.TestingUtils._

class RegressionEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new RegressionEvaluator)
  }

  test("Regression Evaluator: default params") {
    
    val trainer = new LinearRegression
    val model = trainer.fit(dataset)
    val predictions = model.transform(dataset)

    // default = rmse
    val evaluator = new RegressionEvaluator()
    assert(evaluator.evaluate(predictions) ~== 0.1013829 absTol 0.01)

    // r2 score
    evaluator.setMetricName("r2")
    assert(evaluator.evaluate(predictions) ~== 0.9998387 absTol 0.01)

    // mae
    evaluator.setMetricName("mae")
    assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01)
  }

  test("read/write") {
    val evaluator = new RegressionEvaluator()
      .setPredictionCol("myPrediction")
      .setLabelCol("myLabel")
      .setMetricName("r2")
    testDefaultReadWrite(evaluator)
  }

  test("should support all NumericType labels and not support other types") {
    MLTestingUtils.checkNumericTypes(new RegressionEvaluator, spark)
  }
}

org.apache.spark.ml.regression.LinearRegression Scala Examples