org.apache.spark.mllib.regression.LinearRegressionWithSGD Scala Examples
The following examples show how to use org.apache.spark.mllib.regression.LinearRegressionWithSGD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkIntroduction.scala From reactive-machine-learning-systems with MIT License | 6 votes |
package com.reactivemachinelearning import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.evaluation.MulticlassMetrics import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.apache.spark.mllib.linalg.Vectors object SparkIntroduction { def main(args: Array[String]) { // handle args // setup val session = SparkSession.builder.appName("Simple ModelExample").getOrCreate() import session.implicits._ // Load and parse the train and test data val inputBasePath = "example_data" val outputBasePath = "." val trainingDataPath = inputBasePath + "/training.txt" val testingDataPath = inputBasePath + "/testing.txt" val currentOutputPath = outputBasePath + System.currentTimeMillis() val trainingData = session.read.textFile(trainingDataPath) val trainingParsed = trainingData.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val testingData = session.read.textFile(testingDataPath) val testingParsed = testingData.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val model = LinearRegressionWithSGD.train(trainingParsed.rdd, numIterations) // Evaluate model on testing examples val predictionsAndLabels = testingParsed.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } // Report performance statistics val metrics = new MulticlassMetrics(predictionsAndLabels.rdd) val precision = metrics.precision val recall = metrics.recall println(s"Precision: $precision Recall: $recall") // Save model model.save(session.sparkContext, currentOutputPath) } }
Example 2
Source File: LinearRegExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark.sql.SparkSession object LinearRegExample { val homeDir = System.getProperty("user.home") def main(args: Array[String]): Unit = { // 1. Set Spark session val spark = SparkSession.builder().master("local").getOrCreate() // 2. Set logging level to WARNING spark.sparkContext.setLogLevel("WARN") // 3. Import necessary classes from Spark MLLib package that are needed for linear regression import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // 4. Load the data val data = spark.sparkContext.textFile(s"${homeDir}/lpsa.data") // 5. Parse the data into LabeledPoint and cache val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // 6. Build the model by setting number of iterations, step size val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // 7. Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println(s"training Mean Squared Error $MSE") // 8. Save the model model.save(spark.sparkContext, s"{homeDir}/LinearRegressionWithSGDModel") // 9. Load the saved model val sameModel = LinearRegressionModel.load(spark.sparkContext, s"{homeDir}/LinearRegressionWithSGDModel") // 10. Output the model println(sameModel) } }
Example 3
Source File: PCAExample.scala From Swallow with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.intel.hibench.sparkbench.ml import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.apache.spark.rdd.RDD object PCAExample { def main(args: Array[String]): Unit = { var inputPath = "" var maxResultSize = "1g" if (args.length == 2) { inputPath = args(0) maxResultSize = args(1) } val conf = new SparkConf() .setAppName("PCAExample") .set("spark.driver.maxResultSize", maxResultSize) .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val data: RDD[LabeledPoint] = sc.objectFile(inputPath) val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } sc.stop() } } // scalastyle:on println
Example 4
Source File: LinearRegression.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.rdd.RDD import scopt.OptionParser object LinearRegression { case class Params( dataPath: String = null, numIterations: Int = 100, stepSize: Double = 0.00000001 ) def main(args: Array[String]): Unit = { val defaultParams = Params() val parser = new OptionParser[Params]("Linear"){ head("Linear Regression: an example of linear regression with SGD optimizer") opt[Int]("numIterations") .text(s"numIterations, default: ${defaultParams.numIterations}") .action((x,c) => c.copy(numIterations = x)) opt[Double]("stepSize") .text(s"stepSize, default: ${defaultParams.stepSize}") .action((x,c) => c.copy(stepSize = x)) arg[String]("<dataPath>") .required() .text("Input path for data") .action((x,c) => c.copy(dataPath = x)) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"LinearRegressionWithSGD with $params") .set("spark.shuffle.compress", "false") .set("spark.io.compression.codec", "org.apache.spark.io.LZFCompressionCodec") .set("spark.smartCompress", "false") val sc = new SparkContext(conf) val dataPath = params.dataPath val numIterations = params.numIterations val stepSize = params.stepSize // Load training data in LabeledPoint format. val data: RDD[LabeledPoint] = sc.objectFile(dataPath) // Building the model val model = LinearRegressionWithSGD.train(data, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = data.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("Training Mean Squared Error = " + MSE) sc.stop() } }
Example 5
Source File: PCA_LinearRegression_Demo.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter11.SparkMachineLearning import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.regression.LinearRegressionWithSGD object PCAExample2 { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() val data = MLUtils.loadLibSVMFile(spark.sparkContext, "data/mnist.bz2") val df = spark.read.format("libsvm").load("C:/Exp/mnist.bz2") df.show(20) val featureSize = data.first().features.size println("Feature Size: " + featureSize) val splits = data.randomSplit(Array(0.75, 0.25), seed = 12345L) val (training, test) = (splits(0), splits(1)) val pca = new PCA(featureSize/2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 20 val stepSize = 0.0001 val model = LinearRegressionWithSGD.train(training, numIterations, stepSize) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations, stepSize) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow(v - p, 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow(v - p, 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) println("Model coefficients:"+ model.toString()) println("Model with PCA coefficients:"+ model_pca.toString()) spark.stop() } }
Example 6
Source File: HogHBaseReputation.scala From hogzilla with GNU General Public License v2.0 | 5 votes |
package org.hogzilla.hbase import scala.math.random import java.lang.Math import org.apache.spark._ import org.apache.hadoop.hbase.client.HBaseAdmin import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName} import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.spark.mllib.regression.{LabeledPoint,LinearRegressionModel,LinearRegressionWithSGD} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.hadoop.hbase.client.HTable import org.apache.hadoop.hbase.filter.SingleColumnValueFilter import org.apache.hadoop.hbase.filter.BinaryComparator import org.apache.hadoop.hbase.filter.FilterList import org.apache.hadoop.hbase.filter.CompareFilter import java.util.ArrayList import org.apache.hadoop.hbase.client.Scan import org.apache.hadoop.hbase.filter.Filter import scala.collection.mutable.HashSet import org.apache.hadoop.hbase.client.Put object HogHBaseReputation { // Ex: MX, whitelist def getReputationList(listName:String, listType:String):Set[String] = { val list = new HashSet[String] val filters: ArrayList[Filter] = new ArrayList(); val colValFilter1 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listType))) colValFilter1.setFilterIfMissing(false); val colValFilter2 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list"), CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listName))) colValFilter2.setFilterIfMissing(false); filters.add(colValFilter1); filters.add(colValFilter2); val filterList = new FilterList( FilterList.Operator.MUST_PASS_ALL, filters); val scan = new Scan() scan.setFilter(filterList) val it = HogHBaseRDD.hogzilla_reputation.getScanner(scan).iterator() while(it.hasNext()) { list.add( Bytes.toString(it.next().getValue(Bytes.toBytes("rep"),Bytes.toBytes("ip"))) ) } list.toSet } def saveReputationList(listName:String, listType:String, ip:String) = { val put = new Put(Bytes.toBytes(ip+"-"+listName+"-"+listType)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), Bytes.toBytes(listType)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("list"), Bytes.toBytes(listName)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("ip"), Bytes.toBytes(ip)) HogHBaseRDD.hogzilla_reputation.put(put) } }
Example 7
Source File: LinearRegression.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1Updater} spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } } // scalastyle:on println
Example 8
Source File: LinearRegressionWithSGDExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println(s"training Mean Squared Error $MSE") // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 9
Source File: PCAExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println(s"Mean Squared Error = $MSE") println(s"PCA Mean Squared Error = $MSE_pca") // $example off$ sc.stop() } } // scalastyle:on println
Example 10
Source File: LinearRegression.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils import org.apache.spark.mllib.optimization.{SimpleUpdater, SquaredL2Updater, L1Updater} spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } }
Example 11
Source File: LinearRegressionWithSGDExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("training Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 12
Source File: PCAExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) // $example off$ sc.stop() } } // scalastyle:on println
Example 13
Source File: LinearRegression.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.log4j.{Level, Logger} import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.optimization.{L1Updater, SimpleUpdater, SquaredL2Updater} import org.apache.spark.mllib.regression.LinearRegressionWithSGD import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"LinearRegression with $params") val sc = new SparkContext(conf) Logger.getRootLogger.setLevel(Level.WARN) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() val splits = examples.randomSplit(Array(0.8, 0.2)) val training = splits(0).cache() val test = splits(1).cache() val numTraining = training.count() val numTest = test.count() println(s"Training: $numTraining, test: $numTest.") examples.unpersist(blocking = false) val updater = params.regType match { case NONE => new SimpleUpdater() case L1 => new L1Updater() case L2 => new SquaredL2Updater() } val algorithm = new LinearRegressionWithSGD() algorithm.optimizer .setNumIterations(params.numIterations) .setStepSize(params.stepSize) .setUpdater(updater) .setRegParam(params.regParam) val model = algorithm.run(training) val prediction = model.predict(test.map(_.features)) val predictionAndLabel = prediction.zip(test.map(_.label)) val loss = predictionAndLabel.map { case (p, l) => val err = p - l err * err }.reduce(_ + _) val rmse = math.sqrt(loss / numTest) println(s"Test RMSE = $rmse.") sc.stop() } } // scalastyle:on println
Example 14
Source File: LinearRegressionWithSGDExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("training Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 15
Source File: PCAExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) // $example off$ sc.stop() } } // scalastyle:on println
Example 16
Source File: LinearRegressionWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegressionWithLog{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 //val step = 0.2 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (Math.exp(p.label), Math.exp(linear_model.predict(p.features)))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = false if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log") } }
Example 17
Source File: LinearRegression.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegression{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = true if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD") } }
Example 18
Source File: LinearRegressionWithIntercept.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegressionWithIntercept{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val data1 = { records.map(r => Util.extractFeatures(r, catLen, mappings)) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.025 val intercept =true val linReg = new LinearRegressionWithSGD().setIntercept(intercept) linReg.optimizer.setNumIterations(iterations).setStepSize(step) val linear_model = linReg.run(data) print(data.first()); val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = true if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } val mse = true_vs_predicted.map{ case(t, p) => Util.squaredError(t, p)}.mean() val mae = true_vs_predicted.map{ case(t, p) => Util.absError(t, p)}.mean() val rmsle = Math.sqrt(true_vs_predicted.map{ case(t, p) => Util.squaredLogError(t, p)}.mean()) println("Linear Model - Mean Squared Error: " + mse) println("Linear Model - Mean Absolute Error: " + mae) println("Linear Model - Root Mean Squared Log Error:" + rmsle) } }
Example 19
Source File: LinearRegressionWithLog.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegressionWithLog{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Math.log(Util.extractLabel(r)), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 //val step = 0.2 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (Math.exp(p.label), Math.exp(linear_model.predict(p.features)))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = false if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD Log") } }
Example 20
Source File: LinearRegression.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.sparksamples.linearregression import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} import org.sparksamples.Util import scala.collection.Map import scala.collection.mutable.ListBuffer object LinearRegression{ def main(args: Array[String]) { val recordsArray = Util.getRecords() val records = recordsArray._1 val first = records.first() val numData = recordsArray._2 println(numData.toString()) records.cache() print("Mapping of first categorical feature column: " + Util.get_mapping(records, 2)) var list = new ListBuffer[Map[String, Long]]() for( i <- 2 to 9){ val m = Util.get_mapping(records, i) list += m } val mappings = list.toList var catLen = 0 mappings.foreach( m => (catLen +=m.size)) val numLen = records.first().slice(11, 15).size val totalLen = catLen + numLen print("Feature vector length for categorical features:"+ catLen) print("Feature vector length for numerical features:" + numLen) print("Total feature vector length: " + totalLen) val data = { records.map(r => LabeledPoint(Util.extractLabel(r), Util.extractFeatures(r, catLen, mappings))) } val first_point = data.first() println("Linear Model feature vector:" + first_point.features.toString) println("Linear Model feature vector length: " + first_point.features.size) val iterations = 10 val step = 0.025 val intercept =true //LinearRegressionWithSGD.tr val linear_model = LinearRegressionWithSGD.train(data, iterations, step) val x = linear_model.predict(data.first().features) val true_vs_predicted = data.map(p => (p.label, linear_model.predict(p.features))) val true_vs_predicted_csv = data.map(p => p.label + " ," + linear_model.predict(p.features)) val format = new java.text.SimpleDateFormat("dd-MM-yyyy-hh-mm-ss") val date = format.format(new java.util.Date()) val save = true if (save){ true_vs_predicted_csv.saveAsTextFile("./output/linear_model_" + date + ".csv") } val true_vs_predicted_take5 = true_vs_predicted.take(5) for(i <- 0 until 5) { println("True vs Predicted: " + "i :" + true_vs_predicted_take5(i)) } Util.calculatePrintMetrics(true_vs_predicted, "LinearRegressioWithSGD") } }
Example 21
Source File: LinearRegressionWithSGDExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.regression.LinearRegressionModel import org.apache.spark.mllib.regression.LinearRegressionWithSGD // $example off$ @deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0") object LinearRegressionWithSGDExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample") val sc = new SparkContext(conf) // $example on$ // Load and parse the data val data = sc.textFile("data/mllib/ridge-data/lpsa.data") val parsedData = data.map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() // Building the model val numIterations = 100 val stepSize = 0.00000001 val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize) // Evaluate model on training examples and compute training error val valuesAndPreds = parsedData.map { point => val prediction = model.predict(point.features) (point.label, prediction) } val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean() println("training Mean Squared Error = " + MSE) // Save and load model model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel") val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel") // $example off$ sc.stop() } } // scalastyle:on println
Example 22
Source File: PCAExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext // $example on$ import org.apache.spark.mllib.feature.PCA import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD} // $example off$ @deprecated("Deprecated since LinearRegressionWithSGD is deprecated. Use ml.feature.PCA", "2.0.0") object PCAExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("PCAExample") val sc = new SparkContext(conf) // $example on$ val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line => val parts = line.split(',') LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble))) }.cache() val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L) val training = splits(0).cache() val test = splits(1) val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features)) val training_pca = training.map(p => p.copy(features = pca.transform(p.features))) val test_pca = test.map(p => p.copy(features = pca.transform(p.features))) val numIterations = 100 val model = LinearRegressionWithSGD.train(training, numIterations) val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations) val valuesAndPreds = test.map { point => val score = model.predict(point.features) (score, point.label) } val valuesAndPreds_pca = test_pca.map { point => val score = model_pca.predict(point.features) (score, point.label) } val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean() val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean() println("Mean Squared Error = " + MSE) println("PCA Mean Squared Error = " + MSE_pca) // $example off$ sc.stop() } } // scalastyle:on println