org.apache.spark.mllib.stat.MultivariateOnlineSummarizer Scala Examples
The following examples show how to use org.apache.spark.mllib.stat.MultivariateOnlineSummarizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DataFrameExample.scala From drizzle-spark with Apache License 2.0 | 7 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 2
Source File: MultivariateSummarizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 3
Source File: MultivariateOnlineSummarizerAccumulator.scala From sparkpipe-core with Apache License 2.0 | 5 votes |
package software.uncharted.sparkpipe.ops.core.dataframe.numeric.util import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.util.AccumulatorV2 import org.apache.spark.sql.types.StructType import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer private object MultivariateOnlineSummarizerAccumulator { def init(cols: Seq[_]): Seq[MultivariateOnlineSummarizer] = { cols.map(col => { new MultivariateOnlineSummarizer }).toSeq } } private[numeric] class MultivariateOnlineSummarizerAccumulator( private var result: Seq[MultivariateOnlineSummarizer], private var touched: Boolean = false ) extends AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]] { def this(cols: StructType) { this(MultivariateOnlineSummarizerAccumulator.init(cols)) } override def add(r: Row): Unit = { for (i <- 0 to r.length-1) { if (!r.isNullAt(i)) { result(i).add(Vectors.dense(Array[Double](r.getDouble(i)))) touched = true } else { // don't add a sample to the summarizer for this column } } } override def copy(): AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]] = { new MultivariateOnlineSummarizerAccumulator(result.map(s => { // clone by making a new, empty summarizer and merging our data into it val newSummarizer = new MultivariateOnlineSummarizer() newSummarizer.merge(s) newSummarizer }), false) } override def isZero(): Boolean = { !touched } override def merge(other: AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]]): Unit = { for (i <- 0 to other.value.length-1) { result(i).merge(other.value(i)) } } override def reset(): Unit = { result = MultivariateOnlineSummarizerAccumulator.init(result) touched = false } override def value: Seq[MultivariateOnlineSummarizer] = { result } }
Example 4
Source File: MultivariateSummarizer.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 5
Source File: DataFrameExample.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import com.google.common.io.Files import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SQLContext} object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DataFrameExample with $params") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = sqlContext.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Files.createTempDir() tmpDir.deleteOnExit() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = sqlContext.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() sc.stop() } } // scalastyle:on println
Example 6
Source File: MultivariateSummarizer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 7
Source File: DataFrameExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text("input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println("Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 8
Source File: MultivariateSummarizer.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels //总结标签 val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features //总结特点 val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 9
Source File: MultivariateSummarizer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } }
Example 10
Source File: DatasetExample.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import java.io.File import com.google.common.io.Files import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SQLContext, DataFrame} object DatasetExample { case class Params( input: String = "data/mllib/sample_libsvm_data.txt", dataFormat: String = "libsvm") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DatasetExample") { head("Dataset: an example app using DataFrame as a Dataset for ML.") opt[String]("input") .text(s"input path to dataset") .action((x, c) => c.copy(input = x)) opt[String]("dataFormat") .text("data format: libsvm (default), dense (deprecated in Spark v1.1)") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams).map { params => run(params) }.getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"DatasetExample with $params") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) import sqlContext.implicits._ // for implicit conversions // Load input data val origData: RDD[LabeledPoint] = params.dataFormat match { case "dense" => MLUtils.loadLabeledPoints(sc, params.input) case "libsvm" => MLUtils.loadLibSVMFile(sc, params.input) } println(s"Loaded ${origData.count()} instances from file: ${params.input}") // Convert input data to DataFrame explicitly. val df: DataFrame = origData.toDF() println(s"Inferred schema:\n${df.schema.prettyJson}") println(s"Converted to DataFrame with ${df.count()} records") // Select columns val labelsDf: DataFrame = df.select("label") val labels: RDD[Double] = labelsDf.map { case Row(v: Double) => v } val numLabels = labels.count() val meanLabel = labels.fold(0.0)(_ + _) / numLabels println(s"Selected label column with average value $meanLabel") val featuresDf: DataFrame = df.select("features") val features: RDD[Vector] = featuresDf.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") val tmpDir = Files.createTempDir() tmpDir.deleteOnExit() val outputDir = new File(tmpDir, "dataset").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) println(s"Loading Parquet file with UDT from $outputDir.") val newDataset = sqlContext.read.parquet(outputDir) println(s"Schema from Parquet: ${newDataset.schema.prettyJson}") val newFeatures = newDataset.select("features").map { case Row(v: Vector) => v } val newFeaturesSummary = newFeatures.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(feat), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${newFeaturesSummary.mean.toString}") sc.stop() } }
Example 11
Source File: DataFrameExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 12
Source File: TimeSeriesSmallModelRegressionMetrics.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.linalg.Vector import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, MultivariateStatisticalSummary} class TimeSeriesSmallModelRegressionMetrics( idPredictionsAndObservations: Array[(Double, Double)] ) { private lazy val summary: MultivariateStatisticalSummary = idPredictionsAndObservations.map { case (observation, prediction) => Vectors.dense(observation, observation - prediction) }.aggregate(new MultivariateOnlineSummarizer())( (summary, current) => summary.add(org.apache.spark.mllib.linalg.Vectors.fromML(current)), (sum1, sum2) => sum1.merge(sum2) ) private lazy val SSerr = math.pow(summary.normL2(1), 2) private lazy val SStot = summary.variance(0) * (summary.count - 1) private lazy val SSreg = { val yMean = summary.mean(0) idPredictionsAndObservations.map { case (prediction, observation) => math.pow(prediction - yMean, 2) }.sum } def explainedVariance = SSreg / summary.count def meanAbsoluteError = summary.normL1(1) / summary.count def meanSquaredError = SSerr / summary.count def rootMeanSquaredPercentageError = math.sqrt(idPredictionsAndObservations.map { case (observation, prediction) => if (observation == 0) { 0 } else { Math.pow((observation - prediction) / observation, 2) } }.sum / summary.count) def rootMeanSquaredError = math.sqrt(meanSquaredError) def r2 = 1 - (SSerr / SStot) }
Example 13
Source File: TimeSeriesRegressionMetrics.scala From uberdata with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateOnlineSummarizer, MultivariateStatisticalSummary} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag class TimeSeriesRegressionMetrics[T]( idPredictionsAndObservations: RDD[(T, Int, Array[(Double, Double)])], isLargerBetter: Boolean )(implicit kt: ClassTag[T], ord: Ordering[T] = null) { private lazy val summaryRDD: RDD[ (T, Int, Array[(Double, Double)], MultivariateStatisticalSummary) ] = idPredictionsAndObservations.map { case (id, modelIndex, array) => (id, modelIndex, array, array.map { case (observation, prediction) => Vectors.dense(observation, observation - prediction) }.aggregate(new MultivariateOnlineSummarizer())((summary, current) => summary.add(org.apache.spark.mllib.linalg.Vectors.fromML(current)), (sum1, sum2) => sum1.merge(sum2))) } private lazy val SSerr = summaryRDD.map { case (id, modelIndex, values, summary) => ((id, modelIndex), (math.pow(summary.normL2(1), 2), summary)) } private lazy val SStot = summaryRDD.map { case (id, modelIndex, values, summary) => ((id, modelIndex), summary.variance(0) * (summary.count - 1)) } private lazy val SSreg = { summaryRDD.map { case (id, modelIndex, values, summary) => val yMean = summary.mean(0) (id, modelIndex, values.map { case (prediction, observation) => math.pow(prediction - yMean, 2) }.sum, summary) } } def explainedVariance: RDD[(T, (Int, Double))] = SSreg.map { case (id, modelIndex, regValue, summary) => (id, (modelIndex, regValue / summary.count)) } def meanAbsoluteError: RDD[(T, (Int, Double))] = summaryRDD.map { case (id, modelIndex, _, summary) => (id, (modelIndex, summary.normL1(1) / summary.count)) } def meanSquaredError: RDD[(T, (Int, Double))] = SSerr.map { case ((id, modelIndex), (err, summary)) => (id, (modelIndex, err / summary.count)) } def rootMeanSquaredError: RDD[(T, (Int, Double))] = meanSquaredError.map { case (id, (modelIndex, err)) => (id, (modelIndex, math.sqrt(err))) } def r2: RDD[(T, (Int, Double))] = SSerr.join(SStot).map { case ((id, modelIndex), ((sSerr, _), (sStot))) => (id, (modelIndex, 1 - calc(f => sSerr / sStot))) } //TODO refazer private def calc(f: Any => Double) = try { f() } catch { case e: Exception => e.printStackTrace() if (isLargerBetter) 0d else Double.MaxValue } }
Example 14
Source File: MultivariateSummarizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println
Example 15
Source File: DataFrameExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 16
Source File: StandardScaler.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.stats import breeze.linalg.DenseVector import breeze.numerics.sqrt import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.rdd.RDD import keystoneml.utils.MLlibUtils import keystoneml.workflow.{Transformer, Estimator} override def fit(data: RDD[DenseVector[Double]]): StandardScalerModel = { val summary = data.treeAggregate(new MultivariateOnlineSummarizer)( (aggregator, data) => aggregator.add(MLlibUtils.breezeVectorToMLlib(data)), (aggregator1, aggregator2) => aggregator1.merge(aggregator2)) if (normalizeStdDev) { new StandardScalerModel( MLlibUtils.mllibVectorToDenseBreeze(summary.mean), Some(sqrt(MLlibUtils.mllibVectorToDenseBreeze(summary.variance)) .map(r => if (r.isNaN | r.isInfinite | math.abs(r) < eps) 1.0 else r))) } else { new StandardScalerModel( MLlibUtils.mllibVectorToDenseBreeze(summary.mean), None) } } }
Example 17
Source File: MultivariateSummarizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Summarize labels val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(Vectors.dense(lp.label)), (sum1, sum2) => sum1.merge(sum2)) // Summarize features val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())( (summary, lp) => summary.add(lp.features), (sum1, sum2) => sum1.merge(sum2)) println() println(s"Summary statistics") println(s"\tLabel\tFeatures") println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}") println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}") println( s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}") println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}") println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}") println() sc.stop() } } // scalastyle:on println