org.apache.spark.mllib.stat.Statistics Scala Examples
The following examples show how to use org.apache.spark.mllib.stat.Statistics.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SummaryStatisticsExample.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} // $example off$ object SummaryStatisticsExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) // $example on$ val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) println(summary.mean) // a dense vector containing the mean value for each column println(summary.variance) // column-wise variance println(summary.numNonzeros) // number of nonzeros in each column // $example off$ sc.stop() } } // scalastyle:on println
Example 2
Source File: SummaryStatisticsExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} // $example off$ object SummaryStatisticsExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) // $example on$ val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) println(summary.mean) // a dense vector containing the mean value for each column println(summary.variance) // column-wise variance println(summary.numNonzeros) // number of nonzeros in each column // $example off$ sc.stop() } } // scalastyle:on println
Example 3
Source File: MySummaryStats.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter4 import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics import org.apache.spark.sql.SparkSession import org.apache.log4j.Logger import org.apache.log4j.Level object MySummaryStats { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("Summary Statistics") .config("spark.sql.warehouse.dir", ".") .getOrCreate() val sc = spark.sparkContext // handcrafted data set for understanding of Stats Summary val rdd = sc.parallelize( Seq( Vectors.dense(0, 1, 0), Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(3.0, 30.0, 300.0), Vectors.dense(5.0, 50.0, 500.0), Vectors.dense(7.0, 70.0, 700.0), Vectors.dense(9.0, 90.0, 900.0), Vectors.dense(11.0, 110.0, 1100.0) ) ) // Compute column summary statistics. val summary = Statistics.colStats(rdd) println("mean:" + summary.mean) println("variance:" +summary.variance) println("none zero" + summary.numNonzeros) println("min:" + summary.min) println("max:" + summary.max) println("count:" + summary.count) spark.stop() } }
Example 4
Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.etl.machinelearning.kudu import com.cloudera.sa.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MlLibOnKudu { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<taxiTable> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val taxiTable = args(2) val numOfCenters = args(3).toInt val numOfIterations = args(4).toInt val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val sqlContext = new SQLContext(sc) val kuduOptions = Map( "kudu.table" -> taxiTable, "kudu.master" -> kuduMaster) sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") //Vector val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => { val taxiTrip = NyTaxiYellowTripBuilder.build(r) generateVectorOnly(taxiTrip) }) println("--Running KMeans") val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations) println(" > vector centers:") clusters.clusterCenters.foreach(v => println(" >> " + v)) println("--Running corr") val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson") println(" > corr: " + correlMatrix.toString) println("--Running colStats") val colStats = Statistics.colStats(vectorRDD) println(" > max: " + colStats.max) println(" > count: " + colStats.count) println(" > mean: " + colStats.mean) println(" > min: " + colStats.min) println(" > normL1: " + colStats.normL1) println(" > normL2: " + colStats.normL2) println(" > numNonZeros: " + colStats.numNonzeros) println(" > variance: " + colStats.variance) //Labeled Points }
Example 5
Source File: Correlations.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println
Example 6
Source File: HypothesisTestingKolmogorovSmirnovTestExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD // $example off$ object HypothesisTestingKolmogorovSmirnovTestExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data // run a KS test for the sample versus a standard normal distribution val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) // summary of the test including the p-value, test statistic, and null hypothesis if our p-value // indicates significance, we can reject the null hypothesis. println(testResult) println() // perform a KS test using a cumulative distribution function of our making val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) println(testResult2) // $example off$ sc.stop() } } // scalastyle:on println
Example 7
Source File: Correlations.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println
Example 8
Source File: SummaryStatisticsExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} // $example off$ object SummaryStatisticsExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) // $example on$ val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) println(summary.mean) // a dense vector containing the mean value for each column println(summary.variance) // column-wise variance println(summary.numNonzeros) // number of nonzeros in each column // $example off$ sc.stop() } } // scalastyle:on println
Example 9
Source File: ChiSqSelector.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.feature import scala.collection.mutable.ArrayBuilder import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD @Since("1.3.0") def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = { val indices = Statistics.chiSqTest(data) .zipWithIndex.sortBy { case (res, _) => -res.statistic } .take(numTopFeatures) .map { case (_, indices) => indices } .sorted new ChiSqSelectorModel(indices) } }
Example 10
Source File: CorrelationDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD //采用spearman相关系数 //执行结果: //correlation5: Double = 0.9428571428571412 val correlation5: Double = Statistics.corr(rdd4, rdd5, "spearman") println("spearman:"+correlation5) //从上面的执行结果来看,相关性从pearson的值0.6915716600436548提高到了0.9428571428571412,由于利用的等级相关, //因而spearman相关性分析也称为spearman等级相关分析或等级差数法,但需要注意的是spearman相关性分析方法涉及到等级的排序问题, //在分布式环境下的排序可能会涉及到大量的网络IO操作,算法效率不是特别高。 } }
Example 11
Source File: ChiSqLearning.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.linalg.{ Matrix, Matrices, Vectors } import org.apache.spark.mllib.stat.Statistics import org.apache.spark.{ SparkConf, SparkContext } object ChiSqLearning { def main(args: Array[String]) { val vd = Vectors.dense(1, 2, 3, 4, 5) val vdResult = Statistics.chiSqTest(vd) println(vd) println(vdResult) println("-------------------------------") val mtx = Matrices.dense(3, 2, Array(1, 3, 5, 2, 4, 6)) val mtxResult = Statistics.chiSqTest(mtx) println(mtx) println(mtxResult) //print :方法、自由度、方法的统计量、p值,推论犯错的概率p println("-------------------------------") val mtx2 = Matrices.dense(2, 2, Array(19.0, 34, 24, 10.0)) printChiSqTest(mtx2) printChiSqTest(Matrices.dense(2, 2, Array(26.0, 36, 7, 2.0))) // val mtxResult2 = Statistics.chiSqTest(mtx2) // println(mtx2) // println(mtxResult2) } def printChiSqTest(matrix: Matrix): Unit = { println("-------------------------------") val mtxResult2 = Statistics.chiSqTest(matrix) println(matrix) println(mtxResult2) } }
Example 12
Source File: StatisticsDemo.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.stat.MultivariateStatisticalSummary import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics object StatisticsDemo { def main(args: Array[String]) { //val sparkConf = new SparkConf().setMast("local[2]").setAppName("SparkHdfsLR") val conf = new SparkConf().setAppName("test").setMaster("local") val sc = new SparkContext(conf) val rdd1 = sc.parallelize( Array( Array(1.0, 2.0, 3.0), Array(2.0, 3.0, 4.0))).map(f =>Vectors.dense(f)) //比如1.2.3.4.5 这五个数的平均数是3 val mss = Statistics.colStats(rdd1) //方差是各个数据与平均数之差的平方相加再除以个数 //方差越小越稳定,表示数据间差别小 println("均值:" + mss.mean); println("样本方差:" + mss.variance); //样本方差是各个数据与平均数之差的平方相加再除以(个数-1) println("非零统计量个数:" + mss.numNonzeros); println("总数:" + mss.count); println("最大值:" + mss.max); println("最小值:" + mss.min); //其它normL2等统计信息 val land1 = Vectors.dense(1000.0, 1856.0) val land2 = Vectors.dense(400, 560) val c1 = Statistics.chiSqTest(land1, land2) } }
Example 13
Source File: BasicStatistics.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import org.apache.spark.mllib.stat.MultivariateStatisticalSummary import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.linalg.Vector import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Matrix val sc: SparkContext = null val seriesX: RDD[Double] = null // a series 一系列 //必须与seriesX具有相同数量的分区和基数 val seriesY: RDD[Double] = null // must have the same number of partitions and cardinality as seriesX // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a // method is not specified, Pearson's method will be used by default. //pearson皮尔森相关性 val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson") println("pearson:"+correlation) //请注意,每个向量是一个行,而不是一个列 val data: RDD[Vector] = null // note that each Vector is a row and not a column //spearman 斯皮尔曼相关性 // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method. //用皮尔森法计算相关矩阵,用“斯皮尔曼”的斯皮尔曼方法 // If a method is not specified, Pearson's method will be used by default. //如果没有指定方法,皮尔森的方法将被默认使用 val correlMatrix: Matrix = Statistics.corr(data, "pearson") println("correlMatrix:"+correlMatrix.toString()) } }
Example 14
Source File: Correlations.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils import org.apache.spark.{SparkConf, SparkContext} spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams).map { params => run(params) } getOrElse { sys.exit(1) } } def run(params: Params) { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } }
Example 15
Source File: HypothesisTestingKolmogorovSmirnovTestExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD // $example off$ object HypothesisTestingKolmogorovSmirnovTestExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data // run a KS test for the sample versus a standard normal distribution val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) // summary of the test including the p-value, test statistic, and null hypothesis if our p-value // indicates significance, we can reject the null hypothesis. println(testResult) println() // perform a KS test using a cumulative distribution function of our making val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) println(testResult2) // $example off$ sc.stop() } } // scalastyle:on println
Example 16
Source File: Correlations.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println
Example 17
Source File: PositiveCorrelationExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark.sql.SparkSession import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics object PositiveCorrelationExample { def getSparkSession(): SparkSession = { val spark = SparkSession.builder().master("local").getOrCreate() spark.sparkContext.setLogLevel("ERROR") spark } def main(args: Array[String]): Unit = { val spark = getSparkSession() val data = spark.sparkContext.parallelize( Seq( Vectors.dense(0.0, 1.0, 100.0), Vectors.dense(10.0, 10.0, 200.0), Vectors.dense(20.0, 100.0, 300.0), Vectors.dense(30.0, 1000.0, 400.0), Vectors.dense(40.0, 10000.0, 500.0), Vectors.dense(50.0, 100000.0, 600.0), Vectors.dense(60.0, 1000000.0, 700.0), Vectors.dense(70.0, 10000000.0, 800.0), Vectors.dense(80.0, 100000000.0, 900.0), Vectors.dense(90.0, 1000000000.0, 1000.0) ) ) val summary = Statistics.colStats(data) // Compute column summary statistics println( s"""Summary: ${summary.count} // number of records ${summary.mean} // mean value for each column ${summary.min} // column-wise min ${summary.max} // column-wise max ${summary.normL1} // column-wise norm L1 ${summary.normL2} // column-wise Euclidean magnitude ${summary.variance} // column-wise variance ${summary.numNonzeros} // column-wise count of non-zero values """.stripMargin) val corr = Statistics.corr(data) println(s"Correlation:\n${corr}") spark.stop() } }
Example 18
Source File: NegativeCorrelationExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics import org.apache.spark.sql.SparkSession object NegativeCorrelationExample { def getSparkSession(): SparkSession = { val spark = SparkSession.builder().master("local").getOrCreate() spark.sparkContext.setLogLevel("ERROR") spark } def main(args: Array[String]): Unit = { val spark = getSparkSession() val data = spark.sparkContext.parallelize( Seq( Vectors.dense(0.0, 1.0, 100.0), Vectors.dense(-10.0, 10.0, 200.0), Vectors.dense(-20.0, 100.0, 300.0), Vectors.dense(-30.0, 1000.0, 400.0), Vectors.dense(-40.0, 10000.0, 500.0), Vectors.dense(-50.0, 100000.0, 600.0), Vectors.dense(-60.0, 1000000.0, 700.0), Vectors.dense(-70.0, 10000000.0, 800.0), Vectors.dense(-80.0, 100000000.0, 900.0), Vectors.dense(-90.0, 1000000000.0, 1000.0) ) ) val corr = Statistics.corr(data) println(s"Correlation:\n${corr}") spark.stop() } }
Example 19
Source File: HypothesisTestExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License | 5 votes |
package handson.example object HypothesisTestExample { def main(args: Array[String]): Unit = { // 1a. Set Spark session import org.apache.spark.sql.SparkSession val spark = SparkSession.builder().master("local").getOrCreate() import spark.implicits._ // 1b. Set logging level to WARNING spark.sparkContext.setLogLevel("WARN") //2. Import the necessary classes from Spark’s MLLib package: import org.apache.spark.mllib.linalg._ import org.apache.spark.mllib.stat.Statistics //3. Create a sample observation of vectors: val observations = Vectors.dense(0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1) // 4. Run the chi-square test on the data: val results = Statistics.chiSqTest(observations) println(results) // Stop Spark Session before exiting spark.stop() } }
Example 20
Source File: HypothesisTestingKolmogorovSmirnovTestExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD // $example off$ object HypothesisTestingKolmogorovSmirnovTestExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data // run a KS test for the sample versus a standard normal distribution val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) // summary of the test including the p-value, test statistic, and null hypothesis if our p-value // indicates significance, we can reject the null hypothesis. println(testResult) println() // perform a KS test using a cumulative distribution function of our making val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) println(testResult2) // $example off$ sc.stop() } } // scalastyle:on println
Example 21
Source File: Correlations.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println
Example 22
Source File: SummaryStatisticsExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} // $example off$ object SummaryStatisticsExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("SummaryStatisticsExample") val sc = new SparkContext(conf) // $example on$ val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) println(summary.mean) // a dense vector containing the mean value for each column println(summary.variance) // column-wise variance println(summary.numNonzeros) // number of nonzeros in each column // $example off$ sc.stop() } } // scalastyle:on println
Example 23
Source File: loyalty_model.scala From Spark_Personas with MIT License | 5 votes |
//��һ�� val result = hiveContext.sql("select max(login_times) from model_input_loyal_t") //�����ʴ��� val max_login_times = result.collect()(0).get(0).asInstanceOf[Long].toDouble val result = hiveContext.sql("select min(login_times) from model_input_loyal_t") //��С���ʴ��� val min_login_times = result.collect()(0).get(0).asInstanceOf[Long].toDouble val region_login_times = max_login_times - min_login_times val result = hiveContext.sql("select max(stay_time) from model_input_loyal_t") //���ͣ��ʱ�� val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble val result = hiveContext.sql("select min(stay_time) from model_input_loyal_t") //��Сͣ��ʱ�� val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble val region_stay_time = max_stay_time - min_stay_time val result = hiveContext.sql("select max(view_days) from model_input_loyal_t") //���ͣ������ val max_view_days = result.collect()(0).get(0).asInstanceOf[Long].toDouble val result = hiveContext.sql("select min(view_days) from model_input_loyal_t") //��Сͣ������ val min_view_days = result.collect()(0).get(0).asInstanceOf[Long].toDouble val region_view_days = max_view_days - min_view_days val result = hiveContext.sql("select max(pv) from model_input_loyal_t") //������ҳ���� val max_pv = result.collect()(0).get(0).asInstanceOf[Long].toDouble val result = hiveContext.sql("select min(pv) from model_input_loyal_t") //��С����ҳ���� val min_pv = result.collect()(0).get(0).asInstanceOf[Long].toDouble val region_pv = max_pv - min_pv val result =hiveContext.sql("select max(unix_timestamp(t2.last_viewtime,'yyyy-MM-dd')) from model_input_loyal_t t2") val max_last_viewtime = result.collect()(0).get(0).asInstanceOf[Long].toDouble //���ʱ�� val result = hiveContext.sql("select min(unix_timestamp(t2.last_viewtime,'yyyy-MM-dd')) from model_input_loyal_t t2") val min_last_viewtime = result.collect()(0).get(0).asInstanceOf[Long].toDouble //��Сʱ�� val region_last_viewtime = max_last_viewtime - min_last_viewtime //Ȩ�أ�login_times:0.2,stay_time:0.3,view_days:0.3,pv:0.15,last_viewtime:0.05 val normalization= hiveContext.sql("select t1.cookie , (((t1.login_times - "+min_login_times+") * 0.2/"+region_login_times+") + ((t1.stay_time- "+min_stay_time+") * 0.3/"+region_stay_time+") +((t1.view_days - "+min_view_days+")* 0.3/"+region_view_days+") +((t1.pv - "+min_pv+")* 0.15/"+region_pv+") +((unix_timestamp(t1.last_viewtime,'yyyy-MM-dd')- "+min_last_viewtime+")*0.05 / " + region_last_viewtime + "))*100 as loyalty_score from model_input_loyal_t t1") normalization.registerTempTable("temporary_points") //��һ����ʱ�� import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.linalg.Vectors val rdd = normalization.rdd.map( s => Vectors.dense(s.get(1).asInstanceOf[Double].toDouble)) val summary = Statistics.colStats(rdd) println(summary.mean) val means = summary.mean(0) println(summary.variance) val standard_deviation = summary.variance(0) //����һ���������ľ��룬��Ϊ��ֵ�Ƚ�С����ֵ��ȥ�����Ϊ�����������½�����Ϊ0���Ͻ粻�䣻 val r = means - standard_deviation*5 val low_bound = if (r > 0) r else 0 val up_bound = means + standard_deviation*5 val loyalty_temporary = hiveContext.sql("(select t1.lenovo_id,t1.loyalty_score,t1.loyalty_level from model_output_loyal_t t1 where 1=0) union all (select t2.cookie, t2.loyalty_score,(case when t2.loyalty_score <= "+low_bound+" then '��' when t2.loyalty_score < "+up_bound+" then '��' else '��' end)as loyalty_level from temporary_points t2)") loyalty_temporary.registerTempTable("temporary_loyalty") hiveContext.sql("insert overwrite table data.model_output_loyal_t partition (l_day='2016-10-01') select * from temporary_loyalty")
Example 24
Source File: L9-4Correlation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object CorrelationApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: CorrelationApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) walkingOrRunning.map(f => f.features).foreachRDD(rdd => { val corrSpearman = Statistics.corr(rdd, "spearman") val corrPearson = Statistics.corr(rdd, "pearson") println("Correlation Spearman: \n" + corrSpearman) println("Correlation Pearson: \n" + corrPearson) }) ssc.start() ssc.awaitTermination() } }
Example 25
Source File: L9-5ChiSq.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object ChiSqApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: ChiSqApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble)) .filter(f => f(0) == 4.0 || f(0) == 5.0) .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5)))) .foreachRDD(rdd => { Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2))) }) ssc.start() ssc.awaitTermination() } }
Example 26
Source File: L9-3Statistics.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.Statistics import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext object StatisticsApp { def main(args: Array[String]) { if (args.length != 4) { System.err.println( "Usage: StatisticsApp <appname> <batchInterval> <hostname> <port>") System.exit(1) } val Seq(appName, batchInterval, hostname, port) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt)) val substream = ssc.socketTextStream(hostname, port.toInt) .filter(!_.contains("NaN")) .map(_.split(" ")) .filter(f => f(1) != "0") .map(f => f.map(f => f.toDouble)) substream.map(f => Vectors.dense(f.slice(1, 5))).foreachRDD(rdd => { val stats = Statistics.colStats(rdd) println("Count: " + stats.count) println("Max: " + stats.max.toArray.mkString(" ")) println("Min: " + stats.min.toArray.mkString(" ")) println("Mean: " + stats.mean.toArray.mkString(" ")) println("L1-Norm: " + stats.normL1.toArray.mkString(" ")) println("L2-Norm: " + stats.normL2.toArray.mkString(" ")) println("Number of non-zeros: " + stats.numNonzeros.toArray.mkString(" ")) println("Varience: " + stats.variance.toArray.mkString(" ")) }) ssc.start() ssc.awaitTermination() } }
Example 27
Source File: BugDemonstrationTest.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import org.apache.spark.mllib.linalg.{Vectors, Vector} import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics} import org.apache.spark.sql.SparkSession import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers} class BugDemonstrationTest extends FunSuite with Matchers with BeforeAndAfterAll { private var sparkSession : SparkSession = _ override def beforeAll(): Unit = { super.beforeAll() sparkSession = SparkSession.builder().appName("BugTests").master("local[2]").getOrCreate() } override def afterAll(): Unit = { super.afterAll() sparkSession.stop() } test("This demonstrates a bug was fixed in tsne-spark 2.1") { val sc = sparkSession.sparkContext val observations = sc.parallelize( Seq( Vectors.dense(1.0, 10.0, 100.0), Vectors.dense(2.0, 20.0, 200.0), Vectors.dense(3.0, 30.0, 300.0) ) ) // Compute column summary statistics. val summary: MultivariateStatisticalSummary = Statistics.colStats(observations) val expectedMean = Vectors.dense(2.0,20.0,200.0) val resultMean = summary.mean assertEqualEnough(resultMean, expectedMean) val expectedVariance = Vectors.dense(1.0,100.0,10000.0) assertEqualEnough(summary.variance, expectedVariance) val expectedNumNonZeros = Vectors.dense(3.0, 3.0, 3.0) assertEqualEnough(summary.numNonzeros, expectedNumNonZeros) } private def assertEqualEnough(sample: Vector, expected: Vector): Unit = { expected.toArray.zipWithIndex.foreach{ case(d: Double, i: Int) => sample(i) should be (d +- 1E-12) } } }
Example 28
Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder} import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD import org.apache.spark.sql.SQLContext import org.apache.spark.{SparkConf, SparkContext} object MlLibOnKudu { def main(args: Array[String]): Unit = { if (args.length == 0) { println("Args: <runLocal> " + "<kuduMaster> " + "<taxiTable> " + "<numOfCenters> " + "<numOfIterations> ") return } val runLocal = args(0).equalsIgnoreCase("l") val kuduMaster = args(1) val taxiTable = args(2) val numOfCenters = args(3).toInt val numOfIterations = args(4).toInt val sc: SparkContext = if (runLocal) { val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") new SparkContext("local", "TableStatsSinglePathMain", sparkConfig) } else { val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain") new SparkContext(sparkConfig) } val sqlContext = new SQLContext(sc) val kuduOptions = Map( "kudu.table" -> taxiTable, "kudu.master" -> kuduMaster) sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load. registerTempTable("ny_taxi_trip_tmp") //Vector val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => { val taxiTrip = NyTaxiYellowTripBuilder.build(r) generateVectorOnly(taxiTrip) }) println("--Running KMeans") val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations) println(" > vector centers:") clusters.clusterCenters.foreach(v => println(" >> " + v)) println("--Running corr") val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson") println(" > corr: " + correlMatrix.toString) println("--Running colStats") val colStats = Statistics.colStats(vectorRDD) println(" > max: " + colStats.max) println(" > count: " + colStats.count) println(" > mean: " + colStats.mean) println(" > min: " + colStats.min) println(" > normL1: " + colStats.normL1) println(" > normL2: " + colStats.normL2) println(" > numNonZeros: " + colStats.numNonzeros) println(" > variance: " + colStats.variance) //Labeled Points }
Example 29
Source File: HypothesisTestingKolmogorovSmirnovTestExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} // $example on$ import org.apache.spark.mllib.stat.Statistics import org.apache.spark.rdd.RDD // $example off$ object HypothesisTestingKolmogorovSmirnovTestExample { def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample") val sc = new SparkContext(conf) // $example on$ val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25)) // an RDD of sample data // run a KS test for the sample versus a standard normal distribution val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1) // summary of the test including the p-value, test statistic, and null hypothesis if our p-value // indicates significance, we can reject the null hypothesis. println(testResult) println() // perform a KS test using a cumulative distribution function of our making val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1) val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF) println(testResult2) // $example off$ sc.stop() } } // scalastyle:on println
Example 30
Source File: Correlations.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import scopt.OptionParser import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.stat.Statistics import org.apache.spark.mllib.util.MLUtils spark-examples-*.jar \ | --input data/mllib/sample_linear_regression_data.txt """.stripMargin) } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val conf = new SparkConf().setAppName(s"Correlations with $params") val sc = new SparkContext(conf) val examples = MLUtils.loadLibSVMFile(sc, params.input).cache() println(s"Summary of data file: ${params.input}") println(s"${examples.count()} data points") // Calculate label -- feature correlations val labelRDD = examples.map(_.label) val numFeatures = examples.take(1)(0).features.size val corrType = "pearson" println() println(s"Correlation ($corrType) between label and each feature") println(s"Feature\tCorrelation") var feature = 0 while (feature < numFeatures) { val featureRDD = examples.map(_.features(feature)) val corr = Statistics.corr(labelRDD, featureRDD) println(s"$feature\t$corr") feature += 1 } println() sc.stop() } } // scalastyle:on println