org.apache.spark.mllib.stat.Statistics Scala Examples

The following examples show how to use org.apache.spark.mllib.stat.Statistics. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: SummaryStatisticsExample.scala    From drizzle-spark   with Apache License 2.0 6 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
// $example off$

object SummaryStatisticsExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SummaryStatisticsExample")
    val sc = new SparkContext(conf)

    // $example on$
    val observations = sc.parallelize(
      Seq(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    )

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    println(summary.mean)  // a dense vector containing the mean value for each column
    println(summary.variance)  // column-wise variance
    println(summary.numNonzeros)  // number of nonzeros in each column
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 2
Source File: SummaryStatisticsExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
// $example off$

object SummaryStatisticsExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SummaryStatisticsExample")
    val sc = new SparkContext(conf)

    // $example on$
    val observations = sc.parallelize(
      Seq(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    )

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    println(summary.mean)  // a dense vector containing the mean value for each column
    println(summary.variance)  // column-wise variance
    println(summary.numNonzeros)  // number of nonzeros in each column
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 3
Source File: MySummaryStats.scala    From Apache-Spark-2x-Machine-Learning-Cookbook   with MIT License 5 votes vote down vote up
package spark.ml.cookbook.chapter4

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.sql.SparkSession
import org.apache.log4j.Logger
import org.apache.log4j.Level


object MySummaryStats  {

  def main(args: Array[String]): Unit = {

    Logger.getLogger("org").setLevel(Level.ERROR)
    Logger.getLogger("akka").setLevel(Level.ERROR)

    // setup SparkSession to use for interactions with Spark
    val spark = SparkSession
      .builder
      .master("local[*]")
      .appName("Summary Statistics")
      .config("spark.sql.warehouse.dir", ".")
      .getOrCreate()
    val sc = spark.sparkContext

    // handcrafted data set for understanding of Stats Summary
    val rdd = sc.parallelize(
      Seq(
        Vectors.dense(0, 1, 0),
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(3.0, 30.0, 300.0),
        Vectors.dense(5.0, 50.0, 500.0),
        Vectors.dense(7.0, 70.0, 700.0),
        Vectors.dense(9.0, 90.0, 900.0),
        Vectors.dense(11.0, 110.0, 1100.0)
      )
    )

    // Compute column summary statistics.
    val summary = Statistics.colStats(rdd)
    println("mean:" + summary.mean)
    println("variance:" +summary.variance)
    println("none zero" + summary.numNonzeros)
    println("min:" + summary.min)
    println("max:" + summary.max)
    println("count:" + summary.count)

    spark.stop()
  }
} 
Example 4
Source File: MlLibOnKudu.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.cloudera.sa.taxi360.etl.machinelearning.kudu

import com.cloudera.sa.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MlLibOnKudu {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<taxiTable> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val taxiTable = args(2)
    val numOfCenters = args(3).toInt
    val numOfIterations = args(4).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val sqlContext = new SQLContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> taxiTable,
      "kudu.master" -> kuduMaster)

    sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load.
      registerTempTable("ny_taxi_trip_tmp")

    //Vector
    val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => {
      val taxiTrip = NyTaxiYellowTripBuilder.build(r)
      generateVectorOnly(taxiTrip)
    })

    println("--Running KMeans")
    val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations)
    println(" > vector centers:")
    clusters.clusterCenters.foreach(v => println(" >> " + v))

    println("--Running corr")
    val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson")
    println(" > corr: " + correlMatrix.toString)

    println("--Running colStats")
    val colStats = Statistics.colStats(vectorRDD)
    println(" > max: " + colStats.max)
    println(" > count: " + colStats.count)
    println(" > mean: " + colStats.mean)
    println(" > min: " + colStats.min)
    println(" > normL1: " + colStats.normL1)
    println(" > normL2: " + colStats.normL2)
    println(" > numNonZeros: " + colStats.numNonzeros)
    println(" > variance: " + colStats.variance)

    //Labeled Points
    
} 
Example 5
Source File: Correlations.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}


spark-examples-*.jar \
        |  --input data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    } getOrElse {
        sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"Correlations with $params")
    val sc = new SparkContext(conf)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    println(s"Summary of data file: ${params.input}")
    println(s"${examples.count()} data points")

    // Calculate label -- feature correlations
    val labelRDD = examples.map(_.label)
    val numFeatures = examples.take(1)(0).features.size
    val corrType = "pearson"
    println()
    println(s"Correlation ($corrType) between label and each feature")
    println(s"Feature\tCorrelation")
    var feature = 0
    while (feature < numFeatures) {
      val featureRDD = examples.map(_.features(feature))
      val corr = Statistics.corr(labelRDD, featureRDD)
      println(s"$feature\t$corr")
      feature += 1
    }
    println()

    sc.stop()
  }
}
// scalastyle:on println 
Example 6
Source File: HypothesisTestingKolmogorovSmirnovTestExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
// $example off$

object HypothesisTestingKolmogorovSmirnovTestExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25))  // an RDD of sample data

    // run a KS test for the sample versus a standard normal distribution
    val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
    // summary of the test including the p-value, test statistic, and null hypothesis if our p-value
    // indicates significance, we can reject the null hypothesis.
    println(testResult)
    println()

    // perform a KS test using a cumulative distribution function of our making
    val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
    val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
    println(testResult2)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 7
Source File: Correlations.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
        |  --input data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"Correlations with $params")
    val sc = new SparkContext(conf)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    println(s"Summary of data file: ${params.input}")
    println(s"${examples.count()} data points")

    // Calculate label -- feature correlations
    val labelRDD = examples.map(_.label)
    val numFeatures = examples.take(1)(0).features.size
    val corrType = "pearson"
    println()
    println(s"Correlation ($corrType) between label and each feature")
    println(s"Feature\tCorrelation")
    var feature = 0
    while (feature < numFeatures) {
      val featureRDD = examples.map(_.features(feature))
      val corr = Statistics.corr(labelRDD, featureRDD)
      println(s"$feature\t$corr")
      feature += 1
    }
    println()

    sc.stop()
  }
}
// scalastyle:on println 
Example 8
Source File: SummaryStatisticsExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
// $example off$

object SummaryStatisticsExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SummaryStatisticsExample")
    val sc = new SparkContext(conf)

    // $example on$
    val observations = sc.parallelize(
      Seq(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    )

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    println(summary.mean)  // a dense vector containing the mean value for each column
    println(summary.variance)  // column-wise variance
    println(summary.numNonzeros)  // number of nonzeros in each column
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 9
Source File: ChiSqSelector.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.mllib.feature

import scala.collection.mutable.ArrayBuilder

import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD


  @Since("1.3.0")
  def fit(data: RDD[LabeledPoint]): ChiSqSelectorModel = {
    val indices = Statistics.chiSqTest(data)
      .zipWithIndex.sortBy { case (res, _) => -res.statistic }
      .take(numTopFeatures)
      .map { case (_, indices) => indices }
      .sorted
    new ChiSqSelectorModel(indices)
  }
} 
Example 10
Source File: CorrelationDemo.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD

    //采用spearman相关系数
    //执行结果:
    //correlation5: Double = 0.9428571428571412
    val correlation5: Double = Statistics.corr(rdd4, rdd5, "spearman")
    println("spearman:"+correlation5)
    //从上面的执行结果来看,相关性从pearson的值0.6915716600436548提高到了0.9428571428571412,由于利用的等级相关,
    //因而spearman相关性分析也称为spearman等级相关分析或等级差数法,但需要注意的是spearman相关性分析方法涉及到等级的排序问题,
    //在分布式环境下的排序可能会涉及到大量的网络IO操作,算法效率不是特别高。
  }
} 
Example 11
Source File: ChiSqLearning.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib
import org.apache.spark.mllib.linalg.{ Matrix, Matrices, Vectors }
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.{
  SparkConf,
  SparkContext

}

object ChiSqLearning {
  def main(args: Array[String]) {
    val vd = Vectors.dense(1, 2, 3, 4, 5)
    val vdResult = Statistics.chiSqTest(vd)
    println(vd)
    println(vdResult)
    println("-------------------------------")
    val mtx = Matrices.dense(3, 2, Array(1, 3, 5, 2, 4, 6))
    val mtxResult = Statistics.chiSqTest(mtx)
    println(mtx)
    println(mtxResult)
    
    //print :方法、自由度、方法的统计量、p值,推论犯错的概率p
    println("-------------------------------")
    val mtx2 = Matrices.dense(2, 2, Array(19.0, 34, 24, 10.0))
    printChiSqTest(mtx2)
    printChiSqTest(Matrices.dense(2, 2, Array(26.0, 36, 7, 2.0)))
    //    val mtxResult2 = Statistics.chiSqTest(mtx2)
    //    println(mtx2)
    //    println(mtxResult2)
  }

  def printChiSqTest(matrix: Matrix): Unit = {
    println("-------------------------------")
    val mtxResult2 = Statistics.chiSqTest(matrix)
    println(matrix)
    println(mtxResult2)
  }

} 
Example 12
Source File: StatisticsDemo.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics

object StatisticsDemo {
  def main(args: Array[String]) {
    //val sparkConf = new SparkConf().setMast("local[2]").setAppName("SparkHdfsLR")

    val conf = new SparkConf().setAppName("test").setMaster("local")
    val sc = new SparkContext(conf)

    val rdd1 = sc.parallelize(
      Array(
        Array(1.0, 2.0, 3.0),
        Array(2.0, 3.0, 4.0))).map(f =>Vectors.dense(f))
    //比如1.2.3.4.5 这五个数的平均数是3
    val mss  = Statistics.colStats(rdd1)
    //方差是各个数据与平均数之差的平方相加再除以个数
    //方差越小越稳定,表示数据间差别小
    println("均值:" + mss.mean);
    println("样本方差:" + mss.variance); //样本方差是各个数据与平均数之差的平方相加再除以(个数-1)
    println("非零统计量个数:" + mss.numNonzeros);
    println("总数:" + mss.count);
    println("最大值:" + mss.max);
    println("最小值:" + mss.min);
    //其它normL2等统计信息
    
    val land1 = Vectors.dense(1000.0, 1856.0)
    val land2 = Vectors.dense(400, 560)
    val c1 = Statistics.chiSqTest(land1, land2)

  }
} 
Example 13
Source File: BasicStatistics.scala    From spark1.52   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import org.apache.spark.mllib.stat.MultivariateStatisticalSummary
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Matrix

    val sc: SparkContext = null
    val seriesX: RDD[Double] = null // a series 一系列
    //必须与seriesX具有相同数量的分区和基数
    val seriesY: RDD[Double] = null // must have the same number of partitions and cardinality as seriesX

    // compute the correlation using Pearson's method. Enter "spearman" for Spearman's method. If a 
    // method is not specified, Pearson's method will be used by default. 
    //pearson皮尔森相关性
    val correlation: Double = Statistics.corr(seriesX, seriesY, "pearson")
    println("pearson:"+correlation)
    //请注意,每个向量是一个行,而不是一个列
    val data: RDD[Vector] = null // note that each Vector is a row and not a column 
     //spearman 斯皮尔曼相关性
    // calculate the correlation matrix using Pearson's method. Use "spearman" for Spearman's method.
    //用皮尔森法计算相关矩阵,用“斯皮尔曼”的斯皮尔曼方法
    // If a method is not specified, Pearson's method will be used by default. 
    //如果没有指定方法,皮尔森的方法将被默认使用
    val correlMatrix: Matrix = Statistics.corr(data, "pearson")
    println("correlMatrix:"+correlMatrix.toString())


  }
} 
Example 14
Source File: Correlations.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}


spark-examples-*.jar \
        |  --input data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams).map { params =>
      run(params)
    } getOrElse {
        sys.exit(1)
    }
  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"Correlations with $params")
    val sc = new SparkContext(conf)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    println(s"Summary of data file: ${params.input}")
    println(s"${examples.count()} data points")

    // Calculate label -- feature correlations
    val labelRDD = examples.map(_.label)
    val numFeatures = examples.take(1)(0).features.size
    val corrType = "pearson"
    println()
    println(s"Correlation ($corrType) between label and each feature")
    println(s"Feature\tCorrelation")
    var feature = 0
    while (feature < numFeatures) {
      val featureRDD = examples.map(_.features(feature))
      val corr = Statistics.corr(labelRDD, featureRDD)
      println(s"$feature\t$corr")
      feature += 1
    }
    println()

    sc.stop()
  }
} 
Example 15
Source File: HypothesisTestingKolmogorovSmirnovTestExample.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
// $example off$

object HypothesisTestingKolmogorovSmirnovTestExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25))  // an RDD of sample data

    // run a KS test for the sample versus a standard normal distribution
    val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
    // summary of the test including the p-value, test statistic, and null hypothesis if our p-value
    // indicates significance, we can reject the null hypothesis.
    println(testResult)
    println()

    // perform a KS test using a cumulative distribution function of our making
    val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
    val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
    println(testResult2)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 16
Source File: Correlations.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
        |  --input data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"Correlations with $params")
    val sc = new SparkContext(conf)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    println(s"Summary of data file: ${params.input}")
    println(s"${examples.count()} data points")

    // Calculate label -- feature correlations
    val labelRDD = examples.map(_.label)
    val numFeatures = examples.take(1)(0).features.size
    val corrType = "pearson"
    println()
    println(s"Correlation ($corrType) between label and each feature")
    println(s"Feature\tCorrelation")
    var feature = 0
    while (feature < numFeatures) {
      val featureRDD = examples.map(_.features(feature))
      val corr = Statistics.corr(labelRDD, featureRDD)
      println(s"$feature\t$corr")
      feature += 1
    }
    println()

    sc.stop()
  }
}
// scalastyle:on println 
Example 17
package handson.example

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics


object PositiveCorrelationExample {
  def getSparkSession(): SparkSession = {
    val spark = SparkSession.builder().master("local").getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    spark
  }

  def main(args: Array[String]): Unit = {
    val spark = getSparkSession()

    val data = spark.sparkContext.parallelize(
      Seq(
        Vectors.dense(0.0, 1.0, 100.0),
        Vectors.dense(10.0, 10.0, 200.0),
        Vectors.dense(20.0, 100.0, 300.0),
        Vectors.dense(30.0, 1000.0, 400.0),
        Vectors.dense(40.0, 10000.0, 500.0),
        Vectors.dense(50.0, 100000.0, 600.0),
        Vectors.dense(60.0, 1000000.0, 700.0),
        Vectors.dense(70.0, 10000000.0, 800.0),
        Vectors.dense(80.0, 100000000.0, 900.0),
        Vectors.dense(90.0, 1000000000.0, 1000.0)
      )
    )
    val summary = Statistics.colStats(data) // Compute column summary statistics
    println(
      s"""Summary:
    ${summary.count} // number of records
    ${summary.mean}  // mean value for each column
    ${summary.min} // column-wise min
    ${summary.max} // column-wise max
    ${summary.normL1} // column-wise norm L1
    ${summary.normL2} // column-wise Euclidean magnitude
    ${summary.variance}  // column-wise variance
    ${summary.numNonzeros}  // column-wise count of non-zero values
        """.stripMargin)

    val corr = Statistics.corr(data)

    println(s"Correlation:\n${corr}")

    spark.stop()
  }

} 
Example 18
package handson.example

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.sql.SparkSession


object NegativeCorrelationExample {
  def getSparkSession(): SparkSession = {
    val spark = SparkSession.builder().master("local").getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    spark
  }

  def main(args: Array[String]): Unit = {
    val spark = getSparkSession()

    val data = spark.sparkContext.parallelize(
      Seq(
        Vectors.dense(0.0, 1.0, 100.0),
        Vectors.dense(-10.0, 10.0, 200.0),
        Vectors.dense(-20.0, 100.0, 300.0),
        Vectors.dense(-30.0, 1000.0, 400.0),
        Vectors.dense(-40.0, 10000.0, 500.0),
        Vectors.dense(-50.0, 100000.0, 600.0),
        Vectors.dense(-60.0, 1000000.0, 700.0),
        Vectors.dense(-70.0, 10000000.0, 800.0),
        Vectors.dense(-80.0, 100000000.0, 900.0),
        Vectors.dense(-90.0, 1000000000.0, 1000.0)
      )
    )

    val corr = Statistics.corr(data)

    println(s"Correlation:\n${corr}")

    spark.stop()
  }

} 
Example 19
Source File: HypothesisTestExample.scala    From Hands-On-Data-Analysis-with-Scala   with MIT License 5 votes vote down vote up
package handson.example


object HypothesisTestExample {
  def main(args: Array[String]): Unit = {
    // 1a. Set Spark session
    import org.apache.spark.sql.SparkSession
    val spark = SparkSession.builder().master("local").getOrCreate()
    import spark.implicits._

    // 1b. Set logging level to WARNING
    spark.sparkContext.setLogLevel("WARN")

    //2. Import the necessary classes from Spark’s MLLib package:
    import org.apache.spark.mllib.linalg._
    import org.apache.spark.mllib.stat.Statistics

    //3. Create a sample observation of vectors:
    val observations = Vectors.dense(0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 0.1)

    // 4. Run the chi-square test on the data:
    val results = Statistics.chiSqTest(observations)
    println(results)

    // Stop Spark Session before exiting
    spark.stop()
  }

} 
Example 20
Source File: HypothesisTestingKolmogorovSmirnovTestExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
// $example off$

object HypothesisTestingKolmogorovSmirnovTestExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25))  // an RDD of sample data

    // run a KS test for the sample versus a standard normal distribution
    val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
    // summary of the test including the p-value, test statistic, and null hypothesis if our p-value
    // indicates significance, we can reject the null hypothesis.
    println(testResult)
    println()

    // perform a KS test using a cumulative distribution function of our making
    val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
    val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
    println(testResult2)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 21
Source File: Correlations.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
        |  --input data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"Correlations with $params")
    val sc = new SparkContext(conf)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    println(s"Summary of data file: ${params.input}")
    println(s"${examples.count()} data points")

    // Calculate label -- feature correlations
    val labelRDD = examples.map(_.label)
    val numFeatures = examples.take(1)(0).features.size
    val corrType = "pearson"
    println()
    println(s"Correlation ($corrType) between label and each feature")
    println(s"Feature\tCorrelation")
    var feature = 0
    while (feature < numFeatures) {
      val featureRDD = examples.map(_.features(feature))
      val corr = Statistics.corr(labelRDD, featureRDD)
      println(s"$feature\t$corr")
      feature += 1
    }
    println()

    sc.stop()
  }
}
// scalastyle:on println 
Example 22
Source File: SummaryStatisticsExample.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
// $example off$

object SummaryStatisticsExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SummaryStatisticsExample")
    val sc = new SparkContext(conf)

    // $example on$
    val observations = sc.parallelize(
      Seq(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    )

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    println(summary.mean)  // a dense vector containing the mean value for each column
    println(summary.variance)  // column-wise variance
    println(summary.numNonzeros)  // number of nonzeros in each column
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 23
Source File: loyalty_model.scala    From Spark_Personas   with MIT License 5 votes vote down vote up
//��һ��
val result = hiveContext.sql("select max(login_times) from model_input_loyal_t")   //�����ʴ���
val max_login_times = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val result = hiveContext.sql("select min(login_times) from model_input_loyal_t")   //��С���ʴ���
val min_login_times = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val region_login_times = max_login_times - min_login_times

val result = hiveContext.sql("select max(stay_time) from model_input_loyal_t")  //���ͣ��ʱ��
val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val result = hiveContext.sql("select min(stay_time) from model_input_loyal_t")  //��Сͣ��ʱ��
val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val region_stay_time = max_stay_time - min_stay_time 

val result = hiveContext.sql("select max(view_days) from model_input_loyal_t")  //���ͣ������
val max_view_days = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val result = hiveContext.sql("select min(view_days) from model_input_loyal_t")  //��Сͣ������
val min_view_days = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val region_view_days = max_view_days - min_view_days 


val result = hiveContext.sql("select max(pv) from model_input_loyal_t")  //������ҳ����
val max_pv = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val result = hiveContext.sql("select min(pv) from model_input_loyal_t")  //��С����ҳ����
val min_pv = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val region_pv = max_pv - min_pv 

val result =hiveContext.sql("select max(unix_timestamp(t2.last_viewtime,'yyyy-MM-dd')) from  model_input_loyal_t t2")
val max_last_viewtime = result.collect()(0).get(0).asInstanceOf[Long].toDouble         //���ʱ��
val result = hiveContext.sql("select min(unix_timestamp(t2.last_viewtime,'yyyy-MM-dd')) from  model_input_loyal_t t2")
val min_last_viewtime = result.collect()(0).get(0).asInstanceOf[Long].toDouble     //��Сʱ��

val region_last_viewtime = max_last_viewtime - min_last_viewtime  


//Ȩ�أ�login_times:0.2,stay_time:0.3,view_days:0.3,pv:0.15,last_viewtime:0.05
val normalization= hiveContext.sql("select t1.cookie , (((t1.login_times - "+min_login_times+") * 0.2/"+region_login_times+") + ((t1.stay_time- "+min_stay_time+") * 0.3/"+region_stay_time+") +((t1.view_days - "+min_view_days+")* 0.3/"+region_view_days+") +((t1.pv - "+min_pv+")* 0.15/"+region_pv+") +((unix_timestamp(t1.last_viewtime,'yyyy-MM-dd')- "+min_last_viewtime+")*0.05 / " + region_last_viewtime + "))*100 as loyalty_score from model_input_loyal_t t1") 

normalization.registerTempTable("temporary_points")     //��һ����ʱ��

import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.linalg.Vectors

val rdd =  normalization.rdd.map( s => Vectors.dense(s.get(1).asInstanceOf[Double].toDouble))
val summary = Statistics.colStats(rdd)
println(summary.mean)
val means = summary.mean(0)
println(summary.variance)
val standard_deviation = summary.variance(0)

//����һ�������׼��ľ��룬��Ϊ��ֵ�Ƚ�С����ֵ��ȥ��׼���Ϊ�����������½�����Ϊ0���Ͻ粻�䣻
val r = means - standard_deviation*5
val low_bound =  if (r > 0)  r else 0
val up_bound = means + standard_deviation*5

val loyalty_temporary = hiveContext.sql("(select t1.lenovo_id,t1.loyalty_score,t1.loyalty_level from model_output_loyal_t t1 where 1=0) union all (select t2.cookie, t2.loyalty_score,(case when t2.loyalty_score  <= "+low_bound+"  then '��'  when t2.loyalty_score < "+up_bound+" then '��' else '��' end)as loyalty_level   from temporary_points t2)")

loyalty_temporary.registerTempTable("temporary_loyalty")

hiveContext.sql("insert overwrite table data.model_output_loyal_t  partition (l_day='2016-10-01') select * from temporary_loyalty") 
Example 24
Source File: L9-4Correlation.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object CorrelationApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CorrelationApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => f.map(f => f.toDouble))

    val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))

    val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
    walkingOrRunning.map(f => f.features).foreachRDD(rdd => {
      val corrSpearman = Statistics.corr(rdd, "spearman")
      val corrPearson = Statistics.corr(rdd, "pearson")
      println("Correlation Spearman: \n" + corrSpearman)
      println("Correlation Pearson: \n" + corrPearson)
    })

    ssc.start()
    ssc.awaitTermination()
  }

} 
Example 25
Source File: L9-5ChiSq.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object ChiSqApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: ChiSqApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => f.map(f => f.toDouble))

    substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
      .filter(f => f(0) == 4.0 || f(0) == 5.0)
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
      .foreachRDD(rdd => {
        Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2)))
      })

    ssc.start()
    ssc.awaitTermination()
  }

} 
Example 26
Source File: L9-3Statistics.scala    From prosparkstreaming   with Apache License 2.0 5 votes vote down vote up
package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object StatisticsApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: StatisticsApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => f.map(f => f.toDouble))

    substream.map(f => Vectors.dense(f.slice(1, 5))).foreachRDD(rdd => {
      val stats = Statistics.colStats(rdd)
      println("Count: " + stats.count)
      println("Max: " + stats.max.toArray.mkString(" "))
      println("Min: " + stats.min.toArray.mkString(" "))
      println("Mean: " + stats.mean.toArray.mkString(" "))
      println("L1-Norm: " + stats.normL1.toArray.mkString(" "))
      println("L2-Norm: " + stats.normL2.toArray.mkString(" "))
      println("Number of non-zeros: " + stats.numNonzeros.toArray.mkString(" "))
      println("Varience: " + stats.variance.toArray.mkString(" "))
    })

    ssc.start()
    ssc.awaitTermination()
  }

} 
Example 27
Source File: BugDemonstrationTest.scala    From spark-tsne   with Apache License 2.0 5 votes vote down vote up
package com.github.saurfang.spark.tsne

import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}


class BugDemonstrationTest extends FunSuite with Matchers with BeforeAndAfterAll {
  private var sparkSession : SparkSession = _
  override def beforeAll(): Unit = {
    super.beforeAll()
    sparkSession = SparkSession.builder().appName("BugTests").master("local[2]").getOrCreate()
  }

  override def afterAll(): Unit = {
    super.afterAll()
    sparkSession.stop()
  }

  test("This demonstrates a bug was fixed in tsne-spark 2.1") {
    val sc = sparkSession.sparkContext

    val observations = sc.parallelize(
      Seq(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    )

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    val expectedMean = Vectors.dense(2.0,20.0,200.0)
    val resultMean = summary.mean
    assertEqualEnough(resultMean, expectedMean)
    val expectedVariance = Vectors.dense(1.0,100.0,10000.0)
    assertEqualEnough(summary.variance, expectedVariance)
    val expectedNumNonZeros = Vectors.dense(3.0, 3.0, 3.0)
    assertEqualEnough(summary.numNonzeros, expectedNumNonZeros)
  }

  private def assertEqualEnough(sample: Vector, expected: Vector): Unit = {
    expected.toArray.zipWithIndex.foreach{ case(d: Double, i: Int) =>
      sample(i) should be (d +- 1E-12)
    }
  }
} 
Example 28
Source File: MlLibOnKudu.scala    From Taxi360   with Apache License 2.0 5 votes vote down vote up
package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu

import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MlLibOnKudu {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<taxiTable> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val taxiTable = args(2)
    val numOfCenters = args(3).toInt
    val numOfIterations = args(4).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val sqlContext = new SQLContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> taxiTable,
      "kudu.master" -> kuduMaster)

    sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load.
      registerTempTable("ny_taxi_trip_tmp")

    //Vector
    val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => {
      val taxiTrip = NyTaxiYellowTripBuilder.build(r)
      generateVectorOnly(taxiTrip)
    })

    println("--Running KMeans")
    val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations)
    println(" > vector centers:")
    clusters.clusterCenters.foreach(v => println(" >> " + v))

    println("--Running corr")
    val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson")
    println(" > corr: " + correlMatrix.toString)

    println("--Running colStats")
    val colStats = Statistics.colStats(vectorRDD)
    println(" > max: " + colStats.max)
    println(" > count: " + colStats.count)
    println(" > mean: " + colStats.mean)
    println(" > min: " + colStats.min)
    println(" > normL1: " + colStats.normL1)
    println(" > normL2: " + colStats.normL2)
    println(" > numNonZeros: " + colStats.numNonzeros)
    println(" > variance: " + colStats.variance)

    //Labeled Points
    
} 
Example 29
Source File: HypothesisTestingKolmogorovSmirnovTestExample.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
// $example off$

object HypothesisTestingKolmogorovSmirnovTestExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("HypothesisTestingKolmogorovSmirnovTestExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[Double] = sc.parallelize(Seq(0.1, 0.15, 0.2, 0.3, 0.25))  // an RDD of sample data

    // run a KS test for the sample versus a standard normal distribution
    val testResult = Statistics.kolmogorovSmirnovTest(data, "norm", 0, 1)
    // summary of the test including the p-value, test statistic, and null hypothesis if our p-value
    // indicates significance, we can reject the null hypothesis.
    println(testResult)
    println()

    // perform a KS test using a cumulative distribution function of our making
    val myCDF = Map(0.1 -> 0.2, 0.15 -> 0.6, 0.2 -> 0.05, 0.3 -> 0.05, 0.25 -> 0.1)
    val testResult2 = Statistics.kolmogorovSmirnovTest(data, myCDF)
    println(testResult2)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println 
Example 30
Source File: Correlations.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
        |  --input data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"Correlations with $params")
    val sc = new SparkContext(conf)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    println(s"Summary of data file: ${params.input}")
    println(s"${examples.count()} data points")

    // Calculate label -- feature correlations
    val labelRDD = examples.map(_.label)
    val numFeatures = examples.take(1)(0).features.size
    val corrType = "pearson"
    println()
    println(s"Correlation ($corrType) between label and each feature")
    println(s"Feature\tCorrelation")
    var feature = 0
    while (feature < numFeatures) {
      val featureRDD = examples.map(_.features(feature))
      val corr = Statistics.corr(labelRDD, featureRDD)
      println(s"$feature\t$corr")
      feature += 1
    }
    println()

    sc.stop()
  }
}
// scalastyle:on println