org.apache.spark.mllib.linalg.Vectors Scala Example

Source File: DataFrameExample.scala From drizzle-spark with Apache License 2.0

7 votes

// scalastyle:off println
package org.apache.spark.examples.ml

import java.io.File

import scopt.OptionParser

import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.util.Utils


object DataFrameExample {

  case class Params(input: String = "data/mllib/sample_libsvm_data.txt")
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DataFrameExample") {
      head("DataFrameExample: an example app using DataFrame for ML.")
      opt[String]("input")
        .text(s"input path to dataframe")
        .action((x, c) => c.copy(input = x))
      checkConfig { params =>
        success
      }
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val spark = SparkSession
      .builder
      .appName(s"DataFrameExample with $params")
      .getOrCreate()

    // Load input data
    println(s"Loading LIBSVM file with UDT from ${params.input}.")
    val df: DataFrame = spark.read.format("libsvm").load(params.input).cache()
    println("Schema from LIBSVM:")
    df.printSchema()
    println(s"Loaded training data as a DataFrame with ${df.count()} records.")

    // Show statistical summary of labels.
    val labelSummary = df.describe("label")
    labelSummary.show()

    // Convert features column to an RDD of vectors.
    val features = df.select("features").rdd.map { case Row(v: Vector) => v }
    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(Vectors.fromML(feat)),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")

    // Save the records in a parquet file.
    val tmpDir = Utils.createTempDir()
    val outputDir = new File(tmpDir, "dataframe").toString
    println(s"Saving to $outputDir as Parquet file.")
    df.write.parquet(outputDir)

    // Load the records back.
    println(s"Loading Parquet file with UDT from $outputDir.")
    val newDF = spark.read.parquet(outputDir)
    println(s"Schema from Parquet:")
    newDF.printSchema()

    spark.stop()
  }
}
// scalastyle:on println

Source File: SummaryStatisticsExample.scala From drizzle-spark with Apache License 2.0

6 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
// $example off$

object SummaryStatisticsExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SummaryStatisticsExample")
    val sc = new SparkContext(conf)

    // $example on$
    val observations = sc.parallelize(
      Seq(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    )

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    println(summary.mean)  // a dense vector containing the mean value for each column
    println(summary.variance)  // column-wise variance
    println(summary.numNonzeros)  // number of nonzeros in each column
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: DenseKMeans.scala From drizzle-spark with Apache License 2.0

6 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: SparkIntroduction.scala From reactive-machine-learning-systems with MIT License

6 votes

package com.reactivemachinelearning

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.evaluation.MulticlassMetrics
import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
import org.apache.spark.mllib.linalg.Vectors

object SparkIntroduction {

  def main(args: Array[String]) {
    // handle args

    // setup
    val session = SparkSession.builder.appName("Simple ModelExample").getOrCreate()
    import session.implicits._

    // Load and parse the train and test data
    val inputBasePath = "example_data"
    val outputBasePath = "."
    val trainingDataPath = inputBasePath + "/training.txt"
    val testingDataPath = inputBasePath + "/testing.txt"
    val currentOutputPath = outputBasePath + System.currentTimeMillis()

    val trainingData = session.read.textFile(trainingDataPath)
    val trainingParsed = trainingData.map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }.cache()

    val testingData = session.read.textFile(testingDataPath)
    val testingParsed = testingData.map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }.cache()

    // Building the model
    val numIterations = 100
    val model = LinearRegressionWithSGD.train(trainingParsed.rdd, numIterations)

    // Evaluate model on testing examples
    val predictionsAndLabels = testingParsed.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    }

    // Report performance statistics
    val metrics = new MulticlassMetrics(predictionsAndLabels.rdd)
    val precision = metrics.precision
    val recall = metrics.recall
    println(s"Precision: $precision Recall: $recall")

    // Save model
    model.save(session.sparkContext, currentOutputPath)
  }

}

Source File: GMM.scala From spark-tda with Apache License 2.0

6 votes

import java.io.{File, PrintWriter}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.clustering.GaussianMixture
import org.apache.spark.sql.functions._

def computeGaussianMixtureModel(
  pathToTextFile: String,
  quantity: Int) {
  case class Point(x: Double, y: Double)

  def save(f: File)(func: PrintWriter => Unit) {
    val p = new PrintWriter(f)
    try {
      func(p)
    } finally {
      p.close()
    }
  }

  val filename = pathToTextFile.split("\\.")(0)

  val outputFilename = s"$filename-GMM-k${quantity}.tsv"

  val points = sc
    .textFile(pathToTextFile)
    .map {
      line => line.trim.split("\\s+")
    }
    .map {
      row => Point(row(0).toDouble, row(1).toDouble)
    }

  val features = points
    .map {
      p => Vectors.dense(p.x, p.y)
    }

  features.cache()

  val gmm = new GaussianMixture()
    .setK(quantity)
    .run(features)

  val predictions = features
    .map {
      f => (f(0), f(1), gmm.predict(f) + 1)
    }
    .collect

  save(new File(outputFilename)) {
    println(s"OUTPUT TO: ${outputFilename}")
    f => predictions.foreach{
      case (x, y, ccid) => f.println(s"${x}\t${y}\t${ccid}")
    }
  }
}

Source File: PCAOnSourceVectorExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
// $example off$

object PCAOnSourceVectorExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAOnSourceVectorExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[LabeledPoint] = sc.parallelize(Seq(
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)),
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0))))

    // Compute the top 5 principal components.
    val pca = new PCA(5).fit(data.map(_.features))

    // Project vectors to the linear space spanned by the top 5 principal
    // components, keeping the label
    val projected = data.map(p => p.copy(features = pca.transform(p.features)))
    // $example off$
    val collect = projected.collect()
    println("Projected vector of principal component:")
    collect.foreach { vector => println(vector) }
  }
}
// scalastyle:on println

Source File: PCAOnRowMatrixExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
// $example off$

object PCAOnRowMatrixExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAOnRowMatrixExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))

    val dataRDD = sc.parallelize(data, 2)

    val mat: RowMatrix = new RowMatrix(dataRDD)

    // Compute the top 4 principal components.
    // Principal components are stored in a local dense matrix.
    val pc: Matrix = mat.computePrincipalComponents(4)

    // Project the rows to the linear space spanned by the top 4 principal components.
    val projected: RowMatrix = mat.multiply(pc)
    // $example off$
    val collect = projected.rows.collect()
    println("Projected Row Matrix of principal component:")
    collect.foreach { vector => println(vector) }
  }
}
// scalastyle:on println

Source File: TallSkinnyPCA.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.Vectors


object TallSkinnyPCA {
  def main(args: Array[String]) {
    if (args.length != 1) {
      System.err.println("Usage: TallSkinnyPCA <input>")
      System.exit(1)
    }

    val conf = new SparkConf().setAppName("TallSkinnyPCA")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(args(0)).map { line =>
      val values = line.split(' ').map(_.toDouble)
      Vectors.dense(values)
    }
    val mat = new RowMatrix(rows)

    // Compute principal components.
    val pc = mat.computePrincipalComponents(mat.numCols().toInt)

    println("Principal components are:\n" + pc)

    sc.stop()
  }
}
// scalastyle:on println

Source File: GaussianMixtureExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object GaussianMixtureExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("GaussianMixtureExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/gmm_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using GaussianMixture
    val gmm = new GaussianMixture().setK(2).run(parsedData)

    // Save and load model
    gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")
    val sameModel = GaussianMixtureModel.load(sc,
      "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")

    // output parameters of max-likelihood model
    for (i <- 0 until gmm.k) {
      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
        (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma))
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: PCAExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
// $example off$

@deprecated("Deprecated since LinearRegressionWithSGD is deprecated.  Use ml.feature.PCA", "2.0.0")
object PCAExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }.cache()

    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features))
    val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
    val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))

    val numIterations = 100
    val model = LinearRegressionWithSGD.train(training, numIterations)
    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)

    val valuesAndPreds = test.map { point =>
      val score = model.predict(point.features)
      (score, point.label)
    }

    val valuesAndPreds_pca = test_pca.map { point =>
      val score = model_pca.predict(point.features)
      (score, point.label)
    }

    val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean()
    val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean()

    println("Mean Squared Error = " + MSE)
    println("PCA Mean Squared Error = " + MSE_pca)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: CosineSimilarity.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}


object CosineSimilarity {
  case class Params(inputFile: String = null, threshold: Double = 0.1)
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("CosineSimilarity") {
      head("CosineSimilarity: an example app.")
      opt[Double]("threshold")
        .required()
        .text(s"threshold similarity: to tradeoff computation vs quality estimate")
        .action((x, c) => c.copy(threshold = x))
      arg[String]("<inputFile>")
        .required()
        .text(s"input file, one row per line, space-separated")
        .action((x, c) => c.copy(inputFile = x))
      note(
        """
          |For example, the following command runs this app on a dataset:
          |
          | ./bin/spark-submit  --class org.apache.spark.examples.mllib.CosineSimilarity \
          | examplesjar.jar \
          | --threshold 0.1 data/mllib/sample_svm_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName("CosineSimilarity")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(params.inputFile).map { line =>
      val values = line.split(' ').map(_.toDouble)
      Vectors.dense(values)
    }.cache()
    val mat = new RowMatrix(rows)

    // Compute similar columns perfectly, with brute force.
    val exact = mat.columnSimilarities()

    // Compute similar columns with estimation using DIMSUM
    val approx = mat.columnSimilarities(params.threshold)

    val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) }
    val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) }
    val MAE = exactEntries.leftOuterJoin(approxEntries).values.map {
      case (u, Some(v)) =>
        math.abs(u - v)
      case (u, None) =>
        math.abs(u)
    }.mean()

    println(s"Average absolute error in estimate is: $MAE")

    sc.stop()
  }
}
// scalastyle:on println

Source File: ElementwiseProductExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.ElementwiseProduct
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object ElementwiseProductExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("ElementwiseProductExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Create some vector data; also works for sparse vectors
    val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)))

    val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
    val transformer = new ElementwiseProduct(transformingVector)

    // Batch transform and per-row transform give the same results:
    val transformedData = transformer.transform(data)
    val transformedData2 = data.map(x => transformer.transform(x))
    // $example off$

    println("transformedData: ")
    transformedData.foreach(x => println(x))

    println("transformedData2: ")
    transformedData2.foreach(x => println(x))

    sc.stop()
  }
}
// scalastyle:on println

Source File: SVDExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.SingularValueDecomposition
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
// $example off$

object SVDExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SVDExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))

    val dataRDD = sc.parallelize(data, 2)

    val mat: RowMatrix = new RowMatrix(dataRDD)

    // Compute the top 5 singular values and corresponding singular vectors.
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true)
    val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
    val s: Vector = svd.s  // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V  // The V factor is a local dense matrix.
    // $example off$
    val collect = U.rows.collect()
    println("U factor is:")
    collect.foreach { vector => println(vector) }
    println(s"Singular values are: $s")
    println(s"V factor is:\n$V")
  }
}
// scalastyle:on println

Source File: TallSkinnySVD.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.Vectors


object TallSkinnySVD {
  def main(args: Array[String]) {
    if (args.length != 1) {
      System.err.println("Usage: TallSkinnySVD <input>")
      System.exit(1)
    }

    val conf = new SparkConf().setAppName("TallSkinnySVD")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(args(0)).map { line =>
      val values = line.split(' ').map(_.toDouble)
      Vectors.dense(values)
    }
    val mat = new RowMatrix(rows)

    // Compute SVD.
    val svd = mat.computeSVD(mat.numCols().toInt)

    println("Singular values are " + svd.s)

    sc.stop()
  }
}
// scalastyle:on println

Source File: StandardScalerExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
// $example off$

object StandardScalerExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("StandardScalerExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")

    val scaler1 = new StandardScaler().fit(data.map(x => x.features))
    val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
    // scaler3 is an identical model to scaler2, and will produce identical transformations
    val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)

    // data1 will be unit variance.
    val data1 = data.map(x => (x.label, scaler1.transform(x.features)))

    // data2 will be unit variance and zero mean.
    val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
    // $example off$

    println("data1: ")
    data1.foreach(x => println(x))

    println("data2: ")
    data2.foreach(x => println(x))

    sc.stop()
  }
}
// scalastyle:on println

Source File: KMeansExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object KMeansExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("KMeansExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/kmeans_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(parsedData, numClusters, numIterations)

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val WSSSE = clusters.computeCost(parsedData)
    println("Within Set Sum of Squared Errors = " + WSSSE)

    // Save and load model
    clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: MultivariateSummarizer.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
        |  --input data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params")
    val sc = new SparkContext(conf)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    println(s"Summary of data file: ${params.input}")
    println(s"${examples.count()} data points")

    // Summarize labels
    val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())(
      (summary, lp) => summary.add(Vectors.dense(lp.label)),
      (sum1, sum2) => sum1.merge(sum2))

    // Summarize features
    val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())(
      (summary, lp) => summary.add(lp.features),
      (sum1, sum2) => sum1.merge(sum2))

    println()
    println(s"Summary statistics")
    println(s"\tLabel\tFeatures")
    println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}")
    println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}")
    println(
      s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}")
    println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}")
    println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}")
    println()

    sc.stop()
  }
}
// scalastyle:on println

Source File: LinearRegressionWithSGDExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
// $example off$

@deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0")
object LinearRegressionWithSGDExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/ridge-data/lpsa.data")
    val parsedData = data.map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }.cache()

    // Building the model
    val numIterations = 100
    val stepSize = 0.00000001
    val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize)

    // Evaluate model on training examples and compute training error
    val valuesAndPreds = parsedData.map { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }
    val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean()
    println("training Mean Squared Error = " + MSE)

    // Save and load model
    model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
    val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: StreamingLinearRegressionExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
// $example off$
import org.apache.spark.streaming._


object StreamingLinearRegressionExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>")
      System.exit(1)
    }

    val conf = new SparkConf().setAppName("StreamingLinearRegressionExample")
    val ssc = new StreamingContext(conf, Seconds(1))

    // $example on$
    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache()
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val numFeatures = 3
    val model = new StreamingLinearRegressionWithSGD()
      .setInitialWeights(Vectors.zeros(numFeatures))

    model.trainOn(trainingData)
    model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()

    ssc.start()
    ssc.awaitTermination()
    // $example off$

    ssc.stop()
  }
}
// scalastyle:on println

Source File: BisectingKMeansExample.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.examples.mllib

// scalastyle:off println
import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.BisectingKMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
// $example off$


object BisectingKMeansExample {

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample")
    val sc = new SparkContext(sparkConf)

    // $example on$
    // Loads and parses data
    def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble))
    val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache()

    // Clustering the data into 6 clusters by BisectingKMeans.
    val bkm = new BisectingKMeans().setK(6)
    val model = bkm.run(data)

    // Show the compute cost and the cluster centers
    println(s"Compute Cost: ${model.computeCost(data)}")
    model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
      println(s"Cluster Center ${idx}: ${center}")
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: StreamingKMeansExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}
// $example off$


object StreamingKMeansExample {

  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: StreamingKMeansExample " +
          "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")
      System.exit(1)
    }

    // $example on$
    val conf = new SparkConf().setAppName("StreamingKMeansExample")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingKMeans()
      .setK(args(3).toInt)
      .setDecayFactor(1.0)
      .setRandomCenters(args(4).toInt, 0.0)

    model.trainOn(trainingData)
    model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()

    ssc.start()
    ssc.awaitTermination()
    // $example off$
  }
}
// scalastyle:on println

Source File: HashingTF.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import java.lang.{Iterable => JavaIterable}

import scala.collection.JavaConverters._
import scala.collection.mutable

import org.apache.spark.SparkException
import org.apache.spark.annotation.Since
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.unsafe.hash.Murmur3_x86_32._
import org.apache.spark.unsafe.types.UTF8String
import org.apache.spark.util.Utils


  private[spark] def murmur3Hash(term: Any): Int = {
    term match {
      case null => seed
      case b: Boolean => hashInt(if (b) 1 else 0, seed)
      case b: Byte => hashInt(b, seed)
      case s: Short => hashInt(s, seed)
      case i: Int => hashInt(i, seed)
      case l: Long => hashLong(l, seed)
      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
      case s: String =>
        val utf8 = UTF8String.fromString(s)
        hashUnsafeBytes(utf8.getBaseObject, utf8.getBaseOffset, utf8.numBytes(), seed)
      case _ => throw new SparkException("HashingTF with murmur3 algorithm does not " +
        s"support type ${term.getClass.getCanonicalName} of input data.")
    }
  }
}

Source File: Normalizer.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}


  @Since("1.1.0")
  override def transform(vector: Vector): Vector = {
    val norm = Vectors.norm(vector, p)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
          }
          Vectors.dense(values)
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          }
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
      }
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.
      vector
    }
  }

}

Source File: GaussianMixtureModelWrapper.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import scala.collection.JavaConverters

import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.GaussianMixtureModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}


  val gaussians: Array[Byte] = {
    val modelGaussians = model.gaussians.map { gaussian =>
      Array[Any](gaussian.mu, gaussian.sigma)
    }
    SerDe.dumps(JavaConverters.seqAsJavaListConverter(modelGaussians).asJava)
  }

  def predictSoft(point: Vector): Vector = {
    Vectors.dense(model.predictSoft(point))
  }

  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
}

Source File: Word2VecModelWrapper.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import java.util.{List => JList, Map => JMap}

import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}


  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    prepareResult(model.findSynonyms(vector, num))
  }

  private def prepareResult(result: Array[(String, Double)]) = {
    val similarity = Vectors.dense(result.map(_._2))
    val words = result.map(_._1)
    List(words, similarity).map(_.asInstanceOf[Object]).asJava
  }


  def getVectors: JMap[String, JList[Float]] = {
    model.getVectors.map { case (k, v) =>
      (k, v.toList.asJava)
    }.asJava
  }

  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
}

Source File: MatrixFactorizationModelWrapper.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
}

Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.distribution

import breeze.linalg.{diag, eigSym, max, DenseMatrix => DBM, DenseVector => DBV, Vector => BV}

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
import org.apache.spark.mllib.util.MLUtils


  private def calculateCovarianceConstants: (DBM[Double], Double) = {
    val eigSym.EigSym(d, u) = eigSym(sigma.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t

    // For numerical stability, values are considered to be non-zero only if they exceed tol.
    // This prevents any inverted value from exceeding (eps * n * max(d))^-1
    val tol = MLUtils.EPSILON * max(d) * d.length

    try {
      // log(pseudo-determinant) is sum of the logs of all non-zero singular values
      val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum

      // calculate the root-pseudo-inverse of the diagonal matrix of singular values
      // by inverting the square root of all non-zero values
      val pinvS = diag(new DBV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))

      (pinvS * u.t, -0.5 * (mu.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
    } catch {
      case uex: UnsupportedOperationException =>
        throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
    }
  }
}

Source File: SpearmanCorrelation.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD


  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) =>
      vec.toArray.view.zipWithIndex.map { case (v, j) =>
        ((j, v), uid)
      }
    }
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = cachedUids.map { uid =>
          (uid, (preCol, averageRank))
        }
        cachedUids.clear()
        output
      }
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
          output
        } else {
          cachedUids += uid
          Iterator.empty
        }
      } ++ flush()
    }
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
      Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray)
    }
    PearsonCorrelation.computeCorrelationMatrix(groupedRanks)
  }
}

Source File: Updater.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization

import scala.math._

import breeze.linalg.{axpy => brzAxpy, norm => brzNorm, Vector => BV}

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.mllib.linalg.{Vector, Vectors}


@DeveloperApi
class SquaredL2Updater extends Updater {
  override def compute(
      weightsOld: Vector,
      gradient: Vector,
      stepSize: Double,
      iter: Int,
      regParam: Double): (Vector, Double) = {
    // add up both updates from the gradient of the loss (= step) as well as
    // the gradient of the regularizer (= regParam * weightsOld)
    // w' = w - thisIterStepSize * (gradient + regParam * w)
    // w' = (1 - thisIterStepSize * regParam) * w - thisIterStepSize * gradient
    val thisIterStepSize = stepSize / math.sqrt(iter)
    val brzWeights: BV[Double] = weightsOld.asBreeze.toDenseVector
    brzWeights :*= (1.0 - thisIterStepSize * regParam)
    brzAxpy(-thisIterStepSize, gradient.asBreeze, brzWeights)
    val norm = brzNorm(brzWeights, 2.0)

    (Vectors.fromBreeze(brzWeights), 0.5 * regParam * norm * norm)
  }
}

Source File: LogisticRegressionDataGenerator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  @Since("0.8.0")
  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      }
      LabeledPoint(y, Vectors.dense(x))
    }
    data
  }

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Source File: SVMDataGenerator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


@DeveloperApi
@Since("0.8.0")
object SVMDataGenerator {

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      }
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))
    }

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Source File: LabeledPoint.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException


  @Since("1.1.0")
  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
      }
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
    }
  }

  private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = {
    LabeledPoint(point.label, Vectors.fromML(point.features))
  }
}

Source File: ChiSqSelectorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {

  

  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Set(LabeledPoint(0.0, Vectors.dense(Array(8.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSet
    assert(filteredData == preFilteredData)
  }

  test("ChiSqSelector by FPR transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2)
    val preFilteredData =
      Set(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(9.0))))
    val model = new ChiSqSelector().setSelectorType("fpr").setAlpha(0.1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSet
    assert(filteredData == preFilteredData)
  }

  test("model load / save") {
    val model = ChiSqSelectorSuite.createModel()
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    try {
      model.save(sc, path)
      val sameModel = ChiSqSelectorModel.load(sc, path)
      ChiSqSelectorSuite.checkEqual(model, sameModel)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

object ChiSqSelectorSuite extends SparkFunSuite {

  def createModel(): ChiSqSelectorModel = {
    val arr = Array(1, 2, 3, 4)
    new ChiSqSelectorModel(arr)
  }

  def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = {
    assert(a.selectedFeatures.deep == b.selectedFeatures.deep)
  }
}

Source File: ElementwiseProductSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("elementwise (hadamard) product should properly apply vector to dense data set") {
    val denseData = Array(
      Vectors.dense(1.0, 4.0, 1.9, -9.0)
    )
    val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
    val transformer = new ElementwiseProduct(scalingVec)
    val transformedData = transformer.transform(sc.makeRDD(denseData))
    val transformedVecs = transformedData.collect()
    val transformedVec = transformedVecs(0)
    val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
    assert(transformedVec ~== expectedVec absTol 1E-5,
      s"Expected transformed vector $expectedVec but found $transformedVec")
  }

  test("elementwise (hadamard) product should properly apply vector to sparse data set") {
    val sparseData = Array(
      Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
    )
    val dataRDD = sc.parallelize(sparseData, 3)
    val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
    val transformer = new ElementwiseProduct(scalingVec)
    val data2 = sparseData.map(transformer.transform)
    val data2RDD = transformer.transform(dataRDD)

    assert((sparseData, data2, data2RDD.collect()).zipped.forall {
      case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after hadamard product")

    assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
    assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
  }
}

Source File: IDFSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
        0
      }
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

}

Source File: PCASuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
  )

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect()

    pca_transform.zip(mat_multiply).foreach { case (calculated, expected) =>
      assert(calculated ~== expected relTol 1e-8)
    }
    assert(pca.explainedVariance ~== explainedVariance relTol 1e-8)
  }
}

Source File: HashingTFSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("hashing tf on a single doc") {
    val hashingTF = new HashingTF(1000)
    val doc = "a a b b c d".split(" ")
    val n = hashingTF.numFeatures
    val termFreqs = Seq(
      (hashingTF.indexOf("a"), 2.0),
      (hashingTF.indexOf("b"), 2.0),
      (hashingTF.indexOf("c"), 1.0),
      (hashingTF.indexOf("d"), 1.0))
    assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n),
      "index must be in range [0, #features)")
    assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")
    val expected = Vectors.sparse(n, termFreqs)
    assert(hashingTF.transform(doc) === expected)
  }

  test("hashing tf on an RDD") {
    val hashingTF = new HashingTF
    val localDocs: Seq[Seq[String]] = Seq(
      "a a b b b c d".split(" "),
      "a b c d a b c".split(" "),
      "c b a c b a a".split(" "))
    val docs = sc.parallelize(localDocs, 2)
    assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet)
  }

  test("applying binary term freqs") {
    val hashingTF = new HashingTF(100).setBinary(true)
    val doc = "a a b c c c".split(" ")
    val n = hashingTF.numFeatures
    val expected = Vectors.sparse(n, Seq(
      (hashingTF.indexOf("a"), 1.0),
      (hashingTF.indexOf("b"), 1.0),
      (hashingTF.indexOf("c"), 1.0)))
    assert(hashingTF.transform(doc) ~== expected absTol 1e-14)
  }
}

Source File: EnsembleTestHelper.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

}

Source File: PythonMLLibAPISuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.regression.LabeledPoint

class PythonMLLibAPISuite extends SparkFunSuite {

  SerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

  }
}

Source File: MultivariateGaussianSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.distribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Matrices, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }

}

Source File: KMeansPMMLModelExportSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.ClusteringModel

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors

class KMeansPMMLModelExportSuite extends SparkFunSuite {

  test("KMeansPMMLModelExport generate PMML format") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    // assert that the PMML format is as expected
    assert(modelExport.isInstanceOf[PMMLModelExport])
    val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml
    assert(pmml.getHeader.getDescription === "k-means clustering")
    // check that the number of fields match the single vector size
    assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size)
    // This verify that there is a model attached to the pmml object and the model is a clustering
    // one. It also verifies that the pmml model has the same number of clusters of the spark model.
    val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
    assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
  }

}

Source File: PMMLModelExportFactorySuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel}
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
import org.apache.spark.mllib.util.LinearDataGenerator

class PMMLModelExportFactorySuite extends SparkFunSuite {

  test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    assert(modelExport.isInstanceOf[KMeansPMMLModelExport])
  }

  test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a "
    + "LinearRegressionModel, RidgeRegressionModel or LassoModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val linearRegressionModel =
      new LinearRegressionModel(linearInput(0).features, linearInput(0).label)
    val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel)
    assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val ridgeRegressionModel =
      new RidgeRegressionModel(linearInput(0).features, linearInput(0).label)
    val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel)
    assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label)
    val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel)
    assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])
  }

  test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
    + "when passing a LogisticRegressionModel or SVMModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val logisticRegressionModel =
      new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
    val logisticRegressionModelExport =
      PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
    assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])

    val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
    val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
    assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
  }

  test("PMMLModelExportFactory throw IllegalArgumentException "
    + "when passing a Multinomial Logistic Regression") {
    
    val multiclassLogisticRegressionModel = new LogisticRegressionModel(
      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
      numFeatures = 2, numClasses = 3)

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
    }
  }

  test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") {
    val invalidModel = new Object

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(invalidModel)
    }
  }
}

Source File: CoordinateMatrixSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.linalg.distributed

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {

  val m = 5
  val n = 4
  var mat: CoordinateMatrix = _

  override def beforeAll() {
    super.beforeAll()
    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    mat = new CoordinateMatrix(entries)
  }

  test("size") {
    assert(mat.numRows() === m)
    assert(mat.numCols() === n)
  }

  test("empty entries") {
    val entries = sc.parallelize(Seq[MatrixEntry](), 1)
    val emptyMat = new CoordinateMatrix(entries)
    intercept[RuntimeException] {
      emptyMat.numCols()
    }
    intercept[RuntimeException] {
      emptyMat.numRows()
    }
  }

  test("toBreeze") {
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(mat.toBreeze() === expected)
  }

  test("transpose") {
    val transposed = mat.transpose()
    assert(mat.toBreeze().t === transposed.toBreeze())
  }

  test("toIndexedRowMatrix") {
    val indexedRowMatrix = mat.toIndexedRowMatrix()
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(indexedRowMatrix.toBreeze() === expected)
  }

  test("toRowMatrix") {
    val rowMatrix = mat.toRowMatrix()
    val rows = rowMatrix.rows.collect().toSet
    val expected = Set(
      Vectors.dense(1.0, 2.0, 0.0, 0.0),
      Vectors.dense(0.0, 3.0, 4.0, 0.0),
      Vectors.dense(0.0, 0.0, 5.0, 6.0),
      Vectors.dense(7.0, 0.0, 0.0, 8.0),
      Vectors.dense(0.0, 9.0, 0.0, 0.0))
    assert(rows === expected)
  }

  test("toBlockMatrix") {
    val blockMat = mat.toBlockMatrix(2, 2)
    assert(blockMat.numRows() === m)
    assert(blockMat.numCols() === n)
    assert(blockMat.toBreeze() === mat.toBreeze())

    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(-1, 2)
    }
    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(2, 0)
    }
  }
}

Source File: LabeledPointSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.Vectors

class LabeledPointSuite extends SparkFunSuite {

  test("parse labeled points") {
    val points = Seq(
      LabeledPoint(1.0, Vectors.dense(1.0, 0.0)),
      LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0))))
    points.foreach { p =>
      assert(p === LabeledPoint.parse(p.toString))
    }
  }

  test("parse labeled points with whitespaces") {
    val point = LabeledPoint.parse("(0.0, [1.0, 2.0])")
    assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
  }

  test("parse labeled points with v0.9 format") {
    val point = LabeledPoint.parse("1.0,1.0 0.0 -2.0")
    assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))
  }

  test("conversions between new ml LabeledPoint and mllib LabeledPoint") {
    val points: Seq[LabeledPoint] = Seq(
      LabeledPoint(1.0, Vectors.dense(1.0, 0.0)),
      LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0))))

    val newPoints: Seq[NewLabeledPoint] = points.map(_.asML)

    points.zip(newPoints).foreach { case (p1, p2) =>
      assert(p1 === LabeledPoint.fromML(p2))
    }
  }
}

Source File: RidgeRegressionSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.{LinearDataGenerator, LocalClusterSparkContext,
  MLlibTestSparkContext}
import org.apache.spark.util.Utils

private object RidgeRegressionSuite {

  
  val model = new RidgeRegressionModel(weights = Vectors.dense(0.1, 0.2, 0.3), intercept = 0.5)
}

class RidgeRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {

  def predictionError(predictions: Seq[Double], input: Seq[LabeledPoint]): Double = {
    predictions.zip(input).map { case (prediction, expected) =>
      (prediction - expected.label) * (prediction - expected.label)
    }.sum / predictions.size
  }

  test("ridge regression can help avoid overfitting") {

    // For small number of examples and large variance of error distribution,
    // ridge regression should give smaller generalization error that linear regression.

    val numExamples = 50
    val numFeatures = 20

    // Pick weights as random values distributed uniformly in [-0.5, 0.5]
    val random = new Random(42)
    val w = Array.fill(numFeatures)(random.nextDouble() - 0.5)

    // Use half of data for training and other half for validation
    val data = LinearDataGenerator.generateLinearInput(3.0, w, 2 * numExamples, 42, 10.0)
    val testData = data.take(numExamples)
    val validationData = data.takeRight(numExamples)

    val testRDD = sc.parallelize(testData, 2).cache()
    val validationRDD = sc.parallelize(validationData, 2).cache()

    // First run without regularization.
    val linearReg = new LinearRegressionWithSGD()
    linearReg.optimizer.setNumIterations(200)
                       .setStepSize(1.0)

    val linearModel = linearReg.run(testRDD)
    val linearErr = predictionError(
        linearModel.predict(validationRDD.map(_.features)).collect(), validationData)

    val ridgeReg = new RidgeRegressionWithSGD()
    ridgeReg.optimizer.setNumIterations(200)
                      .setRegParam(0.1)
                      .setStepSize(1.0)
    val ridgeModel = ridgeReg.run(testRDD)
    val ridgeErr = predictionError(
        ridgeModel.predict(validationRDD.map(_.features)).collect(), validationData)

    // Ridge validation error should be lower than linear regression.
    assert(ridgeErr < linearErr,
      "ridgeError (" + ridgeErr + ") was not less than linearError(" + linearErr + ")")
  }

  test("model save/load") {
    val model = RidgeRegressionSuite.model

    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString

    // Save model, load it back, and compare.
    try {
      model.save(sc, path)
      val sameModel = RidgeRegressionModel.load(sc, path)
      assert(model.weights == sameModel.weights)
      assert(model.intercept == sameModel.intercept)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

class RidgeRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext {

  test("task size should be small in both training and prediction") {
    val m = 4
    val n = 200000
    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
      val random = new Random(idx)
      iter.map(i => LabeledPoint(1.0, Vectors.dense(Array.fill(n)(random.nextDouble()))))
    }.cache()
    // If we serialize data directly in the task closure, the size of the serialized task would be
    // greater than 1MB and hence Spark would throw an error.
    val model = RidgeRegressionWithSGD.train(points, 2)
    val predictions = model.predict(points.map(_.features))
  }
}

Source File: TestMPSLinearProgramSolver.scala From spark-lp with Apache License 2.0

5 votes

object TestMPSLinearProgramSolver {
  def main(args: Array[String]) {

    val conf = new SparkConf()
      .setMaster("local[2]")
      .setAppName("TestMPSLinearProgramSolver")

    val sc = new SparkContext(conf)

    // Parse the provided MPS file.
    val parser = new MPSParser()
    val mpsFile = new File(args(0))
    parser.parse(mpsFile)

    // Convert the parsed linear program to standard form.
    val converter = new LPStandardConverter(true)
    converter.toStandardForm(parser.getC,
      parser.getG,
      parser.getH,
      parser.getA,
      parser.getB,
      parser.getLb,
      parser.getUb)

    // Convert the parameters of the linear program to spark lp compatible formats.
    val numPartitions = 2
    val c: DVector = sc.parallelize(converter.getStandardC.toArray, numPartitions)
      .glom.map(new DenseVector(_))
    val B: DMatrix = sc.parallelize(converter.getStandardA.toArray.transpose.map(
      Vectors.dense(_).toSparse: Vector), numPartitions)
    val b = new DenseVector(converter.getStandardB.toArray)
    println("Start solving ... ")
    val (optimalVal, optimalX) = LP.solve(c, B, b, sc=sc)
    println("optimalVal: " + optimalVal)
    //println("optimalX: " + optimalX.collectElements.mkString(", "))

    sc.stop()
  }
}

Source File: LinopMatrixAdjoint.scala From spark-lp with Apache License 2.0

5 votes

  override def apply(x: DVector): DenseVector = {
    val n = this.n
    matrix.zipPartitions(x)((matrixPartition, xPartition) =>
      Iterator.single(
        matrixPartition.checkedZip(xPartition.next.values.toIterator).aggregate(
          // NOTE A DenseVector result is assumed here (not sparse safe).
          Vectors.zeros(n).toDense)(
            seqop = (_, _) match {
              case (sum, (matrix_i, x_i)) => {
                // Multiply an element of x by its corresponding matrix row, and add to the
                // accumulation sum vector.
                BLAS.axpy(x_i, matrix_i, sum)
                sum
              }
            },
            combop = (sum1, sum2) => {
              // Add the intermediate sum vectors.
              BLAS.axpy(1.0, sum2, sum1)
              sum1
            }
          ))
    ).treeAggregate(Vectors.zeros(n).toDense)(
      seqOp = (sum1, sum2) => {
        // Add the intermediate sum vectors.
        BLAS.axpy(1.0, sum2, sum1)
        sum1
      },
      combOp = (sum1, sum2) => {
        // Add the intermediate sum vectors.
        BLAS.axpy(1.0, sum2, sum1)
        sum1
      }
      , depth
    )
  }
}

Source File: SpLinopMatrixSuite.scala From spark-lp with Apache License 2.0

5 votes


package org.apache.spark.mllib.optimization.lp

import org.scalatest.FunSuite

import org.apache.spark.mllib.linalg.{ DenseVector, Vectors }
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace
import org.apache.spark.mllib.optimization.lp.VectorSpace._
import org.apache.spark.mllib.optimization.lp.fs.dvector.dmatrix._

class SpLinopMatrixSuite extends FunSuite with MLlibTestSparkContext {

  test("SpLinopMatrix.apply is implemented properly") {

    val matrix: DMatrix = sc.parallelize(Array(
      Vectors.dense(1.0, 2.0, 3.0),
      Vectors.dense(4.0, 5.0, 6.0)),
      2)

    val vector: DVector = sc.parallelize(Array(2.0, 3.0), 2).glom.map(new DenseVector(_))

    val expectApply: DMatrix = sc.parallelize(Array(
      Vectors.dense(2.0 * 1.0, 2.0 * 2.0, 2.0 * 3.0),
      Vectors.dense(3.0 * 4.0, 3.0 * 5.0, 3.0 * 6.0)),
      2)
    assert((new SpLinopMatrix(vector))(matrix).collect().deep == expectApply.collect().deep, // or sameElements
      "SpLinopMatrix.apply should return the correct result.")
  }
}

Source File: InitializeSuite.scala From spark-lp with Apache License 2.0

5 votes


package org.apache.spark.mllib.optimization.lp

import org.scalatest.FunSuite

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.mllib.optimization.lp.VectorSpace._
import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace
import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace
import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, _}
import org.apache.spark.mllib.optimization.tfocs.VectorSpace.{DMatrix, DVector}

class InitializeSuite extends FunSuite with MLlibTestSparkContext {

  val numPartitions = 2
  val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0)
  val BArray = Array(
    Array(12.0, 16.0, 30.0, 1.0, 0.0),
    Array(24.0, 16.0, 12.0, 0.0, 1.0),
    Array(-1.0, 0.0, 0.0, 0.0, 0.0),
    Array(0.0, -1.0, 0.0, 0.0, 0.0),
    Array(0.0, 0.0, -1.0, 0.0, 0.0),
    Array(0.0, 0.0, 0.0, 1.0, 0.0),
    Array(0.0, 0.0, 0.0, 0.0, 1.0))
  val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0)

  lazy val c: DVector = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_))
  lazy val rows: DMatrix = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_))
  lazy val b: DenseVector = new DenseVector(bArray)

  val cBrz = new BDV[Double](cArray)
  val BBrz = new BDM[Double](7, 5,
    BArray.flatMap(x => x),
    offset = 0,
    majorStride = 5,
    isTranspose = true)
  val bBrz = new BDV[Double](bArray)
  // (BT * B) ^(-1)
  val BTBInv = inv(BBrz.t * BBrz)
  // xTilda = B * BTBInv * b
  val xTilda: BDV[Double] = BBrz * (BTBInv * bBrz)
  // lambdaTilda = BTBInv * (B^T * c)
  val lambdaTilda: BDV[Double] = BTBInv * (BBrz.t * cBrz)
  // sTilda = c - B * lambdaTilda
  val sTilda = cBrz - BBrz * lambdaTilda
  val deltax = Math.max(1.5 * max(xTilda), 0)
  val deltas = Math.max(1.5 * max(sTilda), 0)
  val xHat = xTilda :+ deltax
  val sHat = sTilda :+ deltas
  val deltaxHat: Double = 0.5 * (xHat.t * sHat) / sum(sHat)
  val deltasHat: Double = 0.5 * (xHat.t * sHat) / sum(xHat)
  // x = xHat + deltaxHat * e
  val expectedx: BDV[Double] = xHat :+ deltaxHat
  // val expectedLambda = lambdaTilda
  val expecteds: BDV[Double] = sHat :+ deltasHat


  test("Initialize.init is implemented properly") {

    val result = Initialize.init(c, rows, b)
    //println(LP.solve(c, rows, b, 1e-4, 1).collect())
    assert(Vectors.dense(expectedx.toArray) ~= Vectors.dense(result._1.flatMap(_.toArray).collect()) relTol 1e-6,
      "Initialize.init x0 is not computed correctly.")
    assert(Vectors.dense(lambdaTilda.toArray) ~= Vectors.dense(result._2.toArray) relTol 1e-6,
      "Initialize.init lambda0 is not computed correctly.")
    assert(Vectors.dense(expecteds.toArray) ~= Vectors.dense(result._3.flatMap(_.toArray).collect()) relTol 1e-6,
      "Initialize.init s0 should return the correct answer.")
  }
}

Source File: LPSuite.scala From spark-lp with Apache License 2.0

5 votes


package org.apache.spark.mllib.optimization.lp

import org.scalatest.FunSuite

import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._
import org.apache.spark.mllib.optimization.lp.VectorSpace._
import org.apache.spark.mllib.optimization.lp.vs.dvector.DVectorSpace
import org.apache.spark.mllib.optimization.lp.vs.vector.DenseVectorSpace

class LPSuite extends FunSuite with MLlibTestSparkContext {

  val numPartitions = 2
  val cArray = Array(2.0, 1.5, 0.0, 0.0, 0.0, 0.0, 0.0)
  val BArray = Array(
    Array(12.0, 16.0, 30.0, 1.0, 0.0),
    Array(24.0, 16.0, 12.0, 0.0, 1.0),
    Array(-1.0, 0.0, 0.0, 0.0, 0.0),
    Array(0.0, -1.0, 0.0, 0.0, 0.0),
    Array(0.0, 0.0, -1.0, 0.0, 0.0),
    Array(0.0, 0.0, 0.0, 1.0, 0.0),
    Array(0.0, 0.0, 0.0, 0.0, 1.0))
  val bArray = Array(120.0, 120.0, 120.0, 15.0, 15.0)

  lazy val c = sc.parallelize(cArray, numPartitions).glom.map(new DenseVector(_))
  lazy val rows = sc.parallelize(BArray, numPartitions).map(Vectors.dense(_))
  lazy val b = new DenseVector(bArray)

  test("LP solve is implemented properly") {
    val (v, x) = LP.solve(c, rows, b, sc=sc)
    // solution obtained from scipy.optimize.linprog and octave glgk lpsolver with fun_val = 12.083
    val expectedSol = Vectors.dense(
      Array(1.66666667, 5.83333333, 40.0, 0.0, 0.0, 13.33333333, 9.16666667))
    val xx = Vectors.dense(x.flatMap(_.toArray).collect())
    println(s"$xx")
    println("optimal min value: " + v)
    assert(xx ~== expectedSol absTol 1e-6, "LP.solve x should return the correct answer.")

  }

}

Source File: Main.scala From didactic-computing-machine with GNU Affero General Public License v3.0

5 votes

package example

import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkConf, SparkContext}

object Main extends App {

  val conf = new SparkConf()
    .setAppName("K means cluster")
    .setMaster("local")

  val sc = SparkContext
    .getOrCreate(conf)

  val data = sc.parallelize(
    Vector(
      Vector(-4.0, -1.0, -4.0),
      Vector(2.0, 0.0, 0.0),
      Vector(1.0, -2.0, 4.0),
      Vector(-3.0, -4.0, -1.0),
      Vector(2.0, -4.0, 0.0),
      Vector(2.0, 1.0, -5),
      Vector(3.0, -3.0, 0.0),
      Vector(-1.0, -1.0, 1.0)
    ).map(t => Vectors.dense(t.toArray)))

  val numOfClusters   = 3
  val numOfIterations = 100

  val clusters = KMeans.train(data, numOfClusters, numOfIterations)

  println("Cluster centers")
  clusters.clusterCenters.foreach(println)

  println("Squared Errors")
  println(clusters.computeCost(data))

  println("Predictions")
  println(clusters.predict(Vectors.dense(0.0, 0.0, 0.0)))
  println(clusters.predict(Vectors.dense(-3.0, -2.0, 1.5)))
}

Source File: DigitRecognizer.scala From AI with Apache License 2.0

5 votes

package com.bigchange.train

import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}



    val predictResult = Seq(0.001,0.01,0.1,1.0,10.0).map { param =>
      val nbModel = trainNBWithParams(testData,param,"multinomial")
      val predictResult =  testData.map { labeledPoint =>
        val predicted = nbModel.predict(labeledPoint.features)
        if (predicted > 0.5) 1 else 0
      }.reduce(_ + _)
      val accuracy = predictResult / testData.count * 1.0
      println(s"nb model with lambda:$param,modelTpye:multinomial,Accuracy:$accuracy")
    }
  }
}

Source File: DenseKMeans.scala From AI with Apache License 2.0

5 votes

// scalastyle:off println
package com.bigchange.mllib

import org.apache.log4j.{Level, Logger}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      var input: String = null,
      k: Int = 2,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()
    defaultParams.input = args(0)
    run(defaultParams)

  }

  def run(params: Params) {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params").setMaster("local")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    //  Return the K-means cost (sum of squared distances of points to their nearest center) for this
    val cost = model.computeCost(examples)
    // 获取质点(k个)
    val centerPoint = model.clusterCenters
    val one = centerPoint(0)
    val two  =  centerPoint(1)
    println(s"centerPoint=$one,$two.")
    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: StreamingLogisticRegression.scala From AI with Apache License 2.0

5 votes

// scalastyle:off println
package com.bigchange.mllib

import com.bigchange.util.{FileUtil, TimeUtil}
import org.apache.spark.SparkConf
import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}


object StreamingLogisticRegression {

  def main(args: Array[String]) {

    if (args.length != 4) {
      System.err.println(
        "Usage: StreamingLogisticRegression <trainingDir> <testDir> <batchDuration> <numFeatures>")
      System.exit(1)
    }

    val conf = new SparkConf().setMaster("local").setAppName("StreamingLogisticRegression")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingLogisticRegressionWithSGD()
      .setInitialWeights(Vectors.zeros(args(3).toInt))

    model.trainOn(trainingData)
    // model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
    model.predictOnValues(testData.map(lp => (lp.label, lp.features))).map(x => x._1 +"\t" +x._2).foreachRDD(rdd =>{
      val value = rdd.collect()
      FileUtil.normalFileWriter("F:\\datatest\\ai\\StreamingLogisticRegression\\"+TimeUtil.getCurrentHour,value)
    })
    ssc.start()
    ssc.awaitTermination()

  }

}
// scalastyle:on println

Source File: StreamingSimpleModel.scala From AI with Apache License 2.0

5 votes

package com.bigchange.streaming

import breeze.linalg.DenseVector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD}
import org.apache.spark.streaming.{Seconds, StreamingContext}


object StreamingSimpleModel {

  def main(args: Array[String]) {

    val ssc = new StreamingContext("local","test",Seconds(10))
    val stream = ssc.socketTextStream("localhost",9999)
    val numberFeatures = 100
    val zeroVector = DenseVector.zeros[Double](numberFeatures)
    val model = new StreamingLinearRegressionWithSGD()
      .setInitialWeights(Vectors.dense(zeroVector.data))
      .setNumIterations(1)
      .setStepSize(0.01)


    val labeledStream = stream.map { event =>
      val split = event.split("\t")
      val y = split(0).toDouble
      val features = split(1).split(",").map(_.toDouble)
      LabeledPoint(label = y, features = Vectors.dense(features))
    }

    model.trainOn(labeledStream)
    // 使用DStream的转换算子
    val predictAndTrue = labeledStream.transform { rdd =>
     val latestModel = model.latestModel()
      rdd.map { point =>
        val predict = latestModel.predict(point.features)
        predict - point.label
      }
    }
    // 计算MSE
    predictAndTrue.foreachRDD { rdd =>
      val  mse = rdd.map(x => x * x).mean()
      val rmse = math.sqrt(mse)
      println(s"current batch, MSE: $mse, RMSE:$rmse")

    }
    ssc.start()
    ssc.awaitTermination()

  }
}

Source File: PipeClassificationNaiveBayes.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import scala.beans.BeanInfo
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import org.apache.spark.mllib.classification.NaiveBayesModel


class PipeClassificationNaiveBayes(lambda: Double = 1.0) extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("lambda", lambda))

    def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    
    val model = NaiveBayes.train(trainingData, lambda)

    log.debug("Classification Model:" + model)
    log.debug("Classification Model labels :" + model.labels.mkString(" "))
    log.debug("Classification Model pi:     " + model.pi.mkString(" "))
    log.debug("Classification Model theta:  " + model.theta.foreach(_.mkString(" ")))

    // Marking Missing Values as Not Equal (0)
    symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))
  }

}

object PipeClassificationNaiveBayes {
  def apply(lambda: Double = 1.0) = {
    new PipeClassificationNaiveBayes(lambda)
  }
}

Source File: PipeClassificationTrainingDataGenerator.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import scala.compat.Platform

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

import com.rockymadden.stringmetric.StringMetric

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.logging.Logging
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.similarity.SimilarityCalculator
import de.unihamburg.vsis.sddf.sparkextensions.RddUtils.securlyZipRdds
import de.unihamburg.vsis.sddf.visualisation.model.TrainingSetModel
import de.unihamburg.vsis.sddf.visualisation.model.BasicAnalysable

class PipeClassificationTrainingDataGenerator(
  truePositiveCount: Int = 500,
  trueNegativeCount: Int = 500)(
  implicit featureMeasures: Array[(Int, StringMetric[Double])])
  extends PipeElement[SymPairSim, (SymPairSim, RDD[LabeledPoint])]
  with Logging {

  override def step(input: SymPairSim)(implicit pipeContext: AbstractPipeContext) = {
    pipeContext match {
      case pc: GoldstandardContext with CorpusContext => {
        var truePositiveFraction = truePositiveCount / pc.goldstandard.count.toDouble
        var trueNegativeFraction = trueNegativeCount / pc.corpus.count.toDouble
        log.debug("True positive pair fraction taken from the gold standard for training purposes: " + truePositiveFraction)
        log.debug("True negative pair fraction taken from the corpus for training purposes: " + trueNegativeFraction)
        if (truePositiveFraction > 1.0) {
          truePositiveFraction = 1.0
          log.debug("True positive pair fraction limited to 1.0")
        }
        if (trueNegativeFraction > 1.0) {
          trueNegativeFraction = 1.0
          log.debug("True negative pair fraction limited to 1.0")
        }
        val result = generateTrainingData(pc.corpus, pc.goldstandard,
          truePositiveFraction, trueNegativeFraction)
        (input, result)
      }
      case _ => {
        throw new Exception("Wrong AbstractPipeContext type.")
      }
    }
  }

  
object PipeClassificationTrainingDataGenerator {

  val All = -1
  
  def apply(
      truePositiveCount: Int = 500,
      trueNegativeCount: Int = 500)(
      implicit featureMeasures: Array[(Int, StringMetric[Double])]) = {
    new PipeClassificationTrainingDataGenerator(truePositiveCount, trueNegativeCount)
  }

}

Source File: PipeClassificationDecisionTree.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.DecisionTree
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.pipe.PipeElement
import de.unihamburg.vsis.sddf.pipe.context.AbstractPipeContext
import de.unihamburg.vsis.sddf.pipe.context.CorpusContext
import de.unihamburg.vsis.sddf.pipe.context.GoldstandardContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.visualisation.model.AlgoAnalysable
import de.unihamburg.vsis.sddf.Parameterized
import org.apache.spark.mllib.classification.ClassificationModel

class PipeClassificationDecisionTree(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32)
  extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("impurity", impurity), ("maxDepth", maxDepth), ("maxBins", maxBins))

  def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    val model = DecisionTree.trainClassifier(trainingData, numClasses = 2,
      categoricalFeaturesInfo = Map[Int, Int](), impurity, maxDepth, maxBins)

    log.debug("Decision Tree Model:" + model)
    log.debug("Decision Tree:" + model.toDebugString)

    // Marking Missing Values as Not Equal (0)
    symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))
  }

}

object PipeClassificationDecisionTree {
  def apply(
    impurity: String = "gini",
    maxDepth: Int = 5,
    maxBins: Int = 32) = {
    new PipeClassificationDecisionTree(impurity, maxDepth, maxBins)
  }
}

Source File: PipeClassificationSvm.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.classification

import scala.beans.BeanInfo
import org.apache.spark.mllib.classification.NaiveBayes
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import org.apache.spark.mllib.classification.SVMWithSGD

class PipeClassificationSvm(numIterations: Int = 100) extends AbstractPipeClassification {

  val paramMap: Map[String, Any] = Map(("numIterations", numIterations))

    def trainModelAndClassify(
    trainingData: RDD[LabeledPoint],
    symPairSim: SymPairSim): RDD[(SymPair[Tuple], Array[Double], Double)] = {
    
    val model = SVMWithSGD.train(trainingData, numIterations)

    log.debug("Classification Model:" + model)

    // Marking Missing Values as Not Equal (0)
    symPairSim.map(pair => (pair._1, pair._2, model.predict(Vectors.dense(pair._2))))
  }

}

object PipeClassificationSvm {
  def apply(numIterations: Int = 100) = {
    new PipeClassificationSvm(numIterations)
  }
}

Source File: PipeDecisionTest.scala From sddf with GNU General Public License v3.0

5 votes

package de.unihamburg.vsis.sddf.test.classification

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.scalatest.BeforeAndAfterAll
import org.scalatest.FunSuite

import de.unihamburg.vsis.sddf.SddfContext.Duplicate
import de.unihamburg.vsis.sddf.SddfContext.NoDuplicate
import de.unihamburg.vsis.sddf.SddfContext.SymPairSim
import de.unihamburg.vsis.sddf.classification.PipeClassificationDecisionTree
import de.unihamburg.vsis.sddf.classification.PipeClassificationNaiveBayes
import de.unihamburg.vsis.sddf.classification.PipeClassificationSvm
import de.unihamburg.vsis.sddf.pipe.context.SddfPipeContext
import de.unihamburg.vsis.sddf.reading.SymPair
import de.unihamburg.vsis.sddf.reading.Tuple
import de.unihamburg.vsis.sddf.test.util.LocalSparkContext

class PipeClassificationTest extends FunSuite with LocalSparkContext with BeforeAndAfterAll{
  
  var input: (SymPairSim, RDD[LabeledPoint]) = _
  
  override def beforeAll() {
    super.beforeAll()
    val tuple1 = Tuple("test1","test1","test1")
    tuple1.id = 1
    val tuple2 = Tuple("test2","test2","test2")
    tuple2.id = 2
    val tuple3 = Tuple("hans","franz","wurst")
    tuple3.id = 3
    
    val symPairSim: SymPairSim = sc.parallelize(Seq(
      (new SymPair(tuple1, tuple2), Array(1D,1D,0D))
      ,(new SymPair(tuple2, tuple3), Array(0D,0D,1D))
    ))
    
    val trainingData: RDD[LabeledPoint] = sc.parallelize(Seq(
      LabeledPoint(label = Duplicate, features = Vectors.dense(Array(0.99,1.0,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.875,0.0)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,1.0,0.1)))
      ,LabeledPoint(label = Duplicate, features = Vectors.dense(Array(1.0,0.89,0.0)))
      
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.1,0.0,1.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.0,0.2,1.0)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.06,0.0,0.89)))
      ,LabeledPoint(label = NoDuplicate, features = Vectors.dense(Array(0.21,0.19,0.91)))
    ))
    
    input = (symPairSim, trainingData)
  }

  override def afterAll() {
    super.afterAll()
  }
              
	test("naive bayes classification test") {
    val classificationPipe = new PipeClassificationNaiveBayes()
    implicit val pipeContext = new SddfPipeContext()
    val result = classificationPipe.run(input)
    assert(result.count === 1)
  }
  
  test("svm classification test") {
    val classificationPipe = new PipeClassificationSvm()
    implicit val pipeContext = new SddfPipeContext()
    val result = classificationPipe.run(input)
    assert(result.count === 1)
  }

  test("decision tree classification test") {
    val classificationPipe = new PipeClassificationDecisionTree()
    implicit val pipeContext = new SddfPipeContext()
    val result = classificationPipe.run(input)
    assert(result.count === 1)
  }

}

Source File: BisectingKMeansModel.scala From bisecting-kmeans with Apache License 2.0

5 votes

package org.apache.spark.mllib.bisectingkmeans

import breeze.linalg.{Vector => BV, norm => breezeNorm}

import org.apache.spark.Logging
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD


  def toJavaLinkageMatrix: java.util.ArrayList[java.util.ArrayList[java.lang.Double]] = {
    val javaList = new java.util.ArrayList[java.util.ArrayList[java.lang.Double]]()
    this.node.toLinkageMatrix.foreach {x =>
      val row = new java.util.ArrayList[java.lang.Double]()
      row.add(x._1.toDouble)
      row.add(x._2.toDouble)
      row.add(x._3.toDouble)
      row.add(x._4.toDouble)
      javaList.add(row)
    }
    javaList
  }
}

Source File: TestMPSLinearProgram.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs.examples

import java.io.File

import com.joptimizer.optimizers.LPStandardConverter
import com.joptimizer.util.MPSParser

import org.apache.spark.mllib.linalg.{ DenseVector, Vector, Vectors }
import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._
import org.apache.spark.mllib.optimization.tfocs.SolverSLP
import org.apache.spark.{ SparkConf, SparkContext }


object TestMPSLinearProgram {
  def main(args: Array[String]) {

    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestMPSLinearProgram")
    val sc = new SparkContext(sparkConf)

    // Parse the provided MPS file.
    val parser = new MPSParser()
    var mpsFile = new File(args(0))
    parser.parse(mpsFile)

    // Convert the parsed linear program to standard form.
    val converter = new LPStandardConverter(true)
    converter.toStandardForm(parser.getC,
      parser.getG,
      parser.getH,
      parser.getA,
      parser.getB,
      parser.getLb,
      parser.getUb)

    // Convert the parameters of the linear program to spark tfocs compatible formats.
    val c = sc.parallelize(converter.getStandardC.toArray).glom.map(new DenseVector(_))
    val A = sc.parallelize(converter.getStandardA.toArray.transpose.map(
      Vectors.dense(_).toSparse: Vector))
    val b = new DenseVector(converter.getStandardB.toArray)
    val n = converter.getStandardN

    val mu = 1e-2

    // Solve the linear program using SolverSLP, finding the optimal x vector 'optimalX'.
    val (optimalX, _) = SolverSLP.run(c, A, b, mu)
    println("optimalX: " + optimalX.collectElements.mkString(", "))

    sc.stop()
  }
}

Source File: TestLASSO.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs.examples

import scala.util.Random

import org.apache.spark.mllib.linalg.{ BLAS, DenseVector, Vectors }
import org.apache.spark.mllib.optimization.tfocs.SolverL1RLS
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.{ SparkConf, SparkContext }


object TestLASSO {
  def main(args: Array[String]) {

    val rnd = new Random(34324)
    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestLASSO")
    val sc = new SparkContext(sparkConf)

    val n = 1024 // Design matrix column count.
    val m = n / 2 // Design matrix row count.
    val k = m / 5 // Count of nonzero weights.

    // Generate the design matrix using random normal values, then normalize the columns.
    val unnormalizedA = RandomRDDs.normalVectorRDD(sc, m, n, 0, rnd.nextLong)
    val AColumnNormSq = unnormalizedA.treeAggregate(Vectors.zeros(n).toDense)(
      seqOp = (sum, rowA) => {
        val rowASq = Vectors.dense(rowA.toArray.map(rowA_i => rowA_i * rowA_i))
        BLAS.axpy(1.0, rowASq, sum)
        sum
      },
      combOp = (sum1, sum2) => {
        BLAS.axpy(1.0, sum2, sum1)
        sum1
      })
    val A = unnormalizedA.map(rowA =>
      Vectors.dense(rowA.toArray.zip(AColumnNormSq.toArray).map {
        case (rowA_i, normsq_i) => rowA_i / math.sqrt(normsq_i)
      }))

    // Generate the actual 'x' vector, including 'k' nonzero values.
    val x = Vectors.zeros(n).toDense
    for (i <- rnd.shuffle(0 to n - 1).take(k)) {
      x.values(i) = rnd.nextGaussian
    }

    // Generate the 'b' vector using the design matrix and weights, adding gaussian noise.
    val bOriginal = new DenseVector(A.map(rowA => BLAS.dot(rowA, x)).collect)
    val snr = 30 // SNR in dB
    val sigma =
      math.pow(10, ((10 * math.log10(math.pow(Vectors.norm(bOriginal, 2), 2) / n) - snr) / 20))
    val b = sc.parallelize(bOriginal.values.map(_ + sigma * rnd.nextGaussian))
      .glom
      .map(new DenseVector(_))

    // Set 'lambda' using the noise standard deviation.
    val lambda = 2 * sigma * math.sqrt(2 * math.log(n))

    // Solve the lasso problem using SolverL1RLS, finding the estimated x vector 'estimatedX'.
    val (estimatedX, _) = SolverL1RLS.run(A, b, lambda)
    println("estimatedX: " + estimatedX.values.mkString(", "))

    sc.stop()
  }
}

Source File: SolverL1RLS.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs

import org.apache.spark.mllib.linalg.{ DenseVector, Vectors }
import org.apache.spark.mllib.optimization.tfocs.fs.dvector.double._
import org.apache.spark.mllib.optimization.tfocs.fs.vector.double._
import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector._
import org.apache.spark.mllib.optimization.tfocs.VectorSpace._
import org.apache.spark.mllib.optimization.tfocs.vs.dvector._
import org.apache.spark.mllib.optimization.tfocs.vs.vector._


  def run(A: DMatrix,
    b: DVector,
    lambda: Double,
    x0: Option[DenseVector] = None): (DenseVector, Array[Double]) = {
    val (x, TFOCS.OptimizationData(lossHistory, _, _)) =
      TFOCS.optimize(new SmoothQuad(b),
        new LinopMatrix(A),
        new ProxL1(lambda),
        x0.getOrElse(Vectors.zeros(A.first().size).toDense))
    (x, lossHistory)
  }
}

Source File: SolverSLP.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs

import org.apache.spark.mllib.linalg.{ BLAS, DenseVector, Vectors }
import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._
import org.apache.spark.mllib.optimization.tfocs.VectorSpace._
import org.apache.spark.mllib.optimization.tfocs.fs.dvector.double._
import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector._
import org.apache.spark.mllib.optimization.tfocs.fs.vector.double._
import org.apache.spark.mllib.optimization.tfocs.vs.dvector._

object SolverSLP {

  
  def run(
    c: DVector,
    A: DMatrix,
    b: DenseVector,
    mu: Double,
    x0: Option[DVector] = None,
    z0: Option[DenseVector] = None,
    numContinuations: Int = 10,
    tol: Double = 1e-4,
    initialTol: Double = 1e-3,
    dualTolCheckInterval: Int = 10): (DVector, Array[Double]) = {

    val minusB = b.copy
    BLAS.scal(-1.0, minusB)
    TFOCS_SCD.optimize(new ProxShiftRPlus(c),
      new LinopMatrixAdjoint(A, minusB),
      new ProxZero(),
      mu,
      x0.getOrElse(c.mapElements(_ => 0.0)),
      z0.getOrElse(Vectors.zeros(b.size).toDense),
      numContinuations,
      tol,
      initialTol,
      dualTolCheckInterval)
  }
}

Source File: ProxL1.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs.fs.vector.double

import org.apache.spark.mllib.linalg.{ DenseVector, Vectors }
import org.apache.spark.mllib.optimization.tfocs.{ ProxCapableFunction, ProxMode, ProxValue }


class ProxL1(q: Double) extends ProxCapableFunction[DenseVector] {

  require(q > 0)

  override def apply(z: DenseVector, t: Double, mode: ProxMode): ProxValue[DenseVector] = {
    // NOTE DenseVectors are assumed here (not sparse safe).
    val shrinkage = q * t
    val minimizer = shrinkage match {
      case 0.0 => z
      case _ => new DenseVector(z.values.map(z_i =>
        z_i * (1.0 - math.min(shrinkage / math.abs(z_i), 1.0))))
    }
    val f = if (mode.f) Some(apply(minimizer)) else None
    ProxValue(f, Some(minimizer))
  }

  override def apply(x: DenseVector): Double = q * Vectors.norm(x, 1)
}

Source File: LinopMatrixAdjoint.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector

import org.apache.spark.mllib.linalg.BLAS
import org.apache.spark.mllib.linalg.{ DenseVector, Vectors }
import org.apache.spark.mllib.optimization.tfocs.CheckedIteratorFunctions._
import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix
import org.apache.spark.mllib.optimization.tfocs.LinearOperator
import org.apache.spark.mllib.optimization.tfocs.VectorSpace._
import org.apache.spark.storage.StorageLevel


class LinopMatrixAdjoint(@transient private val matrix: DMatrix)
    extends LinearOperator[DVector, DenseVector] {

  if (matrix.getStorageLevel == StorageLevel.NONE) {
    matrix.cache()
  }

  private lazy val n = matrix.first().size

  override def apply(x: DVector): DenseVector = {
    val n = this.n
    matrix.zipPartitions(x)((matrixPartition, xPartition) =>
      Iterator.single(
        matrixPartition.checkedZip(xPartition.next.values.toIterator).aggregate(
          // NOTE A DenseVector result is assumed here (not sparse safe).
          Vectors.zeros(n).toDense)(
            seqop = (_, _) match {
              case (sum, (matrix_i, x_i)) => {
                // Multiply an element of x by its corresponding matrix row, and add to the
                // accumulation sum vector.
                BLAS.axpy(x_i, matrix_i, sum)
                sum
              }
            },
            combop = (sum1, sum2) => {
              // Add the intermediate sum vectors.
              BLAS.axpy(1.0, sum2, sum1)
              sum1
            }
          ))
    ).treeAggregate(Vectors.zeros(n).toDense)(
      seqOp = (sum1, sum2) => {
        // Add the intermediate sum vectors.
        BLAS.axpy(1.0, sum2, sum1)
        sum1
      },
      combOp = (sum1, sum2) => {
        // Add the intermediate sum vectors.
        BLAS.axpy(1.0, sum2, sum1)
        sum1
      }
    )
  }

  override def t: LinearOperator[DenseVector, DVector] = new LinopMatrix(matrix)
}

Source File: SmoothQuad.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs.fs.dvector.double

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._
import org.apache.spark.mllib.optimization.tfocs.{ Mode, SmoothFunction, Value }
import org.apache.spark.mllib.optimization.tfocs.VectorSpace._
import org.apache.spark.storage.StorageLevel


class SmoothQuad(x0: DVector) extends SmoothFunction[DVector] {

  if (x0.getStorageLevel == StorageLevel.NONE) {
    x0.cache()
  }

  override def apply(x: DVector, mode: Mode): Value[DVector] = {

    // Compute the squared error gradient (just the difference between vectors).
    val g = x.diff(x0)

    // If both f and g are requested then g will be read twice, so cache it.
    if (mode.f && mode.g) g.cache()

    val f = if (mode.f) {
      // Compute the squared error.
      // TODO If f is required but not g, then performance might be improved by reimplementing as
      // a single aggregate using 'x' and 'x0' without an intermediate 'g' DVector, which breaks
      // per-element pipelining.
      Some(g.aggregate(0.0)((sum, gPart) => sum + math.pow(Vectors.norm(gPart, 2), 2), _ + _) / 2.0)
    } else {
      None
    }
    Value(f, Some(g))
  }
}

Source File: VectorSpaceSuite.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs

import org.scalatest.FunSuite

import org.apache.spark.mllib.linalg.{ DenseVector, Vectors }
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._
import org.apache.spark.mllib.optimization.tfocs.VectorSpace._
import org.apache.spark.mllib.optimization.tfocs.vs.dvector.DVectorSpace
import org.apache.spark.mllib.optimization.tfocs.vs.dvectordouble.DVectorDoubleSpace
import org.apache.spark.mllib.optimization.tfocs.vs.vector.DenseVectorSpace

class VectorSpaceSuite extends FunSuite with MLlibTestSparkContext {

  test("DenseVectorSpace.combine is implemented properly") {
    val alpha = 1.1
    val a = new DenseVector(Array(2.0, 3.0))
    val beta = 4.0
    val b = new DenseVector(Array(5.0, 6.0))
    val expectedCombination = Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0)
    assert(DenseVectorSpace.combine(alpha, a, beta, b) == expectedCombination,
      "DenseVectorSpace.combine should return the correct result.")
  }

  test("DenseVectorSpace.dot is implemented properly") {
    val a = new DenseVector(Array(2.0, 3.0))
    val b = new DenseVector(Array(5.0, 6.0))
    val expectedDot = 2.0 * 5.0 + 3.0 * 6.0
    assert(DenseVectorSpace.dot(a, b) == expectedDot,
      "DenseVectorSpace.dot should return the correct result.")
  }

  test("DVectorSpace.combine is implemented properly") {
    val alpha = 1.1
    val a = sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2)
    val beta = 4.0
    val b = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2)
    val combination = DVectorSpace.combine(alpha, a, beta, b)
    val expectedCombination =
      Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0, 1.1 * 4.0 + 4.0 * 7.0)
    assert(Vectors.dense(combination.collectElements) == expectedCombination,
      "DVectorSpace.combine should return the correct result.")
  }

  test("DVectorSpace.dot is implemented properly") {
    val a = sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))), 2)
    val b = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))), 2)
    val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 + 4.0 * 7.0
    assert(DVectorSpace.dot(a, b) == expectedDot,
      "DVectorSpace.dot should return the correct result.")
  }

  test("DVectorDoubleSpace.combine is implemented properly") {
    val alpha = 1.1
    val a = (sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))),
      2), 9.9)
    val beta = 4.0
    val b = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))),
      2), 11.11)
    val combination = DVectorDoubleSpace.combine(alpha, a, beta, b)
    val expectedCombination =
      (Vectors.dense(1.1 * 2.0 + 4.0 * 5.0, 1.1 * 3.0 + 4.0 * 6.0, 1.1 * 4.0 + 4.0 * 7.0),
        1.1 * 9.9 + 4.0 * 11.11)
    assert(Vectors.dense(combination._1.collectElements) == expectedCombination._1,
      "DVectorVectorSpace.combine should return the correct result.")
    assert(combination._2 == expectedCombination._2,
      "DVectorVectorSpace.combine should return the correct result.")
  }

  test("DVectorDoubleSpace.dot is implemented properly") {
    val a = (sc.parallelize(Array(new DenseVector(Array(2.0, 3.0)), new DenseVector(Array(4.0))),
      2), 9.9)
    val b = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), new DenseVector(Array(7.0))),
      2), 11.11)
    val expectedDot = 2.0 * 5.0 + 3.0 * 6.0 + 4.0 * 7.0 + 9.9 * 11.11
    assert(DVectorDoubleSpace.dot(a, b) == expectedDot,
      "DVectorVectorSpace.dot should return the correct result.")
  }
}

Source File: LinearOperatorSuite.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs

import org.scalatest.FunSuite

import org.apache.spark.SparkException
import org.apache.spark.mllib.linalg.{ DenseVector, Vectors }
import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._
import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvector.LinopMatrix
import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint
import org.apache.spark.mllib.optimization.tfocs.fs.vector.dvectordouble.{ LinopMatrix => LinopMatrixVector }
import org.apache.spark.mllib.optimization.tfocs.fs.dvectordouble.vector.{ LinopMatrixAdjoint => LinopMatrixVectorAdjoint }
import org.apache.spark.mllib.util.MLlibTestSparkContext

class LinearOperatorSuite extends FunSuite with MLlibTestSparkContext {

  lazy val matrix = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0),
    Vectors.dense(4.0, 5.0, 6.0)), 2)

  lazy val vector = new DenseVector(Array(2.2, 3.3, 4.4))

  test("LinopMatrix multiplies properly") {

    val f = new LinopMatrix(matrix)
    val x = new DenseVector(Array(7.0, 8.0, 9.0))
    val result = f(x)
    val expectedResult = Vectors.dense(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9)
    assert(Vectors.dense(result.collectElements) == expectedResult,
      "should return the correct product")
  }

  test("LinopMatrixAdjoint multiplies properly") {

    val f = new LinopMatrixAdjoint(matrix)
    val y = sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2)
    val result = f(y)
    val expectedResult = Vectors.dense(1 * 5 + 4 * 6, 2 * 5 + 5 * 6, 3 * 5 + 6 * 6)
    assert(result == expectedResult, "should return the correct product")
  }

  test("LinopMatrixAdjoint checks for mismatched partition vectors") {

    val f = new LinopMatrixAdjoint(matrix)
    val y = sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2)
    intercept[SparkException] {
      f(y)
    }
  }

  test("LinopMatrixVector multiplies properly") {

    val f = new LinopMatrixVector(matrix, vector)
    val x = new DenseVector(Array(7.0, 8.0, 9.0))
    val result = f(x)
    val expectedResult = (new DenseVector(Array(1 * 7 + 2 * 8 + 3 * 9, 4 * 7 + 5 * 8 + 6 * 9)),
      7.0 * 2.2 + 8.0 * 3.3 + 9.0 * 4.4)
    assert(Vectors.dense(result._1.collectElements) == expectedResult._1,
      "should return the correct product")
    assert(result._2 == expectedResult._2, "should return the correct product")
  }

  test("LinopMatrixVectorAdjoint multiplies properly") {

    var f = new LinopMatrixVectorAdjoint(matrix, vector)
    val y = (sc.parallelize(Array(new DenseVector(Array(5.0)), new DenseVector(Array(6.0))), 2),
      8.8)
    val result = f(y)
    val expectedResult =
      Vectors.dense(1 * 5 + 4 * 6 + 2.2, 2 * 5 + 5 * 6 + 3.3, 3 * 5 + 6 * 6 + 4.4)
    assert(result == expectedResult, "should return the correct product")
  }

  test("LinopMatrixVectorAdjoint checks for mismatched partition vectors") {

    val f = new LinopMatrixVectorAdjoint(matrix, vector)
    val y = (sc.parallelize(Array(new DenseVector(Array(5.0, 6.0)), Vectors.zeros(0).toDense), 2),
      8.8)
    intercept[SparkException] {
      f(y)
    }
  }
}

Source File: LocalLDAModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.clustering

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.{DataUtils, ParamUtils}
import org.apache.spark.ml.clustering.{LocalLDAModel => SparkLocalLDA}
import org.apache.spark.mllib.clustering.{LocalLDAModel => OldSparkLocalLDA}
import org.apache.spark.mllib.linalg.{Matrices, Matrix, Vector, Vectors}
import org.apache.spark.sql.SparkSession
import DataUtils._
import scala.reflect.runtime.universe

class LocalLDAModel(override val sparkTransformer: SparkLocalLDA)
  extends LocalTransformer[SparkLocalLDA] {

  lazy val oldModel: OldSparkLocalLDA = {
    val mirror     = universe.runtimeMirror(sparkTransformer.getClass.getClassLoader)
    val parentTerm = universe.typeOf[SparkLocalLDA].decl(universe.TermName("oldLocalModel")).asTerm
    mirror.reflect(sparkTransformer).reflectField(parentTerm).get.asInstanceOf[OldSparkLocalLDA]
  }

  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getFeaturesCol) match {
      case Some(column) =>
        val newData = column.data.mapToMlLibVectors.map(oldModel.topicDistribution(_).toList)
        localData.withColumn(
          LocalDataColumn(
            sparkTransformer.getTopicDistributionCol,
            newData
          )
        )
      case None => localData
    }
  }
}

object LocalLDAModel
  extends SimpleModelLoader[SparkLocalLDA]
  with TypedTransformerConverter[SparkLocalLDA] {

  override def build(metadata: Metadata, data: LocalData): SparkLocalLDA = {
    val topics = DataUtils.constructMatrix(
      data.column("topicsMatrix").get.data.head.asInstanceOf[Map[String, Any]]
    )
    val gammaShape = data.column("gammaShape").get.data.head.asInstanceOf[java.lang.Double]
    val topicConcentration =
      data.column("topicConcentration").get.data.head.asInstanceOf[java.lang.Double]
    val docConcentration = DataUtils.constructVector(
      data.column("docConcentration").get.data.head.asInstanceOf[Map[String, Any]]
    )
    val vocabSize = data.column("vocabSize").get.data.head.asInstanceOf[java.lang.Integer]

    val oldLdaCtor = classOf[OldSparkLocalLDA].getDeclaredConstructor(
      classOf[Matrix],
      classOf[Vector],
      classOf[Double],
      classOf[Double]
    )
    val oldLDA = oldLdaCtor.newInstance(
      Matrices.fromML(topics),
      Vectors.fromML(docConcentration),
      topicConcentration,
      gammaShape
    )

    val ldaCtor = classOf[SparkLocalLDA].getDeclaredConstructor(
      classOf[String],
      classOf[Int],
      classOf[OldSparkLocalLDA],
      classOf[SparkSession]
    )

    val lda = ldaCtor.newInstance(metadata.uid, vocabSize, oldLDA, null)

    ParamUtils.set(lda, lda.optimizer, metadata)
    ParamUtils.set(lda, lda.keepLastCheckpoint, metadata)
    ParamUtils.set(lda, lda.seed, metadata)
    ParamUtils.set(lda, lda.featuresCol, metadata)
    ParamUtils.set(lda, lda.learningDecay, metadata)
    ParamUtils.set(lda, lda.checkpointInterval, metadata)
    ParamUtils.set(lda, lda.learningOffset, metadata)
    ParamUtils.set(lda, lda.maxIter, metadata)
    ParamUtils.set(lda, lda.k, metadata)
    lda
  }

  override implicit def toLocal(sparkTransformer: SparkLocalLDA): LocalTransformer[SparkLocalLDA] =
    new LocalLDAModel(sparkTransformer)

}

Source File: MlLibOnKudu.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.etl.machinelearning.kudu

import com.hadooparchitecturebook.taxi360.model.{NyTaxiYellowTrip, NyTaxiYellowTripBuilder}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object MlLibOnKudu {
  def main(args: Array[String]): Unit = {

    if (args.length == 0) {
      println("Args: <runLocal> " +
        "<kuduMaster> " +
        "<taxiTable> " +
        "<numOfCenters> " +
        "<numOfIterations> ")
      return
    }

    val runLocal = args(0).equalsIgnoreCase("l")
    val kuduMaster = args(1)
    val taxiTable = args(2)
    val numOfCenters = args(3).toInt
    val numOfIterations = args(4).toInt

    val sc: SparkContext = if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      new SparkContext(sparkConfig)
    }

    val sqlContext = new SQLContext(sc)

    val kuduOptions = Map(
      "kudu.table" -> taxiTable,
      "kudu.master" -> kuduMaster)

    sqlContext.read.options(kuduOptions).format("org.apache.kudu.spark.kudu").load.
      registerTempTable("ny_taxi_trip_tmp")

    //Vector
    val vectorRDD:RDD[Vector] = sqlContext.sql("select * from ny_taxi_trip_tmp").map(r => {
      val taxiTrip = NyTaxiYellowTripBuilder.build(r)
      generateVectorOnly(taxiTrip)
    })

    println("--Running KMeans")
    val clusters = KMeans.train(vectorRDD, numOfCenters, numOfIterations)
    println(" > vector centers:")
    clusters.clusterCenters.foreach(v => println(" >> " + v))

    println("--Running corr")
    val correlMatrix: Matrix = Statistics.corr(vectorRDD, "pearson")
    println(" > corr: " + correlMatrix.toString)

    println("--Running colStats")
    val colStats = Statistics.colStats(vectorRDD)
    println(" > max: " + colStats.max)
    println(" > count: " + colStats.count)
    println(" > mean: " + colStats.mean)
    println(" > min: " + colStats.min)
    println(" > normL1: " + colStats.normL1)
    println(" > normL2: " + colStats.normL2)
    println(" > numNonZeros: " + colStats.numNonzeros)
    println(" > variance: " + colStats.variance)

    //Labeled Points
    
}

Source File: get_labels_from_VT_signatures.scala From gsoc_relationship with Apache License 2.0

5 votes

import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.clustering.KMeans
import PreProcessingConfig._

case class VT_sample_label_rdd_class(sha256:String, label:Array[Double])

def OnehotEncode(number : Double): Array[Double]={
    var Listnew = Array.iterate(0.0,kmeans_cluster_number)(a=>0.0)
    Listnew(number.toInt)=1
    return Listnew
}

val VT_sample_signatures_final_array_rdd = spark.read.format("parquet").load(VT_sample_signatures_final_array_file).rdd.map(row => new VT_sample_signatures_final_array_rdd_class(row(0).toString,row(1).asInstanceOf[Seq[Double]].toArray))
val VT_sample_signatures_with_sha_rddvector = VT_sample_signatures_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results)))
val VT_sample_signatures_rddvector = VT_sample_signatures_with_sha_rddvector.map(x=>x._2)
val KMeans_Model = KMeans.train(VT_sample_signatures_rddvector,kmeans_cluster_number,30,2)
val VT_sample_signatures_label_with_sha_rdd = VT_sample_signatures_with_sha_rddvector.map(x=>(x._1,KMeans_Model.predict(x._2)))
val VT_sample_label_rdd = VT_sample_signatures_label_with_sha_rdd.map(x=>new VT_sample_label_rdd_class(x._1, OnehotEncode(x._2.toDouble)))

VT_sample_label_rdd.toDF().write.format("parquet").save(VT_sample_label_file)

Source File: X2PHelper.scala From spark-tsne with Apache License 2.0

5 votes

package org.apache.spark.mllib

import breeze.linalg._
import breeze.numerics._
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.MLUtils


object X2PHelper {

  case class VectorWithNorm(vector: Vector, norm: Double)

  def fastSquaredDistance(v1: VectorWithNorm, v2: VectorWithNorm): Double = {
    MLUtils.fastSquaredDistance(v1.vector, v1.norm, v2.vector, v2.norm)
  }

  def Hbeta(D: DenseVector[Double], beta: Double = 1.0) : (Double, DenseVector[Double]) = {
    val P: DenseVector[Double] = exp(- D * beta)
    val sumP = sum(P)
    if(sumP == 0) {
      (0.0, DenseVector.zeros(D.size))
    }else {
      val H = log(sumP) + (beta * sum(D :* P) / sumP)
      (H, P / sumP)
    }
  }
}

Source File: X2P.scala From spark-tsne with Apache License 2.0

5 votes

package com.github.saurfang.spark.tsne

import breeze.linalg.DenseVector
import org.apache.spark.mllib.X2PHelper._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry, RowMatrix}
import org.apache.spark.mllib.rdd.MLPairRDDFunctions._
import org.slf4j.LoggerFactory

object X2P {

  private def logger = LoggerFactory.getLogger(X2P.getClass)

  def apply(x: RowMatrix, tol: Double = 1e-5, perplexity: Double = 30.0): CoordinateMatrix = {
    require(tol >= 0, "Tolerance must be non-negative")
    require(perplexity > 0, "Perplexity must be positive")

    val mu = (3 * perplexity).toInt //TODO: Expose this as parameter
    val logU = Math.log(perplexity)
    val norms = x.rows.map(Vectors.norm(_, 2.0))
    norms.persist()
    val rowsWithNorm = x.rows.zip(norms).map{ case (v, norm) => VectorWithNorm(v, norm) }
    val neighbors = rowsWithNorm.zipWithIndex()
      .cartesian(rowsWithNorm.zipWithIndex())
      .flatMap {
      case ((u, i), (v, j)) =>
        if(i < j) {
          val dist = fastSquaredDistance(u, v)
          Seq((i, (j, dist)), (j, (i, dist)))
        } else Seq.empty
    }
      .topByKey(mu)(Ordering.by(e => -e._2))

    val p_betas =
      neighbors.map {
        case (i, arr) =>
          var betamin = Double.NegativeInfinity
          var betamax = Double.PositiveInfinity
          var beta = 1.0

          val d = DenseVector(arr.map(_._2))
          var (h, p) = Hbeta(d, beta)

          //logInfo("data was " + d.toArray.toList)
          //logInfo("array P was " + p.toList)

          // Evaluate whether the perplexity is within tolerance
          def Hdiff = h - logU
          var tries = 0
          while (Math.abs(Hdiff) > tol && tries < 50) {
            //If not, increase or decrease precision
            if (Hdiff > 0) {
              betamin = beta
              beta = if (betamax.isInfinite) beta * 2 else (beta + betamax) / 2
            } else {
              betamax = beta
              beta = if (betamin.isInfinite) beta / 2 else (beta + betamin) / 2
            }

            // Recompute the values
            val HP = Hbeta(d, beta)
            h = HP._1
            p = HP._2
            tries = tries + 1
          }

          //logInfo("array P is " + p.toList)

          (arr.map(_._1).zip(p.toArray).map { case (j, v) => MatrixEntry(i, j, v) }, beta)
      }

    logger.info("Mean value of sigma: " + p_betas.map(x => math.sqrt(1 / x._2)).mean)
    new CoordinateMatrix(p_betas.flatMap(_._1))
  }
}

Source File: X2PSuite.scala From spark-tsne with Apache License 2.0

5 votes

package com.github.saurfang.spark.tsne

import org.apache.spark.SharedSparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.scalatest.{FunSuite, Matchers}


class X2PSuite extends FunSuite with SharedSparkContext with Matchers {

  test("Test X2P against tsne.jl implementation") {
    val input = new RowMatrix(
      sc.parallelize(Seq(1 to 3, 4 to 6, 7 to 9, 10 to 12))
        .map(x => Vectors.dense(x.map(_.toDouble).toArray))
    )
    val output = X2P(input, 1e-5, 2).toRowMatrix().rows.collect().map(_.toArray.toList)
    println(output.toList)
    //output shouldBe List(List(0, .5, .5), List(.5, 0, .5), List(.5, .5, .0))
  }
}

Source File: BugDemonstrationTest.scala From spark-tsne with Apache License 2.0

5 votes

package com.github.saurfang.spark.tsne

import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
import org.apache.spark.sql.SparkSession
import org.scalatest.{BeforeAndAfterAll, FunSuite, Matchers}


class BugDemonstrationTest extends FunSuite with Matchers with BeforeAndAfterAll {
  private var sparkSession : SparkSession = _
  override def beforeAll(): Unit = {
    super.beforeAll()
    sparkSession = SparkSession.builder().appName("BugTests").master("local[2]").getOrCreate()
  }

  override def afterAll(): Unit = {
    super.afterAll()
    sparkSession.stop()
  }

  test("This demonstrates a bug was fixed in tsne-spark 2.1") {
    val sc = sparkSession.sparkContext

    val observations = sc.parallelize(
      Seq(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    )

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    val expectedMean = Vectors.dense(2.0,20.0,200.0)
    val resultMean = summary.mean
    assertEqualEnough(resultMean, expectedMean)
    val expectedVariance = Vectors.dense(1.0,100.0,10000.0)
    assertEqualEnough(summary.variance, expectedVariance)
    val expectedNumNonZeros = Vectors.dense(3.0, 3.0, 3.0)
    assertEqualEnough(summary.numNonzeros, expectedNumNonZeros)
  }

  private def assertEqualEnough(sample: Vector, expected: Vector): Unit = {
    expected.toArray.zipWithIndex.foreach{ case(d: Double, i: Int) =>
      sample(i) should be (d +- 1E-12)
    }
  }
}

Source File: MNIST.scala From spark-tsne with Apache License 2.0

5 votes

package com.github.saurfang.spark.tsne.examples


import java.io.{BufferedWriter, OutputStreamWriter}

import com.github.saurfang.spark.tsne.impl._
import com.github.saurfang.spark.tsne.tree.SPTree
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.{SparkConf, SparkContext}
import org.slf4j.LoggerFactory

object MNIST {
  private def logger = LoggerFactory.getLogger(MNIST.getClass)

  def main (args: Array[String]) {
    val conf = new SparkConf()
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .registerKryoClasses(Array(classOf[SPTree]))
    val sc = new SparkContext(conf)
    val hadoopConf = sc.hadoopConfiguration
    val fs = FileSystem.get(hadoopConf)

    val dataset = sc.textFile("data/MNIST/mnist.csv.gz")
      .zipWithIndex()
      .filter(_._2 < 6000)
      .sortBy(_._2, true, 60)
      .map(_._1)
      .map(_.split(","))
      .map(x => (x.head.toInt, x.tail.map(_.toDouble)))
      .cache()
    //logInfo(dataset.collect.map(_._2.toList).toList.toString)

    //val features = dataset.map(x => Vectors.dense(x._2))
    //val scaler = new StandardScaler(true, true).fit(features)
    //val scaledData = scaler.transform(features)
    //  .map(v => Vectors.dense(v.toArray.map(x => if(x.isNaN || x.isInfinite) 0.0 else x)))
    //  .cache()
    val data = dataset.flatMap(_._2)
    val mean = data.mean()
    val std = data.stdev()
    val scaledData = dataset.map(x => Vectors.dense(x._2.map(v => (v - mean) / std))).cache()

    val labels = dataset.map(_._1).collect()
    val matrix = new RowMatrix(scaledData)
    val pcaMatrix = matrix.multiply(matrix.computePrincipalComponents(50))
    pcaMatrix.rows.cache()

    val costWriter = new BufferedWriter(new OutputStreamWriter(fs.create(new Path(s".tmp/MNIST/cost.txt"), true)))

    //SimpleTSNE.tsne(pcaMatrix, perplexity = 20, maxIterations = 200)
    BHTSNE.tsne(pcaMatrix, maxIterations = 500, callback = {
    //LBFGSTSNE.tsne(pcaMatrix, perplexity = 10, maxNumIterations = 500, numCorrections = 10, convergenceTol = 1e-8)
      case (i, y, loss) =>
        if(loss.isDefined) logger.info(s"$i iteration finished with loss $loss")

        val os = fs.create(new Path(s".tmp/MNIST/result${"%05d".format(i)}.csv"), true)
        val writer = new BufferedWriter(new OutputStreamWriter(os))
        try {
          (0 until y.rows).foreach {
            row =>
              writer.write(labels(row).toString)
              writer.write(y(row, ::).inner.toArray.mkString(",", ",", "\n"))
          }
          if(loss.isDefined) costWriter.write(loss.get + "\n")
        } finally {
          writer.close()
        }
    })
    costWriter.close()

    sc.stop()
  }
}

Source File: FactorizationMachineCtrModel.scala From CTRmodel with Apache License 2.0

5 votes

package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{FMModel, FMWithSGD, LabeledPoint}
import org.apache.spark.sql.DataFrame

class FactorizationMachineCtrModel extends BaseCtrModel {
  var _model:FMModel = _

  def train(samples:DataFrame) : Unit = {
    //calculate inner product between item embedding and user embedding
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    _pipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct)

    val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct)

    val formatSamples = preparedSamples.rdd.map( row =>{
      new LabeledPoint(row.getAs[Int]("label").toDouble, Vectors.fromML(row.getAs[DenseVector]("scaledFeatures")))
    })

    _model = FMWithSGD.train(formatSamples, task = 1, numIterations = 200, stepSize = 0.15, miniBatchFraction = 1, dim = (true, true, 2), regParam = (0, 0, 0), initStd = 0.1)
  }

  override def transform(samples:DataFrame):DataFrame = {
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct)

    _model.predict(preparedSamples)
  }
}

Source File: Preparator.scala From pio-template-sr with Apache License 2.0

5 votes

package org.template.sr



import org.apache.predictionio.controller.PPreparator
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.feature.StandardScalerModel
import org.apache.spark.sql.SQLContext
import org.apache.spark.mllib.linalg.Vectors

class PreparedData(
  val rows: DataFrame,
  val dsp: DataSourceParams,
  val ssModel: org.apache.spark.mllib.feature.StandardScalerModel
) extends Serializable

class Preparator
  extends PPreparator[TrainingData, PreparedData] {

  def prepare(sc: SparkContext, trainingData: TrainingData): PreparedData = {
    val sqlContext = new SQLContext(sc)
    import sqlContext.implicits._

    if (trainingData.dsp.useStandardScaler) {
      val training = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features")
      val scaler = new StandardScaler().setInputCol("features").setOutputCol("scaledFeatures").setWithStd(trainingData.dsp.standardScalerWithStd).setWithMean(trainingData.dsp.standardScalerWithMean)
      val scalerModel = scaler.fit(training)
      val scaledData = scalerModel.transform(training)
      val s1 = scaledData.select("label","censor","scaledFeatures").withColumnRenamed("scaledFeatures","features")

      //Prepare old StandardScaler
      val oldScaler = new org.apache.spark.mllib.feature.StandardScaler(withMean = trainingData.dsp.standardScalerWithMean, withStd = trainingData.dsp.standardScalerWithStd)
      val oldSSModel = oldScaler.fit(trainingData.rows.map(x=>(Vectors.dense(x._3))))
            
      new PreparedData(rows = s1, dsp = trainingData.dsp, ssModel = oldSSModel)
    }
    else {
      new PreparedData(rows = trainingData.rows.map(x=>(x._1,x._2,Vectors.dense(x._3))).toDF("label", "censor", "features"), dsp = trainingData.dsp, ssModel = null)
    }
  }
}

Source File: SRAlgorithm.scala From pio-template-sr with Apache License 2.0

5 votes

package org.template.sr



import org.apache.predictionio.controller.P2LAlgorithm
import org.apache.predictionio.controller.Params
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD
import grizzled.slf4j.Logger
import org.apache.spark.mllib.linalg.{Vectors,DenseVector}
import org.apache.spark.ml.feature.StandardScalerModel
import org.apache.spark.ml.regression.{AFTSurvivalRegression,AFTSurvivalRegressionModel}

case class AlgorithmParams(
  val quantileProbabilities: Array[Double],
  val fitIntercept: Boolean,
  val maxIter: Int,
  val convTolerance: Double
) extends Params

class SRModel(
  val aAFTSRModel: AFTSurvivalRegressionModel,
  val ssModel: org.apache.spark.mllib.feature.StandardScalerModel,
  val useStandardScaler: Boolean
) extends Serializable {}

class SRAlgorithm(val ap: AlgorithmParams) extends P2LAlgorithm[PreparedData, SRModel, Query, PredictedResult] {

  @transient lazy val logger = Logger[this.type]

  def train(sc: SparkContext, data: PreparedData): SRModel = {
    println("Training SR model.")
    val aft = new AFTSurvivalRegression().setQuantileProbabilities(ap.quantileProbabilities).setQuantilesCol("quantiles").setFitIntercept(ap.fitIntercept).setMaxIter(ap.maxIter).setTol(ap.convTolerance)
    val model = aft.fit(data.rows)

    new SRModel(aAFTSRModel = model, ssModel=data.ssModel, useStandardScaler = data.dsp.useStandardScaler)
  }

  def predict(model: SRModel, query: Query): PredictedResult = {
    // 
    val qryRow0 = Vectors.dense(query.features)
    val qryRow = if (model.useStandardScaler) {
      model.ssModel.transform(qryRow0)
    } else {
      qryRow0
    }
    val score = model.aAFTSRModel.predict(qryRow)
    val quantilesVec = model.aAFTSRModel.predictQuantiles(qryRow)

    PredictedResult(coefficients = model.aAFTSRModel.coefficients.toArray,
                    intercept = model.aAFTSRModel.intercept,
                    scale = model.aAFTSRModel.scale,
                    prediction = score,
                    quantiles = quantilesVec.toArray)
  }
}

Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0

5 votes

package org.apache.spark.ml.optim

import java.util.Random

import scala.language.implicitConversions

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace}
import org.apache.spark.ml.optim.VectorRDDFunctions._
import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.mllib.random.RandomRDDs
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.{RDD, UnionRDD}
import org.apache.spark.storage.StorageLevel


  private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = {
    data.cartesian(dx).map { case (points, x) =>
      val g = Vectors.zeros(x.size)
      points.foreach { case LabeledPoint(b, a) =>
        val err = BLAS.dot(a, x) - b
        BLAS.axpy(err, a, g)
      }
      g
    }.treeSum()
  }

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]")
    val sc = new SparkContext(conf)
    sc.setCheckpointDir("/tmp/checkpoint")
    val n = 1000
    val p = 100
    val random = new Random(0L)
    val xExact = Vectors.dense(Array.fill(p)(random.nextDouble()))
    val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) =>
      val random = new Random(100 + idx)
      part.map { v =>
        val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian()
        LabeledPoint(target, v)
      }
    }.glom()
    .cache()

    val x = solve(data).first()

    println(s"x_exact = $xExact")
    println(s"x_vlbfgs = $x")

    sc.stop()
  }
}

Source File: KMeans.scala From spark-tda with Apache License 2.0

5 votes

import java.io.{File, PrintWriter}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.sql.functions._

def computeKMeans(
  pathToTextFile: String,
  quantity: Int,
  iteration: Int) {
  case class Point(x: Double, y: Double)

  def save(f: File)(func: PrintWriter => Unit) {
    val p = new PrintWriter(f)
    try {
      func(p)
    } finally {
      p.close()
    }
  }

  val filename = pathToTextFile.split("\\.")(0)

  val outputFilename = s"$filename-KMEANS-k${quantity}-i${iteration}.tsv"

  val points = sc
    .textFile(pathToTextFile)
    .map {
      line => line.trim.split("\\s+")
    }
    .map {
      row => Point(row(0).toDouble, row(1).toDouble)
    }

  val features = points
    .map {
      p => Vectors.dense(p.x, p.y)
    }

  features.cache()

  val kmeans = KMeans.train(features, quantity, iteration)

  val predictions = features
    .map {
      f => (f(0), f(1), model.predict(f) + 1)
    }
    .collect

  save(new File(outputFilename)) {
    println(s"OUTPUT TO: ${outputFilename}")
    f => predictions.foreach{
      case (x, y, ccid) => f.println(s"${x}\t${y}\t${ccid}")
    }
  }
}

Source File: spark-latest.scala From ann-benchmark with Apache License 2.0

5 votes

import org.apache.log4j._
Logger.getRootLogger.setLevel(Level.OFF)
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier

// maximum number of worker nodes in cluster
val numNodes = 5
// batch size, ~10K is good for GPU
val batchSize = 1000
// number of iterations to run
val numIterations = 5
val train = MLUtils.loadLibSVMFile(sc, "file:///data/mnist/mnist.scale")
//val layers = Array[Int](780, 2500, 2000, 1500, 1000, 500, 10)
val layers = Array[Int](780, 10)
val trainer = new MultilayerPerceptronClassifier().setLayers(layers).setBlockSize(1000).setSeed(1234L).setMaxIter(1)
for (i <- 1 to numNodes) {
  val dataPartitions = sc.parallelize(1 to i, i)
  val sample = train.sample(true, 1.0 / i, 11L).collect
  val parallelData = sqlContext.createDataFrame(dataPartitions.flatMap(x => sample))
  parallelData.persist
  parallelData.count
  val t = System.nanoTime()
  val model = trainer.fit(parallelData)
  println(i + "\t" + batchSize + "\t" + (System.nanoTime() - t) / (numIterations * 1e9)) 	
  parallelData.unpersist()
}

Source File: spark.scala From ann-benchmark with Apache License 2.0

5 votes

import org.apache.log4j._
Logger.getRootLogger.setLevel(Level.OFF)
import org.apache.spark.mllib.ann.{FeedForwardTrainer, FeedForwardTopology}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.classification.ANNClassifier
// maximum number of worker nodes in cluster
val numNodes = 5
// batch size, ~10K is good for GPU
val batchSize = 1000
// number of iterations to run
val numIterations = 5
val train = MLUtils.loadLibSVMFile(sc, "/mnist.scale")
val topology = FeedForwardTopology.multiLayerPerceptron(Array[Int](780, 2500, 2000, 1500, 1000, 500, 10), false)
val trainer = new FeedForwardTrainer(topology, 780, 10).setBatchSize(batchSize)
trainer.SGDOptimizer.setNumIterations(numIterations).setMiniBatchFraction(1.0).setStepSize(0.03)
// parallalize the data for N nodes, persist, run X iterations and print average time for each run
for (i <- 1 to numNodes) {
	val dataPartitions = sc.parallelize(1 to i, i)
	val sample = train.sample(true, 1.0 / i, 11L).collect
	val parallelData = dataPartitions.flatMap(x => sample)
	parallelData.persist
	parallelData.count
	val t = System.nanoTime()
	val model = new ANNClassifier(trainer).train(parallelData)
	println(i + "\t" + batchSize + "\t" + (System.nanoTime() - t) / (numIterations * 1e9)) 	
}

Source File: IDFOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.IDFModel
import org.apache.spark.ml.param.Param
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.linalg.Vectors


class IDFOp extends SimpleSparkOp[IDFModel] {
  override val Model: OpModel[SparkBundleContext, IDFModel] = new OpModel[SparkBundleContext, IDFModel] {
    override val klazz: Class[IDFModel] = classOf[IDFModel]

    override def opName: String = Bundle.BuiltinOps.feature.idf

    override def store(model: Model, obj: IDFModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("idf", Value.vector(obj.idf.toArray))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): IDFModel = {
      val idfModel = new feature.IDFModel(Vectors.dense(model.value("idf").getTensor[Double].toArray))
      new IDFModel(uid = "", idfModel = idfModel)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: IDFModel): IDFModel = {
    new IDFModel(uid = uid, idfModel = new feature.IDFModel(Vectors.dense(model.idf.toArray)))
  }

  override def sparkInputs(obj: IDFModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: IDFModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
}

Source File: KMeansOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.clustering.KMeansModel
import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
import org.apache.spark.mllib.clustering
import org.apache.spark.mllib.linalg.Vectors


class KMeansOp extends SimpleSparkOp[KMeansModel] {
  override val Model: OpModel[SparkBundleContext, KMeansModel] = new OpModel[SparkBundleContext, KMeansModel] {
    override val klazz: Class[KMeansModel] = classOf[KMeansModel]

    override def opName: String = Bundle.BuiltinOps.clustering.k_means

    override def store(model: Model, obj: KMeansModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("cluster_centers", Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.toArray)))).
        withValue("num_features", Value.long(obj.clusterCenters.head.size))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): KMeansModel = {
      val clusterCenters = model.value("cluster_centers").
        getTensorList[Double].toArray.
        map(t => Vectors.dense(t.toArray))
      val mllibModel = new clustering.KMeansModel(clusterCenters)

      new KMeansModel(uid = "", parentModel = mllibModel)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: KMeansModel): KMeansModel = {
    val clusterCenters = model.clusterCenters.map {
      case DenseVector(values) => Vectors.dense(values)
      case SparseVector(size, indices, values) => Vectors.sparse(size, indices, values)
    }
    new KMeansModel(uid = uid, parentModel = new clustering.KMeansModel(clusterCenters))
  }

  override def sparkInputs(obj: KMeansModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: KMeansModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol)
  }
}

Source File: SparkNodeWrapper.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.mllib.clustering.bundle.tree.clustering

import ml.bundle.ctree.Node
import ml.combust.bundle.tree.cluster.NodeWrapper
import org.apache.spark.mllib.clustering.{ClusteringTreeNode, VectorWithNorm}
import org.apache.spark.mllib.linalg.Vectors


object SparkNodeWrapper extends NodeWrapper[ClusteringTreeNode] {
  override def node(n: ClusteringTreeNode): Node = {
    Node(index = n.index,
      norm = n.centerWithNorm.norm,
      values = n.centerWithNorm.vector.toArray.toSeq,
      numChildren = n.children.length)
  }

  override def children(n: ClusteringTreeNode): Array[ClusteringTreeNode] = n.children

  override def create(node: Node, children: Seq[ClusteringTreeNode]): ClusteringTreeNode = {
    new ClusteringTreeNode(index = node.index,
      size = 0,
      centerWithNorm = new VectorWithNorm(Vectors.dense(node.values.toArray), node.norm),
      cost = 0.0,
      height = 0,
      children = children.toArray)
  }
}

Source File: SupportVectorMachineOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.extension.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.mleap.classification.SVMModel
import org.apache.spark.mllib.linalg.Vectors


class SupportVectorMachineOp extends SimpleSparkOp[SVMModel] {
  override val Model: OpModel[SparkBundleContext, SVMModel] = new OpModel[SparkBundleContext, SVMModel] {
    override val klazz: Class[SVMModel] = classOf[SVMModel]

    override def opName: String = Bundle.BuiltinOps.classification.support_vector_machine

    override def store(model: Model, obj: SVMModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val thresholds = if(obj.isSet(obj.thresholds)) {
        Some(obj.getThresholds)
      } else None

      model.withValue("coefficients", Value.vector(obj.model.weights.toArray)).
        withValue("intercept", Value.double(obj.model.intercept)).
        withValue("num_classes", Value.long(2)).
        withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): SVMModel = {
      if(model.value("num_classes").getLong != 2) {
        throw new IllegalArgumentException("only binary logistic regression supported in Spark")
      }

      val weights = Vectors.dense(model.value("coefficients").getTensor[Double].toArray)
      val svm = new org.apache.spark.mllib.classification.SVMModel(
        weights = weights,
        intercept = model.value("intercept").getDouble
      )
      val svmModel = new SVMModel(uid = "", model = svm)
      model.getValue("thresholds").
        map(t => svmModel.setThresholds(t.getDoubleList.toArray)).
        getOrElse(svmModel)
    }
  }

  override def sparkLoad(uid: String,
                         shape: NodeShape,
                         model: SVMModel): SVMModel = {
    val m = new SVMModel(uid = uid, model = model.model)
    if(model.isDefined(model.thresholds)) {
      m.setThresholds(model.getThresholds)
    }
    m
  }

  override def sparkInputs(obj: SVMModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: SVMModel): Seq[SimpleParamSpec] = {
    Seq("raw_prediction" -> obj.rawPredictionCol,
      "probability" -> obj.probabilityCol,
      "prediction" -> obj.predictionCol)
  }
}

Source File: SupportVectorMachineParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.mleap.parity.classification

import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.mleap.classification.SVMModel
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.mllib
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql._


class SupportVectorMachineParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti", "approved")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer().
    setInputCol("fico_score_group_fnl").
    setOutputCol("fico_index"),
    new VectorAssembler().
      setInputCols(Array("fico_index", "dti")).
      setOutputCol("features"),
    new SVMModel(uid = "svm",
      model = new mllib.classification.SVMModel(weights = Vectors.dense(0.53, 0.67), intercept = 0.77)).
      setRawPredictionCol("raw_prediction").
      setProbabilityCol("probability"))).fit(dataset)

  override val unserializedParams = Set("stringOrderType")
}

Source File: HivemallUtils.scala From hivemall-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors}
import org.apache.spark.sql.catalyst.expressions.Literal
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Column, DataFrame, Row, UserDefinedFunction}

object HivemallUtils {

  // # of maximum dimensions for feature vectors
  val maxDims = 100000000

  
  def funcVectorizer(dense: Boolean = false, dims: Int = maxDims)
    : UserDefinedFunction = {
    udf(funcVectorizerImpl(dense, dims))
  }

  private def funcVectorizerImpl(dense: Boolean, dims: Int)
    : Seq[String] => Vector = {
    if (dense) {
      // Dense features
      i: Seq[String] => {
        val features = new Array[Double](dims)
        i.map { ft =>
          val s = ft.split(":").ensuring(_.size == 2)
          features(s(0).toInt) = s(1).toDouble
        }
        Vectors.dense(features)
      }
    } else {
      // Sparse features
      i: Seq[String] => {
        val features = i.map { ft =>
          // val s = ft.split(":").ensuring(_.size == 2)
          val s = ft.split(":")
          (s(0).toInt, s(1).toDouble)
        }
        Vectors.sparse(dims, features)
      }
    }
  }
}

Source File: L9-3Statistics.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object StatisticsApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: StatisticsApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => f.map(f => f.toDouble))

    substream.map(f => Vectors.dense(f.slice(1, 5))).foreachRDD(rdd => {
      val stats = Statistics.colStats(rdd)
      println("Count: " + stats.count)
      println("Max: " + stats.max.toArray.mkString(" "))
      println("Min: " + stats.min.toArray.mkString(" "))
      println("Mean: " + stats.mean.toArray.mkString(" "))
      println("L1-Norm: " + stats.normL1.toArray.mkString(" "))
      println("L2-Norm: " + stats.normL2.toArray.mkString(" "))
      println("Number of non-zeros: " + stats.numNonzeros.toArray.mkString(" "))
      println("Varience: " + stats.variance.toArray.mkString(" "))
    })

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L9-7FeatureExtraction.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.ChiSqSelector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object FeatureExtractionApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: FeatureExtractionApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
      .map(f => f.map(v => v.toDouble))
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length).map(f => f / 2048))))

    datastream.foreachRDD(rdd => {
      val selector = new ChiSqSelector(5)
      val model = selector.fit(rdd)
      val filtered = rdd.map(p => LabeledPoint(p.label, model.transform(p.features)))
      filtered.take(20).foreach(println)
    })

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L9-9LogisticRegression.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.mllib.classification.StreamingLogisticRegressionWithSGD

object LogisticRegressionApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: LogisticRegressionApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))

    val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
    val test = walkingOrRunning.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
    val train = walkingOrRunning.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
    val model = new StreamingLogisticRegressionWithSGD()
      .setInitialWeights(Vectors.zeros(4))
      .setStepSize(0.0001)
      .setNumIterations(1)

    model.trainOn(train)
    model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd
      .map(v => math.pow((v._1 - v._2), 2)).mean())))

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L9-1LinearRegression.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object LinearRegressionApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: LinearRegressionApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val datastream = substream.map(f => Array(f(2).toDouble, f(3).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
    val test = datastream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
    val train = datastream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
    val model = new StreamingLinearRegressionWithSGD()
      .setInitialWeights(Vectors.zeros(4))
      .setStepSize(0.0001)
      .setNumIterations(1)

    model.trainOn(train)
    model.predictOnValues(test.map(v => (v.label, v.features))).foreachRDD(rdd => println("MSE: %f".format(rdd
      .map(v => math.pow((v._1 - v._2), 2)).mean())))

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: T9-4DataTypes.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Matrices
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object DataTypesApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: DataTypesApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => f.map(f => f.toDouble))

    val denseV = substream.map(f => Vectors.dense(f.slice(1, 5)))
    denseV.print()
    val sparseV = substream.map(f => f.slice(1, 5).toList).map(f => f.zipWithIndex.map { case (s, i) => (i, s) })
      .map(f => f.filter(v => v._2 != 0)).map(l => Vectors.sparse(l.size, l))
    sparseV.print()
    val labeledP = substream.map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
    labeledP.print()
    val denseM = substream.map(f => Matrices.dense(3, 16, f.slice(3, 19) ++ f.slice(20, 36) ++ f.slice(37, 53)))
    denseM.print()
    denseV.foreachRDD(rdd => {
      val rowM = new RowMatrix(rdd)
      println(rowM)
    })
    denseV.foreachRDD(rdd => {
      val iRdd = rdd.zipWithIndex.map(v => new IndexedRow(v._2, v._1))
      val iRowM = new IndexedRowMatrix(iRdd)
      println(iRowM)
    })
    substream.foreachRDD(rdd => {
      val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
        .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
      val cRowM = new CoordinateMatrix(entries)
      println(cRowM)
    })
    substream.foreachRDD(rdd => {
      val entries = rdd.zipWithIndex.flatMap(v => List(3, 20, 37).zipWithIndex.map(i => (i._2.toLong, v._2, v._1.slice(i._1, i._1 + 16).toList)))
        .map(v => v._3.map(d => new MatrixEntry(v._1, v._2, d))).flatMap(x => x)
      val blockM = new CoordinateMatrix(entries).toBlockMatrix
      println(blockM)
    })

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L9-5ChiSq.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object ChiSqApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: ChiSqApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => f.map(f => f.toDouble))

    substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))
      .filter(f => f(0) == 4.0 || f(0) == 5.0)
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
      .foreachRDD(rdd => {
        Statistics.chiSqTest(rdd).zipWithIndex.foreach(v => println("%s, column no. %d".format(v._1, v._2)))
      })

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L9-4Correlation.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object CorrelationApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: CorrelationApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")
      .map(f => f.map(f => f.toDouble))

    val datastream = substream.map(f => Array(f(1).toDouble, f(2).toDouble, f(4).toDouble, f(5).toDouble, f(6).toDouble))

    val walkingOrRunning = datastream.filter(f => f(0) == 4.0 || f(0) == 5.0).map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, 5))))
    walkingOrRunning.map(f => f.features).foreachRDD(rdd => {
      val corrSpearman = Statistics.corr(rdd, "spearman")
      val corrPearson = Statistics.corr(rdd, "pearson")
      println("Correlation Spearman: \n" + corrSpearman)
      println("Correlation Pearson: \n" + corrPearson)
    })

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L9-6Preprocessing.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.StandardScaler
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object PreprocessingApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: PreprocessingAppApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    substream.map(f => Array(f(2), f(4), f(5), f(6)))
      .map(f => f.map(v => v.toDouble))
      .map(f => Vectors.dense(f))
      .foreachRDD(rdd => {
        val scalerModel = new StandardScaler().fit(rdd)
        val scaledRDD = scalerModel.transform(rdd)
      })

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L9-8PCA.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object PCAApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: PCAApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val datastream = substream.map(f => Array(f(1), f(4), f(5), f(6), f(20), f(21), f(22), f(36), f(37), f(38)))
      .map(f => f.map(v => v.toDouble))
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))

    datastream.foreachRDD(rdd => {
      val pca = new PCA(rdd.first().features.size / 2)
        .fit(rdd.map(_.features))
      val testTrain = rdd.randomSplit(Array(0.3, 0.7))
      val test = testTrain(0).map(lp => lp.copy(features = pca.transform(lp.features)))
      val train = testTrain(1).map(lp => lp.copy(features = pca.transform(lp.features)))
      train.take(20).foreach(println)
    })

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: L9-10KMeans.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.rdd.RDD.doubleRDDToDoubleRDDFunctions
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext

object KMeansClusteringApp {

  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println(
        "Usage: KMeansClusteringApp <appname> <batchInterval> <hostname> <port>")
      System.exit(1)
    }
    val Seq(appName, batchInterval, hostname, port) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(batchInterval.toInt))

    val substream = ssc.socketTextStream(hostname, port.toInt)
      .filter(!_.contains("NaN"))
      .map(_.split(" "))
      .filter(f => f(1) != "0")

    val orientationStream = substream
      .map(f => Seq(1, 4, 5, 6, 10, 11, 12, 20, 21, 22, 26, 27, 28, 36, 37, 38, 42, 43, 44).map(i => f(i)).toArray)
      .map(arr => arr.map(_.toDouble))
      .filter(f => f(0) == 1.0 || f(0) == 2.0 || f(0) == 3.0)
      .map(f => LabeledPoint(f(0), Vectors.dense(f.slice(1, f.length))))
    val test = orientationStream.transform(rdd => rdd.randomSplit(Array(0.3, 0.7))(0))
    val train = orientationStream.transformWith(test, (r1: RDD[LabeledPoint], r2: RDD[LabeledPoint]) => r1.subtract(r2)).cache()
    val model = new StreamingKMeans()
      .setK(3)
      .setDecayFactor(0)
      .setRandomCenters(18, 0.0)

    model.trainOn(train.map(v => v.features))
    val prediction = model.predictOnValues(test.map(v => (v.label, v.features)))

    ssc.start()
    ssc.awaitTermination()
  }

}

Source File: value_model.scala From Spark_Personas with MIT License

5 votes

val input_df = hiveContext.sql("select t.lenovo_id,t.monetary,cast(t.frequency as int) as frequency,t.recency from model_input_rfm_t t")
val row_nums = input_df.count.toInt       //获得总行数
val row_partition = row_nums / 5      //获得5分区点
val row_partition6 = row_nums / 6    //获得6分区点
val input_sort_monetary = input_df.sort($"monetary".desc).collect()         
val input_sort_frequency = input_df.sort($"frequency".desc).collect()      //wrong
val input_sort_recency = input_df.sort($"recency".desc).collect()
//monetary的分区
val monetary_1 = input_sort_monetary(row_partition * 1).get(1).asInstanceOf[Number].intValue
val monetary_2 = input_sort_monetary(row_partition * 2).get(1).asInstanceOf[Number].intValue
val monetary_3 = input_sort_monetary(row_partition * 3).get(1).asInstanceOf[Number].intValue
val monetary_4 = input_sort_monetary(row_partition * 4).get(1).asInstanceOf[Number].intValue
//frequency的分区
val frequency_1 = input_sort_frequency (row_partition * 1).get(2).asInstanceOf[Integer].toInt
val frequency_2 = input_sort_frequency (row_partition * 2).get(2).asInstanceOf[Integer].toInt
val frequency_3 = input_sort_frequency (row_partition * 3).get(2).asInstanceOf[Integer].toInt
val frequency_4 = input_sort_frequency (row_partition * 4).get(2).asInstanceOf[Integer].toInt
//recency的分区
val result= input_sort_recency(row_partition6 * 1).get(3).asInstanceOf[String].toString
val recency_1 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10)
val result= input_sort_recency(row_partition6 * 2).get(3).asInstanceOf[String].toString
val recency_2 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10)
val result= input_sort_recency(row_partition6 * 3).get(3).asInstanceOf[String].toString
val recency_3 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10)

val result= input_sort_recency(row_partition6 * 4).get(3).asInstanceOf[String].toString
val recency_4 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10)
val result= input_sort_recency(row_partition6 * 5).get(3).asInstanceOf[String].toString
val recency_5 = result.substring(0,4)+result.substring(5,7)+result.substring(8,10)
val io_monetary = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1  where 1=0 ) union all (select t2.lenovo_id, t2.frequency, t2.monetary, t2.recency,(case when t2.monetary > "+monetary_1+ " then 5 when t2.monetary >"+monetary_2+" then 4 when t2.monetary > "+monetary_3+" then 3 when t2.monetary >"+monetary_4 + " then 2 else  1 end) as points, ' ', ' ' from model_input_rfm_t t2)")
io_monetary .registerTempTable("temporary_monetary")     //金额临时表
val io_frequency = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1  where 1=0 ) union all (select t2.lenovo_id, t2.frequency, t2.monetary, t2.recency,(case when t2.frequency> "+frequency_1 + " then (50+t3.points) when t2.frequency>"+frequency_2 +" then (40+t3.points) when t2.frequency> "+frequency_3 +" then (30+t3.points) when t2.frequency>"+frequency_4 + " then (20+t3.points) else  (10+t3.points) end) as points, ' ', ' ' from model_input_rfm_t t2,temporary_monetary t3 where t2.lenovo_id = t3.lenovo_id)")
io_frequency.registerTempTable("temporary_frequency")     //频率临时表
//归一化
val result = hiveContext.sql("select max(cast(frequency as int)) from model_input_rfm_t")   //求最大频率
val max_frequency = result.collect()(0).get(0).asInstanceOf[Integer].toInt
val result = hiveContext.sql("select min(cast(frequency as int)) from temporary_frequency")   //最小频率
val min_frequency = result.collect()(0).get(0).asInstanceOf[Integer].toInt
val region_frequency = max_frequency - min_frequency 
val result = hiveContext.sql("select max(unix_timestamp(concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2)),'yyyyMMdd')) from  temporary_frequency t2")
val max_recency = result.collect()(0).get(0).asInstanceOf[Long]         //最大时间
val result = hiveContext.sql("select min(unix_timestamp(concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2)),'yyyyMMdd')) from  temporary_frequency t2")
val min_recency = result.collect()(0).get(0).asInstanceOf[Long]     //最小时间
val region_recency = max_recency - min_recency                                   //时间最大区间
val result =hiveContext.sql("select max(monetary) from model_input_rfm_t")
val max_monetary =  result.collect()(0).get(0).asInstanceOf[Float]              //最大金额
//val result =hiveContext.sql("select min(monetary) from model_input_rfm_t")
//val min_monetary =  result.collect()(0).get(0).asInstanceOf[Float]              //最小金额
val min_monetary = 0
val region_monetary = max_monetary - min_monetary                //金额最大区间
val io_recency = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1 where 1=0 ) union all (select t2.lenovo_id, ((t2.frequency - "+min_frequency+")/" + region_frequency + ") as frequency, ((t2.monetary - "+min_monetary+") /" + region_monetary+") as monetary, ((unix_timestamp(t2.recency,'yyyy-MM-dd')- "+min_recency+") / " + region_recency + ") as recency,(case when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))> "+recency_1+ " then (600+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))>"+recency_2+" then (500+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))> "+recency_3+" then (400+t3.points) when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))>"+recency_4+ " then (300+t3.points)   when concat(substring(t2.recency,0,4),substring(t2.recency,6,2),substring(t2.recency,9,2))>"+recency_5+ " then (200+t3.points) else  (100+t3.points) end) as points, ' ', ' ' from model_input_rfm_t t2,temporary_frequency t3 where t2.lenovo_id = t3.lenovo_id)")
io_recency.registerTempTable("temporary_recency")     //日期临时表

//聚类算法
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkConf, SparkContext}
//DataFrame转化为RDD，直接io_recency.rdd即可
val parsedData =  io_recency.rdd.map( s => Vectors.dense(s.get(1).asInstanceOf[String].toDouble,s.get(2).asInstanceOf[Double],s.get(3).asInstanceOf[String].toDouble))      //.cache()
val numClusters = 8
val numIterations = 20
val model = KMeans.train(parsedData, numClusters, numIterations)
model.clusterCenters.foreach(println)
val WSSSE = model.computeCost(parsedData)
println("Within Set Sum of Squared Errors = " + WSSSE)
val insertData =  io_recency.rdd.map( s => Vectors.dense(s.get(0).asInstanceOf[String].toLong,s.get(1).asInstanceOf[String].toDouble,s.get(2).asInstanceOf[Double],s.get(3).asInstanceOf[String].toDouble,s.get(4).asInstanceOf[Integer].toInt,' ',model.predict(Vectors.dense(s.get(1).asInstanceOf[String].toDouble,s.get(2).asInstanceOf[Double],s.get(3).asInstanceOf[String].toDouble))) )  //.cache()

import spark.implicits._
case class Cluster(lenovo_id: Long, frequency:Double,monetary:Double,recency:Double,points:Double,flag:Double,cluster:Double)
val rdd_df = insertData.map(attributes => Cluster(attributes(0).toLong, attributes(1).toDouble, attributes(2).toDouble, attributes(3).toDouble, attributes(4).toDouble, attributes(5).toDouble, attributes(6).toDouble)).toDF()
rdd_df.registerTempTable("temporary_cluster")
hiveContext.sql("insert overwrite table userfigure_local.model_output_rfm_t  partition (l_day='2016-10-01') select * from temporary_cluster")
val io_cluster = hiveContext.sql("(select t1.lenovo_id, t1.frequency, t1.monetary, t1.recency, t1.points, t1.flag, t1.cluster from model_output_rfm_t t1where 1=0 ) union all (select t2.lenovo_id, t2.frequency, t2.monetary, t2.recency,t2.points, t2.flag,t2.cluster from temporary_cluster t2)")
hiveContext.sql("insert into model_output_rfm_t partition(l_day='2016-10-01') select * from table1")

Source File: activity _model.scala From Spark_Personas with MIT License

5 votes

//归一化
val result = h.sql("select max(visit_times) from model_input_active_t")   //最大访问次数
val max_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(visit_times) from model_input_active_t")   //最小访问次数
val min_visit_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_visit_times =if(( max_visit_times - min_visit_times) == 0) 1 else ( max_visit_times - min_visit_times)


val result = h.sql("select max(last_online_time) from model_input_active_t")   //最远登录天数
val max_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(last_online_time) from model_input_active_t")   //最小登录天数
val min_last_online_time = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_last_online_time =if(( max_last_online_time - min_last_online_time ) == 0) 1 else ( max_last_online_time - min_last_online_time)


val result = h.sql("select max(pay_times) from model_input_active_t")   //最大支付次数
val max_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(pay_times) from model_input_active_t")   //最小支付次数
val min_pay_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_pay_times =if(( max_pay_times - min_pay_times ) == 0) 1 else (  max_pay_times - min_pay_times)

val result = h.sql("select max(comment_times) from model_input_active_t")   //最大问询评论数
val max_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(comment_times) from model_input_active_t")   //最小问询评论数
val min_comment_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_comment_times =if(( max_comment_times - min_comment_times ) == 0) 1 else (  max_comment_times - min_comment_times)

val result = h.sql("select max(stay_time) from model_input_active_t")   //最大停留时间
val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val result = h.sql("select min(stay_time) from model_input_active_t")   //最小停留时间
val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val region_stay_time =if(( max_stay_time - min_stay_time ) == 0) 1 else (  max_stay_time - min_stay_time)


val result = h.sql("select max(visit_day_times) from model_input_active_t")   //最大登录天数
val max_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val result = h.sql("select min(visit_day_times) from model_input_active_t")   //最小登录天数
val min_visit_day_times = result.collect()(0).get(0).asInstanceOf[Int].toDouble
val region_visit_day_times =if(( max_visit_day_times - min_visit_day_times ) == 0) 1 else (   max_visit_day_times - min_visit_day_times)

//权重：visit_times:0.2,visit_targetpage_percen:0.1,last_online_time:0.1,pay_times:0.2,comment_times:0.2,stay_time:0.1,visit_day_times 0.1
val normalization= h.sql("select t1.cookie , ((t1.visit_times- "+min_visit_times+")*0.2/"+region_visit_times+") as visit_times, t1.visit_targetpage_percen*0.1, ((t1.last_online_time- "+min_last_online_time+")*0.1/"+region_last_online_time+") as last_online_time, ((t1.pay_times- "+min_pay_times+")*0.2/"+region_pay_times+") as pay_times, ((t1.comment_times- "+min_comment_times+")*0.2/"+region_comment_times+") as comment_times, ((t1.stay_time- "+min_stay_time+")*0.1/"+region_stay_time+") as stay_time, ((t1.visit_day_times- "+min_visit_day_times+")*0.1/"+region_visit_day_times+") as visit_day_times from model_input_active_t t1")


import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix


//DataFrame转化为Vectors，没发现直接的API，方案是Dataframe转为rdd，然后，调用Vectors.dense，把它们集合起来
val data = normalization.rdd.map(line => Vectors.dense(line.get(1).toString.asInstanceOf[String].toDouble,line.get(2).toString.asInstanceOf[String].toDouble,line.get(3).toString.asInstanceOf[String].toDouble,line.get(4).toString.asInstanceOf[String].toDouble,line.get(5).toString.asInstanceOf[String].toDouble,line.get(6).toString.asInstanceOf[String].toDouble,line.get(7).toString.asInstanceOf[String].toDouble))

val rm = new RowMatrix(data)

val pc = rm.computePrincipalComponents(1)
val mx = rm.multiply(pc)

//未完待续

Source File: loyalty_model.scala From Spark_Personas with MIT License

5 votes

//��һ��
val result = hiveContext.sql("select max(login_times) from model_input_loyal_t")   //�����ʴ���
val max_login_times = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val result = hiveContext.sql("select min(login_times) from model_input_loyal_t")   //��С���ʴ���
val min_login_times = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val region_login_times = max_login_times - min_login_times

val result = hiveContext.sql("select max(stay_time) from model_input_loyal_t")  //���ͣ��ʱ��
val max_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val result = hiveContext.sql("select min(stay_time) from model_input_loyal_t")  //��Сͣ��ʱ��
val min_stay_time = result.collect()(0).get(0).asInstanceOf[Float].toDouble
val region_stay_time = max_stay_time - min_stay_time 

val result = hiveContext.sql("select max(view_days) from model_input_loyal_t")  //���ͣ������
val max_view_days = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val result = hiveContext.sql("select min(view_days) from model_input_loyal_t")  //��Сͣ������
val min_view_days = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val region_view_days = max_view_days - min_view_days 


val result = hiveContext.sql("select max(pv) from model_input_loyal_t")  //������ҳ����
val max_pv = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val result = hiveContext.sql("select min(pv) from model_input_loyal_t")  //��С����ҳ����
val min_pv = result.collect()(0).get(0).asInstanceOf[Long].toDouble
val region_pv = max_pv - min_pv 

val result =hiveContext.sql("select max(unix_timestamp(t2.last_viewtime,'yyyy-MM-dd')) from  model_input_loyal_t t2")
val max_last_viewtime = result.collect()(0).get(0).asInstanceOf[Long].toDouble         //���ʱ��
val result = hiveContext.sql("select min(unix_timestamp(t2.last_viewtime,'yyyy-MM-dd')) from  model_input_loyal_t t2")
val min_last_viewtime = result.collect()(0).get(0).asInstanceOf[Long].toDouble     //��Сʱ��

val region_last_viewtime = max_last_viewtime - min_last_viewtime  


//Ȩ�أ�login_times:0.2,stay_time:0.3,view_days:0.3,pv:0.15,last_viewtime:0.05
val normalization= hiveContext.sql("select t1.cookie , (((t1.login_times - "+min_login_times+") * 0.2/"+region_login_times+") + ((t1.stay_time- "+min_stay_time+") * 0.3/"+region_stay_time+") +((t1.view_days - "+min_view_days+")* 0.3/"+region_view_days+") +((t1.pv - "+min_pv+")* 0.15/"+region_pv+") +((unix_timestamp(t1.last_viewtime,'yyyy-MM-dd')- "+min_last_viewtime+")*0.05 / " + region_last_viewtime + "))*100 as loyalty_score from model_input_loyal_t t1") 

normalization.registerTempTable("temporary_points")     //��һ����ʱ��

import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.mllib.linalg.Vectors

val rdd =  normalization.rdd.map( s => Vectors.dense(s.get(1).asInstanceOf[Double].toDouble))
val summary = Statistics.colStats(rdd)
println(summary.mean)
val means = summary.mean(0)
println(summary.variance)
val standard_deviation = summary.variance(0)

//����һ�������׼��ľ��룬��Ϊ��ֵ�Ƚ�С����ֵ��ȥ��׼���Ϊ�����������½�����Ϊ0���Ͻ粻�䣻
val r = means - standard_deviation*5
val low_bound =  if (r > 0)  r else 0
val up_bound = means + standard_deviation*5

val loyalty_temporary = hiveContext.sql("(select t1.lenovo_id,t1.loyalty_score,t1.loyalty_level from model_output_loyal_t t1 where 1=0) union all (select t2.cookie, t2.loyalty_score,(case when t2.loyalty_score  <= "+low_bound+"  then '��'  when t2.loyalty_score < "+up_bound+" then '��' else '��' end)as loyalty_level   from temporary_points t2)")

loyalty_temporary.registerTempTable("temporary_loyalty")

hiveContext.sql("insert overwrite table data.model_output_loyal_t  partition (l_day='2016-10-01') select * from temporary_loyalty")

Source File: LibLinAlg.scala From spark-cp with Apache License 2.0

5 votes

package se.uu.farmbio.cp.liblinear

import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import de.bwaldvogel.liblinear.Feature
import de.bwaldvogel.liblinear.FeatureNode
import de.bwaldvogel.liblinear.Linear
import de.bwaldvogel.liblinear.Parameter
import de.bwaldvogel.liblinear.Problem
import de.bwaldvogel.liblinear.SolverType
import se.uu.farmbio.cp.UnderlyingAlgorithm
import se.uu.farmbio.cp.Deserializer

object LibLinAlg {

  private def vectorToFeatures(v: Vector) = {
    val indices = v.toSparse.indices
    val values = v.toSparse.values
    indices
      .zip(values)
      .sortBy {
        case (i, v) => i
      }
      .map {
        case (i, v) => new FeatureNode(i + 1, v)
          .asInstanceOf[Feature]
      }
  }

  private def train(
    input: Array[LabeledPoint],
    solverType: SolverType,
    c: Double,
    tol: Double) = {

    //configure problem
    val problem = new Problem
    problem.l = input.length
    problem.n = input(0).features.size
    problem.x = input.map { p =>
      vectorToFeatures(p.features)
    }
    problem.y = input.map(_.label + 1.0)
    problem.bias = -1.0

    //train
    val parameter = new Parameter(solverType, c, tol)
    val libLinModel = Linear.train(problem, parameter)

    //convert to Spark SVMModel
    val weights = libLinModel.getFeatureWeights
    val intercept = libLinModel.getBias
    val svmModel = new SVMModel(Vectors.dense(weights).toSparse, intercept)
    svmModel.clearThreshold
    svmModel

  }

}

object LibLinAlgDeserializer extends Deserializer[LibLinAlg] {
  override def deserialize(alg: String) = {
    val splitted = alg.split(",", 2)
    val intercept = splitted(0)
    val weights = splitted(1)
    val model = new SVMModel(Vectors.parse(weights).toSparse, intercept.toDouble)
    model.clearThreshold()
    new LibLinAlg(model)
  }  
}

class LibLinAlg(
  val svmModel: SVMModel)
  extends UnderlyingAlgorithm(
    (features: Vector) => svmModel.predict(features)) {

  def this(
    training: Array[LabeledPoint],
    solverType: SolverType,
    regParam: Double,
    tol: Double) = {
    this(LibLinAlg.train(training, solverType, regParam, tol))
  }

  override def nonConformityMeasure(newSample: LabeledPoint) = {
    val score = predictor(newSample.features)
    if (newSample.label == 1.0) {
      score
    } else {
      -score
    }
  }

  override def toString = {
    this.svmModel.intercept + "," +
      this.svmModel.weights.toString
  }

}

Source File: SVM.scala From spark-cp with Apache License 2.0

5 votes

package se.uu.farmbio.cp.alg

import org.apache.spark.mllib.classification.SVMModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.HingeGradient
import org.apache.spark.mllib.optimization.LBFGS
import org.apache.spark.mllib.optimization.SquaredL2Updater
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD

import se.uu.farmbio.cp.UnderlyingAlgorithm

//Define a SVMs UnderlyingAlgorithm
private object SVM {
  def trainingProcedure(
    input: RDD[LabeledPoint],
    maxNumItearations: Int,
    regParam: Double,
    numCorrections: Int,
    convergenceTol: Double) = {

    //Train SVM with LBFGS
    val numFeatures = input.take(1)(0).features.size
    val training = input.map(x => (x.label, MLUtils.appendBias(x.features))).cache()
    val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1))
    val (weightsWithIntercept, _) = LBFGS.runLBFGS(
      training,
      new HingeGradient(),
      new SquaredL2Updater(),
      numCorrections,
      convergenceTol,
      maxNumItearations,
      regParam,
      initialWeightsWithIntercept)

    //Create the model using the weights
    val model = new SVMModel(
      Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)),
      weightsWithIntercept(weightsWithIntercept.size - 1))

    //Return raw score predictor
    model.clearThreshold()
    model

  }
}

class SVM(val model: SVMModel)
  extends UnderlyingAlgorithm(model.predict) {

  def this(
    input: RDD[LabeledPoint],
    maxNumItearations: Int = 100,
    regParam: Double = 0.1,
    numCorrections: Int = 10,
    convergenceTol: Double = 1e-4) = {

    this(SVM.trainingProcedure(
      input,
      maxNumItearations,
      regParam,
      numCorrections,
      convergenceTol))

  }
  
  def nonConformityMeasure(newSample: LabeledPoint) = {
    val score = predictor(newSample.features)
    if (newSample.label == 1.0) {
      -score
    } else {
      score
    }
  }
  
}

Source File: LogisticRegression.scala From spark-cp with Apache License 2.0

5 votes

package se.uu.farmbio.cp.alg

import org.apache.spark.mllib.classification.LogisticRegressionModel
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.LBFGS
import org.apache.spark.mllib.optimization.LogisticGradient
import org.apache.spark.mllib.optimization.SquaredL2Updater
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD

import se.uu.farmbio.cp.UnderlyingAlgorithm

//Define a LogisticRegression UnderlyingAlgorithm
private object LogisticRegression {
  def trainingProcedure(
    input: RDD[LabeledPoint],
    maxNumItearations: Int,
    regParam: Double,
    numCorrections: Int,
    convergenceTol: Double): (Vector => Double) = {

    //Train Logistic Regression with LBFGS
    val numFeatures = input.take(1)(0).features.size
    val training = input.map(x => (x.label, MLUtils.appendBias(x.features))).cache()
    val initialWeightsWithIntercept = Vectors.dense(new Array[Double](numFeatures + 1))
    val (weightsWithIntercept, _) = LBFGS.runLBFGS(
      training,
      new LogisticGradient(),
      new SquaredL2Updater(),
      numCorrections,
      convergenceTol,
      maxNumItearations,
      regParam,
      initialWeightsWithIntercept)

    //Create the model using the weights
    val model = new LogisticRegressionModel(
      Vectors.dense(weightsWithIntercept.toArray.slice(0, weightsWithIntercept.size - 1)),
      weightsWithIntercept(weightsWithIntercept.size - 1))

    //Return raw score predictor
    model.clearThreshold()
    model.predict

  }
}

class LogisticRegression(
  private val input: RDD[LabeledPoint],
  private val maxNumItearations: Int = 100,
  private val regParam: Double = 0.1,
  private val numCorrections: Int = 10,
  private val convergenceTol: Double = 1e-4)
  extends UnderlyingAlgorithm(
    LogisticRegression.trainingProcedure(
      input,
      maxNumItearations,
      regParam,
      numCorrections,
      convergenceTol)) {
  override def nonConformityMeasure(newSample: LabeledPoint) = {
    val score = predictor(newSample.features)
    if (newSample.label == 1.0) {
      1-score
    } else {
      score
    }
  }
}

Source File: TestUtils.scala From spark-cp with Apache License 2.0

5 votes

package se.uu.farmbio.cp

import scala.util.Random

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD

object TestUtils {

  def generate4ClassesData(instances: Int, seed: Long): Seq[LabeledPoint] = {
    val rnd = new Random(seed)
    Seq.fill(instances)((rnd.nextInt(100), rnd.nextInt(100))).map(r => {
      val label = if (r._1 < 50 && r._2 < 50) {
        0.0
      } else if (r._1 < 50) {
        1.0
      } else if (r._2 < 50) {
        2.0
      } else {
        3.0
      }
      new LabeledPoint(label, Vectors.dense(Array(r._1.toDouble, r._2.toDouble)))
    })
  }

  def generate4ClassesTrainCalibTest(significance: Double) = {
    val numClasses = 4
    val calibSamples = 4 * numClasses * (1 / significance - 1).ceil.toInt //4 times the minimum
    val training = generate4ClassesData(instances = 80,
      seed = Random.nextLong)
    val test = generate4ClassesData(instances = 20,
      seed = Random.nextLong)
    val calibration = generate4ClassesData(instances = calibSamples,
      seed = Random.nextLong)
      .toArray
    (training, calibration, test)
  }

  def generateBinaryData(instances: Int, seed: Long): Seq[LabeledPoint] = {
    val rnd = new Random(seed)
    Seq.fill(instances)(rnd.nextInt(100)).map(r => {
      val label = if (r < 50) {
        0.0
      } else {
        1.0
      }
      new LabeledPoint(label, Vectors.dense(r))
    })
  }

  def testPerformance[T <: UnderlyingAlgorithm](
    model: ICPClassifierModel[T],
    test: RDD[LabeledPoint],
    sig: Double = 0.2,
    minEff: Double = 0.6,
    minRec: Double = 0.6) = {
    val pvAndLab = test.map { p =>
      (model.mondrianPv(p.features), p.label)
    }
    val metrics = new BinaryClassificationICPMetrics(pvAndLab, Array(sig))
    val eff = metrics.efficiencyBySignificance(sig)
    val rec = metrics.recallBySignificance(sig)
    eff >= minEff && rec >= minRec
  }

}

Source File: GamerSparkSQLExample.scala From SparkOnKudu with Apache License 2.0

5 votes

package org.kududb.spark.demo.gamer.aggregates

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}

object GamerSparkSQLExample {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("{kudumaster} {runLocal}")
      return
    }

    Logger.getRootLogger.setLevel(Level.ERROR)

    val kuduMaster = args(0)
    val runLocal = args(1).equals("l")

    println("Loading Spark Context")
    var sc:SparkContext = null

    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      sc = new SparkContext(sparkConfig)
    }
    println("Loading Spark Context: Finished")

    println("Setting up Tables")
    val sqlContext = new SQLContext(sc)
    sqlContext.load("org.kududb.spark",
      Map("kudu.table" -> "gamer", "kudu.master" -> kuduMaster)).registerTempTable("gamer")

    println("Query 1: SELECT count(*) FROM gamer")
    val startTimeQ1 = System.currentTimeMillis()
    sqlContext.sql("SELECT count(*) FROM gamer").take(10).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1))

    println("Query 2: SELECT * FROM gamer limit 100")
    val startTimeQ2 = System.currentTimeMillis()
    sqlContext.sql("SELECT * FROM gamer limit 100").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2))

    println("Query 3: SELECT * FROM gamer order_by last_time_played desc limit 100")
    val startTimeQ3 = System.currentTimeMillis()
    sqlContext.sql("SELECT * FROM gamer order by last_time_played desc limit 100").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3))

    println("Query 4: SELECT max(games_played), max(oks), max(damage_given) FROM gamer")
    val startTimeQ4 = System.currentTimeMillis()
    sqlContext.sql("SELECT max(games_played), max(oks), max(damage_given) FROM gamer").take(100).foreach(r => {
      println(" - (" + r + ")")
    })
    println("Finish Query 4: " + (System.currentTimeMillis() - startTimeQ4))

    println("Query 5 + MLLIB: SELECT gamer_id, oks, games_won, games_played FROM gamer" )
    val startTimeQ5 = System.currentTimeMillis()
    val resultDf = sqlContext.sql("SELECT gamer_id, oks, games_won, games_played FROM gamer")

    val parsedData = resultDf.map(r => {
      val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble, r.getInt(3).toDouble)
      Vectors.dense(array)
    })

    val dataCount = parsedData.count()

    if (dataCount > 0) {
      val clusters = KMeans.train(parsedData, 3, 5)
      clusters.clusterCenters.foreach(v => println(" Vector Center:" + v))

    }
    //TODO add Mllib here
    println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5))

  }
}

Source File: BasicSparkSQLExamples.scala From SparkOnKudu with Apache License 2.0

5 votes

package org.kududb.spark.demo.basic

import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors

object BasicSparkSQLExamples {
  def main(args:Array[String]): Unit = {
    if (args.length == 0) {
      println("<kuduMaster> <tablename> <runLocal>")
    }

    Logger.getRootLogger.setLevel(Level.ERROR)

    val kuduMaster = args(0)
    val tableName = args(1)
    val runLocal = args(2).equals("l")

    println("starting")
    var sc:SparkContext = null
    if (runLocal) {
      val sparkConfig = new SparkConf()
      sparkConfig.set("spark.broadcast.compress", "false")
      sparkConfig.set("spark.shuffle.compress", "false")
      sparkConfig.set("spark.shuffle.spill.compress", "false")
      sc = new SparkContext("local", "TableStatsSinglePathMain", sparkConfig)
    } else {
      val sparkConfig = new SparkConf().setAppName("TableStatsSinglePathMain")
      sc = new SparkContext(sparkConfig)
    }

    try {
      println("Setting up Tables")
      val sqlContext = new SQLContext(sc)
      sqlContext.load("org.kududb.spark",
        Map("kudu.table" -> tableName, "kudu.master" -> kuduMaster)).registerTempTable(tableName)

      println("Query 1: SELECT count(*) FROM " + tableName)
      val startTimeQ1 = System.currentTimeMillis()
      sqlContext.sql("SELECT count(*) FROM " + tableName).take(10).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 1: " + (System.currentTimeMillis() - startTimeQ1))

      println("Query 2: SELECT key_id, col_1 FROM " + tableName + " limit 100")
      val startTimeQ2 = System.currentTimeMillis()
      sqlContext.sql("SELECT key_id, col_1 FROM " + tableName + " limit 100 ").take(100).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 2: " + (System.currentTimeMillis() - startTimeQ2))

      val q3 = "select key_id from " + tableName + " a join (SELECT max(col_1) col_max FROM " + tableName + ") b on (a.col_1 = b.col_max)"
      println("Query 3: " + q3)
      val startTimeQ3 = System.currentTimeMillis()
      sqlContext.sql(q3).take(100).foreach(r => {
        println(" - (" + r + ")")
      })
      println("Finish Query 3: " + (System.currentTimeMillis() - startTimeQ3))


      println("Query 5 + MLLIB: SELECT key_id, col_1, col_2 FROM " + tableName )
      val startTimeQ5 = System.currentTimeMillis()
      val resultDf = sqlContext.sql("SELECT key_id, col_1, col_2 FROM " + tableName + " limit 1000")

      val parsedData = resultDf.map(r => {
        val array = Array(r.getInt(1).toDouble, r.getInt(2).toDouble)
        Vectors.dense(array)
      })
      val clusters = KMeans.train(parsedData, 3, 4)
      clusters.clusterCenters.foreach(v => println(" Vector Center:" + v))

      //TODO add Mllib here
      println("Finish Query 5 + MLLIB: " + (System.currentTimeMillis() - startTimeQ5))

    } finally {
      sc.stop()
    }
  }
}

Source File: PCA.scala From seqspark with Apache License 2.0

5 votes

package org.dizhang.seqspark.stat

import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
import org.apache.spark.mllib.feature.{PCA => SPCA}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.dizhang.seqspark.ds.{DenseCounter, Genotype, SparseCounter}
import org.dizhang.seqspark.util.General._
import org.dizhang.seqspark.worker.Data
import org.slf4j.LoggerFactory


  }
  def pc(n: Int): BDM[Double] = {
    val model = new SPCA(n)
    val data = this.prepare
    if (data.isEmpty()) {
      new BDM[Double](0, 0)
    } else {
      val res = model.fit(data).pc.values
      new BDM(res.length/n, n, res)
    }
  }
}

Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = records.map { r =>
      val trimmed = r.map(_.replaceAll("\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))
    }

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.
    svmModel.clearThreshold()

    val svmTotalCorrect = data.map { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0
    }.sum()

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()
    println(svmAccuracy)
  }

}

Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.classification.stumbleupon

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = records.map { r =>
      val trimmed = r.map(_.replaceAll("\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))
    }

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.
    svmModel.clearThreshold()

    val svmTotalCorrect = data.map { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0
    }.sum()

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()
    println(svmAccuracy)
  }

}

Source File: StandardScalarSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}

object StandardScalarSample {
  def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("local").setAppName("Word2Vector")
    val sc = new SparkContext(conf)
    val data = MLUtils.loadLibSVMFile(sc,
      org.sparksamples.Util.SPARK_HOME +  "/data/mllib/sample_libsvm_data.txt")

    val scaler1 = new StandardScaler().fit(data.map(x => x.features))
    val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
    // scaler3 is an identical model to scaler2, and will produce identical transformations
    val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)

    // data1 will be unit variance.
    val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
    println(data1.first())

    // Without converting the features into dense vectors, transformation with zero mean will raise
    // exception on sparse vector.
    // data2 will be unit variance and zero mean.
    val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
    println(data2.first())
  }
}

Source File: StandardScalarSample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.{SparkConf, SparkContext}

object StandardScalarSample {
  def main(args: Array[String]) {
    val conf = new SparkConf().setMaster("local").setAppName("Word2Vector")
    val sc = new SparkContext(conf)
    val data = MLUtils.loadLibSVMFile(sc, "/home/ubuntu/work/spark-1.6.0-bin-hadoop2.6/data/mllib/sample_libsvm_data.txt")

    val scaler1 = new StandardScaler().fit(data.map(x => x.features))
    val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
    // scaler3 is an identical model to scaler2, and will produce identical transformations
    val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)

    // data1 will be unit variance.
    val data1 = data.map(x => (x.label, scaler1.transform(x.features)))
    println(data1.first())

    // Without converting the features into dense vectors, transformation with zero mean will raise
    // exception on sparse vector.
    // data2 will be unit variance and zero mean.
    val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
    println(data2.first())
  }
}

Source File: SVMPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.stumbleuponclassifier

import org.apache.log4j.Logger
import org.apache.spark.SparkContext
import org.apache.spark.mllib.classification.SVMWithSGD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint


object SVMPipeline {
  @transient lazy val logger = Logger.getLogger(getClass.getName)

  def svmPipeline(sc: SparkContext) = {
    val records = sc.textFile("/home/ubuntu/work/ml-resources/spark-ml/train_noheader.tsv").map(line => line.split("\t"))

    val data = records.map { r =>
      val trimmed = r.map(_.replaceAll("\"", ""))
      val label = trimmed(r.size - 1).toInt
      val features = trimmed.slice(4, r.size - 1).map(d => if (d == "?") 0.0 else d.toDouble)
      LabeledPoint(label, Vectors.dense(features))
    }

    // params for SVM
    val numIterations = 10

    // Run training algorithm to build the model
    val svmModel = SVMWithSGD.train(data, numIterations)

    // Clear the default threshold.
    svmModel.clearThreshold()

    val svmTotalCorrect = data.map { point =>
      if(svmModel.predict(point.features) == point.label) 1 else 0
    }.sum()

    // calculate accuracy
    val svmAccuracy = svmTotalCorrect / data.count()
    println(svmAccuracy)
  }

}

Source File: MovieLensDataPowerIterationClustering.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples

import org.apache.spark.mllib.recommendation.{ALS, Rating}
import org.apache.spark.{SparkConf, SparkContext}



object MovieLensDataPowerIterationClustering {
  val PATH= "../data/ml-100k"
  def main(args: Array[String]): Unit = {
    val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp").
      set("spark.driver.allowMultipleContexts", "true")
    val sc = new SparkContext(spConfig)
    //val path = PATH + "../data/"
    //val rdd = sc.wholeTextFiles(path)
    val movies = sc.textFile(PATH + "/u.item")
    println(movies.first)
    val genres = sc.textFile(PATH + "/u.genre")
    genres.take(5).foreach(println)

    val genreMap = genres.filter(!_.isEmpty).map(line => line.split("\\|")).
      map(array => (array(1), array(0))).collectAsMap


    val titlesAndGenres = movies.map(_.split("\\|")).map { array =>
      val genres = array.toSeq.slice(5, array.size)
      val genresAssigned = genres.zipWithIndex.filter { case (g, idx)
      =>
        g == "1"
      }.map { case (g, idx) =>
        genreMap(idx.toString)
      }
      (array(0).toInt, (array(1), genresAssigned))
    }

    val rawData = sc.textFile(PATH + "/u.data")
    val rawRatings = rawData.map(_.split("\t").take(3))
    val ratings = rawRatings.map{ case Array(user, movie, rating) => Rating(user.toInt, movie.toInt, rating.toDouble) }
    ratings.cache
    val alsModel = ALS.train(ratings, 50, 10, 0.1)
    import org.apache.spark.mllib.linalg.Vectors
    val movieFactors = alsModel.productFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) }
    val movieVectors = movieFactors.map(_._2)
    val userFactors = alsModel.userFeatures.map { case (id, factor) => (id, Vectors.dense(factor)) }
    val userVectors = userFactors.map(_._2)
    

    val numClusters = 5
    val numIterations = 10
    val numRuns = 3
    import org.apache.spark.mllib.clustering.PowerIterationClustering
    //val bKMeans = new PowerIterationClustering()()
    val piClustering = new PowerIterationClustering()
    piClustering.setMaxIterations(10)
    piClustering.setK(numClusters)


    println("done")


  }
}

Source File: GMMClustering.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.gmm

// scalastyle:off println

// $example on$
import org.apache.spark.SparkConf
import org.apache.spark.ml.clustering.{GaussianMixture, KMeans}
// $example off$
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SparkSession


object GMMClustering {


  def main(args: Array[String]): Unit = {
    val spConfig = (new SparkConf).setMaster("local[1]").setAppName("SparkApp").
      set("spark.driver.allowMultipleContexts", "true")

    val spark = SparkSession
      .builder()
      .appName("Spark SQL Example")
      .config(spConfig)
      .getOrCreate()

    val datasetUsers = spark.read.format("libsvm").load(
      "./data/movie_lens_libsvm/movie_lens_users_libsvm/part-00000")
    datasetUsers.show(3)

    val gmmUsers = new GaussianMixture().setK(5).setSeed(1L)
    val modelUsers = gmmUsers.fit(datasetUsers)

    for (i <- 0 until modelUsers.gaussians.length) {
      println("Users : weight=%f\ncov=%s\nmean=\n%s\n" format
        (modelUsers.weights(i), modelUsers.gaussians(i).cov, modelUsers.gaussians(i).mean))
    }

    val dataSetItems = spark.read.format("libsvm").load(
      "./data/movie_lens_libsvm/movie_lens_items_libsvm/part-00000")

    val gmmItems = new GaussianMixture().setK(5).setSeed(1L)
    val modelItems = gmmItems.fit(dataSetItems)

    for (i <- 0 until modelItems.gaussians.length) {
      println("Items : weight=%f\ncov=%s\nmean=\n%s\n" format
        (modelUsers.weights(i), modelUsers.gaussians(i).cov, modelUsers.gaussians(i).mean))
    }

    spark.stop()
  }

  def loadInLibSVMFormat(line: String, noOfFeatures : Int) : LabeledPoint = {
    val items = line.split(' ')
    val label = items.head.toDouble
    val (indices, values) = items.tail.filter(_.nonEmpty).map { item =>
      val indexAndValue = item.split(':')
      val index = indexAndValue(0).toInt - 1 // Convert 1-based indices to 0-based.
    val value = indexAndValue(1).toDouble
      (index, value)
    }.unzip

    // check if indices are one-based and in ascending order
    var previous = -1
    var i = 0
    val indicesLength = indices.length
    while (i < indicesLength) {
      val current = indices(i)
      require(current > previous, "indices should be one-based and in ascending order" )
      previous = current
      i += 1
    }

    (label, indices.toArray, values.toArray)

    import org.apache.spark.mllib.linalg.Vectors
    val d = noOfFeatures
    LabeledPoint(label, Vectors.sparse(d, indices, values))
  }
}

Source File: LDATextExample.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package org.sparksamples.lda

import scala.collection.mutable
import org.apache.spark.mllib.clustering.LDA
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext

object LDATextExample {
  val PATH = "/home/ubuntu/work/spark-src/spark/"
  val sc = new SparkContext("local[2]", "First Spark App")
  def main(args: Array[String]): Unit = {
    // Load documents from text files, 1 document per file
    val corpus: RDD[String] = sc.wholeTextFiles(PATH + "docs/*.md").map(_._2)
    // Split each document into a sequence of terms (words)
    val tokenized: RDD[Seq[String]] =
      corpus.map(_.toLowerCase.split("\\s")).map(_.filter(_.length > 3).
        filter(_.forall(java.lang.Character.isLetter)))
    // Choose the vocabulary.
    // termCounts: Sorted list of (term, termCount) pairs
    val termCounts: Array[(String, Long)] =
      tokenized.flatMap(_.map(_ -> 1L)).reduceByKey(_ + _).collect().sortBy(-_._2)
    // vocabArray: Chosen vocab (removing common terms)
    val numStopwords = 20
    val vocabArray: Array[String] =
      termCounts.takeRight(termCounts.size - numStopwords).map(_._1)
    // vocab: Map term -> term index
    val vocab: Map[String, Int] = vocabArray.zipWithIndex.toMap
    // Convert documents into term count vectors
    val documents: RDD[(Long, Vector)] =
      tokenized.zipWithIndex.map { case (tokens, id) =>
        val counts = new mutable.HashMap[Int, Double]()
        tokens.foreach { term =>
          if (vocab.contains(term)) {
            val idx = vocab(term)
            counts(idx) = counts.getOrElse(idx, 0.0) + 1.0
          }
        }
        (id, Vectors.sparse(vocab.size, counts.toSeq))
      }
    // Set LDA parameters
    val numTopics = 10
    val lda = new LDA().setK(numTopics).setMaxIterations(10)
    val ldaModel = lda.run(documents)
    //val avgLogLikelihood = ldaModel. / documents.count()
    // Print topics, showing top-weighted 10 terms for each topic.
    val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 10)
    topicIndices.foreach { case (terms, termWeights) =>
      println("TOPIC:")
      terms.zip(termWeights).foreach { case (term, weight) =>
        println(s"${vocabArray(term.toInt)}\t$weight")
      }
      println()
    }
  }
}

Source File: SparkMatrix.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package linalg.matrix

import org.apache.spark.ml.linalg.Matrix
import org.apache.spark.ml.linalg.Matrices
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.MatrixEntry

object SparkMatrix {

  def main(args: Array[String]) {

    val dMatrix: Matrix = Matrices.dense(2, 2, Array(1.0, 2.0, 3.0, 4.0))
    println("dMatrix: \n" + dMatrix)

    val sMatrixOne: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(5, 6, 7))
    println("sMatrixOne: \n" + sMatrixOne)

    val sMatrixTwo: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 1, 2), Array(5, 6, 7))
    println("sMatrixTwo: \n" + sMatrixTwo)

    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val sc = new SparkContext(spConfig)
    val denseData = Seq(
      Vectors.dense(0.0, 1.0, 2.1),
      Vectors.dense(3.0, 2.0, 4.0),
      Vectors.dense(5.0, 7.0, 8.0),
      Vectors.dense(9.0, 0.0, 1.1)
    )
    val sparseData = Seq(
      Vectors.sparse(3, Seq((1, 1.0), (2, 2.1))),
      Vectors.sparse(3, Seq((0, 3.0), (1, 2.0), (2, 4.0))),
      Vectors.sparse(3, Seq((0, 5.0), (1, 7.0), (2, 8.0))),
      Vectors.sparse(3, Seq((0, 9.0), (2, 1.0)))
    )

    val denseMat = new RowMatrix(sc.parallelize(denseData, 2))
    val sparseMat = new RowMatrix(sc.parallelize(sparseData, 2))

    println("Dense Matrix - Num of Rows :" + denseMat.numRows())
    println("Dense Matrix - Num of Cols:" + denseMat.numCols())
    println("Sparse Matrix - Num of Rows :" + sparseMat.numRows())
    println("Sparse Matrix - Num of Cols:" + sparseMat.numCols())

    val data = Seq(
      (0L, Vectors.dense(0.0, 1.0, 2.0)),
      (1L, Vectors.dense(3.0, 4.0, 5.0)),
      (3L, Vectors.dense(9.0, 0.0, 1.0))
    ).map(x => IndexedRow(x._1, x._2))
    val indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2)
    val indexedRowsMat = new IndexedRowMatrix(indexedRows)
    println("Indexed Row Matrix - No of Rows: " + indexedRowsMat.numRows())
    println("Indexed Row Matrix - No of Cols: " + indexedRowsMat.numCols())

    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    val coordinateMat = new CoordinateMatrix(entries)
    println("Coordinate Matrix - No of Rows: " + coordinateMat.numRows())
    println("Coordinate Matrix - No of Cols: " + coordinateMat.numCols())

    sc.stop()

  }

}

Source File: SparkSVDExampleOne.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package linalg.svd

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.{Matrix, SingularValueDecomposition, Vector, Vectors}
object SparkSVDExampleOne {

  def main(args: Array[String]) {
    val denseData = Seq(
      Vectors.dense(0.0, 1.0, 2.0, 1.0, 5.0, 3.3, 2.1),
      Vectors.dense(3.0, 4.0, 5.0, 3.1, 4.5, 5.1, 3.3),
      Vectors.dense(6.0, 7.0, 8.0, 2.1, 6.0, 6.7, 6.8),
      Vectors.dense(9.0, 0.0, 1.0, 3.4, 4.3, 1.0, 1.0)
    )
    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkSVDDemo")
    val sc = new SparkContext(spConfig)
    val mat: RowMatrix = new RowMatrix(sc.parallelize(denseData, 2))

    // Compute the top 20 singular values and corresponding singular vectors.
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(7, computeU = true)
    val U: RowMatrix = svd.U // The U factor is a RowMatrix.
    val s: Vector = svd.s // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V // The V factor is a local dense matrix.
    println("U:" + U)
    println("s:" + s)
    println("V:" + V)
    sc.stop()
  }
}

Source File: SparkSGD.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package linalg.sgd
import scala.util.Random
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.optimization.GradientDescent
import org.apache.spark.mllib.optimization.SquaredL2Updater
import org.apache.spark.mllib.optimization.LogisticGradient
import org.apache.spark.SparkContext



object SparkSGD {
  def main(args: Array[String]): Unit = {
    val m = 4
    val n = 200000
    val sc = new SparkContext("local[2]", "")
    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
      val random = new Random(idx)
      iter.map(i => (1.0, Vectors.dense(Array.fill(n)(random.nextDouble()))))
    }.cache()
    val (weights, loss) = GradientDescent.runMiniBatchSGD(
      points,
      new LogisticGradient,
      new SquaredL2Updater,
      0.1,
      2,
      1.0,
      1.0,
      Vectors.dense(new Array[Double](n)))
    println("w:"  + weights(0))
    println("loss:" + loss(0))
    sc.stop()

  }
}

Source File: DataFrameExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

import java.io.File

import scopt.OptionParser

import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.util.Utils


object DataFrameExample {

  case class Params(input: String = "data/mllib/sample_libsvm_data.txt")
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DataFrameExample") {
      head("DataFrameExample: an example app using DataFrame for ML.")
      opt[String]("input")
        .text(s"input path to dataframe")
        .action((x, c) => c.copy(input = x))
      checkConfig { params =>
        success
      }
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val spark = SparkSession
      .builder
      .appName(s"DataFrameExample with $params")
      .getOrCreate()

    // Load input data
    println(s"Loading LIBSVM file with UDT from ${params.input}.")
    val df: DataFrame = spark.read.format("libsvm").load(params.input).cache()
    println("Schema from LIBSVM:")
    df.printSchema()
    println(s"Loaded training data as a DataFrame with ${df.count()} records.")

    // Show statistical summary of labels.
    val labelSummary = df.describe("label")
    labelSummary.show()

    // Convert features column to an RDD of vectors.
    val features = df.select("features").rdd.map { case Row(v: Vector) => v }
    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(Vectors.fromML(feat)),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")

    // Save the records in a parquet file.
    val tmpDir = Utils.createTempDir()
    val outputDir = new File(tmpDir, "dataframe").toString
    println(s"Saving to $outputDir as Parquet file.")
    df.write.parquet(outputDir)

    // Load the records back.
    println(s"Loading Parquet file with UDT from $outputDir.")
    val newDF = spark.read.parquet(outputDir)
    println(s"Schema from Parquet:")
    newDF.printSchema()

    spark.stop()
  }
}
// scalastyle:on println

Source File: SummaryStatisticsExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
// $example off$

object SummaryStatisticsExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SummaryStatisticsExample")
    val sc = new SparkContext(conf)

    // $example on$
    val observations = sc.parallelize(
      Seq(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    )

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    println(summary.mean)  // a dense vector containing the mean value for each column
    println(summary.variance)  // column-wise variance
    println(summary.numNonzeros)  // number of nonzeros in each column
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: PCAOnSourceVectorExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
// $example off$

object PCAOnSourceVectorExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAOnSourceVectorExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[LabeledPoint] = sc.parallelize(Seq(
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)),
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0))))

    // Compute the top 5 principal components.
    val pca = new PCA(5).fit(data.map(_.features))

    // Project vectors to the linear space spanned by the top 5 principal
    // components, keeping the label
    val projected = data.map(p => p.copy(features = pca.transform(p.features)))
    // $example off$
    val collect = projected.collect()
    println("Projected vector of principal component:")
    collect.foreach { vector => println(vector) }
  }
}
// scalastyle:on println

Source File: PCAOnRowMatrixExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
// $example off$

object PCAOnRowMatrixExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAOnRowMatrixExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))

    val dataRDD = sc.parallelize(data, 2)

    val mat: RowMatrix = new RowMatrix(dataRDD)

    // Compute the top 4 principal components.
    // Principal components are stored in a local dense matrix.
    val pc: Matrix = mat.computePrincipalComponents(4)

    // Project the rows to the linear space spanned by the top 4 principal components.
    val projected: RowMatrix = mat.multiply(pc)
    // $example off$
    val collect = projected.rows.collect()
    println("Projected Row Matrix of principal component:")
    collect.foreach { vector => println(vector) }
  }
}
// scalastyle:on println

Source File: TallSkinnyPCA.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.Vectors


object TallSkinnyPCA {
  def main(args: Array[String]) {
    if (args.length != 1) {
      System.err.println("Usage: TallSkinnyPCA <input>")
      System.exit(1)
    }

    val conf = new SparkConf().setAppName("TallSkinnyPCA")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(args(0)).map { line =>
      val values = line.split(' ').map(_.toDouble)
      Vectors.dense(values)
    }
    val mat = new RowMatrix(rows)

    // Compute principal components.
    val pc = mat.computePrincipalComponents(mat.numCols().toInt)

    println("Principal components are:\n" + pc)

    sc.stop()
  }
}
// scalastyle:on println

Source File: GaussianMixtureExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object GaussianMixtureExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("GaussianMixtureExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/gmm_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using GaussianMixture
    val gmm = new GaussianMixture().setK(2).run(parsedData)

    // Save and load model
    gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")
    val sameModel = GaussianMixtureModel.load(sc,
      "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")

    // output parameters of max-likelihood model
    for (i <- 0 until gmm.k) {
      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
        (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma))
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: PCAExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
// $example off$

@deprecated("Deprecated since LinearRegressionWithSGD is deprecated.  Use ml.feature.PCA", "2.0.0")
object PCAExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }.cache()

    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features))
    val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
    val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))

    val numIterations = 100
    val model = LinearRegressionWithSGD.train(training, numIterations)
    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)

    val valuesAndPreds = test.map { point =>
      val score = model.predict(point.features)
      (score, point.label)
    }

    val valuesAndPreds_pca = test_pca.map { point =>
      val score = model_pca.predict(point.features)
      (score, point.label)
    }

    val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean()
    val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean()

    println("Mean Squared Error = " + MSE)
    println("PCA Mean Squared Error = " + MSE_pca)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: DenseKMeans.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: CosineSimilarity.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}


object CosineSimilarity {
  case class Params(inputFile: String = null, threshold: Double = 0.1)
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("CosineSimilarity") {
      head("CosineSimilarity: an example app.")
      opt[Double]("threshold")
        .required()
        .text(s"threshold similarity: to tradeoff computation vs quality estimate")
        .action((x, c) => c.copy(threshold = x))
      arg[String]("<inputFile>")
        .required()
        .text(s"input file, one row per line, space-separated")
        .action((x, c) => c.copy(inputFile = x))
      note(
        """
          |For example, the following command runs this app on a dataset:
          |
          | ./bin/spark-submit  --class org.apache.spark.examples.mllib.CosineSimilarity \
          | examplesjar.jar \
          | --threshold 0.1 data/mllib/sample_svm_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName("CosineSimilarity")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(params.inputFile).map { line =>
      val values = line.split(' ').map(_.toDouble)
      Vectors.dense(values)
    }.cache()
    val mat = new RowMatrix(rows)

    // Compute similar columns perfectly, with brute force.
    val exact = mat.columnSimilarities()

    // Compute similar columns with estimation using DIMSUM
    val approx = mat.columnSimilarities(params.threshold)

    val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) }
    val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) }
    val MAE = exactEntries.leftOuterJoin(approxEntries).values.map {
      case (u, Some(v)) =>
        math.abs(u - v)
      case (u, None) =>
        math.abs(u)
    }.mean()

    println(s"Average absolute error in estimate is: $MAE")

    sc.stop()
  }
}
// scalastyle:on println

Source File: ElementwiseProductExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.ElementwiseProduct
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object ElementwiseProductExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("ElementwiseProductExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Create some vector data; also works for sparse vectors
    val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)))

    val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
    val transformer = new ElementwiseProduct(transformingVector)

    // Batch transform and per-row transform give the same results:
    val transformedData = transformer.transform(data)
    val transformedData2 = data.map(x => transformer.transform(x))
    // $example off$

    println("transformedData: ")
    transformedData.foreach(x => println(x))

    println("transformedData2: ")
    transformedData2.foreach(x => println(x))

    sc.stop()
  }
}
// scalastyle:on println

Source File: SVDExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.SingularValueDecomposition
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
// $example off$

object SVDExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SVDExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))

    val dataRDD = sc.parallelize(data, 2)

    val mat: RowMatrix = new RowMatrix(dataRDD)

    // Compute the top 5 singular values and corresponding singular vectors.
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true)
    val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
    val s: Vector = svd.s  // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V  // The V factor is a local dense matrix.
    // $example off$
    val collect = U.rows.collect()
    println("U factor is:")
    collect.foreach { vector => println(vector) }
    println(s"Singular values are: $s")
    println(s"V factor is:\n$V")
  }
}
// scalastyle:on println

Source File: TallSkinnySVD.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.Vectors


object TallSkinnySVD {
  def main(args: Array[String]) {
    if (args.length != 1) {
      System.err.println("Usage: TallSkinnySVD <input>")
      System.exit(1)
    }

    val conf = new SparkConf().setAppName("TallSkinnySVD")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(args(0)).map { line =>
      val values = line.split(' ').map(_.toDouble)
      Vectors.dense(values)
    }
    val mat = new RowMatrix(rows)

    // Compute SVD.
    val svd = mat.computeSVD(mat.numCols().toInt)

    println("Singular values are " + svd.s)

    sc.stop()
  }
}
// scalastyle:on println

Source File: StandardScalerExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
// $example off$

object StandardScalerExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("StandardScalerExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")

    val scaler1 = new StandardScaler().fit(data.map(x => x.features))
    val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
    // scaler3 is an identical model to scaler2, and will produce identical transformations
    val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)

    // data1 will be unit variance.
    val data1 = data.map(x => (x.label, scaler1.transform(x.features)))

    // data2 will be unit variance and zero mean.
    val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
    // $example off$

    println("data1: ")
    data1.foreach(x => println(x))

    println("data2: ")
    data2.foreach(x => println(x))

    sc.stop()
  }
}
// scalastyle:on println

Source File: KMeansExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object KMeansExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("KMeansExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/kmeans_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(parsedData, numClusters, numIterations)

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val WSSSE = clusters.computeCost(parsedData)
    println("Within Set Sum of Squared Errors = " + WSSSE)

    // Save and load model
    clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: MultivariateSummarizer.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
        |  --input data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params")
    val sc = new SparkContext(conf)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    println(s"Summary of data file: ${params.input}")
    println(s"${examples.count()} data points")

    // Summarize labels
    val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())(
      (summary, lp) => summary.add(Vectors.dense(lp.label)),
      (sum1, sum2) => sum1.merge(sum2))

    // Summarize features
    val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())(
      (summary, lp) => summary.add(lp.features),
      (sum1, sum2) => sum1.merge(sum2))

    println()
    println(s"Summary statistics")
    println(s"\tLabel\tFeatures")
    println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}")
    println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}")
    println(
      s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}")
    println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}")
    println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}")
    println()

    sc.stop()
  }
}
// scalastyle:on println

Source File: LinearRegressionWithSGDExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
// $example off$

@deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0")
object LinearRegressionWithSGDExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/ridge-data/lpsa.data")
    val parsedData = data.map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }.cache()

    // Building the model
    val numIterations = 100
    val stepSize = 0.00000001
    val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize)

    // Evaluate model on training examples and compute training error
    val valuesAndPreds = parsedData.map { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }
    val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean()
    println("training Mean Squared Error = " + MSE)

    // Save and load model
    model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
    val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: StreamingLinearRegressionExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
// $example off$
import org.apache.spark.streaming._


object StreamingLinearRegressionExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>")
      System.exit(1)
    }

    val conf = new SparkConf().setAppName("StreamingLinearRegressionExample")
    val ssc = new StreamingContext(conf, Seconds(1))

    // $example on$
    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache()
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val numFeatures = 3
    val model = new StreamingLinearRegressionWithSGD()
      .setInitialWeights(Vectors.zeros(numFeatures))

    model.trainOn(trainingData)
    model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()

    ssc.start()
    ssc.awaitTermination()
    // $example off$

    ssc.stop()
  }
}
// scalastyle:on println

Source File: BisectingKMeansExample.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.examples.mllib

// scalastyle:off println
import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.BisectingKMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
// $example off$


object BisectingKMeansExample {

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample")
    val sc = new SparkContext(sparkConf)

    // $example on$
    // Loads and parses data
    def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble))
    val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache()

    // Clustering the data into 6 clusters by BisectingKMeans.
    val bkm = new BisectingKMeans().setK(6)
    val model = bkm.run(data)

    // Show the compute cost and the cluster centers
    println(s"Compute Cost: ${model.computeCost(data)}")
    model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
      println(s"Cluster Center ${idx}: ${center}")
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: StreamingKMeansExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.clustering.StreamingKMeans
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.streaming.{Seconds, StreamingContext}
// $example off$


object StreamingKMeansExample {

  def main(args: Array[String]) {
    if (args.length != 5) {
      System.err.println(
        "Usage: StreamingKMeansExample " +
          "<trainingDir> <testDir> <batchDuration> <numClusters> <numDimensions>")
      System.exit(1)
    }

    // $example on$
    val conf = new SparkConf().setAppName("StreamingKMeansExample")
    val ssc = new StreamingContext(conf, Seconds(args(2).toLong))

    val trainingData = ssc.textFileStream(args(0)).map(Vectors.parse)
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val model = new StreamingKMeans()
      .setK(args(3).toInt)
      .setDecayFactor(1.0)
      .setRandomCenters(args(4).toInt, 0.0)

    model.trainOn(trainingData)
    model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()

    ssc.start()
    ssc.awaitTermination()
    // $example off$
  }
}
// scalastyle:on println

Source File: Normalizer.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.annotation.Since
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}


  @Since("1.1.0")
  override def transform(vector: Vector): Vector = {
    val norm = Vectors.norm(vector, p)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
          }
          Vectors.dense(values)
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          }
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
      }
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.
      vector
    }
  }

}

Source File: GaussianMixtureModelWrapper.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import scala.collection.JavaConverters

import org.apache.spark.SparkContext
import org.apache.spark.mllib.clustering.GaussianMixtureModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}


  val gaussians: Array[Byte] = {
    val modelGaussians = model.gaussians.map { gaussian =>
      Array[Any](gaussian.mu, gaussian.sigma)
    }
    SerDe.dumps(JavaConverters.seqAsJavaListConverter(modelGaussians).asJava)
  }

  def predictSoft(point: Vector): Vector = {
    Vectors.dense(model.predictSoft(point))
  }

  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
}

Source File: Word2VecModelWrapper.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import java.util.{List => JList, Map => JMap}

import scala.collection.JavaConverters._

import org.apache.spark.SparkContext
import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.feature.Word2VecModel
import org.apache.spark.mllib.linalg.{Vector, Vectors}


  def findSynonyms(vector: Vector, num: Int): JList[Object] = {
    prepareResult(model.findSynonyms(vector, num))
  }

  private def prepareResult(result: Array[(String, Double)]) = {
    val similarity = Vectors.dense(result.map(_._2))
    val words = result.map(_._1)
    List(words, similarity).map(_.asInstanceOf[Object]).asJava
  }


  def getVectors: JMap[String, JList[Float]] = {
    model.getVectors.map { case (k, v) =>
      (k, v.toList.asJava)
    }.asJava
  }

  def save(sc: SparkContext, path: String): Unit = model.save(sc, path)
}

Source File: MatrixFactorizationModelWrapper.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.api.java.JavaRDD
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.recommendation.{MatrixFactorizationModel, Rating}
import org.apache.spark.rdd.RDD


private[python] class MatrixFactorizationModelWrapper(model: MatrixFactorizationModel)
  extends MatrixFactorizationModel(model.rank, model.userFeatures, model.productFeatures) {

  def predict(userAndProducts: JavaRDD[Array[Any]]): RDD[Rating] =
    predict(SerDe.asTupleRDD(userAndProducts.rdd))

  def getUserFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(userFeatures.map {
      case (user, feature) => (user, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def getProductFeatures: RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(productFeatures.map {
      case (product, feature) => (product, Vectors.dense(feature))
    }.asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendProductsForUsers(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendProductsForUsers(num).asInstanceOf[RDD[(Any, Any)]])
  }

  def wrappedRecommendUsersForProducts(num: Int): RDD[Array[Any]] = {
    SerDe.fromTuple2RDD(recommendUsersForProducts(num).asInstanceOf[RDD[(Any, Any)]])
  }
}

Source File: SpearmanCorrelation.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.correlation

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.internal.Logging
import org.apache.spark.mllib.linalg.{Matrix, Vector, Vectors}
import org.apache.spark.rdd.RDD


  override def computeCorrelationMatrix(X: RDD[Vector]): Matrix = {
    // ((columnIndex, value), rowUid)
    val colBased = X.zipWithUniqueId().flatMap { case (vec, uid) =>
      vec.toArray.view.zipWithIndex.map { case (v, j) =>
        ((j, v), uid)
      }
    }
    // global sort by (columnIndex, value)
    val sorted = colBased.sortByKey()
    // assign global ranks (using average ranks for tied values)
    val globalRanks = sorted.zipWithIndex().mapPartitions { iter =>
      var preCol = -1
      var preVal = Double.NaN
      var startRank = -1.0
      var cachedUids = ArrayBuffer.empty[Long]
      val flush: () => Iterable[(Long, (Int, Double))] = () => {
        val averageRank = startRank + (cachedUids.size - 1) / 2.0
        val output = cachedUids.map { uid =>
          (uid, (preCol, averageRank))
        }
        cachedUids.clear()
        output
      }
      iter.flatMap { case (((j, v), uid), rank) =>
        // If we see a new value or cachedUids is too big, we flush ids with their average rank.
        if (j != preCol || v != preVal || cachedUids.size >= 10000000) {
          val output = flush()
          preCol = j
          preVal = v
          startRank = rank
          cachedUids += uid
          output
        } else {
          cachedUids += uid
          Iterator.empty
        }
      } ++ flush()
    }
    // Replace values in the input matrix by their ranks compared with values in the same column.
    // Note that shifting all ranks in a column by a constant value doesn't affect result.
    val groupedRanks = globalRanks.groupByKey().map { case (uid, iter) =>
      // sort by column index and then convert values to a vector
      Vectors.dense(iter.toSeq.sortBy(_._1).map(_._2).toArray)
    }
    PearsonCorrelation.computeCorrelationMatrix(groupedRanks)
  }
}

Source File: LogisticRegressionDataGenerator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


  @Since("0.8.0")
  def generateLogisticRDD(
    sc: SparkContext,
    nexamples: Int,
    nfeatures: Int,
    eps: Double,
    nparts: Int = 2,
    probOne: Double = 0.5): RDD[LabeledPoint] = {
    val data = sc.parallelize(0 until nexamples, nparts).map { idx =>
      val rnd = new Random(42 + idx)

      val y = if (idx % 2 == 0) 0.0 else 1.0
      val x = Array.fill[Double](nfeatures) {
        rnd.nextGaussian() + (y * eps)
      }
      LabeledPoint(y, Vectors.dense(x))
    }
    data
  }

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length != 5) {
      // scalastyle:off println
      println("Usage: LogisticRegressionGenerator " +
        "<master> <output_dir> <num_examples> <num_features> <num_partitions>")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2
    val eps = 3

    val sc = new SparkContext(sparkMaster, "LogisticRegressionDataGenerator")
    val data = generateLogisticRDD(sc, nexamples, nfeatures, eps, parts)

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Source File: SVMDataGenerator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.util

import scala.util.Random

import com.github.fommil.netlib.BLAS.{getInstance => blas}

import org.apache.spark.SparkContext
import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


@DeveloperApi
@Since("0.8.0")
object SVMDataGenerator {

  @Since("0.8.0")
  def main(args: Array[String]) {
    if (args.length < 2) {
      // scalastyle:off println
      println("Usage: SVMGenerator " +
        "<master> <output_dir> [num_examples] [num_features] [num_partitions]")
      // scalastyle:on println
      System.exit(1)
    }

    val sparkMaster: String = args(0)
    val outputPath: String = args(1)
    val nexamples: Int = if (args.length > 2) args(2).toInt else 1000
    val nfeatures: Int = if (args.length > 3) args(3).toInt else 2
    val parts: Int = if (args.length > 4) args(4).toInt else 2

    val sc = new SparkContext(sparkMaster, "SVMGenerator")

    val globalRnd = new Random(94720)
    val trueWeights = Array.fill[Double](nfeatures)(globalRnd.nextGaussian())

    val data: RDD[LabeledPoint] = sc.parallelize(0 until nexamples, parts).map { idx =>
      val rnd = new Random(42 + idx)

      val x = Array.fill[Double](nfeatures) {
        rnd.nextDouble() * 2.0 - 1.0
      }
      val yD = blas.ddot(trueWeights.length, x, 1, trueWeights, 1) + rnd.nextGaussian() * 0.1
      val y = if (yD < 0) 0.0 else 1.0
      LabeledPoint(y, Vectors.dense(x))
    }

    data.saveAsTextFile(outputPath)

    sc.stop()
  }
}

Source File: LabeledPoint.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.beans.BeanInfo

import org.apache.spark.annotation.Since
import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.util.NumericParser
import org.apache.spark.SparkException


  @Since("1.1.0")
  def parse(s: String): LabeledPoint = {
    if (s.startsWith("(")) {
      NumericParser.parse(s) match {
        case Seq(label: Double, numeric: Any) =>
          LabeledPoint(label, Vectors.parseNumeric(numeric))
        case other =>
          throw new SparkException(s"Cannot parse $other.")
      }
    } else { // dense format used before v1.0
      val parts = s.split(',')
      val label = java.lang.Double.parseDouble(parts(0))
      val features = Vectors.dense(parts(1).trim().split(' ').map(java.lang.Double.parseDouble))
      LabeledPoint(label, features)
    }
  }

  private[spark] def fromML(point: NewLabeledPoint): LabeledPoint = {
    LabeledPoint(point.label, Vectors.fromML(point.features))
  }
}

Source File: ChiSqSelectorSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext {

  

  test("ChiSqSelector transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))), 2)
    val preFilteredData =
      Seq(LabeledPoint(0.0, Vectors.dense(Array(8.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0))))
    val model = new ChiSqSelector(1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSeq
    assert(filteredData === preFilteredData)
  }

  test("ChiSqSelector by fpr transform test (sparse & dense vector)") {
    val labeledDiscreteData = sc.parallelize(
      Seq(LabeledPoint(0.0, Vectors.sparse(4, Array((0, 8.0), (1, 7.0)))),
        LabeledPoint(1.0, Vectors.sparse(4, Array((1, 9.0), (2, 6.0), (3, 4.0)))),
        LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0, 4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0, 9.0)))), 2)
    val preFilteredData =
      Seq(LabeledPoint(0.0, Vectors.dense(Array(0.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(1.0, Vectors.dense(Array(4.0))),
        LabeledPoint(2.0, Vectors.dense(Array(9.0))))
    val model: ChiSqSelectorModel = new ChiSqSelector().setSelectorType("fpr")
      .setFpr(0.1).fit(labeledDiscreteData)
    val filteredData = labeledDiscreteData.map { lp =>
      LabeledPoint(lp.label, model.transform(lp.features))
    }.collect().toSeq
    assert(filteredData === preFilteredData)
  }

  test("model load / save") {
    val model = ChiSqSelectorSuite.createModel()
    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString
    try {
      model.save(sc, path)
      val sameModel = ChiSqSelectorModel.load(sc, path)
      ChiSqSelectorSuite.checkEqual(model, sameModel)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

object ChiSqSelectorSuite extends SparkFunSuite {

  def createModel(): ChiSqSelectorModel = {
    val arr = Array(1, 2, 3, 4)
    new ChiSqSelectorModel(arr)
  }

  def checkEqual(a: ChiSqSelectorModel, b: ChiSqSelectorModel): Unit = {
    assert(a.selectedFeatures.deep == b.selectedFeatures.deep)
  }
}

Source File: ElementwiseProductSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("elementwise (hadamard) product should properly apply vector to dense data set") {
    val denseData = Array(
      Vectors.dense(1.0, 4.0, 1.9, -9.0)
    )
    val scalingVec = Vectors.dense(2.0, 0.5, 0.0, 0.25)
    val transformer = new ElementwiseProduct(scalingVec)
    val transformedData = transformer.transform(sc.makeRDD(denseData))
    val transformedVecs = transformedData.collect()
    val transformedVec = transformedVecs(0)
    val expectedVec = Vectors.dense(2.0, 2.0, 0.0, -2.25)
    assert(transformedVec ~== expectedVec absTol 1E-5,
      s"Expected transformed vector $expectedVec but found $transformedVec")
  }

  test("elementwise (hadamard) product should properly apply vector to sparse data set") {
    val sparseData = Array(
      Vectors.sparse(3, Seq((1, -1.0), (2, -3.0)))
    )
    val dataRDD = sc.parallelize(sparseData, 3)
    val scalingVec = Vectors.dense(1.0, 0.0, 0.5)
    val transformer = new ElementwiseProduct(scalingVec)
    val data2 = sparseData.map(transformer.transform)
    val data2RDD = transformer.transform(dataRDD)

    assert((sparseData, data2, data2RDD.collect()).zipped.forall {
      case (v1: DenseVector, v2: DenseVector, v3: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector, v3: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after hadamard product")

    assert((data2, data2RDD.collect()).zipped.forall((v1, v2) => v1 ~== v2 absTol 1E-5))
    assert(data2(0) ~== Vectors.sparse(3, Seq((1, 0.0), (2, -1.5))) absTol 1E-5)
  }
}

Source File: IDFSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class IDFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("idf") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((m + 1.0) / (x + 1.0))
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

  test("idf minimum document frequency filtering") {
    val n = 4
    val localTermFrequencies = Seq(
      Vectors.sparse(n, Array(1, 3), Array(1.0, 2.0)),
      Vectors.dense(0.0, 1.0, 2.0, 3.0),
      Vectors.sparse(n, Array(1), Array(1.0))
    )
    val m = localTermFrequencies.size
    val termFrequencies = sc.parallelize(localTermFrequencies, 2)
    val idf = new IDF(minDocFreq = 1)
    val model = idf.fit(termFrequencies)
    val expected = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) {
        math.log((m + 1.0) / (x + 1.0))
      } else {
        0
      }
    })
    assert(model.idf ~== expected absTol 1e-12)

    val assertHelper = (tfidf: Array[Vector]) => {
      assert(tfidf.size === 3)
      val tfidf0 = tfidf(0).asInstanceOf[SparseVector]
      assert(tfidf0.indices === Array(1, 3))
      assert(Vectors.dense(tfidf0.values) ~==
          Vectors.dense(1.0 * expected(1), 2.0 * expected(3)) absTol 1e-12)
      val tfidf1 = tfidf(1).asInstanceOf[DenseVector]
      assert(Vectors.dense(tfidf1.values) ~==
          Vectors.dense(0.0, 1.0 * expected(1), 2.0 * expected(2), 3.0 * expected(3)) absTol 1e-12)
      val tfidf2 = tfidf(2).asInstanceOf[SparseVector]
      assert(tfidf2.indices === Array(1))
      assert(tfidf2.values(0) ~== (1.0 * expected(1)) absTol 1e-12)
    }
    // Transforms a RDD
    val tfidf = model.transform(termFrequencies).collect()
    assertHelper(tfidf)
    // Transforms local vectors
    val localTfidf = localTermFrequencies.map(model.transform(_)).toArray
    assertHelper(localTfidf)
  }

}

Source File: PCASuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class PCASuite extends SparkFunSuite with MLlibTestSparkContext {

  private val data = Array(
    Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
    Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
    Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
  )

  private lazy val dataRDD = sc.parallelize(data, 2)

  test("Correct computing use a PCA wrapper") {
    val k = dataRDD.count().toInt
    val pca = new PCA(k).fit(dataRDD)

    val mat = new RowMatrix(dataRDD)
    val (pc, explainedVariance) = mat.computePrincipalComponentsAndExplainedVariance(k)

    val pca_transform = pca.transform(dataRDD).collect()
    val mat_multiply = mat.multiply(pc).rows.collect()

    pca_transform.zip(mat_multiply).foreach { case (calculated, expected) =>
      assert(calculated ~== expected relTol 1e-8)
    }
    assert(pca.explainedVariance ~== explainedVariance relTol 1e-8)
  }
}

Source File: HashingTFSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("hashing tf on a single doc") {
    val hashingTF = new HashingTF(1000)
    val doc = "a a b b c d".split(" ")
    val n = hashingTF.numFeatures
    val termFreqs = Seq(
      (hashingTF.indexOf("a"), 2.0),
      (hashingTF.indexOf("b"), 2.0),
      (hashingTF.indexOf("c"), 1.0),
      (hashingTF.indexOf("d"), 1.0))
    assert(termFreqs.map(_._1).forall(i => i >= 0 && i < n),
      "index must be in range [0, #features)")
    assert(termFreqs.map(_._1).toSet.size === 4, "expecting perfect hashing")
    val expected = Vectors.sparse(n, termFreqs)
    assert(hashingTF.transform(doc) === expected)
  }

  test("hashing tf on an RDD") {
    val hashingTF = new HashingTF
    val localDocs: Seq[Seq[String]] = Seq(
      "a a b b b c d".split(" "),
      "a b c d a b c".split(" "),
      "c b a c b a a".split(" "))
    val docs = sc.parallelize(localDocs, 2)
    assert(hashingTF.transform(docs).collect().toSet === localDocs.map(hashingTF.transform).toSet)
  }

  test("applying binary term freqs") {
    val hashingTF = new HashingTF(100).setBinary(true)
    val doc = "a a b c c c".split(" ")
    val n = hashingTF.numFeatures
    val expected = Vectors.sparse(n, Seq(
      (hashingTF.indexOf("a"), 1.0),
      (hashingTF.indexOf("b"), 1.0),
      (hashingTF.indexOf("c"), 1.0)))
    assert(hashingTF.transform(doc) ~== expected absTol 1e-14)
  }
}

Source File: EnsembleTestHelper.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

}

Source File: PythonMLLibAPISuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.api.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}
import org.apache.spark.mllib.recommendation.Rating
import org.apache.spark.mllib.regression.LabeledPoint

class PythonMLLibAPISuite extends SparkFunSuite {

  SerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = SerDe.loads(SerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle labeled point") {
    val points = Seq(
      LabeledPoint(0.0, Vectors.dense(Array.empty[Double])),
      LabeledPoint(1.0, Vectors.dense(0.0)),
      LabeledPoint(-0.5, Vectors.dense(0.0, -2.0)),
      LabeledPoint(0.0, Vectors.sparse(0, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(1.0, Vectors.sparse(1, Array.empty[Int], Array.empty[Double])),
      LabeledPoint(-0.5, Vectors.sparse(2, Array(1), Array(-2.0))))
    points.foreach { p =>
      val q = SerDe.loads(SerDe.dumps(p)).asInstanceOf[LabeledPoint]
      assert(q.label === p.label)
      assert(q.features.getClass === p.features.getClass)
      assert(q.features === p.features)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = SerDe.loads(SerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = SerDe.loads(SerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = SerDe.loads(SerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = SerDe.loads(SerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = SerDe.loads(SerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }

  test("pickle rating") {
    val rat = new Rating(1, 2, 3.0)
    val rat2 = SerDe.loads(SerDe.dumps(rat)).asInstanceOf[Rating]
    assert(rat == rat2)

    // Test name of class only occur once
    val rats = (1 to 10).map(x => new Rating(x, x + 1, x + 3.0)).toArray
    val bytes = SerDe.dumps(rats)
    assert(bytes.toString.split("Rating").length == 1)
    assert(bytes.length / 10 < 25) //  25 bytes per rating

  }
}

Source File: MultivariateGaussianSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.distribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.{Matrices, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.mllib.util.TestingUtils._

class MultivariateGaussianSuite extends SparkFunSuite with MLlibTestSparkContext {
  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }

}

Source File: KMeansPMMLModelExportSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.dmg.pmml.ClusteringModel

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors

class KMeansPMMLModelExportSuite extends SparkFunSuite {

  test("KMeansPMMLModelExport generate PMML format") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    // assert that the PMML format is as expected
    assert(modelExport.isInstanceOf[PMMLModelExport])
    val pmml = modelExport.asInstanceOf[PMMLModelExport].getPmml
    assert(pmml.getHeader.getDescription === "k-means clustering")
    // check that the number of fields match the single vector size
    assert(pmml.getDataDictionary.getNumberOfFields === clusterCenters(0).size)
    // This verify that there is a model attached to the pmml object and the model is a clustering
    // one. It also verifies that the pmml model has the same number of clusters of the spark model.
    val pmmlClusteringModel = pmml.getModels.get(0).asInstanceOf[ClusteringModel]
    assert(pmmlClusteringModel.getNumberOfClusters === clusterCenters.length)
  }

}

Source File: PMMLModelExportFactorySuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.pmml.export

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.classification.{LogisticRegressionModel, SVMModel}
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LassoModel, LinearRegressionModel, RidgeRegressionModel}
import org.apache.spark.mllib.util.LinearDataGenerator

class PMMLModelExportFactorySuite extends SparkFunSuite {

  test("PMMLModelExportFactory create KMeansPMMLModelExport when passing a KMeansModel") {
    val clusterCenters = Array(
      Vectors.dense(1.0, 2.0, 6.0),
      Vectors.dense(1.0, 3.0, 0.0),
      Vectors.dense(1.0, 4.0, 6.0))
    val kmeansModel = new KMeansModel(clusterCenters)

    val modelExport = PMMLModelExportFactory.createPMMLModelExport(kmeansModel)

    assert(modelExport.isInstanceOf[KMeansPMMLModelExport])
  }

  test("PMMLModelExportFactory create GeneralizedLinearPMMLModelExport when passing a "
    + "LinearRegressionModel, RidgeRegressionModel or LassoModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val linearRegressionModel =
      new LinearRegressionModel(linearInput(0).features, linearInput(0).label)
    val linearModelExport = PMMLModelExportFactory.createPMMLModelExport(linearRegressionModel)
    assert(linearModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val ridgeRegressionModel =
      new RidgeRegressionModel(linearInput(0).features, linearInput(0).label)
    val ridgeModelExport = PMMLModelExportFactory.createPMMLModelExport(ridgeRegressionModel)
    assert(ridgeModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])

    val lassoModel = new LassoModel(linearInput(0).features, linearInput(0).label)
    val lassoModelExport = PMMLModelExportFactory.createPMMLModelExport(lassoModel)
    assert(lassoModelExport.isInstanceOf[GeneralizedLinearPMMLModelExport])
  }

  test("PMMLModelExportFactory create BinaryClassificationPMMLModelExport "
    + "when passing a LogisticRegressionModel or SVMModel") {
    val linearInput = LinearDataGenerator.generateLinearInput(3.0, Array(10.0, 10.0), 1, 17)

    val logisticRegressionModel =
      new LogisticRegressionModel(linearInput(0).features, linearInput(0).label)
    val logisticRegressionModelExport =
      PMMLModelExportFactory.createPMMLModelExport(logisticRegressionModel)
    assert(logisticRegressionModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])

    val svmModel = new SVMModel(linearInput(0).features, linearInput(0).label)
    val svmModelExport = PMMLModelExportFactory.createPMMLModelExport(svmModel)
    assert(svmModelExport.isInstanceOf[BinaryClassificationPMMLModelExport])
  }

  test("PMMLModelExportFactory throw IllegalArgumentException "
    + "when passing a Multinomial Logistic Regression") {
    
    val multiclassLogisticRegressionModel = new LogisticRegressionModel(
      weights = Vectors.dense(0.1, 0.2, 0.3, 0.4), intercept = 1.0,
      numFeatures = 2, numClasses = 3)

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(multiclassLogisticRegressionModel)
    }
  }

  test("PMMLModelExportFactory throw IllegalArgumentException when passing an unsupported model") {
    val invalidModel = new Object

    intercept[IllegalArgumentException] {
      PMMLModelExportFactory.createPMMLModelExport(invalidModel)
    }
  }
}

Source File: CoordinateMatrixSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.linalg.distributed

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class CoordinateMatrixSuite extends SparkFunSuite with MLlibTestSparkContext {

  val m = 5
  val n = 4
  var mat: CoordinateMatrix = _

  override def beforeAll() {
    super.beforeAll()
    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    mat = new CoordinateMatrix(entries)
  }

  test("size") {
    assert(mat.numRows() === m)
    assert(mat.numCols() === n)
  }

  test("empty entries") {
    val entries = sc.parallelize(Seq[MatrixEntry](), 1)
    val emptyMat = new CoordinateMatrix(entries)
    intercept[RuntimeException] {
      emptyMat.numCols()
    }
    intercept[RuntimeException] {
      emptyMat.numRows()
    }
  }

  test("toBreeze") {
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(mat.toBreeze() === expected)
  }

  test("transpose") {
    val transposed = mat.transpose()
    assert(mat.toBreeze().t === transposed.toBreeze())
  }

  test("toIndexedRowMatrix") {
    val indexedRowMatrix = mat.toIndexedRowMatrix()
    val expected = BDM(
      (1.0, 2.0, 0.0, 0.0),
      (0.0, 3.0, 4.0, 0.0),
      (0.0, 0.0, 5.0, 6.0),
      (7.0, 0.0, 0.0, 8.0),
      (0.0, 9.0, 0.0, 0.0))
    assert(indexedRowMatrix.toBreeze() === expected)
  }

  test("toRowMatrix") {
    val rowMatrix = mat.toRowMatrix()
    val rows = rowMatrix.rows.collect().toSet
    val expected = Set(
      Vectors.dense(1.0, 2.0, 0.0, 0.0),
      Vectors.dense(0.0, 3.0, 4.0, 0.0),
      Vectors.dense(0.0, 0.0, 5.0, 6.0),
      Vectors.dense(7.0, 0.0, 0.0, 8.0),
      Vectors.dense(0.0, 9.0, 0.0, 0.0))
    assert(rows === expected)
  }

  test("toBlockMatrix") {
    val blockMat = mat.toBlockMatrix(2, 2)
    assert(blockMat.numRows() === m)
    assert(blockMat.numCols() === n)
    assert(blockMat.toBreeze() === mat.toBreeze())

    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(-1, 2)
    }
    intercept[IllegalArgumentException] {
      mat.toBlockMatrix(2, 0)
    }
  }
}

Source File: LabeledPointSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.Vectors

class LabeledPointSuite extends SparkFunSuite {

  test("parse labeled points") {
    val points = Seq(
      LabeledPoint(1.0, Vectors.dense(1.0, 0.0)),
      LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0))))
    points.foreach { p =>
      assert(p === LabeledPoint.parse(p.toString))
    }
  }

  test("parse labeled points with whitespaces") {
    val point = LabeledPoint.parse("(0.0, [1.0, 2.0])")
    assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
  }

  test("parse labeled points with v0.9 format") {
    val point = LabeledPoint.parse("1.0,1.0 0.0 -2.0")
    assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))
  }

  test("conversions between new ml LabeledPoint and mllib LabeledPoint") {
    val points: Seq[LabeledPoint] = Seq(
      LabeledPoint(1.0, Vectors.dense(1.0, 0.0)),
      LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0))))

    val newPoints: Seq[NewLabeledPoint] = points.map(_.asML)

    points.zip(newPoints).foreach { case (p1, p2) =>
      assert(p1 === LabeledPoint.fromML(p2))
    }
  }
}

Source File: RidgeRegressionSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import scala.util.Random

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.{LinearDataGenerator, LocalClusterSparkContext,
  MLlibTestSparkContext}
import org.apache.spark.util.Utils

private object RidgeRegressionSuite {

  
  val model = new RidgeRegressionModel(weights = Vectors.dense(0.1, 0.2, 0.3), intercept = 0.5)
}

class RidgeRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {

  def predictionError(predictions: Seq[Double], input: Seq[LabeledPoint]): Double = {
    predictions.zip(input).map { case (prediction, expected) =>
      (prediction - expected.label) * (prediction - expected.label)
    }.sum / predictions.size
  }

  test("ridge regression can help avoid overfitting") {

    // For small number of examples and large variance of error distribution,
    // ridge regression should give smaller generalization error that linear regression.

    val numExamples = 50
    val numFeatures = 20

    // Pick weights as random values distributed uniformly in [-0.5, 0.5]
    val random = new Random(42)
    val w = Array.fill(numFeatures)(random.nextDouble() - 0.5)

    // Use half of data for training and other half for validation
    val data = LinearDataGenerator.generateLinearInput(3.0, w, 2 * numExamples, 42, 10.0)
    val testData = data.take(numExamples)
    val validationData = data.takeRight(numExamples)

    val testRDD = sc.parallelize(testData, 2).cache()
    val validationRDD = sc.parallelize(validationData, 2).cache()

    // First run without regularization.
    val linearReg = new LinearRegressionWithSGD()
    linearReg.optimizer.setNumIterations(200)
                       .setStepSize(1.0)

    val linearModel = linearReg.run(testRDD)
    val linearErr = predictionError(
        linearModel.predict(validationRDD.map(_.features)).collect(), validationData)

    val ridgeReg = new RidgeRegressionWithSGD()
    ridgeReg.optimizer.setNumIterations(200)
                      .setRegParam(0.1)
                      .setStepSize(1.0)
    val ridgeModel = ridgeReg.run(testRDD)
    val ridgeErr = predictionError(
        ridgeModel.predict(validationRDD.map(_.features)).collect(), validationData)

    // Ridge validation error should be lower than linear regression.
    assert(ridgeErr < linearErr,
      "ridgeError (" + ridgeErr + ") was not less than linearError(" + linearErr + ")")
  }

  test("model save/load") {
    val model = RidgeRegressionSuite.model

    val tempDir = Utils.createTempDir()
    val path = tempDir.toURI.toString

    // Save model, load it back, and compare.
    try {
      model.save(sc, path)
      val sameModel = RidgeRegressionModel.load(sc, path)
      assert(model.weights == sameModel.weights)
      assert(model.intercept == sameModel.intercept)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

class RidgeRegressionClusterSuite extends SparkFunSuite with LocalClusterSparkContext {

  test("task size should be small in both training and prediction") {
    val m = 4
    val n = 200000
    val points = sc.parallelize(0 until m, 2).mapPartitionsWithIndex { (idx, iter) =>
      val random = new Random(idx)
      iter.map(i => LabeledPoint(1.0, Vectors.dense(Array.fill(n)(random.nextDouble()))))
    }.cache()
    // If we serialize data directly in the task closure, the size of the serialized task would be
    // greater than 1MB and hence Spark would throw an error.
    val model = RidgeRegressionWithSGD.train(points, 2)
    val predictions = model.predict(points.map(_.features))
  }
}

Source File: HoltWintersBestModelEvaluation.scala From uberdata with Apache License 2.0

5 votes

package org.apache.spark.ml

import com.cloudera.sparkts.models.UberHoltWintersModel
import eleflow.uberdata.enums.SupportedAlgorithm
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.ml.evaluation.TimeSeriesEvaluator
import org.apache.spark.ml.param.{ParamMap, ParamPair}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.Row

import scala.reflect.ClassTag


abstract class HoltWintersBestModelEvaluation[L, M <: ForecastBaseModel[M]](
  implicit kt: ClassTag[L],
  ord: Ordering[L] = null
) extends BestModelFinder[L, M]
    with HoltWintersParams {

  protected def holtWintersEvaluation(
    row: Row,
    model: UberHoltWintersModel,
    broadcastEvaluator: Broadcast[TimeSeriesEvaluator[L]],
    id: L
  ): (UberHoltWintersModel, ModelParamEvaluation[L]) = {
    val features =
      row.getAs[org.apache.spark.ml.linalg.Vector]($(featuresCol))
    log.warn(
      s"Evaluating forecast for id $id, with parameters " +
        s"alpha ${model.alpha}, beta ${model.beta} and gamma ${model.gamma}"
    )
    val expectedResult =
      row.getAs[org.apache.spark.ml.linalg.Vector](partialValidationCol)
    val forecastToBeValidated = Vectors.dense(new Array[Double]($(nFutures)))
    model.forecast(org.apache.spark.mllib.linalg.Vectors.fromML(features), forecastToBeValidated).toArray
    val toBeValidated =
      expectedResult.toArray.zip(forecastToBeValidated.toArray)
    val metric = broadcastEvaluator.value.evaluate(toBeValidated)
    val metricName = broadcastEvaluator.value.getMetricName
    val params = ParamMap().put(
      ParamPair(gamma, model.gamma),
      ParamPair(beta, model.beta),
      ParamPair(alpha, model.alpha)
    )
    (model,
     new ModelParamEvaluation[L](
       id,
       metric,
       params,
       Some(metricName),
       SupportedAlgorithm.HoltWinters
     ))
  }
}

Source File: QuadraticRenyiEntropy.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.prototype

import breeze.linalg.DenseVector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import io.github.mandar2812.dynaml.kernels.DensityKernel



  override def entropy(data: List[DenseVector[Double]]): Double = {
    val dim = data.head.length
    val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2))
    val product = for(i <- data.view; j <- data.view) yield (i, j)
    -1*log_e(product.map((couple) => {
      val point1: DenseVector[Double] = couple._1 / sqrt(2.0)
      val point2: DenseVector[Double] = couple._2 / sqrt(2.0)
      density.eval(point1 - point2)
    }).sum)
  }

  override def entropy[K](data: RDD[(K, LabeledPoint)]): Double = {
    val dim = data.first()._2.features.size
    -1*log_e(data.cartesian(data).map((couple) =>{
      val point1: DenseVector[Double] = DenseVector(couple._1._2.features.toArray) / sqrt(2.0)
      val point2: DenseVector[Double] = DenseVector(couple._2._2.features.toArray) / sqrt(2.0)
      density.eval(point1 - point2)
    }).reduce((a,b) => a + b))
  }

  def entropyDifference(entropy: Double,
                        data: List[DenseVector[Double]],
                        add: DenseVector[Double],
                        remove: DenseVector[Double]): Double = {
    val dim = data.head.length
    val expEntropy = math.exp(-1.0*entropy)

    val product1 = for(i <- data.view) yield (remove, i)
    val subtractEnt = 2*product1.map((couple) => {
      density.eval((couple._1 - couple._2) / sqrt(2.0))
    }).sum - density.eval(DenseVector.zeros(dim))

    val product2 = for(i <- data.view) yield (add, i)
    val addEnt = 2*product2.map((couple) => {
      density.eval((couple._1 - couple._2) / sqrt(2.0))
    }).sum - 2*density.eval((add - remove) / sqrt(2.0)) +
      density.eval(DenseVector.zeros(dim))

    -1.0*log_e(expEntropy + addEnt - subtractEnt) - entropy
  }
}

Source File: SparkLogisticGLM.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.models.lm

//Breeze Imports
import breeze.linalg.DenseVector
import breeze.numerics.sigmoid
import breeze.stats.distributions.Gaussian
import io.github.mandar2812.dynaml.optimization.ProbitGradient
import org.apache.spark.mllib.linalg.Vectors
//DynaML Imports
import io.github.mandar2812.dynaml.optimization.{
GradientDescentSpark, LogisticGradient,
RegularizedOptimizer, SquaredL2Updater}
//Spark Imports
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


class SparkProbitGLM(
  data: RDD[(DenseVector[Double], Double)], numPoints: Long,
  map: (DenseVector[Double]) => DenseVector[Double] =
  identity[DenseVector[Double]]) extends SparkLogisticGLM(data, numPoints, map) {

  private val standardGaussian = new Gaussian(0, 1.0)

  override val h: (Double) => Double = (x: Double) => standardGaussian.cdf(x)

  override protected val optimizer: RegularizedOptimizer[
    DenseVector[Double], DenseVector[Double],
    Double, RDD[LabeledPoint]] = new GradientDescentSpark(new ProbitGradient, new SquaredL2Updater)

}

Source File: Test_example_CNN.scala From SparkMLlibDeepLearn with Apache License 2.0

5 votes

package tests

import org.apache.log4j.{ Level, Logger }
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.storage.StorageLevel
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.mllib.linalg.{ Vector, Vectors }
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.regression.LabeledPoint
import breeze.linalg.{
  Matrix => BM,
  CSCMatrix => BSM,
  DenseMatrix => BDM,
  Vector => BV,
  DenseVector => BDV,
  SparseVector => BSV,
  axpy => brzAxpy,
  svd => brzSvd,
  max => Bmax,
  min => Bmin,
  sum => Bsum
}
import scala.collection.mutable.ArrayBuffer
import CNN.CNN

object Test_example_CNN {

  def main(args: Array[String]) {
    //1 ����Spark����
    val conf = new SparkConf().setAppName("CNNtest")
    val sc = new SparkContext(conf)

    //2 ��������
    Logger.getRootLogger.setLevel(Level.WARN)
    val data_path = "/deeplearn/train_d3.txt"
    val examples = sc.textFile(data_path).cache()
    val train_d1 = examples.map { line =>
      val f1 = line.split("\t")
      val f = f1.map(f => f.toDouble)
      val y = f.slice(0, 10)
      val x = f.slice(10, f.length)
      (new BDM(1, y.length, y), (new BDM(1, x.length, x)).reshape(28, 28) / 255.0)
    }
    val train_d = train_d1.map(f => (f._1, f._2))
    
    //3 ����ѵ������������ģ��
    // opts:��������������������������֤����
    val opts = Array(50.0, 1.0, 0.0)
    train_d.cache
    val numExamples = train_d.count()
    println(s"numExamples = $numExamples.")
    val CNNmodel = new CNN().
      setMapsize(new BDM(1, 2, Array(28.0, 28.0))).
      setTypes(Array("i", "c", "s", "c", "s")).
      setLayer(5).
      setOnum(10).
      setOutputmaps(Array(0.0, 6.0, 0.0, 12.0, 0.0)).
      setKernelsize(Array(0.0, 5.0, 0.0, 5.0, 0.0)).
      setScale(Array(0.0, 0.0, 2.0, 0.0, 2.0)).
      setAlpha(1.0).
      CNNtrain(train_d, opts)

    //4 ģ�Ͳ���
    val CNNforecast = CNNmodel.predict(train_d)
    val CNNerror = CNNmodel.Loss(CNNforecast)
    println(s"NNerror = $CNNerror.")
    val printf1 = CNNforecast.map(f => (f.label.data,  f.predict_label.data)).take(200)
    println("Ԥ��ֵ")
    for (i <- 0 until printf1.length) {
      val outi = printf1(i)._2.mkString("\t")
      println(outi)
    }

  }
}

Source File: NegativeCorrelationExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License

5 votes

package handson.example

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics
import org.apache.spark.sql.SparkSession


object NegativeCorrelationExample {
  def getSparkSession(): SparkSession = {
    val spark = SparkSession.builder().master("local").getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    spark
  }

  def main(args: Array[String]): Unit = {
    val spark = getSparkSession()

    val data = spark.sparkContext.parallelize(
      Seq(
        Vectors.dense(0.0, 1.0, 100.0),
        Vectors.dense(-10.0, 10.0, 200.0),
        Vectors.dense(-20.0, 100.0, 300.0),
        Vectors.dense(-30.0, 1000.0, 400.0),
        Vectors.dense(-40.0, 10000.0, 500.0),
        Vectors.dense(-50.0, 100000.0, 600.0),
        Vectors.dense(-60.0, 1000000.0, 700.0),
        Vectors.dense(-70.0, 10000000.0, 800.0),
        Vectors.dense(-80.0, 100000000.0, 900.0),
        Vectors.dense(-90.0, 1000000000.0, 1000.0)
      )
    )

    val corr = Statistics.corr(data)

    println(s"Correlation:\n${corr}")

    spark.stop()
  }

}

Source File: PositiveCorrelationExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License

5 votes

package handson.example

import org.apache.spark.sql.SparkSession
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.Statistics


object PositiveCorrelationExample {
  def getSparkSession(): SparkSession = {
    val spark = SparkSession.builder().master("local").getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")
    spark
  }

  def main(args: Array[String]): Unit = {
    val spark = getSparkSession()

    val data = spark.sparkContext.parallelize(
      Seq(
        Vectors.dense(0.0, 1.0, 100.0),
        Vectors.dense(10.0, 10.0, 200.0),
        Vectors.dense(20.0, 100.0, 300.0),
        Vectors.dense(30.0, 1000.0, 400.0),
        Vectors.dense(40.0, 10000.0, 500.0),
        Vectors.dense(50.0, 100000.0, 600.0),
        Vectors.dense(60.0, 1000000.0, 700.0),
        Vectors.dense(70.0, 10000000.0, 800.0),
        Vectors.dense(80.0, 100000000.0, 900.0),
        Vectors.dense(90.0, 1000000000.0, 1000.0)
      )
    )
    val summary = Statistics.colStats(data) // Compute column summary statistics
    println(
      s"""Summary:
    ${summary.count} // number of records
    ${summary.mean}  // mean value for each column
    ${summary.min} // column-wise min
    ${summary.max} // column-wise max
    ${summary.normL1} // column-wise norm L1
    ${summary.normL2} // column-wise Euclidean magnitude
    ${summary.variance}  // column-wise variance
    ${summary.numNonzeros}  // column-wise count of non-zero values
        """.stripMargin)

    val corr = Statistics.corr(data)

    println(s"Correlation:\n${corr}")

    spark.stop()
  }

}

Source File: HandsOnKMeanStreaming.scala From Hands-On-Data-Analysis-with-Scala with MIT License

5 votes

package handson.example

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.mllib.clustering.StreamingKMeans


object HandsOnKMeanStreaming {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[2]").setAppName("HandsOnKMeanStreaming")
    val ssc = new StreamingContext(conf, Seconds(10))
    val model = new StreamingKMeans().
      setK(4). // number of clusters is 4
      setDecayFactor(1.0). // decay factor (the forgetfulness of the previous centroids)
      setRandomCenters(3, 0.0) // 3 dimensions and 0 weight
    import org.apache.spark.mllib.linalg.Vectors
    val trainingData = ssc.textFileStream("file:/tmp/k-means-train-data").map(Vectors.parse).cache()
    trainingData.print()
    import org.apache.spark.mllib.regression.LabeledPoint
    val testData = ssc.textFileStream("file:/tmp/k-means-test-data").map(LabeledPoint.parse)
    model.trainOn(trainingData)
    model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
    ssc.start()
    ssc.awaitTerminationOrTimeout(1000*60*3) // Wait for the computation to terminate (3 minutes)
  }
}

Source File: HandsOnLinRegStreaming.scala From Hands-On-Data-Analysis-with-Scala with MIT License

5 votes

package handson.example

import org.apache.spark._
import org.apache.spark.streaming._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD


object HandsOnLinRegStreaming {
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[2]").setAppName("HandsOnLinRegStreaming")
    val ssc = new StreamingContext(conf, Seconds(10))
    val numFeatures = 3
    val model = new StreamingLinearRegressionWithSGD().setInitialWeights(Vectors.zeros(numFeatures))
    val trainingData = ssc.textFileStream("file:/tmp/lin-reg-train-data").map(LabeledPoint.parse).cache()
    trainingData.print() // output training data for debug purpose
    val testData = ssc.textFileStream("file:/tmp/lin-reg-test-data").map(LabeledPoint.parse)
    model.trainOn(trainingData)
    model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()
    ssc.start()
    ssc.awaitTerminationOrTimeout(1000*60*3) // Wait for the computation to terminate (3 minutes)
  }
}

Source File: LinearRegExample.scala From Hands-On-Data-Analysis-with-Scala with MIT License

5 votes

package handson.example

import org.apache.spark.sql.SparkSession


object LinearRegExample {
  val homeDir = System.getProperty("user.home")
  def main(args: Array[String]): Unit = {
    // 1. Set Spark session
    val spark = SparkSession.builder().master("local").getOrCreate()

    // 2. Set logging level to WARNING
    spark.sparkContext.setLogLevel("WARN")

    // 3. Import necessary classes from Spark MLLib package that are needed for linear regression
    import org.apache.spark.mllib.linalg.Vectors
    import org.apache.spark.mllib.regression.LabeledPoint
    import org.apache.spark.mllib.regression.LinearRegressionModel
    import org.apache.spark.mllib.regression.LinearRegressionWithSGD

    // 4. Load the data
    val data = spark.sparkContext.textFile(s"${homeDir}/lpsa.data")
    // 5. Parse the data into LabeledPoint and cache
    val parsedData = data.map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }.cache()

    // 6. Build the model by setting number of iterations, step size
    val numIterations = 100
    val stepSize = 0.00000001
    val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize)

    // 7. Evaluate model on training examples and compute training error
    val valuesAndPreds = parsedData.map { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }
    val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean()
    println(s"training Mean Squared Error $MSE")

    // 8. Save the model
    model.save(spark.sparkContext, s"{homeDir}/LinearRegressionWithSGDModel")
    // 9. Load the saved model
    val sameModel = LinearRegressionModel.load(spark.sparkContext, s"{homeDir}/LinearRegressionWithSGDModel")
    // 10. Output the model
    println(sameModel)
  }

}

Source File: MnistExample.scala From SparseML with Apache License 2.0

5 votes

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{KMeans, ScalableKMeans, SparseKMeans}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.SparkSession

object MnistExample {


  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val spark = SparkSession.builder.appName("svm").master("local[8]").getOrCreate()

    val trainRDD = spark.sparkContext.textFile("data/mnist/mnist_train.csv", 8)
      .map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr =>  Vectors.dense(arr.slice(1, 785)))

    val model = new KMeans()
      .setK(10)
      .setInitializationMode("random")
      .setMaxIterations(10)
      .run(trainRDD)

    println("final clusters:")
    println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n"))
  }

}

Source File: KMeanTest.scala From SparseML with Apache License 2.0

5 votes

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.clustering.{ScalableKMeans, KMeans}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.{SparseVector, Vectors, Vector}

import scala.util.Random


//spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 20g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 1000000 100 0.1 1 my 9

//guale spark/bin/spark-submit --master spark://10.100.34.48:7077 --class  ScalableKMeanTest --executor-memory 5g --executor-cores 1 --driver-memory 24g --conf spark.driver.maxResultSize=8g --conf spark.akka.frameSize=1024 unnamed.jar 50 5000000 100 0.1 1 my 15

object ScalableKMeanTest {

  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)

    val conf = new SparkConf().setAppName(s"kmeans: ${args.mkString(",")}")
    val sc = new SparkContext(conf)

    val k = args(0).toInt
    val dimension = args(1).toInt
    val recordNum = args(2).toInt
    val sparsity = args(3).toDouble
    val iterations = args(4).toInt
    val means = args(5)
    val parNumber = args(6).toInt

    val data: RDD[Vector] = sc.parallelize(1 to recordNum, parNumber).map(i => {
      val ran = new Random()
      val indexArr = ran.shuffle((0 until dimension).toList).take((dimension * sparsity).toInt).sorted.toArray
      val valueArr = (1 to (dimension * sparsity).toInt).map(in => ran.nextDouble()).sorted.toArray
      val vec: Vector = new SparseVector(dimension, indexArr, valueArr)
      vec
    }).cache()
    println(args.mkString(", "))
    println(data.count() + " records generated")

    val st = System.nanoTime()

    val model = if(means == "my") {
      println("running scalable kmeans")
      val model = new ScalableKMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    } else {
      println("running mllib kmeans")
      val model = new KMeans()
        .setK(k)
        .setInitializationMode("random")
        .setMaxIterations(iterations)
        .run(data)
      model
    }

    println((System.nanoTime() - st) / 1e9 + " seconds cost")
    println("final clusters: " + model.clusterCenters.length)
    println(model.clusterCenters.map(v => v.numNonzeros).mkString("\n"))

    sc.stop()
  }

}

Source File: lda-script.scala From practical-data-science-with-hadoop-and-spark with Apache License 2.0

5 votes

import collection.JavaConversions._
import scala.collection.mutable

import opennlp.tools.tokenize.SimpleTokenizer
import opennlp.tools.stemmer.PorterStemmer

import org.apache.spark.rdd._
import org.apache.spark.mllib.clustering.{OnlineLDAOptimizer, DistributedLDAModel, LDA}
import org.apache.spark.mllib.linalg.{Vector, SparseVector, Vectors}
import org.apache.spark.mllib.feature.IDF

// add openNLP jar to the Spark Context 
sc.addJar("opennlp-tools-1.6.0.jar")

// Load documents from text files, 1 element (text string) per file
val corpus = sc.wholeTextFiles("ohsumed/C*", 20).map(x => x._2)

// read stop words from file
val stopwordFile = "stop-words.txt"
val st_words = sc.textFile(stopwordFile).collect()
      .flatMap(_.stripMargin.split("\\s+")).map(_.toLowerCase).toSet
val stopwords = sc.broadcast(st_words)

val minWordLength = 3
val tokenized: RDD[(Long, Array[String])] = corpus.zipWithIndex().map { case (text,id) => 
    val tokenizer = SimpleTokenizer.INSTANCE
    val stemmer = new PorterStemmer()    
    val tokens = tokenizer.tokenize(text)
    val words = tokens.filter(w => (w.length >= minWordLength) && (!stopwords.value.contains(w)))
                      .map(w => stemmer.stem(w))
    id -> words
}.filter(_._2.length > 0)

tokenized.cache()
val numDocs = tokenized.count()

val wordCounts: RDD[(String, Long)] = tokenized.flatMap { case (_, tokens) => 
tokens.map(_ -> 1L) 
}.reduceByKey(_ + _)
wordCounts.cache()
val fullVocabSize = wordCounts.count()
val vSize = 10000
val (vocab: Map[String, Int], selectedTokenCount: Long) = {
    val sortedWC: Array[(String,Long)] = {wordCounts.sortBy(_._2, ascending=false) .take(vSize)}
    (sortedWC.map(_._1).zipWithIndex.toMap, sortedWC.map(_._2).sum)
}


val documents = tokenized.map { case (id, tokens) =>
    // Filter tokens by vocabulary, and create word count vector representation of document.
    val wc = new mutable.HashMap[Int, Int]()
    tokens.foreach { term =>
        if (vocab.contains(term)) {
          val termIndex = vocab(term)
          wc(termIndex) = wc.getOrElse(termIndex, 0) + 1
        }
    }
    val indices = wc.keys.toArray.sorted
    val values = indices.map(i => wc(i).toDouble)
    val sb = Vectors.sparse(vocab.size, indices, values)
    (id, sb)
}

val vocabArray = new Array[String](vocab.size)
vocab.foreach { case (term, i) => vocabArray(i) = term }

val tf = documents.map { case (id, vec) => vec }.cache()
val idfVals = new IDF().fit(tf).idf.toArray
val tfidfDocs: RDD[(Long, Vector)] = documents.map { case (id, vec) =>
    val indices = vec.asInstanceOf[SparseVector].indices
    val counts = new mutable.HashMap[Int, Double]()    
    for (idx <- indices) {
        counts(idx) = vec(idx) * idfVals(idx)
    }
    (id, Vectors.sparse(vocab.size, counts.toSeq))
}



val numTopics = 5
val numIterations = 50
val lda = new LDA().setK(numTopics).setMaxIterations(numIterations).setOptimizer("online")
val ldaModel = lda.run(tfidfDocs)

val topicIndices = ldaModel.describeTopics(maxTermsPerTopic = 5)
topicIndices.foreach { case (terms, termWeights) =>
    println("TOPIC:")
    terms.zip(termWeights).foreach { case (term, weight) =>
        println(s"${vocabArray(term.toInt)}\t$weight")
    }
    println()
}

Source File: MatrixUtilSuite.scala From spark-timeseries with Apache License 2.0

5 votes

package com.cloudera.sparkts

import com.cloudera.sparkts.MatrixUtil._
import org.apache.spark.mllib.linalg.{Matrices, Vectors}
import org.scalatest._

class MatrixUtilSuite extends FunSuite with ShouldMatchers {
  test("modifying toBreeze version modifies original tensor") {
    val vec = Vectors.dense(1.0, 2.0, 3.0)
    val breezeVec = toBreeze(vec)
    breezeVec(1) = 4.0
    vec(1) should be (4.0)

    val mat = Matrices.zeros(3, 4)
    val breezeMat = toBreeze(mat)
    breezeMat(0, 1) = 2.0
    mat(0, 1) should be (2.0)
  }
}

Source File: LocalDBSCANArcherySuite.scala From dbscan-on-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.clustering.dbscan

import java.net.URI

import scala.io.Source

import org.scalatest.FunSuite
import org.scalatest.Matchers
import org.apache.spark.mllib.linalg.Vectors

class LocalDBSCANArcherySuite extends FunSuite with Matchers {

  private val dataFile = "labeled_data.csv"

  test("should cluster") {

    val labeled: Map[DBSCANPoint, Double] =
      new LocalDBSCANArchery(eps = 0.3F, minPoints = 10)
        .fit(getRawData(dataFile))
        .map(l => (l, l.cluster.toDouble))
        .toMap

    val expected: Map[DBSCANPoint, Double] = getExpectedData(dataFile).toMap

    labeled.foreach {
      case (key, value) => {
        val t = expected(key)
        if (t != value) {
          println(s"expected: $t but got $value for $key")
        }

      }
    }

    labeled should equal(expected)

  }

  def getExpectedData(file: String): Iterator[(DBSCANPoint, Double)] = {
    Source
      .fromFile(getFile(file))
      .getLines()
      .map(s => {
        val vector = Vectors.dense(s.split(',').map(_.toDouble))
        val point = DBSCANPoint(vector)
        (point, vector(2))
      })
  }

  def getRawData(file: String): Iterable[DBSCANPoint] = {

    Source
      .fromFile(getFile(file))
      .getLines()
      .map(s => DBSCANPoint(Vectors.dense(s.split(',').map(_.toDouble))))
      .toIterable
  }

  def getFile(filename: String): URI = {
    getClass.getClassLoader.getResource(filename).toURI
  }

}

Source File: printMatrix.scala From mCNN with Apache License 2.0

5 votes

package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.{SparkContext, SparkConf}
import breeze.linalg.{DenseMatrix => BDM, kron}


object printMatrix {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))(0)))

    val lines2 = sc.textFile("dataset/train.format", 8)
    val data2 = lines2.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(784), Example.Vector2Tensor(Vectors.dense(arr.slice(0, 784)))(0)))

    data2.take(10).foreach(record =>{
      println("label: " + record._1)
      val intm = new BDM[Int](28, 28, record._2.toArray.map(d => d.toInt))
      val str = intm.toString(1000, 1000).replace('0', '.').replace('0', '*')
      println(str)
    })

  }
}

Source File: Example.scala From mCNN with Apache License 2.0

5 votes

package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.{SparkConf, SparkContext}
import breeze.linalg.{DenseMatrix => BDM, _}

object Example {
  def main(args: Array[String]) {
    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/train.format", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => (arr(784), Vector2Tensor(Vectors.dense(arr.slice(0, 784)))))

    val topology = new CNNTopology
    topology.addLayer(CNNLayer.buildConvolutionLayer(1, 6, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvolutionLayer(6, 12, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildMeanPoolingLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvolutionLayer(12, 12, new Scale(4, 4)))
    val cnn: CNN = new CNN(topology).setMaxIterations(5).setMiniBatchSize(16)
    val start = System.nanoTime()
    cnn.trainOneByOne(data)
    println("Training time: " + (System.nanoTime() - start) / 1e9)

    val right = data.map(record =>{
      val result = cnn.predict(record._2)
      if(result == record._1) 1 else 0
    }).sum()
    println(s"Predicting precision: $right " + right.toDouble/(data.count()))

//    val testData = sc.textFile("dataset/mnist/mnist_test.csv", 8)
//      .map(line => line.split(",")).map(arr => arr.map(_.toDouble))
//      .map(arr => (arr(0), Example.Vector2Tensor(Vectors.dense(arr.slice(1, 785).map(v => if(v > 200) 1.0 else 0)))))

    val rightM = data.map(record =>{
      val result = cnn.predict(record._2)
      if(result == record._1) 1 else 0
    }).sum()
    println(s"Mnist Full Predicting precision: $rightM " + rightM.toDouble/(data.count()))
  }

  
  def Vector2Tensor(record: Vector): Array[BDM[Double]] = {
    val mapSize = new Scale(28, 28)
    val m = new BDM[Double](mapSize.x, mapSize.y)
    var i: Int = 0
    while (i < mapSize.x) {
      var j: Int = 0
      while (j < mapSize.y) {
        m(i, j) = record(mapSize.x * i + j)
        j += 1
      }
      i += 1
    }
    Array(m)
  }


}

Source File: Driver.scala From mCNN with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import org.apache.log4j.{Logger, Level}
import breeze.linalg.{DenseMatrix => BDM}
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.{SparkContext, SparkConf}

object CNNDriver {

  def main(args: Array[String]) {

    val myLayers = new Array[Layer](8)
    myLayers(0) = new ConvolutionalLayer(1, 6, kernelSize = new MapSize(5, 5), inputMapSize = new MapSize(28, 28))
    myLayers(1) = new FunctionalLayer(new SigmoidFunction())
    myLayers(2) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(24, 24))
    myLayers(3) = new ConvolutionalLayer(6, 12, new MapSize(5, 5), new MapSize(12, 12))
    myLayers(4) = new FunctionalLayer(new SigmoidFunction())
    myLayers(5) = new MeanPoolingLayer(new MapSize(2, 2), new MapSize(8, 8))
    myLayers(6) = new ConvolutionalLayer(12, 12, new MapSize(4, 4), new MapSize(4, 4))
    myLayers(7) = new FunctionalLayer(new SigmoidFunction())
    val topology = FeedForwardTopology(myLayers)

    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/train.format", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => {
      val target = new Array[Double](12)
      target(arr(784).toInt) = 1
      val in = Vector2BDM(Vectors.dense(arr.slice(0, 784)))
      (Vectors.fromBreeze(in.toDenseVector), Vectors.dense(target))
    }).cache()

    val feedForwardTrainer = new FeedForwardTrainer(topology, 784, 12)

    feedForwardTrainer.setStackSize(4) // CNN does not benefit from the stacked data
//    .LBFGSOptimizer.setNumIterations(20)
      .SGDOptimizer
      .setMiniBatchFraction(0.002)
      .setConvergenceTol(0)
      .setNumIterations(1000)
      .setUpdater(new CNNUpdater(0.85))

    for(iter <- 1 to 1000){
      val start = System.nanoTime()
      val mlpModel = feedForwardTrainer.train(data)
      feedForwardTrainer.setWeights(mlpModel.weights())

      println(s"Training time $iter: " + (System.nanoTime() - start) / 1e9)

      // predict
      val right = data.filter(v => mlpModel.predict(v._1).argmax == v._2.argmax).count()
      val precision = right.toDouble / data.count()
      println(s"right: $right, count: ${data.count()}, precision: $precision")
    }
  }

  def Vector2BDM(record: Vector): BDM[Double] = {
    val mapSize = new MapSize(28, 28)
    val m = new BDM[Double](mapSize.x, mapSize.y)
    var i: Int = 0
    while (i < mapSize.x) {
      var j: Int = 0
      while (j < mapSize.y) {
        m(i, j) = record(mapSize.x * i + j)
        j += 1
      }
      i += 1
    }
    m
  }

}

Source File: CNNUpdater.scala From mCNN with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import breeze.linalg.{*, DenseMatrix => BDM, DenseVector => BDV, Vector => BV, axpy => Baxpy,
sum => Bsum}
import org.apache.spark.mllib.linalg.{Vectors, Vector}
import org.apache.spark.mllib.optimization.Updater

private[ann] class CNNUpdater(alpha: Double) extends Updater {

  override def compute(
      weightsOld: Vector,
      gradient: Vector,
      stepSize: Double,
      iter: Int,
      regParam: Double): (Vector, Double) = {
    val thisIterStepSize = stepSize
    val brzWeights: BV[Double] = weightsOld.toBreeze.toDenseVector
    Baxpy(-thisIterStepSize, gradient.toBreeze * alpha, brzWeights)
    (Vectors.fromBreeze(brzWeights), 0)
  }
}

Source File: MnistCSVDriver.scala From mCNN with Apache License 2.0

5 votes

package hhbyyh.mCNN

import org.apache.log4j.{Level, Logger}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.{SparkConf, SparkContext}

object MnistCSVDriver {

  def main(args: Array[String]) {
    val topology = new CNNTopology
    topology.addLayer(CNNLayer.buildConvolutionLayer(new Scale(28, 28)))
    topology.addLayer(CNNLayer.buildConvLayer(6, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(5, 5)))
    topology.addLayer(CNNLayer.buildSampLayer(new Scale(2, 2)))
    topology.addLayer(CNNLayer.buildConvLayer(12, new Scale(4, 4)))
    val cnn: CNN = new CNN(topology).setMaxIterations(500000).setMiniBatchSize(16)

    Logger.getLogger("org").setLevel(Level.WARN)
    Logger.getLogger("akka").setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local[8]").setAppName("ttt")
    val sc = new SparkContext(conf)
    val lines = sc.textFile("dataset/mnist/mnist_train.csv", 8)
    val data = lines.map(line => line.split(",")).map(arr => arr.map(_.toDouble))
      .map(arr => new LabeledPoint(arr(0), Vectors.dense(arr.slice(1, 785).map(v => if(v > 0) 1.0 else 0))))

    val start = System.nanoTime()
    cnn.trainOneByOne(data)
    println("Training time: " + (System.nanoTime() - start) / 1e9)
  }

}

Source File: RichIndexedRowMatrixSuite.scala From hail with MIT License

5 votes

package is.hail.utils

import breeze.linalg.{DenseMatrix => BDM, _}
import is.hail.{HailSuite, TestUtils}
import is.hail.linalg.BlockMatrix
import is.hail.linalg.BlockMatrix.ops._
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{DistributedMatrix, IndexedRow, IndexedRowMatrix}
import org.apache.spark.rdd.RDD
import org.testng.annotations.Test


class RichIndexedRowMatrixSuite extends HailSuite {

  private def convertDistributedMatrixToBreeze(sparkMatrix: DistributedMatrix): Matrix[Double] = {
    val breezeConverter = sparkMatrix.getClass.getMethod("toBreeze")
    breezeConverter.invoke(sparkMatrix).asInstanceOf[Matrix[Double]]
  }

  @Test def testToBlockMatrixDense() {
    val nRows = 9L
    val nCols = 6L
    val data = Seq(
      (0L, Vectors.dense(0.0, 1.0, 2.0, 1.0, 3.0, 4.0)),
      (1L, Vectors.dense(3.0, 4.0, 5.0, 1.0, 1.0, 1.0)),
      (3L, Vectors.dense(9.0, 0.0, 1.0, 1.0, 1.0, 1.0)),
      (4L, Vectors.dense(9.0, 0.0, 1.0, 1.0, 1.0, 1.0)),
      (5L, Vectors.dense(9.0, 0.0, 1.0, 1.0, 1.0, 1.0)),
      (6L, Vectors.dense(1.0, 2.0, 3.0, 1.0, 1.0, 1.0)),
      (7L, Vectors.dense(4.0, 5.0, 6.0, 1.0, 1.0, 1.0)),
      (8L, Vectors.dense(7.0, 8.0, 9.0, 1.0, 1.0, 1.0))
    ).map(IndexedRow.tupled)
    val indexedRows: RDD[IndexedRow] = sc.parallelize(data)

    val irm = new IndexedRowMatrix(indexedRows)

    for {
      blockSize <- Seq(1, 2, 3, 4, 6, 7, 9, 10)
    } {
      val blockMat = irm.toHailBlockMatrix(blockSize)
      assert(blockMat.nRows === nRows)
      assert(blockMat.nCols === nCols)
      assert(blockMat.toBreezeMatrix() === convertDistributedMatrixToBreeze(irm))
    }

    intercept[IllegalArgumentException] {
      irm.toHailBlockMatrix(-1)
    }
    intercept[IllegalArgumentException] {
      irm.toHailBlockMatrix(0)
    }
  }

  @Test def emptyBlocks() {
    val nRows = 9
    val nCols = 2
    val data = Seq(
      (3L, Vectors.dense(1.0, 2.0)),
      (4L, Vectors.dense(1.0, 2.0)),
      (5L, Vectors.dense(1.0, 2.0)),
      (8L, Vectors.dense(1.0, 2.0))
    ).map(IndexedRow.tupled)

    val irm = new IndexedRowMatrix(sc.parallelize(data))

    val m = irm.toHailBlockMatrix(2)
    assert(m.nRows == nRows)
    assert(m.nCols == nCols)
    assert(m.toBreezeMatrix() == convertDistributedMatrixToBreeze(irm))
    assert(m.blocks.count() == 5)

    (m.dot(m.T)).toBreezeMatrix() // assert no exception

    assert(m.mapWithIndex { case (i, j, v) => i + 10 * j + v }.toBreezeMatrix() ===
      new BDM[Double](nRows, nCols, Array[Double](
        0.0, 1.0, 2.0, 4.0, 5.0, 6.0, 6.0, 7.0, 9.0,
        10.0, 11.0, 12.0, 15.0, 16.0, 17.0, 16.0, 17.0, 20.0
      )))
  }
}

Source File: DataFrameExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

import java.io.File

import scopt.OptionParser

import org.apache.spark.examples.mllib.AbstractParams
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.apache.spark.util.Utils


object DataFrameExample {

  case class Params(input: String = "data/mllib/sample_libsvm_data.txt")
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DataFrameExample") {
      head("DataFrameExample: an example app using DataFrame for ML.")
      opt[String]("input")
        .text(s"input path to dataframe")
        .action((x, c) => c.copy(input = x))
      checkConfig { params =>
        success
      }
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val spark = SparkSession
      .builder
      .appName(s"DataFrameExample with $params")
      .getOrCreate()

    // Load input data
    println(s"Loading LIBSVM file with UDT from ${params.input}.")
    val df: DataFrame = spark.read.format("libsvm").load(params.input).cache()
    println("Schema from LIBSVM:")
    df.printSchema()
    println(s"Loaded training data as a DataFrame with ${df.count()} records.")

    // Show statistical summary of labels.
    val labelSummary = df.describe("label")
    labelSummary.show()

    // Convert features column to an RDD of vectors.
    val features = df.select("features").rdd.map { case Row(v: Vector) => v }
    val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())(
      (summary, feat) => summary.add(Vectors.fromML(feat)),
      (sum1, sum2) => sum1.merge(sum2))
    println(s"Selected features column with average values:\n ${featureSummary.mean.toString}")

    // Save the records in a parquet file.
    val tmpDir = Utils.createTempDir()
    val outputDir = new File(tmpDir, "dataframe").toString
    println(s"Saving to $outputDir as Parquet file.")
    df.write.parquet(outputDir)

    // Load the records back.
    println(s"Loading Parquet file with UDT from $outputDir.")
    val newDF = spark.read.parquet(outputDir)
    println(s"Schema from Parquet:")
    newDF.printSchema()

    spark.stop()
  }
}
// scalastyle:on println

Source File: SummaryStatisticsExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.{MultivariateStatisticalSummary, Statistics}
// $example off$

object SummaryStatisticsExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SummaryStatisticsExample")
    val sc = new SparkContext(conf)

    // $example on$
    val observations = sc.parallelize(
      Seq(
        Vectors.dense(1.0, 10.0, 100.0),
        Vectors.dense(2.0, 20.0, 200.0),
        Vectors.dense(3.0, 30.0, 300.0)
      )
    )

    // Compute column summary statistics.
    val summary: MultivariateStatisticalSummary = Statistics.colStats(observations)
    println(summary.mean)  // a dense vector containing the mean value for each column
    println(summary.variance)  // column-wise variance
    println(summary.numNonzeros)  // number of nonzeros in each column
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: PCAOnSourceVectorExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD
// $example off$

object PCAOnSourceVectorExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAOnSourceVectorExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data: RDD[LabeledPoint] = sc.parallelize(Seq(
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 1)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 1, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0)),
      new LabeledPoint(0, Vectors.dense(1, 0, 0, 0, 0)),
      new LabeledPoint(1, Vectors.dense(1, 1, 0, 0, 0))))

    // Compute the top 5 principal components.
    val pca = new PCA(5).fit(data.map(_.features))

    // Project vectors to the linear space spanned by the top 5 principal
    // components, keeping the label
    val projected = data.map(p => p.copy(features = pca.transform(p.features)))
    // $example off$
    val collect = projected.collect()
    println("Projected vector of principal component:")
    collect.foreach { vector => println(vector) }
  }
}
// scalastyle:on println

Source File: PCAOnRowMatrixExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
// $example off$

object PCAOnRowMatrixExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAOnRowMatrixExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))

    val dataRDD = sc.parallelize(data, 2)

    val mat: RowMatrix = new RowMatrix(dataRDD)

    // Compute the top 4 principal components.
    // Principal components are stored in a local dense matrix.
    val pc: Matrix = mat.computePrincipalComponents(4)

    // Project the rows to the linear space spanned by the top 4 principal components.
    val projected: RowMatrix = mat.multiply(pc)
    // $example off$
    val collect = projected.rows.collect()
    println("Projected Row Matrix of principal component:")
    collect.foreach { vector => println(vector) }
  }
}
// scalastyle:on println

Source File: TallSkinnyPCA.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.Vectors


object TallSkinnyPCA {
  def main(args: Array[String]) {
    if (args.length != 1) {
      System.err.println("Usage: TallSkinnyPCA <input>")
      System.exit(1)
    }

    val conf = new SparkConf().setAppName("TallSkinnyPCA")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(args(0)).map { line =>
      val values = line.split(' ').map(_.toDouble)
      Vectors.dense(values)
    }
    val mat = new RowMatrix(rows)

    // Compute principal components.
    val pc = mat.computePrincipalComponents(mat.numCols().toInt)

    println("Principal components are:\n" + pc)

    sc.stop()
  }
}
// scalastyle:on println

Source File: GaussianMixtureExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{GaussianMixture, GaussianMixtureModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object GaussianMixtureExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("GaussianMixtureExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/gmm_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.trim.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using GaussianMixture
    val gmm = new GaussianMixture().setK(2).run(parsedData)

    // Save and load model
    gmm.save(sc, "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")
    val sameModel = GaussianMixtureModel.load(sc,
      "target/org/apache/spark/GaussianMixtureExample/GaussianMixtureModel")

    // output parameters of max-likelihood model
    for (i <- 0 until gmm.k) {
      println("weight=%f\nmu=%s\nsigma=\n%s\n" format
        (gmm.weights(i), gmm.gaussians(i).mu, gmm.gaussians(i).sigma))
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: PCAExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.PCA
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{LabeledPoint, LinearRegressionWithSGD}
// $example off$

@deprecated("Deprecated since LinearRegressionWithSGD is deprecated.  Use ml.feature.PCA", "2.0.0")
object PCAExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("PCAExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = sc.textFile("data/mllib/ridge-data/lpsa.data").map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }.cache()

    val splits = data.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0).cache()
    val test = splits(1)

    val pca = new PCA(training.first().features.size / 2).fit(data.map(_.features))
    val training_pca = training.map(p => p.copy(features = pca.transform(p.features)))
    val test_pca = test.map(p => p.copy(features = pca.transform(p.features)))

    val numIterations = 100
    val model = LinearRegressionWithSGD.train(training, numIterations)
    val model_pca = LinearRegressionWithSGD.train(training_pca, numIterations)

    val valuesAndPreds = test.map { point =>
      val score = model.predict(point.features)
      (score, point.label)
    }

    val valuesAndPreds_pca = test_pca.map { point =>
      val score = model_pca.predict(point.features)
      (score, point.label)
    }

    val MSE = valuesAndPreds.map { case (v, p) => math.pow((v - p), 2) }.mean()
    val MSE_pca = valuesAndPreds_pca.map { case (v, p) => math.pow((v - p), 2) }.mean()

    println("Mean Squared Error = " + MSE)
    println("PCA Mean Squared Error = " + MSE_pca)
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: DenseKMeans.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.log4j.{Level, Logger}
import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.Vectors


object DenseKMeans {

  object InitializationMode extends Enumeration {
    type InitializationMode = Value
    val Random, Parallel = Value
  }

  import InitializationMode._

  case class Params(
      input: String = null,
      k: Int = -1,
      numIterations: Int = 10,
      initializationMode: InitializationMode = Parallel) extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("DenseKMeans") {
      head("DenseKMeans: an example k-means app for dense data.")
      opt[Int]('k', "k")
        .required()
        .text(s"number of clusters, required")
        .action((x, c) => c.copy(k = x))
      opt[Int]("numIterations")
        .text(s"number of iterations, default: ${defaultParams.numIterations}")
        .action((x, c) => c.copy(numIterations = x))
      opt[String]("initMode")
        .text(s"initialization mode (${InitializationMode.values.mkString(",")}), " +
        s"default: ${defaultParams.initializationMode}")
        .action((x, c) => c.copy(initializationMode = InitializationMode.withName(x)))
      arg[String]("<input>")
        .text("input paths to examples")
        .required()
        .action((x, c) => c.copy(input = x))
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"DenseKMeans with $params")
    val sc = new SparkContext(conf)

    Logger.getRootLogger.setLevel(Level.WARN)

    val examples = sc.textFile(params.input).map { line =>
      Vectors.dense(line.split(' ').map(_.toDouble))
    }.cache()

    val numExamples = examples.count()

    println(s"numExamples = $numExamples.")

    val initMode = params.initializationMode match {
      case Random => KMeans.RANDOM
      case Parallel => KMeans.K_MEANS_PARALLEL
    }

    val model = new KMeans()
      .setInitializationMode(initMode)
      .setK(params.k)
      .setMaxIterations(params.numIterations)
      .run(examples)

    val cost = model.computeCost(examples)

    println(s"Total cost = $cost.")

    sc.stop()
  }
}
// scalastyle:on println

Source File: CosineSimilarity.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.{MatrixEntry, RowMatrix}


object CosineSimilarity {
  case class Params(inputFile: String = null, threshold: Double = 0.1)
    extends AbstractParams[Params]

  def main(args: Array[String]) {
    val defaultParams = Params()

    val parser = new OptionParser[Params]("CosineSimilarity") {
      head("CosineSimilarity: an example app.")
      opt[Double]("threshold")
        .required()
        .text(s"threshold similarity: to tradeoff computation vs quality estimate")
        .action((x, c) => c.copy(threshold = x))
      arg[String]("<inputFile>")
        .required()
        .text(s"input file, one row per line, space-separated")
        .action((x, c) => c.copy(inputFile = x))
      note(
        """
          |For example, the following command runs this app on a dataset:
          |
          | ./bin/spark-submit  --class org.apache.spark.examples.mllib.CosineSimilarity \
          | examplesjar.jar \
          | --threshold 0.1 data/mllib/sample_svm_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName("CosineSimilarity")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(params.inputFile).map { line =>
      val values = line.split(' ').map(_.toDouble)
      Vectors.dense(values)
    }.cache()
    val mat = new RowMatrix(rows)

    // Compute similar columns perfectly, with brute force.
    val exact = mat.columnSimilarities()

    // Compute similar columns with estimation using DIMSUM
    val approx = mat.columnSimilarities(params.threshold)

    val exactEntries = exact.entries.map { case MatrixEntry(i, j, u) => ((i, j), u) }
    val approxEntries = approx.entries.map { case MatrixEntry(i, j, v) => ((i, j), v) }
    val MAE = exactEntries.leftOuterJoin(approxEntries).values.map {
      case (u, Some(v)) =>
        math.abs(u - v)
      case (u, None) =>
        math.abs(u)
    }.mean()

    println(s"Average absolute error in estimate is: $MAE")

    sc.stop()
  }
}
// scalastyle:on println

Source File: ElementwiseProductExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.ElementwiseProduct
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object ElementwiseProductExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("ElementwiseProductExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Create some vector data; also works for sparse vectors
    val data = sc.parallelize(Array(Vectors.dense(1.0, 2.0, 3.0), Vectors.dense(4.0, 5.0, 6.0)))

    val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
    val transformer = new ElementwiseProduct(transformingVector)

    // Batch transform and per-row transform give the same results:
    val transformedData = transformer.transform(data)
    val transformedData2 = data.map(x => transformer.transform(x))
    // $example off$

    println("transformedData: ")
    transformedData.foreach(x => println(x))

    println("transformedData2: ")
    transformedData2.foreach(x => println(x))

    sc.stop()
  }
}
// scalastyle:on println

Source File: SVDExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.linalg.Matrix
import org.apache.spark.mllib.linalg.SingularValueDecomposition
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
// $example off$

object SVDExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("SVDExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0))

    val dataRDD = sc.parallelize(data, 2)

    val mat: RowMatrix = new RowMatrix(dataRDD)

    // Compute the top 5 singular values and corresponding singular vectors.
    val svd: SingularValueDecomposition[RowMatrix, Matrix] = mat.computeSVD(5, computeU = true)
    val U: RowMatrix = svd.U  // The U factor is a RowMatrix.
    val s: Vector = svd.s  // The singular values are stored in a local dense vector.
    val V: Matrix = svd.V  // The V factor is a local dense matrix.
    // $example off$
    val collect = U.rows.collect()
    println("U factor is:")
    collect.foreach { vector => println(vector) }
    println(s"Singular values are: $s")
    println(s"V factor is:\n$V")
  }
}
// scalastyle:on println

Source File: TallSkinnySVD.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.Vectors


object TallSkinnySVD {
  def main(args: Array[String]) {
    if (args.length != 1) {
      System.err.println("Usage: TallSkinnySVD <input>")
      System.exit(1)
    }

    val conf = new SparkConf().setAppName("TallSkinnySVD")
    val sc = new SparkContext(conf)

    // Load and parse the data file.
    val rows = sc.textFile(args(0)).map { line =>
      val values = line.split(' ').map(_.toDouble)
      Vectors.dense(values)
    }
    val mat = new RowMatrix(rows)

    // Compute SVD.
    val svd = mat.computeSVD(mat.numCols().toInt)

    println("Singular values are " + svd.s)

    sc.stop()
  }
}
// scalastyle:on println

Source File: StandardScalerExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
// $example on$
import org.apache.spark.mllib.feature.{StandardScaler, StandardScalerModel}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.util.MLUtils
// $example off$

object StandardScalerExample {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("StandardScalerExample")
    val sc = new SparkContext(conf)

    // $example on$
    val data = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt")

    val scaler1 = new StandardScaler().fit(data.map(x => x.features))
    val scaler2 = new StandardScaler(withMean = true, withStd = true).fit(data.map(x => x.features))
    // scaler3 is an identical model to scaler2, and will produce identical transformations
    val scaler3 = new StandardScalerModel(scaler2.std, scaler2.mean)

    // data1 will be unit variance.
    val data1 = data.map(x => (x.label, scaler1.transform(x.features)))

    // data2 will be unit variance and zero mean.
    val data2 = data.map(x => (x.label, scaler2.transform(Vectors.dense(x.features.toArray))))
    // $example off$

    println("data1: ")
    data1.foreach(x => println(x))

    println("data2: ")
    data2.foreach(x => println(x))

    sc.stop()
  }
}
// scalastyle:on println

Source File: KMeansExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.{KMeans, KMeansModel}
import org.apache.spark.mllib.linalg.Vectors
// $example off$

object KMeansExample {

  def main(args: Array[String]) {

    val conf = new SparkConf().setAppName("KMeansExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/kmeans_data.txt")
    val parsedData = data.map(s => Vectors.dense(s.split(' ').map(_.toDouble))).cache()

    // Cluster the data into two classes using KMeans
    val numClusters = 2
    val numIterations = 20
    val clusters = KMeans.train(parsedData, numClusters, numIterations)

    // Evaluate clustering by computing Within Set Sum of Squared Errors
    val WSSSE = clusters.computeCost(parsedData)
    println("Within Set Sum of Squared Errors = " + WSSSE)

    // Save and load model
    clusters.save(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    val sameModel = KMeansModel.load(sc, "target/org/apache/spark/KMeansExample/KMeansModel")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: MultivariateSummarizer.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import scopt.OptionParser

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
import org.apache.spark.mllib.util.MLUtils

spark-examples-*.jar \
        |  --input data/mllib/sample_linear_regression_data.txt
        """.stripMargin)
    }

    parser.parse(args, defaultParams) match {
      case Some(params) => run(params)
      case _ => sys.exit(1)
    }
  }

  def run(params: Params): Unit = {
    val conf = new SparkConf().setAppName(s"MultivariateSummarizer with $params")
    val sc = new SparkContext(conf)

    val examples = MLUtils.loadLibSVMFile(sc, params.input).cache()

    println(s"Summary of data file: ${params.input}")
    println(s"${examples.count()} data points")

    // Summarize labels
    val labelSummary = examples.aggregate(new MultivariateOnlineSummarizer())(
      (summary, lp) => summary.add(Vectors.dense(lp.label)),
      (sum1, sum2) => sum1.merge(sum2))

    // Summarize features
    val featureSummary = examples.aggregate(new MultivariateOnlineSummarizer())(
      (summary, lp) => summary.add(lp.features),
      (sum1, sum2) => sum1.merge(sum2))

    println()
    println(s"Summary statistics")
    println(s"\tLabel\tFeatures")
    println(s"mean\t${labelSummary.mean(0)}\t${featureSummary.mean.toArray.mkString("\t")}")
    println(s"var\t${labelSummary.variance(0)}\t${featureSummary.variance.toArray.mkString("\t")}")
    println(
      s"nnz\t${labelSummary.numNonzeros(0)}\t${featureSummary.numNonzeros.toArray.mkString("\t")}")
    println(s"max\t${labelSummary.max(0)}\t${featureSummary.max.toArray.mkString("\t")}")
    println(s"min\t${labelSummary.min(0)}\t${featureSummary.min.toArray.mkString("\t")}")
    println()

    sc.stop()
  }
}
// scalastyle:on println

Source File: LinearRegressionWithSGDExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.LinearRegressionModel
import org.apache.spark.mllib.regression.LinearRegressionWithSGD
// $example off$

@deprecated("Use ml.regression.LinearRegression or LBFGS", "2.0.0")
object LinearRegressionWithSGDExample {

  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setAppName("LinearRegressionWithSGDExample")
    val sc = new SparkContext(conf)

    // $example on$
    // Load and parse the data
    val data = sc.textFile("data/mllib/ridge-data/lpsa.data")
    val parsedData = data.map { line =>
      val parts = line.split(',')
      LabeledPoint(parts(0).toDouble, Vectors.dense(parts(1).split(' ').map(_.toDouble)))
    }.cache()

    // Building the model
    val numIterations = 100
    val stepSize = 0.00000001
    val model = LinearRegressionWithSGD.train(parsedData, numIterations, stepSize)

    // Evaluate model on training examples and compute training error
    val valuesAndPreds = parsedData.map { point =>
      val prediction = model.predict(point.features)
      (point.label, prediction)
    }
    val MSE = valuesAndPreds.map{ case(v, p) => math.pow((v - p), 2) }.mean()
    println("training Mean Squared Error = " + MSE)

    // Save and load model
    model.save(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
    val sameModel = LinearRegressionModel.load(sc, "target/tmp/scalaLinearRegressionWithSGDModel")
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

Source File: StreamingLinearRegressionExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.mllib

import org.apache.spark.SparkConf
// $example on$
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.regression.StreamingLinearRegressionWithSGD
// $example off$
import org.apache.spark.streaming._


object StreamingLinearRegressionExample {

  def main(args: Array[String]): Unit = {
    if (args.length != 2) {
      System.err.println("Usage: StreamingLinearRegressionExample <trainingDir> <testDir>")
      System.exit(1)
    }

    val conf = new SparkConf().setAppName("StreamingLinearRegressionExample")
    val ssc = new StreamingContext(conf, Seconds(1))

    // $example on$
    val trainingData = ssc.textFileStream(args(0)).map(LabeledPoint.parse).cache()
    val testData = ssc.textFileStream(args(1)).map(LabeledPoint.parse)

    val numFeatures = 3
    val model = new StreamingLinearRegressionWithSGD()
      .setInitialWeights(Vectors.zeros(numFeatures))

    model.trainOn(trainingData)
    model.predictOnValues(testData.map(lp => (lp.label, lp.features))).print()

    ssc.start()
    ssc.awaitTermination()
    // $example off$

    ssc.stop()
  }
}
// scalastyle:on println

Source File: BisectingKMeansExample.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.examples.mllib

// scalastyle:off println
import org.apache.spark.{SparkConf, SparkContext}
// $example on$
import org.apache.spark.mllib.clustering.BisectingKMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
// $example off$


object BisectingKMeansExample {

  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("mllib.BisectingKMeansExample")
    val sc = new SparkContext(sparkConf)

    // $example on$
    // Loads and parses data
    def parse(line: String): Vector = Vectors.dense(line.split(" ").map(_.toDouble))
    val data = sc.textFile("data/mllib/kmeans_data.txt").map(parse).cache()

    // Clustering the data into 6 clusters by BisectingKMeans.
    val bkm = new BisectingKMeans().setK(6)
    val model = bkm.run(data)

    // Show the compute cost and the cluster centers
    println(s"Compute Cost: ${model.computeCost(data)}")
    model.clusterCenters.zipWithIndex.foreach { case (center, idx) =>
      println(s"Cluster Center ${idx}: ${center}")
    }
    // $example off$

    sc.stop()
  }
}
// scalastyle:on println

org.apache.spark.mllib.linalg.Vectors Scala Examples