org.apache.spark.ml.linalg.Matrix Scala Examples

The following examples show how to use org.apache.spark.ml.linalg.Matrix. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: MultivariateGaussian.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.stat.distribution

import breeze.linalg.{diag, eigSym, max, DenseMatrix => BDM, DenseVector => BDV, Vector => BV}

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.ml.impl.Utils
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors}



  private def calculateCovarianceConstants: (BDM[Double], Double) = {
    val eigSym.EigSym(d, u) = eigSym(cov.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t

    // For numerical stability, values are considered to be non-zero only if they exceed tol.
    // This prevents any inverted value from exceeding (eps * n * max(d))^-1
    val tol = Utils.EPSILON * max(d) * d.length

    try {
      // log(pseudo-determinant) is sum of the logs of all non-zero singular values
      val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum

      // calculate the root-pseudo-inverse of the diagonal matrix of singular values
      // by inverting the square root of all non-zero values
      val pinvS = diag(new BDV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))

      (pinvS * u.t, -0.5 * (mean.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
    } catch {
      case uex: UnsupportedOperationException =>
        throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
    }
  }
} 
Example 2
Source File: LocalLogisticRegressionModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.classification

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.linalg.{Matrix, Vector}

class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel)
  extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {}

object LocalLogisticRegressionModel
  extends SimpleModelLoader[LogisticRegressionModel]
  with TypedTransformerConverter[LogisticRegressionModel] {

  override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = {
    val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor(
      classOf[String],
      classOf[Matrix],
      classOf[Vector],
      classOf[Int],
      java.lang.Boolean.TYPE
    )
    constructor.setAccessible(true)
    val coefficientMatrixParams =
      data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]]
    val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams)
    val interceptVectorParams =
      data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]]
    val interceptVector = DataUtils.constructVector(interceptVectorParams)
    constructor
      .newInstance(
        metadata.uid,
        coefficientMatrix,
        interceptVector,
        data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer],
        data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean]
      )
      .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String])
      .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String])
      .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String])
      .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String])
      .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double])
  }

  override implicit def toLocal(
    transformer: LogisticRegressionModel
  ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer)
} 
Example 3
Source File: LocalLogisticRegressionModel.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.classification

import java.lang.Boolean

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.linalg.{Matrix, SparseMatrix, Vector, Vectors}

class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel)
  extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {}

object LocalLogisticRegressionModel
  extends SimpleModelLoader[LogisticRegressionModel]
  with TypedTransformerConverter[LogisticRegressionModel] {

  override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = {
    val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor(
      classOf[String],
      classOf[Matrix],
      classOf[Vector],
      classOf[Int],
      java.lang.Boolean.TYPE
    )
    constructor.setAccessible(true)
    val coefficientMatrixParams =
      data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]]
    val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams)
    val interceptVectorParams =
      data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]]
    val interceptVector = DataUtils.constructVector(interceptVectorParams)
    constructor
      .newInstance(
        metadata.uid,
        coefficientMatrix,
        interceptVector,
        data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer],
        data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean]
      )
      .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String])
      .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String])
      .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String])
      .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String])
      .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double])
  }

  override implicit def toLocal(
    transformer: LogisticRegressionModel
  ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer)
} 
Example 4
Source File: LocalNaiveBayes.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving.classification

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.classification.NaiveBayesModel
import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors}

class LocalNaiveBayes(override val sparkTransformer: NaiveBayesModel)
  extends LocalProbabilisticClassificationModel[NaiveBayesModel] {}

object LocalNaiveBayes
  extends SimpleModelLoader[NaiveBayesModel]
  with TypedTransformerConverter[NaiveBayesModel] {

  override def build(metadata: Metadata, data: LocalData): NaiveBayesModel = {
    val constructor = classOf[NaiveBayesModel].getDeclaredConstructor(
      classOf[String],
      classOf[Vector],
      classOf[Matrix]
    )
    constructor.setAccessible(true)
    val matrixMetadata = data.column("theta").get.data.head.asInstanceOf[Map[String, Any]]
    val matrix         = DataUtils.constructMatrix(matrixMetadata)
    val piParams = data.column("pi").get.data.head.asInstanceOf[Map[String, Any]]
    val piVec = DataUtils.constructVector(piParams)

    val nb = constructor
      .newInstance(metadata.uid, piVec, matrix)
      .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String])
      .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String])
      .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String])
      .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String])

    nb.set(nb.smoothing, metadata.paramMap("smoothing").asInstanceOf[Number].doubleValue())
    nb.set(nb.modelType, metadata.paramMap("modelType").asInstanceOf[String])
    nb.set(nb.labelCol, metadata.paramMap("labelCol").asInstanceOf[String])

    nb
  }

  override implicit def toLocal(sparkTransformer: NaiveBayesModel): LocalNaiveBayes = {
    new LocalNaiveBayes(sparkTransformer)
  }
} 
Example 5
Source File: GenericTestSpec.scala    From spark-ml-serving   with Apache License 2.0 5 votes vote down vote up
package io.hydrosphere.spark_ml_serving

import io.hydrosphere.spark_ml_serving.common.LocalData
import org.apache.spark.SparkConf
import org.apache.spark.ml.linalg.{Matrix, Vector}
import org.apache.spark.mllib.linalg.{Matrix => OldMatrix, Vector => OldVector}
import org.apache.spark.ml.{Pipeline, PipelineStage}
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.scalatest.{BeforeAndAfterAll, FunSpec}

trait GenericTestSpec extends FunSpec with BeforeAndAfterAll {
  val conf = new SparkConf()
    .setMaster("local[2]")
    .setAppName("test")
    .set("spark.ui.enabled", "false")

  val session: SparkSession = SparkSession.builder().config(conf).getOrCreate()

  def modelPath(modelName: String): String = s"./target/test_models/${session.version}/$modelName"

  def test(
    name: String,
    data: => DataFrame,
    steps: => Seq[PipelineStage],
    columns: => Seq[String],
    accuracy: Double = 0.01
  ) = {
    val path = modelPath(name.toLowerCase())
    var validation = LocalData.empty
    var localPipelineModel = Option.empty[LocalPipelineModel]

    it("should train") {
      val pipeline = new Pipeline().setStages(steps.toArray)
      val pipelineModel = pipeline.fit(data)
      validation = LocalData.fromDataFrame(pipelineModel.transform(data))
      pipelineModel.write.overwrite().save(path)
    }

    it("should load local version") {
      localPipelineModel = Some(LocalPipelineModel.load(path))
      assert(localPipelineModel.isDefined)
    }

    it("should transform LocalData") {
      val localData = LocalData.fromDataFrame(data)
      val model = localPipelineModel.get
      val result = model.transform(localData)
      columns.foreach { col =>
        val resCol = result
          .column(col)
          .getOrElse(throw new IllegalArgumentException("Result column is absent"))
        val valCol = validation
          .column(col)
          .getOrElse(throw new IllegalArgumentException("Validation column is absent"))
        resCol.data.zip(valCol.data).foreach {
          case (r: Seq[Number @unchecked], v: Seq[Number @unchecked]) if r.head.isInstanceOf[Number] && r.head.isInstanceOf[Number] =>
            r.zip(v).foreach {
              case (ri, vi) =>
                assert(ri.doubleValue() - vi.doubleValue() <= accuracy, s"$ri - $vi > $accuracy")
            }
          case (r: Number, v: Number) =>
            assert(r.doubleValue() - v.doubleValue() <= accuracy, s"$r - $v > $accuracy")
          case (r, n) =>
            assert(r === n)
        }
        result.column(col).foreach { resData =>
          resData.data.foreach { resRow =>
            if (resRow.isInstanceOf[Seq[_]]) {
              assert(resRow.isInstanceOf[List[_]], resRow)
            } else if (resRow.isInstanceOf[Vector] || resRow.isInstanceOf[OldVector] || resRow
              .isInstanceOf[Matrix] || resRow.isInstanceOf[OldMatrix]) {
              assert(false, s"SparkML type detected. Column: $col, value: $resRow")
            }
          }
        }
      }
    }
  }

  def modelTest(
    data: => DataFrame,
    steps: => Seq[PipelineStage],
    columns: => Seq[String],
    accuracy: Double = 0.01
  ): Unit = {
    lazy val name = steps.map(_.getClass.getSimpleName).foldLeft("") {
      case ("", b) => b
      case (a, b) => a + "-" + b
    }

    describe(name) {
      test(name, data, steps, columns, accuracy)
    }
  }
} 
Example 6
Source File: Describe.scala    From Scala-Machine-Learning-Projects   with MIT License 5 votes vote down vote up
package com.packt.ScalaML.ChrunPrediction

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel }
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.max
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

import org.apache.spark._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset

import org.apache.spark.ml.linalg.{ Matrix, Vectors }
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row

object Describe {
  case class CustomerAccount(state_code: String, account_length: Integer, area_code: String,
    international_plan: String, voice_mail_plan: String, num_voice_mail: Double,
    total_day_mins: Double, total_day_calls: Double, total_day_charge: Double,
    total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double,
    total_night_mins: Double, total_night_calls: Double, total_night_charge: Double,
    total_international_mins: Double, total_international_calls: Double, total_international_charge: Double,
    total_international_num_calls: Double, churn: String)

  val schema = StructType(Array(
    StructField("state_code", StringType, true),
    StructField("account_length", IntegerType, true),
    StructField("area_code", StringType, true),
    StructField("international_plan", StringType, true),
    StructField("voice_mail_plan", StringType, true),
    StructField("num_voice_mail", DoubleType, true),
    StructField("total_day_mins", DoubleType, true),
    StructField("total_day_calls", DoubleType, true),
    StructField("total_day_charge", DoubleType, true),
    StructField("total_evening_mins", DoubleType, true),
    StructField("total_evening_calls", DoubleType, true),
    StructField("total_evening_charge", DoubleType, true),
    StructField("total_night_mins", DoubleType, true),
    StructField("total_night_calls", DoubleType, true),
    StructField("total_night_charge", DoubleType, true),
    StructField("total_international_mins", DoubleType, true),
    StructField("total_international_calls", DoubleType, true),
    StructField("total_international_charge", DoubleType, true),
    StructField("total_international_num_calls", DoubleType, true),
    StructField("churn", StringType, true)))

  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName("Desribe")
      .getOrCreate()

    spark.conf.set("spark.debug.maxToStringFields", 10000)
    val DEFAULT_MAX_TO_STRING_FIELDS = 2500
    if (SparkEnv.get != null) {
      SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS)
    } else {
      DEFAULT_MAX_TO_STRING_FIELDS
    }
    import spark.implicits._

    val trainSet: Dataset[CustomerAccount] = spark.read.
      option("inferSchema", "false")
      .format("com.databricks.spark.csv")
      .schema(schema)
      .load("data/churn-bigml-80.csv")
      .as[CustomerAccount]

    val statsDF = trainSet.describe()   
    statsDF.show()

    trainSet.createOrReplaceTempView("UserAccount")
    spark.catalog.cacheTable("UserAccount")
    
    spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show()
    spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show()
    trainSet.groupBy("churn").count.show()
    spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn")
    
  }
} 
Example 7
Source File: NaiveBayesModel.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.classification

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.classification.NaiveBayesModel.{Bernoulli, ModelType, Multinomial}
import org.apache.spark.ml.linalg.mleap.{BLAS, Matrices}
import org.apache.spark.ml.linalg.{DenseVector, Matrix, SparseVector, Vector}



@SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala")
case class NaiveBayesModel(numFeatures: Int,
                           numClasses: Int,
                           pi: Vector,
                           theta: Matrix,
                           modelType: NaiveBayesModel.ModelType,
                           override val thresholds: Option[Array[Double]] = None)
  extends ProbabilisticClassificationModel with Model {

  private def multinomialCalculation(raw: Vector) = {
    val prob = theta.multiply(raw)
    BLAS.axpy(1.0, pi, prob)
    prob
  }

  private def bernoulliCalculation(raw: Vector) = {
    val negTheta = Matrices.map(theta, value => math.log(1.0 - math.exp(value)))
    val ones = new DenseVector(Array.fill(theta.numCols) {1.0})
    val thetaMinusNegTheta = Matrices.map(theta, value =>
      value - math.log(1.0 - math.exp(value)))
    val negThetaSum = negTheta.multiply(ones)

    raw.foreachActive((_, value) =>
      require(value == 0.0 || value == 1.0,
        s"Bernoulli naive Bayes requires 0 or 1 feature values but found $raw.")
    )
    val prob = thetaMinusNegTheta.multiply(raw)
    BLAS.axpy(1.0, pi, prob)
    BLAS.axpy(1.0, negThetaSum, prob)
    prob
  }

  override def predictRaw(raw: Vector): Vector = {
    modelType match {
      case Multinomial =>
        multinomialCalculation(raw)
      case Bernoulli =>
        bernoulliCalculation(raw)
    }
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        var i = 0
        val size = dv.size
        val maxLog = dv.values.max
        while (i < size) {
          dv.values(i) = math.exp(dv.values(i) - maxLog)
          i += 1
        }
        ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv)
        dv
      case sv: SparseVector =>
        throw new RuntimeException("Unexpected error in NaiveBayesModel:" +
          " raw2probabilityInPlace encountered SparseVector")
    }

  }
} 
Example 8
Source File: VectorConverters.scala    From mleap   with Apache License 2.0 5 votes vote down vote up
package ml.combust.mleap.core.util

import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor}
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors}

import scala.language.implicitConversions


trait VectorConverters {
  implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match {
    case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size))
    case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)),
      values = vector.values,
      dimensions = Seq(vector.size))
  }

  implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match {
    case tensor: DenseTensor[_] =>
      Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      Vectors.sparse(tensor.dimensions.product,
        tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match {
    case matrix: DenseMatrix =>
      DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols))
    case matrix: SparseMatrix =>
      val indices = matrix.rowIndices.zip(matrix.colPtrs).map {
        case (r, c) => Seq(r, c)
      }.toSeq
      SparseTensor(indices = indices,
      values = matrix.values,
      dimensions = Seq(matrix.numRows, matrix.numCols))
  }

  implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match {
    case tensor: DenseTensor[_] =>
      Matrices.dense(tensor.dimensions.head,
        tensor.dimensions(1),
        tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip
      Matrices.sparse(tensor.dimensions.head,
        tensor.dimensions(1),
        cols.toArray,
        rows.toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match {
    case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size))
    case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size))
  }


  implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match {
    case tensor: DenseTensor[_] =>
      new BDV(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      new BSV(tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]],
        tensor.dimensions.product)
  }
}
object VectorConverters extends VectorConverters 
Example 9
Source File: SparkMatrix.scala    From Machine-Learning-with-Spark-Second-Edition   with MIT License 5 votes vote down vote up
package linalg.matrix

import org.apache.spark.ml.linalg.Matrix
import org.apache.spark.ml.linalg.Matrices
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.MatrixEntry

object SparkMatrix {

  def main(args: Array[String]) {

    val dMatrix: Matrix = Matrices.dense(2, 2, Array(1.0, 2.0, 3.0, 4.0))
    println("dMatrix: \n" + dMatrix)

    val sMatrixOne: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(5, 6, 7))
    println("sMatrixOne: \n" + sMatrixOne)

    val sMatrixTwo: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 1, 2), Array(5, 6, 7))
    println("sMatrixTwo: \n" + sMatrixTwo)

    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val sc = new SparkContext(spConfig)
    val denseData = Seq(
      Vectors.dense(0.0, 1.0, 2.1),
      Vectors.dense(3.0, 2.0, 4.0),
      Vectors.dense(5.0, 7.0, 8.0),
      Vectors.dense(9.0, 0.0, 1.1)
    )
    val sparseData = Seq(
      Vectors.sparse(3, Seq((1, 1.0), (2, 2.1))),
      Vectors.sparse(3, Seq((0, 3.0), (1, 2.0), (2, 4.0))),
      Vectors.sparse(3, Seq((0, 5.0), (1, 7.0), (2, 8.0))),
      Vectors.sparse(3, Seq((0, 9.0), (2, 1.0)))
    )

    val denseMat = new RowMatrix(sc.parallelize(denseData, 2))
    val sparseMat = new RowMatrix(sc.parallelize(sparseData, 2))

    println("Dense Matrix - Num of Rows :" + denseMat.numRows())
    println("Dense Matrix - Num of Cols:" + denseMat.numCols())
    println("Sparse Matrix - Num of Rows :" + sparseMat.numRows())
    println("Sparse Matrix - Num of Cols:" + sparseMat.numCols())

    val data = Seq(
      (0L, Vectors.dense(0.0, 1.0, 2.0)),
      (1L, Vectors.dense(3.0, 4.0, 5.0)),
      (3L, Vectors.dense(9.0, 0.0, 1.0))
    ).map(x => IndexedRow(x._1, x._2))
    val indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2)
    val indexedRowsMat = new IndexedRowMatrix(indexedRows)
    println("Indexed Row Matrix - No of Rows: " + indexedRowsMat.numRows())
    println("Indexed Row Matrix - No of Cols: " + indexedRowsMat.numCols())

    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    val coordinateMat = new CoordinateMatrix(entries)
    println("Coordinate Matrix - No of Rows: " + coordinateMat.numRows())
    println("Coordinate Matrix - No of Cols: " + coordinateMat.numCols())

    sc.stop()

  }

} 
Example 10
Source File: Generators.scala    From frameless   with Apache License 2.0 5 votes vote down vote up
package frameless
package ml

import frameless.ml.params.linears.{LossStrategy, Solver}
import frameless.ml.params.trees.FeatureSubsetStrategy
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors}
import org.scalacheck.{Arbitrary, Gen}

object Generators {

  implicit val arbVector: Arbitrary[Vector] = Arbitrary {
    val genDenseVector = Gen.listOf(arbDouble.arbitrary).map(doubles => Vectors.dense(doubles.toArray))
    val genSparseVector = genDenseVector.map(_.toSparse)

    Gen.oneOf(genDenseVector, genSparseVector)
  }

  implicit val arbMatrix: Arbitrary[Matrix] = Arbitrary {
    Gen.sized { size =>
      for {
        nbRows <- Gen.choose(0, size)
        nbCols <- Gen.choose(1, size)
        matrix <- {
          Gen.listOfN(nbRows * nbCols, arbDouble.arbitrary)
            .map(values => Matrices.dense(nbRows, nbCols, values.toArray))
        }
      } yield matrix
    }
  }

  implicit val arbTreesFeaturesSubsetStrategy: Arbitrary[FeatureSubsetStrategy] = Arbitrary {
    val genRatio = Gen.choose(0D, 1D).suchThat(_ > 0D).map(FeatureSubsetStrategy.Ratio)
    val genNumberOfFeatures = Gen.choose(1, Int.MaxValue).map(FeatureSubsetStrategy.NumberOfFeatures)

    Gen.oneOf(Gen.const(FeatureSubsetStrategy.All),
      Gen.const(FeatureSubsetStrategy.All),
      Gen.const(FeatureSubsetStrategy.Log2),
      Gen.const(FeatureSubsetStrategy.OneThird),
      Gen.const(FeatureSubsetStrategy.Sqrt),
      genRatio,
      genNumberOfFeatures
    )
  }

  implicit val arbLossStrategy: Arbitrary[LossStrategy] = Arbitrary {
      Gen.const(LossStrategy.SquaredError)
  }

  implicit val arbSolver: Arbitrary[Solver] = Arbitrary {
    Gen.oneOf(
      Gen.const(Solver.LBFGS),
      Gen.const(Solver.Auto),
      Gen.const(Solver.Normal)
    )
  }

} 
Example 11
Source File: MatrixUtils.scala    From pravda-ml   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.odkl

import org.apache.spark.ml.linalg.{DenseMatrix, Matrix, VectorUDT}


object MatrixUtils {

  def vectorUDT = new VectorUDT()

  def transformDense(matrix: DenseMatrix, transformer: (Int, Int, Double) => Double): DenseMatrix = {
    matrix.foreachActive((i, j, v) => {
      matrix(i, j) = transformer(i, j, v)
    })
    matrix
  }

  def applyNonZeros(source: Matrix, target: DenseMatrix, transformer: (Int, Int, Double, Double) => Double): DenseMatrix = {
    source.foreachActive((i, j, v) => {
      val index = target.index(i, j)
      target.values(index) = transformer(i, j, v, target.values(index))
    })
    target
  }

  def applyAll(source: Matrix, target: DenseMatrix, transformer: (Int, Int, Double, Double) => Double): DenseMatrix = {
    for (j <- 0 until source.numCols; i <- 0 until source.numRows) {
      val index = target.index(i, j)
      target.values(index) = transformer(i, j, source(i, j), target.values(index))
    }
    target
  }
} 
Example 12
Source File: CorrelationExample.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.linalg.{Matrix, Vectors}
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row
// $example off$
import org.apache.spark.sql.SparkSession


object CorrelationExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("CorrelationExample")
      .getOrCreate()
    import spark.implicits._

    // $example on$
    val data = Seq(
      Vectors.sparse(4, Seq((0, 1.0), (3, -2.0))),
      Vectors.dense(4.0, 5.0, 0.0, 3.0),
      Vectors.dense(6.0, 7.0, 0.0, 8.0),
      Vectors.sparse(4, Seq((0, 9.0), (3, 1.0)))
    )


    val df = data.map(Tuple1.apply).toDF("features")
    val Row(coeff1: Matrix) = Correlation.corr(df, "features").head
    println(s"Pearson correlation matrix:\n $coeff1")

    val Row(coeff2: Matrix) = Correlation.corr(df, "features", "spearman").head
    println(s"Spearman correlation matrix:\n $coeff2")
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println 
Example 13
Source File: CorrelationSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.stat

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.internal.Logging
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vectors}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}


class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {

  val xData = Array(1.0, 0.0, -2.0)
  val yData = Array(4.0, 5.0, 3.0)
  val zeros = new Array[Double](3)
  val data = Seq(
    Vectors.dense(1.0, 0.0, 0.0, -2.0),
    Vectors.dense(4.0, 5.0, 0.0, 3.0),
    Vectors.dense(6.0, 7.0, 0.0, 8.0),
    Vectors.dense(9.0, 0.0, 0.0, 1.0)
  )

  private def X = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

  private def extract(df: DataFrame): BDM[Double] = {
    val Array(Row(mat: Matrix)) = df.collect()
    mat.asBreeze.toDenseMatrix
  }


  test("corr(X) default, pearson") {
    val defaultMat = Correlation.corr(X, "features")
    val pearsonMat = Correlation.corr(X, "features", "pearson")
    // scalastyle:off
    val expected = Matrices.fromBreeze(BDM(
      (1.00000000, 0.05564149, Double.NaN, 0.4004714),
      (0.05564149, 1.00000000, Double.NaN, 0.9135959),
      (Double.NaN, Double.NaN, 1.00000000, Double.NaN),
      (0.40047142, 0.91359586, Double.NaN, 1.0000000)))
    // scalastyle:on

    assert(Matrices.fromBreeze(extract(defaultMat)) ~== expected absTol 1e-4)
    assert(Matrices.fromBreeze(extract(pearsonMat)) ~== expected absTol 1e-4)
  }

  test("corr(X) spearman") {
    val spearmanMat = Correlation.corr(X, "features", "spearman")
    // scalastyle:off
    val expected = Matrices.fromBreeze(BDM(
      (1.0000000,  0.1054093,  Double.NaN, 0.4000000),
      (0.1054093,  1.0000000,  Double.NaN, 0.9486833),
      (Double.NaN, Double.NaN, 1.00000000, Double.NaN),
      (0.4000000,  0.9486833,  Double.NaN, 1.0000000)))
    // scalastyle:on
    assert(Matrices.fromBreeze(extract(spearmanMat)) ~== expected absTol 1e-4)
  }

}