org.apache.spark.ml.linalg.Matrices Scala Example

Source File: MultinomialLogisticRegressionParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.parity.classification

import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType}

class MultinomialLogisticRegressionParitySpec extends SparkParityBase {

  val labels = Seq(0.0, 1.0, 2.0, 0.0, 1.0, 2.0)
  val ages = Seq(15, 30, 40, 50, 15, 80)
  val heights = Seq(175, 190, 155, 160, 170, 180)
  val weights = Seq(67, 100, 57, 56, 56, 88)

  val rows = spark.sparkContext.parallelize(Seq.tabulate(6) { i => Row(labels(i), ages(i), heights(i), weights(i)) })
  val schema = new StructType().add("label", DoubleType, nullable = false)
    .add("age", IntegerType, nullable = false)
    .add("height", IntegerType, nullable = false)
    .add("weight", IntegerType, nullable = false)

  override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema)

  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(
    new VectorAssembler().
      setInputCols(Array("age", "height", "weight")).
      setOutputCol("features"),
    new LogisticRegressionModel(uid = "logr", 
      coefficientMatrix = Matrices.dense(3, 3, Array(-1.3920551604166562, -0.13119545493644366, 1.5232506153530998, 0.3129112131192873, -0.21959056436528473, -0.09332064875400257, -0.24696506013528507, 0.6122879917796569, -0.36532293164437174)),
      interceptVector = Vectors.dense(0.4965574044951358, -2.1486146169780063, 1.6520572124828703),
      numClasses = 3, isMultinomial = true))).fit(dataset)
}

Source File: CorrelationSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.stat

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.internal.Logging
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vectors}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}


class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Logging {

  val xData = Array(1.0, 0.0, -2.0)
  val yData = Array(4.0, 5.0, 3.0)
  val zeros = new Array[Double](3)
  val data = Seq(
    Vectors.dense(1.0, 0.0, 0.0, -2.0),
    Vectors.dense(4.0, 5.0, 0.0, 3.0),
    Vectors.dense(6.0, 7.0, 0.0, 8.0),
    Vectors.dense(9.0, 0.0, 0.0, 1.0)
  )

  private def X = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

  private def extract(df: DataFrame): BDM[Double] = {
    val Array(Row(mat: Matrix)) = df.collect()
    mat.asBreeze.toDenseMatrix
  }


  test("corr(X) default, pearson") {
    val defaultMat = Correlation.corr(X, "features")
    val pearsonMat = Correlation.corr(X, "features", "pearson")
    // scalastyle:off
    val expected = Matrices.fromBreeze(BDM(
      (1.00000000, 0.05564149, Double.NaN, 0.4004714),
      (0.05564149, 1.00000000, Double.NaN, 0.9135959),
      (Double.NaN, Double.NaN, 1.00000000, Double.NaN),
      (0.40047142, 0.91359586, Double.NaN, 1.0000000)))
    // scalastyle:on

    assert(Matrices.fromBreeze(extract(defaultMat)) ~== expected absTol 1e-4)
    assert(Matrices.fromBreeze(extract(pearsonMat)) ~== expected absTol 1e-4)
  }

  test("corr(X) spearman") {
    val spearmanMat = Correlation.corr(X, "features", "spearman")
    // scalastyle:off
    val expected = Matrices.fromBreeze(BDM(
      (1.0000000,  0.1054093,  Double.NaN, 0.4000000),
      (0.1054093,  1.0000000,  Double.NaN, 0.9486833),
      (Double.NaN, Double.NaN, 1.00000000, Double.NaN),
      (0.4000000,  0.9486833,  Double.NaN, 1.0000000)))
    // scalastyle:on
    assert(Matrices.fromBreeze(extract(spearmanMat)) ~== expected absTol 1e-4)
  }

}

Source File: MLSerDeSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}

class MLSerDeSuite extends SparkFunSuite {

  MLSerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = MLSerDe.loads(MLSerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }
}

Source File: MultivariateGaussianSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.stat.distribution

import org.apache.spark.ml.SparkMLFunSuite
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.util.TestingUtils._


class MultivariateGaussianSuite extends SparkMLFunSuite {

  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }
}

Source File: HasNetlibBlas.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl

import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS}
import com.github.fommil.netlib.{F2jBLAS, BLAS => NetlibBLAS}
import org.apache.spark.ml.linalg.{DenseVector, Matrices, Vector, Vectors}

trait HasNetlibBlas {
  // For level-1 routines, we use Java implementation.
  def f2jBLAS: NetlibBLAS = HasNetlibBlas._f2jBLAS

  def blas: NetlibBLAS = HasNetlibBlas._nativeBLAS

  def dscal(a: Double, data: Array[Double]) : Unit = f2jBLAS.dscal(data.length, a, data, 1)

  def axpy(a: Double, x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.daxpy(x.length, a, x, 1, y, 1)

  def axpy(a: Double, x: Vector, y : Array[Double]) : Unit = x match {
    case dense: DenseVector => axpy(a, dense.values, y)
    case _ => x.foreachActive((i, v) => y(i) += a * v)
  }

  def copy( x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.dcopy(x.length, x, 1, y, 1)
}

object HasNetlibBlas extends Serializable {
  @transient private lazy val _f2jBLAS: NetlibBLAS = {
    initSparkBlas
    new F2jBLAS
  }

  private def initSparkBlas = synchronized {
    org.apache.spark.ml.linalg.BLAS.dot(Vectors.zeros(2), Vectors.zeros(2))
    org.apache.spark.ml.linalg.BLAS.gemv(1.0, Matrices.zeros(2, 2), Vectors.zeros(2), 0.5, Vectors.zeros(2).toDense)
  }

  @transient private lazy val _nativeBLAS: NetlibBLAS = {
    initSparkBlas
    NativeBLAS
  }
}

Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl.texts

import java.util.Random

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed}
import org.apache.spark.ml.param._
import org.apache.spark.ml.util.{Identifiable, SchemaUtils}
import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector}
import org.apache.spark.sql.{DataFrame, Dataset}
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.types.{LongType, StructType}


  def setDim(value: Long): this.type = set(dim, value)


  def this() = this(Identifiable.randomUID("randomProjectionsHasher"))

  override def transform(dataset: Dataset[_]): DataFrame = {
    val dimensity = {
      if (!isSet(dim)) {//If dimensions is not set - will search  AttributeGroup in metadata as it comes from OdklCountVectorizer
        val vectorsIndex = dataset.schema.fieldIndex($(inputCol))
        AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size
      } else {
        $(dim).toInt
      }
    }
    val projectionMatrix = dataset.sqlContext.sparkContext.broadcast(
      Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix])
  //the matrix of random vectors to costruct hash

    val binHashSparseVectorColumn = udf((vector: Vector) => {
      projectionMatrix.value.multiply(vector).values
        .map(f =>  if (f>0) 1L else 0L)
        .view.zipWithIndex
        .foldLeft(0L) {case  (acc,(v, i)) => acc | (v << i) }

    })
    dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol))))
  }

  override def copy(extra: ParamMap): Transformer = {
    defaultCopy(extra)
  }

  @DeveloperApi
  override def transformSchema(schema: StructType): StructType = {
    SchemaUtils.appendColumn(schema, $(outputCol), LongType)
  }

}

Source File: Generators.scala From frameless with Apache License 2.0

5 votes

package frameless
package ml

import frameless.ml.params.linears.{LossStrategy, Solver}
import frameless.ml.params.trees.FeatureSubsetStrategy
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors}
import org.scalacheck.{Arbitrary, Gen}

object Generators {

  implicit val arbVector: Arbitrary[Vector] = Arbitrary {
    val genDenseVector = Gen.listOf(arbDouble.arbitrary).map(doubles => Vectors.dense(doubles.toArray))
    val genSparseVector = genDenseVector.map(_.toSparse)

    Gen.oneOf(genDenseVector, genSparseVector)
  }

  implicit val arbMatrix: Arbitrary[Matrix] = Arbitrary {
    Gen.sized { size =>
      for {
        nbRows <- Gen.choose(0, size)
        nbCols <- Gen.choose(1, size)
        matrix <- {
          Gen.listOfN(nbRows * nbCols, arbDouble.arbitrary)
            .map(values => Matrices.dense(nbRows, nbCols, values.toArray))
        }
      } yield matrix
    }
  }

  implicit val arbTreesFeaturesSubsetStrategy: Arbitrary[FeatureSubsetStrategy] = Arbitrary {
    val genRatio = Gen.choose(0D, 1D).suchThat(_ > 0D).map(FeatureSubsetStrategy.Ratio)
    val genNumberOfFeatures = Gen.choose(1, Int.MaxValue).map(FeatureSubsetStrategy.NumberOfFeatures)

    Gen.oneOf(Gen.const(FeatureSubsetStrategy.All),
      Gen.const(FeatureSubsetStrategy.All),
      Gen.const(FeatureSubsetStrategy.Log2),
      Gen.const(FeatureSubsetStrategy.OneThird),
      Gen.const(FeatureSubsetStrategy.Sqrt),
      genRatio,
      genNumberOfFeatures
    )
  }

  implicit val arbLossStrategy: Arbitrary[LossStrategy] = Arbitrary {
      Gen.const(LossStrategy.SquaredError)
  }

  implicit val arbSolver: Arbitrary[Solver] = Arbitrary {
    Gen.oneOf(
      Gen.const(Solver.LBFGS),
      Gen.const(Solver.Auto),
      Gen.const(Solver.Normal)
    )
  }

}

Source File: MLSerDeSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}

class MLSerDeSuite extends SparkFunSuite {

  MLSerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = MLSerDe.loads(MLSerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }
}

Source File: MultivariateGaussianSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.stat.distribution

import org.apache.spark.ml.SparkMLFunSuite
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.util.TestingUtils._


class MultivariateGaussianSuite extends SparkMLFunSuite {

  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }
}

Source File: LRSelectorSuite.scala From spark-FeatureSelection with Apache License 2.0

5 votes

package org.apache.spark.ml.feature.selection.embedded

import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.feature.selection.{FeatureSelectionTestBase, FeatureSelectorTestBase}
import org.apache.spark.ml.linalg.Matrices

class LRSelectorSuite extends FeatureSelectionTestBase {
  // Order of feature importances must be: f4 > f3 > f2 > f1
  private val lrWeights = Matrices.dense(3, 4, Array(0.1, 0.1, 0.1, 0.2, 0.2, 0.2, -0.8, -0.8, -0.8, 0.9, 0.9, 0.9))

  test("Test LRSelector: numTopFeatures") {
    val selector = new LRSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName).setCoefficientMatrix(lrWeights)
      .setOutputCol("filtered").setSelectorType("numTopFeatures").setNumTopFeatures(2)

    val importantColNames = Array("pWidth", "pLength")
    val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset)

    FeatureSelectorTestBase.testSelector[LRSelector, LRSelectorModel](selector, df, importantColNames, "ImportantFeatures")
  }

  test("Test LRSelector: percentile") {
    val selector = new LRSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName)
      .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.51).setCoefficientMatrix(lrWeights)

    val importantColNames = Array("pWidth", "pLength")
    val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset)

    FeatureSelectorTestBase.testSelector[LRSelector, LRSelectorModel](selector, df, importantColNames, "ImportantFeatures")
  }

  test("Test LRSelector: randomCutOff") {
    val selector = new LRSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName)
      .setOutputCol("filtered").setSelectorType("randomCutOff").setRandomCutOff(1.0).setCoefficientMatrix(lrWeights)

    val importantColNames = Array("pWidth", "pLength", "sWidth", "sLength")
    val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset)

    FeatureSelectorTestBase.testSelector[LRSelector, LRSelectorModel](selector, df, importantColNames, "ImportantFeatures")
  }

  test("LRSelector read/write") {
    val nb = new LRSelector
    testEstimatorAndModelReadWrite[LRSelector, LRSelectorModel](nb, dataset,
      FeatureSelectorTestBase.allParamSettings.+("coefficientMatrix" -> lrWeights), FeatureSelectorTestBase.checkModelData)
  }
}

Source File: MLSerDeSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}

class MLSerDeSuite extends SparkFunSuite {

  MLSerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = MLSerDe.loads(MLSerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }
}

Source File: MultivariateGaussianSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.stat.distribution

import org.apache.spark.ml.SparkMLFunSuite
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.util.TestingUtils._


class MultivariateGaussianSuite extends SparkMLFunSuite {

  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }
}

Source File: SparkMatrix.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package linalg.matrix

import org.apache.spark.ml.linalg.Matrix
import org.apache.spark.ml.linalg.Matrices
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix
import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.spark.mllib.linalg.distributed.IndexedRow
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.linalg.distributed.MatrixEntry

object SparkMatrix {

  def main(args: Array[String]) {

    val dMatrix: Matrix = Matrices.dense(2, 2, Array(1.0, 2.0, 3.0, 4.0))
    println("dMatrix: \n" + dMatrix)

    val sMatrixOne: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(5, 6, 7))
    println("sMatrixOne: \n" + sMatrixOne)

    val sMatrixTwo: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 1, 2), Array(5, 6, 7))
    println("sMatrixTwo: \n" + sMatrixTwo)

    val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp")
    val sc = new SparkContext(spConfig)
    val denseData = Seq(
      Vectors.dense(0.0, 1.0, 2.1),
      Vectors.dense(3.0, 2.0, 4.0),
      Vectors.dense(5.0, 7.0, 8.0),
      Vectors.dense(9.0, 0.0, 1.1)
    )
    val sparseData = Seq(
      Vectors.sparse(3, Seq((1, 1.0), (2, 2.1))),
      Vectors.sparse(3, Seq((0, 3.0), (1, 2.0), (2, 4.0))),
      Vectors.sparse(3, Seq((0, 5.0), (1, 7.0), (2, 8.0))),
      Vectors.sparse(3, Seq((0, 9.0), (2, 1.0)))
    )

    val denseMat = new RowMatrix(sc.parallelize(denseData, 2))
    val sparseMat = new RowMatrix(sc.parallelize(sparseData, 2))

    println("Dense Matrix - Num of Rows :" + denseMat.numRows())
    println("Dense Matrix - Num of Cols:" + denseMat.numCols())
    println("Sparse Matrix - Num of Rows :" + sparseMat.numRows())
    println("Sparse Matrix - Num of Cols:" + sparseMat.numCols())

    val data = Seq(
      (0L, Vectors.dense(0.0, 1.0, 2.0)),
      (1L, Vectors.dense(3.0, 4.0, 5.0)),
      (3L, Vectors.dense(9.0, 0.0, 1.0))
    ).map(x => IndexedRow(x._1, x._2))
    val indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2)
    val indexedRowsMat = new IndexedRowMatrix(indexedRows)
    println("Indexed Row Matrix - No of Rows: " + indexedRowsMat.numRows())
    println("Indexed Row Matrix - No of Cols: " + indexedRowsMat.numCols())

    val entries = sc.parallelize(Seq(
      (0, 0, 1.0),
      (0, 1, 2.0),
      (1, 1, 3.0),
      (1, 2, 4.0),
      (2, 2, 5.0),
      (2, 3, 6.0),
      (3, 0, 7.0),
      (3, 3, 8.0),
      (4, 1, 9.0)), 3).map { case (i, j, value) =>
      MatrixEntry(i, j, value)
    }
    val coordinateMat = new CoordinateMatrix(entries)
    println("Coordinate Matrix - No of Rows: " + coordinateMat.numRows())
    println("Coordinate Matrix - No of Cols: " + coordinateMat.numCols())

    sc.stop()

  }

}

Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.stat.distribution

import breeze.linalg.{diag, eigSym, max, DenseMatrix => BDM, DenseVector => BDV, Vector => BV}

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.ml.impl.Utils
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors}



  private def calculateCovarianceConstants: (BDM[Double], Double) = {
    val eigSym.EigSym(d, u) = eigSym(cov.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t

    // For numerical stability, values are considered to be non-zero only if they exceed tol.
    // This prevents any inverted value from exceeding (eps * n * max(d))^-1
    val tol = Utils.EPSILON * max(d) * d.length

    try {
      // log(pseudo-determinant) is sum of the logs of all non-zero singular values
      val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum

      // calculate the root-pseudo-inverse of the diagonal matrix of singular values
      // by inverting the square root of all non-zero values
      val pinvS = diag(new BDV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))

      (pinvS * u.t, -0.5 * (mean.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
    } catch {
      case uex: UnsupportedOperationException =>
        throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
    }
  }
}

Source File: NaiveBayesClassifierOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.classification.NaiveBayesModel
import org.apache.spark.ml.linalg.{Matrices, Vectors}


class NaiveBayesClassifierOp extends SimpleSparkOp[NaiveBayesModel] {
  override val Model: OpModel[SparkBundleContext, NaiveBayesModel] = new OpModel[SparkBundleContext, NaiveBayesModel] {
    override val klazz: Class[NaiveBayesModel] = classOf[NaiveBayesModel]

    override def opName: String = Bundle.BuiltinOps.classification.naive_bayes

    override def store(model: Model, obj: NaiveBayesModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val thresholds = if(obj.isSet(obj.thresholds)) {
        Some(obj.getThresholds)
      } else None
      model.withValue("num_features", Value.long(obj.numFeatures)).
        withValue("num_classes", Value.long(obj.numClasses)).
        withValue("pi", Value.vector(obj.pi.toArray)).
        withValue("theta", Value.tensor(DenseTensor(obj.theta.toArray, Seq(obj.theta.numRows, obj.theta.numCols)))).
        withValue("model_type", Value.string(obj.getModelType)).
        withValue("thresholds", thresholds.map(Value.doubleList(_)))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): NaiveBayesModel = {
      val theta = model.value("theta").getTensor[Double]
      val nb = new NaiveBayesModel(uid = "",
        pi = Vectors.dense(model.value("pi").getTensor[Double].toArray),
        theta = Matrices.dense(theta.dimensions.head, theta.dimensions(1), theta.toArray))
      val modelType = model.value("model_type").getString
      model.getValue("thresholds").map(t => nb.setThresholds(t.getDoubleList.toArray))
      nb.set(nb.modelType, modelType)
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: NaiveBayesModel): NaiveBayesModel = {
    val r = new NaiveBayesModel(uid = uid, pi = model.pi, theta = model.theta)
    if (model.isDefined(model.thresholds)) { r.setThresholds(model.getThresholds) }
    if (model.isDefined(model.modelType)) { r.set(r.modelType, model.getModelType)}
    r
  }

  override def sparkInputs(obj: NaiveBayesModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: NaiveBayesModel): Seq[SimpleParamSpec] = {
    Seq("raw_prediction" -> obj.rawPredictionCol,
      "probability" -> obj.probabilityCol,
      "prediction" -> obj.predictionCol)
  }
}

Source File: LogisticRegressionOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.linalg.{Matrices, Vectors}


class LogisticRegressionOp extends SimpleSparkOp[LogisticRegressionModel] {

  private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5

  override val Model: OpModel[SparkBundleContext, LogisticRegressionModel] = new OpModel[SparkBundleContext, LogisticRegressionModel] {
    override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel]

    override def opName: String = Bundle.BuiltinOps.classification.logistic_regression

    override def store(model: Model, obj: LogisticRegressionModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val m = model.withValue("num_classes", Value.long(obj.numClasses))
      if(obj.numClasses > 2) {
        val cm = obj.coefficientMatrix
        val thresholds = if(obj.isSet(obj.thresholds)) {
          Some(obj.getThresholds)
        } else None
        m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))).
          withValue("intercept_vector", Value.vector(obj.interceptVector.toArray)).
          withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList))
      } else {
        m.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
          withValue("intercept", Value.double(obj.intercept)).
          withValue("threshold", Value.double(obj.getThreshold))
      }
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): LogisticRegressionModel = {
      val numClasses = model.value("num_classes").getLong
      val r = if(numClasses > 2) {
        val cmTensor = model.value("coefficient_matrix").getTensor[Double]
        val coefficientMatrix = Matrices.dense(cmTensor.dimensions.head, cmTensor.dimensions(1), cmTensor.toArray)
        val lr = new LogisticRegressionModel(uid = "",
          coefficientMatrix = coefficientMatrix,
          interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray),
          numClasses = numClasses.toInt,
          isMultinomial = true)
        model.getValue("thresholds").
          map(t => lr.setThresholds(t.getDoubleList.toArray)).
          getOrElse(lr)
      } else {
        val lr = new LogisticRegressionModel(uid = "",
          coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
          intercept = model.value("intercept").getDouble)

        // default threshold is 0.5 for both Spark and Scikit-learn
        val threshold = model.getValue("threshold")
          .map(value => value.getDouble)
          .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD)

        lr.setThreshold(threshold)
      }
      r
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: LogisticRegressionModel): LogisticRegressionModel = {
    val numClasses = model.numClasses
    val r = if (numClasses > 2) {
        val lr = new LogisticRegressionModel(uid = uid,
          coefficientMatrix = model.coefficientMatrix,
          interceptVector = model.interceptVector,
          numClasses = numClasses,
          isMultinomial = true)
        if(model.isDefined(model.thresholds)) { lr.setThresholds(model.getThresholds) }
        lr
    } else {
        val lr = new LogisticRegressionModel(uid = uid,
          coefficientMatrix = model.coefficientMatrix,
          interceptVector = model.interceptVector,
          numClasses = numClasses,
          isMultinomial = false)
        if(model.isDefined(model.threshold)) { lr.setThreshold(model.getThreshold) }
        lr
    }
    r
  }

  override def sparkInputs(obj: LogisticRegressionModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: LogisticRegressionModel): Seq[SimpleParamSpec] = {
    Seq("raw_prediction" -> obj.rawPredictionCol,
      "probability" -> obj.probabilityCol,
      "prediction" -> obj.predictionCol)
  }
}

Source File: GaussianMixtureOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.{DenseTensor, Tensor}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.clustering.GaussianMixtureModel
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian


class GaussianMixtureOp extends SimpleSparkOp[GaussianMixtureModel] {
  override val Model: OpModel[SparkBundleContext, GaussianMixtureModel] = new OpModel[SparkBundleContext, GaussianMixtureModel] {
    override val klazz: Class[GaussianMixtureModel] = classOf[GaussianMixtureModel]

    override def opName: String = Bundle.BuiltinOps.clustering.gaussian_mixture

    override def store(model: Model, obj: GaussianMixtureModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val (rows, cols) = obj.gaussians.headOption.
        map(g => (g.cov.numRows, g.cov.numCols)).
        getOrElse((-1, -1))
      val (means, covs) = obj.gaussians.map(g => (g.mean, g.cov)).unzip

      model.withValue("means", Value.tensorList(means.map(_.toArray).map(Tensor.denseVector))).
        withValue("covs", Value.tensorList(covs.map(m => DenseTensor(m.toArray, Seq(m.numRows, m.numCols))))).
        withValue("weights", Value.doubleList(obj.weights.toSeq))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): GaussianMixtureModel = {
      val means = model.value("means").getTensorList[Double].map(values => Vectors.dense(values.toArray))
      val covs = model.value("covs").getTensorList[Double].map(values => Matrices.dense(values.dimensions.head, values.dimensions(1), values.toArray))
      val gaussians = means.zip(covs).map {
        case (mean, cov) => new MultivariateGaussian(mean, cov)
      }.toArray
      val weights = model.value("weights").getDoubleList.toArray

      new GaussianMixtureModel(uid = "",
        gaussians = gaussians,
        weights = weights)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: GaussianMixtureModel): GaussianMixtureModel = {
    new GaussianMixtureModel(uid = uid, weights = model.weights, gaussians = model.gaussians)
  }

  override def sparkInputs(obj: GaussianMixtureModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: GaussianMixtureModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol,
      "probability" -> obj.probabilityCol)
  }
}

Source File: LogisticRegressionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.scalatest.FunSpec


class LogisticRegressionModelSpec extends FunSpec {
  describe("BinaryLogisticRegression") {
    val weights = Vectors.dense(1.0, 2.0, 4.0)
    val intercept = 0.7

    describe("issue210: Logistic function not being applied") {
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4)
      it("applies the logistic function for prediction") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)

        assert(lr.predict(features) == 1.0)
      }
    }

    describe("issue386:Wrong Binary LogisticRegression predictions") {
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4)
      it("compare binary logisticRegression prediction with the transform api predictions") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.probabilityToPrediction(lr.rawToProbability(lr.predictRaw(features))))
        assert(lr.predict(features) == 1.0)
      }

      it("compare binary logisticRegression prediction with rawToPrediction() results") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.rawToPrediction(lr.predictRaw(features)))
        assert(lr.predict(features) == 1.0)
      }
    }

    describe("issue386:Binary LogisticRegression predictions with 1.0 threshold"){
      val lr = BinaryLogisticRegressionModel(weights, intercept, 1.0)
      it("binary logisticRegression prediction equals zero for 1.0 threshold") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.probabilityToPrediction(lr.rawToProbability(lr.predictRaw(features))))
        assert(lr.predict(features) == 0.0)
      }
    }

    describe("issue386:Binary LogisticRegression predictions with 0.0 threshold"){
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.0)
      it("binary logisticRegression prediction equals 1 for zero threshold") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.rawToPrediction(lr.predictRaw(features)))
        assert(lr.predict(features) == 1.0)
      }
    }

    describe("input/output schema"){
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4)
      it("has the right input schema") {
        assert(lr.inputSchema.fields == Seq(StructField("features", TensorType.Double(3))))
      }

      it("has the right output schema") {
        assert(lr.outputSchema.fields == Seq(
          StructField("raw_prediction", TensorType.Double(2)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)
        ))
      }
    }
  }

  describe("ProbabilisticLogisticsRegressionModel") {
    val weights = Matrices.dense(3, 3, Array(1, 2, 3, 1, 2, 3, 1, 2, 3))
    val intercept = Vectors.dense(1, 2, 3)
    val lr = ProbabilisticLogisticsRegressionModel(weights, intercept, None)

    describe("input/output schema"){
      it("has the right input schema") {
        assert(lr.inputSchema.fields == Seq(StructField("features", TensorType.Double(3))))
      }

      it("has the right output schema") {
        assert(lr.outputSchema.fields == Seq(
          StructField("raw_prediction", TensorType.Double(3)),
          StructField("probability", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)
        ))
      }
    }
  }
}

Source File: PcaModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vectors}
import org.scalatest.FunSpec


class PcaModelSpec extends FunSpec {
  describe("pca model") {
    val pc = new DenseMatrix(3, 2, Array[Double](1, -1, 2,
      0, -3, 1))
    val pca = PcaModel(pc)

    it("uses the principal components matrix to transform a vector to a lower-dimensional vector") {

      val input = Vectors.dense(Array[Double](2, 1, 0))

      assert(pca(input).toArray sameElements Array[Double](1, -3))
    }

    it("has the right input schema") {
      assert(pca.inputSchema.fields == Seq(StructField("input", TensorType.Double())))
    }

    it("has the right output schema") {
      assert(pca.outputSchema.fields == Seq(StructField("output", TensorType.Double())))
    }
  }
}

Source File: VectorConverters.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.util

import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor}
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors}

import scala.language.implicitConversions


trait VectorConverters {
  implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match {
    case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size))
    case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)),
      values = vector.values,
      dimensions = Seq(vector.size))
  }

  implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match {
    case tensor: DenseTensor[_] =>
      Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      Vectors.sparse(tensor.dimensions.product,
        tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match {
    case matrix: DenseMatrix =>
      DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols))
    case matrix: SparseMatrix =>
      val indices = matrix.rowIndices.zip(matrix.colPtrs).map {
        case (r, c) => Seq(r, c)
      }.toSeq
      SparseTensor(indices = indices,
      values = matrix.values,
      dimensions = Seq(matrix.numRows, matrix.numCols))
  }

  implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match {
    case tensor: DenseTensor[_] =>
      Matrices.dense(tensor.dimensions.head,
        tensor.dimensions(1),
        tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip
      Matrices.sparse(tensor.dimensions.head,
        tensor.dimensions(1),
        cols.toArray,
        rows.toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match {
    case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size))
    case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size))
  }


  implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match {
    case tensor: DenseTensor[_] =>
      new BDV(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      new BSV(tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]],
        tensor.dimensions.product)
  }
}
object VectorConverters extends VectorConverters

Source File: NaiveBayesClassifierOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl.Model
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.runtime.transformer.classification.NaiveBayesClassifier
import ml.combust.mleap.core.classification.NaiveBayesModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.linalg.{Matrices, Vectors}



class NaiveBayesClassifierOp extends MleapOp[NaiveBayesClassifier, NaiveBayesModel]{
  override val Model: OpModel[MleapContext, NaiveBayesModel] = new OpModel[MleapContext, NaiveBayesModel]{
    override val klazz: Class[NaiveBayesModel] = classOf[NaiveBayesModel]

    override def opName: String = Bundle.BuiltinOps.classification.naive_bayes

    override def store(model: Model, obj: NaiveBayesModel)(implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("num_features", Value.long(obj.numFeatures)).
        withValue("num_classes", Value.long(obj.numClasses)).
        withValue("pi", Value.vector(obj.pi.toArray)).
        withValue("theta", Value.tensor(DenseTensor(obj.theta.toArray, Seq(obj.theta.numRows, obj.theta.numCols)))).
        withValue("model_type", Value.string(obj.modelType.toString)).
        withValue("thresholds", obj.thresholds.map(Value.doubleList(_)))
    }

    override def load(model: Model)(implicit context: BundleContext[MleapContext]): NaiveBayesModel = {
      val theta = model.value("theta").getTensor[Double]
      val modelType = NaiveBayesModel.forName(model.value("model_type").getString)
      val numClasses = model.value("num_classes").getLong.toInt
      val thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray)
      require(thresholds.isEmpty || thresholds.get.length == numClasses,
        "NaiveBayesModel loaded with non-matching numClasses and thresholds.length. " +
          s" numClasses=$numClasses, but thresholds has length ${thresholds.get.length}")
      new NaiveBayesModel(numFeatures = model.value("num_features").getLong.toInt,
        numClasses = numClasses,
        pi = Vectors.dense(model.value("pi").getTensor[Double].toArray),
        theta = Matrices.dense(theta.dimensions.head, theta.dimensions(1), theta.toArray),
        modelType = modelType,
        thresholds = thresholds)
    }

  }
  override def model(node: NaiveBayesClassifier): NaiveBayesModel = node.model
}

Source File: LogisticRegressionOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, LogisticRegressionModel, ProbabilisticLogisticsRegressionModel}
import ml.combust.mleap.runtime.transformer.classification.LogisticRegression
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.linalg.{Matrices, Vectors}


class LogisticRegressionOp extends MleapOp[LogisticRegression, LogisticRegressionModel] {

  private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5

  override val Model: OpModel[MleapContext, LogisticRegressionModel] = new OpModel[MleapContext, LogisticRegressionModel] {
    override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel]

    override def opName: String = Bundle.BuiltinOps.classification.logistic_regression

    override def store(model: Model, obj: LogisticRegressionModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      val m = model.withValue("num_classes", Value.long(obj.numClasses))
      if(obj.isMultinomial) {
        val mm = obj.multinomialModel
        val cm = mm.coefficientMatrix
        m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))).
          withValue("intercept_vector", Value.vector(mm.interceptVector.toArray)).
          withValue("thresholds", mm.thresholds.map(_.toSeq).map(Value.doubleList))
      } else {
        m.withValue("coefficients", Value.vector(obj.binaryModel.coefficients.toArray)).
          withValue("intercept", Value.double(obj.binaryModel.intercept)).
          withValue("threshold", Value.double(obj.binaryModel.threshold))
      }
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): LogisticRegressionModel = {
      val numClasses = model.value("num_classes").getLong

      val lm = if(numClasses > 2) {
        val tensor = model.value("coefficient_matrix").getTensor[Double]
        val cm = Matrices.dense(numRows = tensor.dimensions.head, numCols = tensor.dimensions(1), tensor.toArray)

        ProbabilisticLogisticsRegressionModel(coefficientMatrix = cm,
          interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray),
          thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray))
      } else {
        // default threshold is 0.5 for both Spark and Scikit-learn
        val threshold = model.getValue("threshold")
                              .map(value => value.getDouble)
                              .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD)
        BinaryLogisticRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
          intercept = model.value("intercept").getDouble,
          threshold = threshold)
      }

      LogisticRegressionModel(lm)
    }
  }

  override def model(node: LogisticRegression): LogisticRegressionModel = node.model
}

Source File: GaussianMixtureOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.clustering.GaussianMixtureModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.clustering.GaussianMixture
import ml.combust.mleap.tensor.{DenseTensor, Tensor}
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian


class GaussianMixtureOp extends MleapOp[GaussianMixture, GaussianMixtureModel] {
  override val Model: OpModel[MleapContext, GaussianMixtureModel] = new OpModel[MleapContext, GaussianMixtureModel] {
    override val klazz: Class[GaussianMixtureModel] = classOf[GaussianMixtureModel]

    override def opName: String = Bundle.BuiltinOps.clustering.gaussian_mixture

    override def store(model: Model, obj: GaussianMixtureModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      val (means, covs) = obj.gaussians.map(g => (g.mean, g.cov)).unzip
      model.withValue("means", Value.tensorList(means.map(m => Tensor.denseVector(m.toArray)))).
        withValue("covs", Value.tensorList(covs.map(c => DenseTensor(c.toArray, Seq(c.numRows, c.numCols))))).
        withValue("weights", Value.doubleList(obj.weights.toSeq))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): GaussianMixtureModel = {
      val means = model.value("means").getTensorList[Double].map(values => Vectors.dense(values.toArray))
      val covs = model.value("covs").getTensorList[Double].map {
        values => Matrices.dense(values.dimensions.head, values.dimensions(1), values.toArray)
      }
      val gaussians = means.zip(covs).map {
        case (mean, cov) => new MultivariateGaussian(mean, cov)
      }.toArray
      val weights = model.value("weights").getDoubleList.toArray
      GaussianMixtureModel(gaussians, weights)
    }
  }

  override def model(node: GaussianMixture): GaussianMixtureModel = node.model
}

Source File: LocalPCAModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.PCAModel
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Vectors}
import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Matrices => OldMatrices}

class LocalPCAModel(override val sparkTransformer: PCAModel) extends LocalTransformer[PCAModel] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val pc      = OldMatrices.fromML(sparkTransformer.pc).asInstanceOf[OldDenseMatrix]
        val newData = column.data.mapToMlLibVectors.map(pc.transpose.multiply).map(_.toList)
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalPCAModel extends SimpleModelLoader[PCAModel] with TypedTransformerConverter[PCAModel] {

  override def build(metadata: Metadata, data: LocalData): PCAModel = {
    val constructor = classOf[PCAModel].getDeclaredConstructor(
      classOf[String],
      classOf[DenseMatrix],
      classOf[DenseVector]
    )
    constructor.setAccessible(true)
    val pcMap = data.column("pc").get.data.head.asInstanceOf[Map[String, Any]]
    val pcMat = DataUtils.constructMatrix(pcMap).asInstanceOf[DenseMatrix]
    data.column("explainedVariance") match {
      case Some(ev) =>
        // NOTE: Spark >= 2
        val evParams = ev.data.head.asInstanceOf[Map[String, Any]]
        val explainedVariance = DataUtils.constructVector(evParams).toDense

        constructor
          .newInstance(metadata.uid, pcMat, explainedVariance)
          .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
          .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      case None =>
        // NOTE: Spark < 2
        constructor
          .newInstance(
            metadata.uid,
            pcMat,
            Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]
          )
          .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
          .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
    }
  }

  override implicit def toLocal(transformer: PCAModel) =
    new LocalPCAModel(transformer)
}

Source File: MLSerDeSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}

class MLSerDeSuite extends SparkFunSuite {

  MLSerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = MLSerDe.loads(MLSerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }
}

Source File: MultivariateGaussianSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.stat.distribution

import org.apache.spark.ml.SparkMLFunSuite
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.util.TestingUtils._


class MultivariateGaussianSuite extends SparkMLFunSuite {

  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }
}

org.apache.spark.ml.linalg.Matrices Scala Examples