org.apache.spark.ml.linalg.Matrices Scala Examples
The following examples show how to use org.apache.spark.ml.linalg.Matrices.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MultinomialLogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} class MultinomialLogisticRegressionParitySpec extends SparkParityBase { val labels = Seq(0.0, 1.0, 2.0, 0.0, 1.0, 2.0) val ages = Seq(15, 30, 40, 50, 15, 80) val heights = Seq(175, 190, 155, 160, 170, 180) val weights = Seq(67, 100, 57, 56, 56, 88) val rows = spark.sparkContext.parallelize(Seq.tabulate(6) { i => Row(labels(i), ages(i), heights(i), weights(i)) }) val schema = new StructType().add("label", DoubleType, nullable = false) .add("age", IntegerType, nullable = false) .add("height", IntegerType, nullable = false) .add("weight", IntegerType, nullable = false) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new VectorAssembler(). setInputCols(Array("age", "height", "weight")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficientMatrix = Matrices.dense(3, 3, Array(-1.3920551604166562, -0.13119545493644366, 1.5232506153530998, 0.3129112131192873, -0.21959056436528473, -0.09332064875400257, -0.24696506013528507, 0.6122879917796569, -0.36532293164437174)), interceptVector = Vectors.dense(0.4965574044951358, -2.1486146169780063, 1.6520572124828703), numClasses = 3, isMultinomial = true))).fit(dataset) }
Example 2
Source File: CorrelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.linalg.{Matrices, Matrix, Vectors} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { val xData = Array(1.0, 0.0, -2.0) val yData = Array(4.0, 5.0, 3.0) val zeros = new Array[Double](3) val data = Seq( Vectors.dense(1.0, 0.0, 0.0, -2.0), Vectors.dense(4.0, 5.0, 0.0, 3.0), Vectors.dense(6.0, 7.0, 0.0, 8.0), Vectors.dense(9.0, 0.0, 0.0, 1.0) ) private def X = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") private def extract(df: DataFrame): BDM[Double] = { val Array(Row(mat: Matrix)) = df.collect() mat.asBreeze.toDenseMatrix } test("corr(X) default, pearson") { val defaultMat = Correlation.corr(X, "features") val pearsonMat = Correlation.corr(X, "features", "pearson") // scalastyle:off val expected = Matrices.fromBreeze(BDM( (1.00000000, 0.05564149, Double.NaN, 0.4004714), (0.05564149, 1.00000000, Double.NaN, 0.9135959), (Double.NaN, Double.NaN, 1.00000000, Double.NaN), (0.40047142, 0.91359586, Double.NaN, 1.0000000))) // scalastyle:on assert(Matrices.fromBreeze(extract(defaultMat)) ~== expected absTol 1e-4) assert(Matrices.fromBreeze(extract(pearsonMat)) ~== expected absTol 1e-4) } test("corr(X) spearman") { val spearmanMat = Correlation.corr(X, "features", "spearman") // scalastyle:off val expected = Matrices.fromBreeze(BDM( (1.0000000, 0.1054093, Double.NaN, 0.4000000), (0.1054093, 1.0000000, Double.NaN, 0.9486833), (Double.NaN, Double.NaN, 1.00000000, Double.NaN), (0.4000000, 0.9486833, Double.NaN, 1.0000000))) // scalastyle:on assert(Matrices.fromBreeze(extract(spearmanMat)) ~== expected absTol 1e-4) } }
Example 3
Source File: MLSerDeSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.python import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} class MLSerDeSuite extends SparkFunSuite { MLSerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = MLSerDe.loads(MLSerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } }
Example 4
Source File: MultivariateGaussianSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import org.apache.spark.ml.SparkMLFunSuite import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.util.TestingUtils._ class MultivariateGaussianSuite extends SparkMLFunSuite { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 5
Source File: HasNetlibBlas.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS} import com.github.fommil.netlib.{F2jBLAS, BLAS => NetlibBLAS} import org.apache.spark.ml.linalg.{DenseVector, Matrices, Vector, Vectors} trait HasNetlibBlas { // For level-1 routines, we use Java implementation. def f2jBLAS: NetlibBLAS = HasNetlibBlas._f2jBLAS def blas: NetlibBLAS = HasNetlibBlas._nativeBLAS def dscal(a: Double, data: Array[Double]) : Unit = f2jBLAS.dscal(data.length, a, data, 1) def axpy(a: Double, x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.daxpy(x.length, a, x, 1, y, 1) def axpy(a: Double, x: Vector, y : Array[Double]) : Unit = x match { case dense: DenseVector => axpy(a, dense.values, y) case _ => x.foreachActive((i, v) => y(i) += a * v) } def copy( x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.dcopy(x.length, x, 1, y, 1) } object HasNetlibBlas extends Serializable { @transient private lazy val _f2jBLAS: NetlibBLAS = { initSparkBlas new F2jBLAS } private def initSparkBlas = synchronized { org.apache.spark.ml.linalg.BLAS.dot(Vectors.zeros(2), Vectors.zeros(2)) org.apache.spark.ml.linalg.BLAS.gemv(1.0, Matrices.zeros(2, 2), Vectors.zeros(2), 0.5, Vectors.zeros(2).toDense) } @transient private lazy val _nativeBLAS: NetlibBLAS = { initSparkBlas NativeBLAS } }
Example 6
Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import java.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{LongType, StructType} def setDim(value: Long): this.type = set(dim, value) def this() = this(Identifiable.randomUID("randomProjectionsHasher")) override def transform(dataset: Dataset[_]): DataFrame = { val dimensity = { if (!isSet(dim)) {//If dimensions is not set - will search AttributeGroup in metadata as it comes from OdklCountVectorizer val vectorsIndex = dataset.schema.fieldIndex($(inputCol)) AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size } else { $(dim).toInt } } val projectionMatrix = dataset.sqlContext.sparkContext.broadcast( Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix]) //the matrix of random vectors to costruct hash val binHashSparseVectorColumn = udf((vector: Vector) => { projectionMatrix.value.multiply(vector).values .map(f => if (f>0) 1L else 0L) .view.zipWithIndex .foldLeft(0L) {case (acc,(v, i)) => acc | (v << i) } }) dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), LongType) } }
Example 7
Source File: Generators.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml import frameless.ml.params.linears.{LossStrategy, Solver} import frameless.ml.params.trees.FeatureSubsetStrategy import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} import org.scalacheck.{Arbitrary, Gen} object Generators { implicit val arbVector: Arbitrary[Vector] = Arbitrary { val genDenseVector = Gen.listOf(arbDouble.arbitrary).map(doubles => Vectors.dense(doubles.toArray)) val genSparseVector = genDenseVector.map(_.toSparse) Gen.oneOf(genDenseVector, genSparseVector) } implicit val arbMatrix: Arbitrary[Matrix] = Arbitrary { Gen.sized { size => for { nbRows <- Gen.choose(0, size) nbCols <- Gen.choose(1, size) matrix <- { Gen.listOfN(nbRows * nbCols, arbDouble.arbitrary) .map(values => Matrices.dense(nbRows, nbCols, values.toArray)) } } yield matrix } } implicit val arbTreesFeaturesSubsetStrategy: Arbitrary[FeatureSubsetStrategy] = Arbitrary { val genRatio = Gen.choose(0D, 1D).suchThat(_ > 0D).map(FeatureSubsetStrategy.Ratio) val genNumberOfFeatures = Gen.choose(1, Int.MaxValue).map(FeatureSubsetStrategy.NumberOfFeatures) Gen.oneOf(Gen.const(FeatureSubsetStrategy.All), Gen.const(FeatureSubsetStrategy.All), Gen.const(FeatureSubsetStrategy.Log2), Gen.const(FeatureSubsetStrategy.OneThird), Gen.const(FeatureSubsetStrategy.Sqrt), genRatio, genNumberOfFeatures ) } implicit val arbLossStrategy: Arbitrary[LossStrategy] = Arbitrary { Gen.const(LossStrategy.SquaredError) } implicit val arbSolver: Arbitrary[Solver] = Arbitrary { Gen.oneOf( Gen.const(Solver.LBFGS), Gen.const(Solver.Auto), Gen.const(Solver.Normal) ) } }
Example 8
Source File: MLSerDeSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.python import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} class MLSerDeSuite extends SparkFunSuite { MLSerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = MLSerDe.loads(MLSerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } }
Example 9
Source File: MultivariateGaussianSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import org.apache.spark.ml.SparkMLFunSuite import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.util.TestingUtils._ class MultivariateGaussianSuite extends SparkMLFunSuite { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 10
Source File: LRSelectorSuite.scala From spark-FeatureSelection with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature.selection.embedded import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.feature.selection.{FeatureSelectionTestBase, FeatureSelectorTestBase} import org.apache.spark.ml.linalg.Matrices class LRSelectorSuite extends FeatureSelectionTestBase { // Order of feature importances must be: f4 > f3 > f2 > f1 private val lrWeights = Matrices.dense(3, 4, Array(0.1, 0.1, 0.1, 0.2, 0.2, 0.2, -0.8, -0.8, -0.8, 0.9, 0.9, 0.9)) test("Test LRSelector: numTopFeatures") { val selector = new LRSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName).setCoefficientMatrix(lrWeights) .setOutputCol("filtered").setSelectorType("numTopFeatures").setNumTopFeatures(2) val importantColNames = Array("pWidth", "pLength") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[LRSelector, LRSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("Test LRSelector: percentile") { val selector = new LRSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("percentile").setPercentile(0.51).setCoefficientMatrix(lrWeights) val importantColNames = Array("pWidth", "pLength") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[LRSelector, LRSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("Test LRSelector: randomCutOff") { val selector = new LRSelector().setFeaturesCol(featuresColName).setLabelCol(labelColName) .setOutputCol("filtered").setSelectorType("randomCutOff").setRandomCutOff(1.0).setCoefficientMatrix(lrWeights) val importantColNames = Array("pWidth", "pLength", "sWidth", "sLength") val df = new VectorAssembler().setInputCols(importantColNames).setOutputCol("ImportantFeatures").transform(dataset) FeatureSelectorTestBase.testSelector[LRSelector, LRSelectorModel](selector, df, importantColNames, "ImportantFeatures") } test("LRSelector read/write") { val nb = new LRSelector testEstimatorAndModelReadWrite[LRSelector, LRSelectorModel](nb, dataset, FeatureSelectorTestBase.allParamSettings.+("coefficientMatrix" -> lrWeights), FeatureSelectorTestBase.checkModelData) } }
Example 11
Source File: MLSerDeSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.python import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} class MLSerDeSuite extends SparkFunSuite { MLSerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = MLSerDe.loads(MLSerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } }
Example 12
Source File: MultivariateGaussianSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import org.apache.spark.ml.SparkMLFunSuite import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.util.TestingUtils._ class MultivariateGaussianSuite extends SparkMLFunSuite { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 13
Source File: SparkMatrix.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.matrix import org.apache.spark.ml.linalg.Matrix import org.apache.spark.ml.linalg.Matrices import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.MatrixEntry object SparkMatrix { def main(args: Array[String]) { val dMatrix: Matrix = Matrices.dense(2, 2, Array(1.0, 2.0, 3.0, 4.0)) println("dMatrix: \n" + dMatrix) val sMatrixOne: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(5, 6, 7)) println("sMatrixOne: \n" + sMatrixOne) val sMatrixTwo: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 1, 2), Array(5, 6, 7)) println("sMatrixTwo: \n" + sMatrixTwo) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val sc = new SparkContext(spConfig) val denseData = Seq( Vectors.dense(0.0, 1.0, 2.1), Vectors.dense(3.0, 2.0, 4.0), Vectors.dense(5.0, 7.0, 8.0), Vectors.dense(9.0, 0.0, 1.1) ) val sparseData = Seq( Vectors.sparse(3, Seq((1, 1.0), (2, 2.1))), Vectors.sparse(3, Seq((0, 3.0), (1, 2.0), (2, 4.0))), Vectors.sparse(3, Seq((0, 5.0), (1, 7.0), (2, 8.0))), Vectors.sparse(3, Seq((0, 9.0), (2, 1.0))) ) val denseMat = new RowMatrix(sc.parallelize(denseData, 2)) val sparseMat = new RowMatrix(sc.parallelize(sparseData, 2)) println("Dense Matrix - Num of Rows :" + denseMat.numRows()) println("Dense Matrix - Num of Cols:" + denseMat.numCols()) println("Sparse Matrix - Num of Rows :" + sparseMat.numRows()) println("Sparse Matrix - Num of Cols:" + sparseMat.numCols()) val data = Seq( (0L, Vectors.dense(0.0, 1.0, 2.0)), (1L, Vectors.dense(3.0, 4.0, 5.0)), (3L, Vectors.dense(9.0, 0.0, 1.0)) ).map(x => IndexedRow(x._1, x._2)) val indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2) val indexedRowsMat = new IndexedRowMatrix(indexedRows) println("Indexed Row Matrix - No of Rows: " + indexedRowsMat.numRows()) println("Indexed Row Matrix - No of Cols: " + indexedRowsMat.numCols()) val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } val coordinateMat = new CoordinateMatrix(entries) println("Coordinate Matrix - No of Rows: " + coordinateMat.numRows()) println("Coordinate Matrix - No of Cols: " + coordinateMat.numCols()) sc.stop() } }
Example 14
Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import breeze.linalg.{diag, eigSym, max, DenseMatrix => BDM, DenseVector => BDV, Vector => BV} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.ml.impl.Utils import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} private def calculateCovarianceConstants: (BDM[Double], Double) = { val eigSym.EigSym(d, u) = eigSym(cov.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t // For numerical stability, values are considered to be non-zero only if they exceed tol. // This prevents any inverted value from exceeding (eps * n * max(d))^-1 val tol = Utils.EPSILON * max(d) * d.length try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new BDV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) (pinvS * u.t, -0.5 * (mean.size * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") } } }
Example 15
Source File: NaiveBayesClassifierOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.classification.NaiveBayesModel import org.apache.spark.ml.linalg.{Matrices, Vectors} class NaiveBayesClassifierOp extends SimpleSparkOp[NaiveBayesModel] { override val Model: OpModel[SparkBundleContext, NaiveBayesModel] = new OpModel[SparkBundleContext, NaiveBayesModel] { override val klazz: Class[NaiveBayesModel] = classOf[NaiveBayesModel] override def opName: String = Bundle.BuiltinOps.classification.naive_bayes override def store(model: Model, obj: NaiveBayesModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val thresholds = if(obj.isSet(obj.thresholds)) { Some(obj.getThresholds) } else None model.withValue("num_features", Value.long(obj.numFeatures)). withValue("num_classes", Value.long(obj.numClasses)). withValue("pi", Value.vector(obj.pi.toArray)). withValue("theta", Value.tensor(DenseTensor(obj.theta.toArray, Seq(obj.theta.numRows, obj.theta.numCols)))). withValue("model_type", Value.string(obj.getModelType)). withValue("thresholds", thresholds.map(Value.doubleList(_))) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): NaiveBayesModel = { val theta = model.value("theta").getTensor[Double] val nb = new NaiveBayesModel(uid = "", pi = Vectors.dense(model.value("pi").getTensor[Double].toArray), theta = Matrices.dense(theta.dimensions.head, theta.dimensions(1), theta.toArray)) val modelType = model.value("model_type").getString model.getValue("thresholds").map(t => nb.setThresholds(t.getDoubleList.toArray)) nb.set(nb.modelType, modelType) } } override def sparkLoad(uid: String, shape: NodeShape, model: NaiveBayesModel): NaiveBayesModel = { val r = new NaiveBayesModel(uid = uid, pi = model.pi, theta = model.theta) if (model.isDefined(model.thresholds)) { r.setThresholds(model.getThresholds) } if (model.isDefined(model.modelType)) { r.set(r.modelType, model.getModelType)} r } override def sparkInputs(obj: NaiveBayesModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: NaiveBayesModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "probability" -> obj.probabilityCol, "prediction" -> obj.predictionCol) } }
Example 16
Source File: LogisticRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.op.OpModel import ml.combust.bundle.dsl._ import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.{Matrices, Vectors} class LogisticRegressionOp extends SimpleSparkOp[LogisticRegressionModel] { private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5 override val Model: OpModel[SparkBundleContext, LogisticRegressionModel] = new OpModel[SparkBundleContext, LogisticRegressionModel] { override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel] override def opName: String = Bundle.BuiltinOps.classification.logistic_regression override def store(model: Model, obj: LogisticRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val m = model.withValue("num_classes", Value.long(obj.numClasses)) if(obj.numClasses > 2) { val cm = obj.coefficientMatrix val thresholds = if(obj.isSet(obj.thresholds)) { Some(obj.getThresholds) } else None m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))). withValue("intercept_vector", Value.vector(obj.interceptVector.toArray)). withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList)) } else { m.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)). withValue("threshold", Value.double(obj.getThreshold)) } } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): LogisticRegressionModel = { val numClasses = model.value("num_classes").getLong val r = if(numClasses > 2) { val cmTensor = model.value("coefficient_matrix").getTensor[Double] val coefficientMatrix = Matrices.dense(cmTensor.dimensions.head, cmTensor.dimensions(1), cmTensor.toArray) val lr = new LogisticRegressionModel(uid = "", coefficientMatrix = coefficientMatrix, interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray), numClasses = numClasses.toInt, isMultinomial = true) model.getValue("thresholds"). map(t => lr.setThresholds(t.getDoubleList.toArray)). getOrElse(lr) } else { val lr = new LogisticRegressionModel(uid = "", coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble) // default threshold is 0.5 for both Spark and Scikit-learn val threshold = model.getValue("threshold") .map(value => value.getDouble) .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD) lr.setThreshold(threshold) } r } } override def sparkLoad(uid: String, shape: NodeShape, model: LogisticRegressionModel): LogisticRegressionModel = { val numClasses = model.numClasses val r = if (numClasses > 2) { val lr = new LogisticRegressionModel(uid = uid, coefficientMatrix = model.coefficientMatrix, interceptVector = model.interceptVector, numClasses = numClasses, isMultinomial = true) if(model.isDefined(model.thresholds)) { lr.setThresholds(model.getThresholds) } lr } else { val lr = new LogisticRegressionModel(uid = uid, coefficientMatrix = model.coefficientMatrix, interceptVector = model.interceptVector, numClasses = numClasses, isMultinomial = false) if(model.isDefined(model.threshold)) { lr.setThreshold(model.getThreshold) } lr } r } override def sparkInputs(obj: LogisticRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: LogisticRegressionModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "probability" -> obj.probabilityCol, "prediction" -> obj.predictionCol) } }
Example 17
Source File: GaussianMixtureOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.clustering import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.tensor.{DenseTensor, Tensor} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.clustering.GaussianMixtureModel import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.stat.distribution.MultivariateGaussian class GaussianMixtureOp extends SimpleSparkOp[GaussianMixtureModel] { override val Model: OpModel[SparkBundleContext, GaussianMixtureModel] = new OpModel[SparkBundleContext, GaussianMixtureModel] { override val klazz: Class[GaussianMixtureModel] = classOf[GaussianMixtureModel] override def opName: String = Bundle.BuiltinOps.clustering.gaussian_mixture override def store(model: Model, obj: GaussianMixtureModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val (rows, cols) = obj.gaussians.headOption. map(g => (g.cov.numRows, g.cov.numCols)). getOrElse((-1, -1)) val (means, covs) = obj.gaussians.map(g => (g.mean, g.cov)).unzip model.withValue("means", Value.tensorList(means.map(_.toArray).map(Tensor.denseVector))). withValue("covs", Value.tensorList(covs.map(m => DenseTensor(m.toArray, Seq(m.numRows, m.numCols))))). withValue("weights", Value.doubleList(obj.weights.toSeq)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): GaussianMixtureModel = { val means = model.value("means").getTensorList[Double].map(values => Vectors.dense(values.toArray)) val covs = model.value("covs").getTensorList[Double].map(values => Matrices.dense(values.dimensions.head, values.dimensions(1), values.toArray)) val gaussians = means.zip(covs).map { case (mean, cov) => new MultivariateGaussian(mean, cov) }.toArray val weights = model.value("weights").getDoubleList.toArray new GaussianMixtureModel(uid = "", gaussians = gaussians, weights = weights) } } override def sparkLoad(uid: String, shape: NodeShape, model: GaussianMixtureModel): GaussianMixtureModel = { new GaussianMixtureModel(uid = uid, weights = model.weights, gaussians = model.gaussians) } override def sparkInputs(obj: GaussianMixtureModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: GaussianMixtureModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol, "probability" -> obj.probabilityCol) } }
Example 18
Source File: LogisticRegressionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.scalatest.FunSpec class LogisticRegressionModelSpec extends FunSpec { describe("BinaryLogisticRegression") { val weights = Vectors.dense(1.0, 2.0, 4.0) val intercept = 0.7 describe("issue210: Logistic function not being applied") { val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4) it("applies the logistic function for prediction") { val features = Vectors.dense(-1.0, 1.0, -0.5) assert(lr.predict(features) == 1.0) } } describe("issue386:Wrong Binary LogisticRegression predictions") { val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4) it("compare binary logisticRegression prediction with the transform api predictions") { val features = Vectors.dense(-1.0, 1.0, -0.5) assert(lr.predict(features) == lr.probabilityToPrediction(lr.rawToProbability(lr.predictRaw(features)))) assert(lr.predict(features) == 1.0) } it("compare binary logisticRegression prediction with rawToPrediction() results") { val features = Vectors.dense(-1.0, 1.0, -0.5) assert(lr.predict(features) == lr.rawToPrediction(lr.predictRaw(features))) assert(lr.predict(features) == 1.0) } } describe("issue386:Binary LogisticRegression predictions with 1.0 threshold"){ val lr = BinaryLogisticRegressionModel(weights, intercept, 1.0) it("binary logisticRegression prediction equals zero for 1.0 threshold") { val features = Vectors.dense(-1.0, 1.0, -0.5) assert(lr.predict(features) == lr.probabilityToPrediction(lr.rawToProbability(lr.predictRaw(features)))) assert(lr.predict(features) == 0.0) } } describe("issue386:Binary LogisticRegression predictions with 0.0 threshold"){ val lr = BinaryLogisticRegressionModel(weights, intercept, 0.0) it("binary logisticRegression prediction equals 1 for zero threshold") { val features = Vectors.dense(-1.0, 1.0, -0.5) assert(lr.predict(features) == lr.rawToPrediction(lr.predictRaw(features))) assert(lr.predict(features) == 1.0) } } describe("input/output schema"){ val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4) it("has the right input schema") { assert(lr.inputSchema.fields == Seq(StructField("features", TensorType.Double(3)))) } it("has the right output schema") { assert(lr.outputSchema.fields == Seq( StructField("raw_prediction", TensorType.Double(2)), StructField("probability", TensorType.Double(2)), StructField("prediction", ScalarType.Double.nonNullable) )) } } } describe("ProbabilisticLogisticsRegressionModel") { val weights = Matrices.dense(3, 3, Array(1, 2, 3, 1, 2, 3, 1, 2, 3)) val intercept = Vectors.dense(1, 2, 3) val lr = ProbabilisticLogisticsRegressionModel(weights, intercept, None) describe("input/output schema"){ it("has the right input schema") { assert(lr.inputSchema.fields == Seq(StructField("features", TensorType.Double(3)))) } it("has the right output schema") { assert(lr.outputSchema.fields == Seq( StructField("raw_prediction", TensorType.Double(3)), StructField("probability", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable) )) } } } }
Example 19
Source File: PcaModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vectors} import org.scalatest.FunSpec class PcaModelSpec extends FunSpec { describe("pca model") { val pc = new DenseMatrix(3, 2, Array[Double](1, -1, 2, 0, -3, 1)) val pca = PcaModel(pc) it("uses the principal components matrix to transform a vector to a lower-dimensional vector") { val input = Vectors.dense(Array[Double](2, 1, 0)) assert(pca(input).toArray sameElements Array[Double](1, -3)) } it("has the right input schema") { assert(pca.inputSchema.fields == Seq(StructField("input", TensorType.Double()))) } it("has the right output schema") { assert(pca.outputSchema.fields == Seq(StructField("output", TensorType.Double()))) } } }
Example 20
Source File: VectorConverters.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.util import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor} import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors} import scala.language.implicitConversions trait VectorConverters { implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match { case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size)) case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)), values = vector.values, dimensions = Seq(vector.size)) } implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match { case tensor: DenseTensor[_] => Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => Vectors.sparse(tensor.dimensions.product, tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match { case matrix: DenseMatrix => DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols)) case matrix: SparseMatrix => val indices = matrix.rowIndices.zip(matrix.colPtrs).map { case (r, c) => Seq(r, c) }.toSeq SparseTensor(indices = indices, values = matrix.values, dimensions = Seq(matrix.numRows, matrix.numCols)) } implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match { case tensor: DenseTensor[_] => Matrices.dense(tensor.dimensions.head, tensor.dimensions(1), tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip Matrices.sparse(tensor.dimensions.head, tensor.dimensions(1), cols.toArray, rows.toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match { case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size)) case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size)) } implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match { case tensor: DenseTensor[_] => new BDV(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => new BSV(tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]], tensor.dimensions.product) } } object VectorConverters extends VectorConverters
Example 21
Source File: NaiveBayesClassifierOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl.Model import ml.combust.bundle.op.OpModel import ml.combust.mleap.runtime.transformer.classification.NaiveBayesClassifier import ml.combust.mleap.core.classification.NaiveBayesModel import ml.combust.bundle.dsl._ import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.linalg.{Matrices, Vectors} class NaiveBayesClassifierOp extends MleapOp[NaiveBayesClassifier, NaiveBayesModel]{ override val Model: OpModel[MleapContext, NaiveBayesModel] = new OpModel[MleapContext, NaiveBayesModel]{ override val klazz: Class[NaiveBayesModel] = classOf[NaiveBayesModel] override def opName: String = Bundle.BuiltinOps.classification.naive_bayes override def store(model: Model, obj: NaiveBayesModel)(implicit context: BundleContext[MleapContext]): Model = { model.withValue("num_features", Value.long(obj.numFeatures)). withValue("num_classes", Value.long(obj.numClasses)). withValue("pi", Value.vector(obj.pi.toArray)). withValue("theta", Value.tensor(DenseTensor(obj.theta.toArray, Seq(obj.theta.numRows, obj.theta.numCols)))). withValue("model_type", Value.string(obj.modelType.toString)). withValue("thresholds", obj.thresholds.map(Value.doubleList(_))) } override def load(model: Model)(implicit context: BundleContext[MleapContext]): NaiveBayesModel = { val theta = model.value("theta").getTensor[Double] val modelType = NaiveBayesModel.forName(model.value("model_type").getString) val numClasses = model.value("num_classes").getLong.toInt val thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray) require(thresholds.isEmpty || thresholds.get.length == numClasses, "NaiveBayesModel loaded with non-matching numClasses and thresholds.length. " + s" numClasses=$numClasses, but thresholds has length ${thresholds.get.length}") new NaiveBayesModel(numFeatures = model.value("num_features").getLong.toInt, numClasses = numClasses, pi = Vectors.dense(model.value("pi").getTensor[Double].toArray), theta = Matrices.dense(theta.dimensions.head, theta.dimensions(1), theta.toArray), modelType = modelType, thresholds = thresholds) } } override def model(node: NaiveBayesClassifier): NaiveBayesModel = node.model }
Example 22
Source File: LogisticRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, LogisticRegressionModel, ProbabilisticLogisticsRegressionModel} import ml.combust.mleap.runtime.transformer.classification.LogisticRegression import ml.combust.bundle.op.OpModel import ml.combust.bundle.dsl._ import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.linalg.{Matrices, Vectors} class LogisticRegressionOp extends MleapOp[LogisticRegression, LogisticRegressionModel] { private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5 override val Model: OpModel[MleapContext, LogisticRegressionModel] = new OpModel[MleapContext, LogisticRegressionModel] { override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel] override def opName: String = Bundle.BuiltinOps.classification.logistic_regression override def store(model: Model, obj: LogisticRegressionModel) (implicit context: BundleContext[MleapContext]): Model = { val m = model.withValue("num_classes", Value.long(obj.numClasses)) if(obj.isMultinomial) { val mm = obj.multinomialModel val cm = mm.coefficientMatrix m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))). withValue("intercept_vector", Value.vector(mm.interceptVector.toArray)). withValue("thresholds", mm.thresholds.map(_.toSeq).map(Value.doubleList)) } else { m.withValue("coefficients", Value.vector(obj.binaryModel.coefficients.toArray)). withValue("intercept", Value.double(obj.binaryModel.intercept)). withValue("threshold", Value.double(obj.binaryModel.threshold)) } } override def load(model: Model) (implicit context: BundleContext[MleapContext]): LogisticRegressionModel = { val numClasses = model.value("num_classes").getLong val lm = if(numClasses > 2) { val tensor = model.value("coefficient_matrix").getTensor[Double] val cm = Matrices.dense(numRows = tensor.dimensions.head, numCols = tensor.dimensions(1), tensor.toArray) ProbabilisticLogisticsRegressionModel(coefficientMatrix = cm, interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray), thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray)) } else { // default threshold is 0.5 for both Spark and Scikit-learn val threshold = model.getValue("threshold") .map(value => value.getDouble) .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD) BinaryLogisticRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble, threshold = threshold) } LogisticRegressionModel(lm) } } override def model(node: LogisticRegression): LogisticRegressionModel = node.model }
Example 23
Source File: GaussianMixtureOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.clustering import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.clustering.GaussianMixtureModel import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.clustering.GaussianMixture import ml.combust.mleap.tensor.{DenseTensor, Tensor} import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.stat.distribution.MultivariateGaussian class GaussianMixtureOp extends MleapOp[GaussianMixture, GaussianMixtureModel] { override val Model: OpModel[MleapContext, GaussianMixtureModel] = new OpModel[MleapContext, GaussianMixtureModel] { override val klazz: Class[GaussianMixtureModel] = classOf[GaussianMixtureModel] override def opName: String = Bundle.BuiltinOps.clustering.gaussian_mixture override def store(model: Model, obj: GaussianMixtureModel) (implicit context: BundleContext[MleapContext]): Model = { val (means, covs) = obj.gaussians.map(g => (g.mean, g.cov)).unzip model.withValue("means", Value.tensorList(means.map(m => Tensor.denseVector(m.toArray)))). withValue("covs", Value.tensorList(covs.map(c => DenseTensor(c.toArray, Seq(c.numRows, c.numCols))))). withValue("weights", Value.doubleList(obj.weights.toSeq)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): GaussianMixtureModel = { val means = model.value("means").getTensorList[Double].map(values => Vectors.dense(values.toArray)) val covs = model.value("covs").getTensorList[Double].map { values => Matrices.dense(values.dimensions.head, values.dimensions(1), values.toArray) } val gaussians = means.zip(covs).map { case (mean, cov) => new MultivariateGaussian(mean, cov) }.toArray val weights = model.value("weights").getDoubleList.toArray GaussianMixtureModel(gaussians, weights) } } override def model(node: GaussianMixture): GaussianMixtureModel = node.model }
Example 24
Source File: LocalPCAModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.feature.PCAModel import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Vectors} import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Matrices => OldMatrices} class LocalPCAModel(override val sparkTransformer: PCAModel) extends LocalTransformer[PCAModel] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val pc = OldMatrices.fromML(sparkTransformer.pc).asInstanceOf[OldDenseMatrix] val newData = column.data.mapToMlLibVectors.map(pc.transpose.multiply).map(_.toList) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalPCAModel extends SimpleModelLoader[PCAModel] with TypedTransformerConverter[PCAModel] { override def build(metadata: Metadata, data: LocalData): PCAModel = { val constructor = classOf[PCAModel].getDeclaredConstructor( classOf[String], classOf[DenseMatrix], classOf[DenseVector] ) constructor.setAccessible(true) val pcMap = data.column("pc").get.data.head.asInstanceOf[Map[String, Any]] val pcMat = DataUtils.constructMatrix(pcMap).asInstanceOf[DenseMatrix] data.column("explainedVariance") match { case Some(ev) => // NOTE: Spark >= 2 val evParams = ev.data.head.asInstanceOf[Map[String, Any]] val explainedVariance = DataUtils.constructVector(evParams).toDense constructor .newInstance(metadata.uid, pcMat, explainedVariance) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) case None => // NOTE: Spark < 2 constructor .newInstance( metadata.uid, pcMat, Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector] ) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) } } override implicit def toLocal(transformer: PCAModel) = new LocalPCAModel(transformer) }
Example 25
Source File: MLSerDeSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.python import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} class MLSerDeSuite extends SparkFunSuite { MLSerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = MLSerDe.loads(MLSerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } }
Example 26
Source File: MultivariateGaussianSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import org.apache.spark.ml.SparkMLFunSuite import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.util.TestingUtils._ class MultivariateGaussianSuite extends SparkMLFunSuite { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }