org.apache.spark.ml.linalg.Matrix Scala Examples
The following examples show how to use org.apache.spark.ml.linalg.Matrix.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import breeze.linalg.{diag, eigSym, max, DenseMatrix => BDM, DenseVector => BDV, Vector => BV} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.ml.impl.Utils import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} private def calculateCovarianceConstants: (BDM[Double], Double) = { val eigSym.EigSym(d, u) = eigSym(cov.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t // For numerical stability, values are considered to be non-zero only if they exceed tol. // This prevents any inverted value from exceeding (eps * n * max(d))^-1 val tol = Utils.EPSILON * max(d) * d.length try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new BDV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) (pinvS * u.t, -0.5 * (mean.size * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") } } }
Example 2
Source File: LocalLogisticRegressionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.{Matrix, Vector} class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel) extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {} object LocalLogisticRegressionModel extends SimpleModelLoader[LogisticRegressionModel] with TypedTransformerConverter[LogisticRegressionModel] { override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = { val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor( classOf[String], classOf[Matrix], classOf[Vector], classOf[Int], java.lang.Boolean.TYPE ) constructor.setAccessible(true) val coefficientMatrixParams = data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]] val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams) val interceptVectorParams = data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]] val interceptVector = DataUtils.constructVector(interceptVectorParams) constructor .newInstance( metadata.uid, coefficientMatrix, interceptVector, data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer], data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean] ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double]) } override implicit def toLocal( transformer: LogisticRegressionModel ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer) }
Example 3
Source File: LocalLogisticRegressionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import java.lang.Boolean import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.{Matrix, SparseMatrix, Vector, Vectors} class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel) extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {} object LocalLogisticRegressionModel extends SimpleModelLoader[LogisticRegressionModel] with TypedTransformerConverter[LogisticRegressionModel] { override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = { val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor( classOf[String], classOf[Matrix], classOf[Vector], classOf[Int], java.lang.Boolean.TYPE ) constructor.setAccessible(true) val coefficientMatrixParams = data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]] val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams) val interceptVectorParams = data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]] val interceptVector = DataUtils.constructVector(interceptVectorParams) constructor .newInstance( metadata.uid, coefficientMatrix, interceptVector, data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer], data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean] ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double]) } override implicit def toLocal( transformer: LogisticRegressionModel ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer) }
Example 4
Source File: LocalNaiveBayes.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.NaiveBayesModel import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors} class LocalNaiveBayes(override val sparkTransformer: NaiveBayesModel) extends LocalProbabilisticClassificationModel[NaiveBayesModel] {} object LocalNaiveBayes extends SimpleModelLoader[NaiveBayesModel] with TypedTransformerConverter[NaiveBayesModel] { override def build(metadata: Metadata, data: LocalData): NaiveBayesModel = { val constructor = classOf[NaiveBayesModel].getDeclaredConstructor( classOf[String], classOf[Vector], classOf[Matrix] ) constructor.setAccessible(true) val matrixMetadata = data.column("theta").get.data.head.asInstanceOf[Map[String, Any]] val matrix = DataUtils.constructMatrix(matrixMetadata) val piParams = data.column("pi").get.data.head.asInstanceOf[Map[String, Any]] val piVec = DataUtils.constructVector(piParams) val nb = constructor .newInstance(metadata.uid, piVec, matrix) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) nb.set(nb.smoothing, metadata.paramMap("smoothing").asInstanceOf[Number].doubleValue()) nb.set(nb.modelType, metadata.paramMap("modelType").asInstanceOf[String]) nb.set(nb.labelCol, metadata.paramMap("labelCol").asInstanceOf[String]) nb } override implicit def toLocal(sparkTransformer: NaiveBayesModel): LocalNaiveBayes = { new LocalNaiveBayes(sparkTransformer) } }
Example 5
Source File: GenericTestSpec.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import io.hydrosphere.spark_ml_serving.common.LocalData import org.apache.spark.SparkConf import org.apache.spark.ml.linalg.{Matrix, Vector} import org.apache.spark.mllib.linalg.{Matrix => OldMatrix, Vector => OldVector} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.{DataFrame, SparkSession} import org.scalatest.{BeforeAndAfterAll, FunSpec} trait GenericTestSpec extends FunSpec with BeforeAndAfterAll { val conf = new SparkConf() .setMaster("local[2]") .setAppName("test") .set("spark.ui.enabled", "false") val session: SparkSession = SparkSession.builder().config(conf).getOrCreate() def modelPath(modelName: String): String = s"./target/test_models/${session.version}/$modelName" def test( name: String, data: => DataFrame, steps: => Seq[PipelineStage], columns: => Seq[String], accuracy: Double = 0.01 ) = { val path = modelPath(name.toLowerCase()) var validation = LocalData.empty var localPipelineModel = Option.empty[LocalPipelineModel] it("should train") { val pipeline = new Pipeline().setStages(steps.toArray) val pipelineModel = pipeline.fit(data) validation = LocalData.fromDataFrame(pipelineModel.transform(data)) pipelineModel.write.overwrite().save(path) } it("should load local version") { localPipelineModel = Some(LocalPipelineModel.load(path)) assert(localPipelineModel.isDefined) } it("should transform LocalData") { val localData = LocalData.fromDataFrame(data) val model = localPipelineModel.get val result = model.transform(localData) columns.foreach { col => val resCol = result .column(col) .getOrElse(throw new IllegalArgumentException("Result column is absent")) val valCol = validation .column(col) .getOrElse(throw new IllegalArgumentException("Validation column is absent")) resCol.data.zip(valCol.data).foreach { case (r: Seq[Number @unchecked], v: Seq[Number @unchecked]) if r.head.isInstanceOf[Number] && r.head.isInstanceOf[Number] => r.zip(v).foreach { case (ri, vi) => assert(ri.doubleValue() - vi.doubleValue() <= accuracy, s"$ri - $vi > $accuracy") } case (r: Number, v: Number) => assert(r.doubleValue() - v.doubleValue() <= accuracy, s"$r - $v > $accuracy") case (r, n) => assert(r === n) } result.column(col).foreach { resData => resData.data.foreach { resRow => if (resRow.isInstanceOf[Seq[_]]) { assert(resRow.isInstanceOf[List[_]], resRow) } else if (resRow.isInstanceOf[Vector] || resRow.isInstanceOf[OldVector] || resRow .isInstanceOf[Matrix] || resRow.isInstanceOf[OldMatrix]) { assert(false, s"SparkML type detected. Column: $col, value: $resRow") } } } } } } def modelTest( data: => DataFrame, steps: => Seq[PipelineStage], columns: => Seq[String], accuracy: Double = 0.01 ): Unit = { lazy val name = steps.map(_.getClass.getSimpleName).foldLeft("") { case ("", b) => b case (a, b) => a + "-" + b } describe(name) { test(name, data, steps, columns, accuracy) } } }
Example 6
Source File: Describe.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.max import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.ml.linalg.{ Matrix, Vectors } import org.apache.spark.ml.stat.Correlation import org.apache.spark.sql.Row object Describe { case class CustomerAccount(state_code: String, account_length: Integer, area_code: String, international_plan: String, voice_mail_plan: String, num_voice_mail: Double, total_day_mins: Double, total_day_calls: Double, total_day_charge: Double, total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double, total_night_mins: Double, total_night_calls: Double, total_night_charge: Double, total_international_mins: Double, total_international_calls: Double, total_international_charge: Double, total_international_num_calls: Double, churn: String) val schema = StructType(Array( StructField("state_code", StringType, true), StructField("account_length", IntegerType, true), StructField("area_code", StringType, true), StructField("international_plan", StringType, true), StructField("voice_mail_plan", StringType, true), StructField("num_voice_mail", DoubleType, true), StructField("total_day_mins", DoubleType, true), StructField("total_day_calls", DoubleType, true), StructField("total_day_charge", DoubleType, true), StructField("total_evening_mins", DoubleType, true), StructField("total_evening_calls", DoubleType, true), StructField("total_evening_charge", DoubleType, true), StructField("total_night_mins", DoubleType, true), StructField("total_night_calls", DoubleType, true), StructField("total_night_charge", DoubleType, true), StructField("total_international_mins", DoubleType, true), StructField("total_international_calls", DoubleType, true), StructField("total_international_charge", DoubleType, true), StructField("total_international_num_calls", DoubleType, true), StructField("churn", StringType, true))) def main(args: Array[String]) { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Desribe") .getOrCreate() spark.conf.set("spark.debug.maxToStringFields", 10000) val DEFAULT_MAX_TO_STRING_FIELDS = 2500 if (SparkEnv.get != null) { SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS) } else { DEFAULT_MAX_TO_STRING_FIELDS } import spark.implicits._ val trainSet: Dataset[CustomerAccount] = spark.read. option("inferSchema", "false") .format("com.databricks.spark.csv") .schema(schema) .load("data/churn-bigml-80.csv") .as[CustomerAccount] val statsDF = trainSet.describe() statsDF.show() trainSet.createOrReplaceTempView("UserAccount") spark.catalog.cacheTable("UserAccount") spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show() spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show() trainSet.groupBy("churn").count.show() spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn") } }
Example 7
Source File: NaiveBayesModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.classification.NaiveBayesModel.{Bernoulli, ModelType, Multinomial} import org.apache.spark.ml.linalg.mleap.{BLAS, Matrices} import org.apache.spark.ml.linalg.{DenseVector, Matrix, SparseVector, Vector} @SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala") case class NaiveBayesModel(numFeatures: Int, numClasses: Int, pi: Vector, theta: Matrix, modelType: NaiveBayesModel.ModelType, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with Model { private def multinomialCalculation(raw: Vector) = { val prob = theta.multiply(raw) BLAS.axpy(1.0, pi, prob) prob } private def bernoulliCalculation(raw: Vector) = { val negTheta = Matrices.map(theta, value => math.log(1.0 - math.exp(value))) val ones = new DenseVector(Array.fill(theta.numCols) {1.0}) val thetaMinusNegTheta = Matrices.map(theta, value => value - math.log(1.0 - math.exp(value))) val negThetaSum = negTheta.multiply(ones) raw.foreachActive((_, value) => require(value == 0.0 || value == 1.0, s"Bernoulli naive Bayes requires 0 or 1 feature values but found $raw.") ) val prob = thetaMinusNegTheta.multiply(raw) BLAS.axpy(1.0, pi, prob) BLAS.axpy(1.0, negThetaSum, prob) prob } override def predictRaw(raw: Vector): Vector = { modelType match { case Multinomial => multinomialCalculation(raw) case Bernoulli => bernoulliCalculation(raw) } } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => var i = 0 val size = dv.size val maxLog = dv.values.max while (i < size) { dv.values(i) = math.exp(dv.values(i) - maxLog) i += 1 } ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in NaiveBayesModel:" + " raw2probabilityInPlace encountered SparseVector") } } }
Example 8
Source File: VectorConverters.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.util import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor} import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors} import scala.language.implicitConversions trait VectorConverters { implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match { case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size)) case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)), values = vector.values, dimensions = Seq(vector.size)) } implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match { case tensor: DenseTensor[_] => Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => Vectors.sparse(tensor.dimensions.product, tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match { case matrix: DenseMatrix => DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols)) case matrix: SparseMatrix => val indices = matrix.rowIndices.zip(matrix.colPtrs).map { case (r, c) => Seq(r, c) }.toSeq SparseTensor(indices = indices, values = matrix.values, dimensions = Seq(matrix.numRows, matrix.numCols)) } implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match { case tensor: DenseTensor[_] => Matrices.dense(tensor.dimensions.head, tensor.dimensions(1), tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip Matrices.sparse(tensor.dimensions.head, tensor.dimensions(1), cols.toArray, rows.toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match { case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size)) case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size)) } implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match { case tensor: DenseTensor[_] => new BDV(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => new BSV(tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]], tensor.dimensions.product) } } object VectorConverters extends VectorConverters
Example 9
Source File: SparkMatrix.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.matrix import org.apache.spark.ml.linalg.Matrix import org.apache.spark.ml.linalg.Matrices import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.linalg.distributed.IndexedRowMatrix import org.apache.spark.mllib.linalg.distributed.CoordinateMatrix import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.distributed.IndexedRow import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.distributed.MatrixEntry object SparkMatrix { def main(args: Array[String]) { val dMatrix: Matrix = Matrices.dense(2, 2, Array(1.0, 2.0, 3.0, 4.0)) println("dMatrix: \n" + dMatrix) val sMatrixOne: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 2, 1), Array(5, 6, 7)) println("sMatrixOne: \n" + sMatrixOne) val sMatrixTwo: Matrix = Matrices.sparse(3, 2, Array(0, 1, 3), Array(0, 1, 2), Array(5, 6, 7)) println("sMatrixTwo: \n" + sMatrixTwo) val spConfig = (new SparkConf).setMaster("local").setAppName("SparkApp") val sc = new SparkContext(spConfig) val denseData = Seq( Vectors.dense(0.0, 1.0, 2.1), Vectors.dense(3.0, 2.0, 4.0), Vectors.dense(5.0, 7.0, 8.0), Vectors.dense(9.0, 0.0, 1.1) ) val sparseData = Seq( Vectors.sparse(3, Seq((1, 1.0), (2, 2.1))), Vectors.sparse(3, Seq((0, 3.0), (1, 2.0), (2, 4.0))), Vectors.sparse(3, Seq((0, 5.0), (1, 7.0), (2, 8.0))), Vectors.sparse(3, Seq((0, 9.0), (2, 1.0))) ) val denseMat = new RowMatrix(sc.parallelize(denseData, 2)) val sparseMat = new RowMatrix(sc.parallelize(sparseData, 2)) println("Dense Matrix - Num of Rows :" + denseMat.numRows()) println("Dense Matrix - Num of Cols:" + denseMat.numCols()) println("Sparse Matrix - Num of Rows :" + sparseMat.numRows()) println("Sparse Matrix - Num of Cols:" + sparseMat.numCols()) val data = Seq( (0L, Vectors.dense(0.0, 1.0, 2.0)), (1L, Vectors.dense(3.0, 4.0, 5.0)), (3L, Vectors.dense(9.0, 0.0, 1.0)) ).map(x => IndexedRow(x._1, x._2)) val indexedRows: RDD[IndexedRow] = sc.parallelize(data, 2) val indexedRowsMat = new IndexedRowMatrix(indexedRows) println("Indexed Row Matrix - No of Rows: " + indexedRowsMat.numRows()) println("Indexed Row Matrix - No of Cols: " + indexedRowsMat.numCols()) val entries = sc.parallelize(Seq( (0, 0, 1.0), (0, 1, 2.0), (1, 1, 3.0), (1, 2, 4.0), (2, 2, 5.0), (2, 3, 6.0), (3, 0, 7.0), (3, 3, 8.0), (4, 1, 9.0)), 3).map { case (i, j, value) => MatrixEntry(i, j, value) } val coordinateMat = new CoordinateMatrix(entries) println("Coordinate Matrix - No of Rows: " + coordinateMat.numRows()) println("Coordinate Matrix - No of Cols: " + coordinateMat.numCols()) sc.stop() } }
Example 10
Source File: Generators.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml import frameless.ml.params.linears.{LossStrategy, Solver} import frameless.ml.params.trees.FeatureSubsetStrategy import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} import org.scalacheck.{Arbitrary, Gen} object Generators { implicit val arbVector: Arbitrary[Vector] = Arbitrary { val genDenseVector = Gen.listOf(arbDouble.arbitrary).map(doubles => Vectors.dense(doubles.toArray)) val genSparseVector = genDenseVector.map(_.toSparse) Gen.oneOf(genDenseVector, genSparseVector) } implicit val arbMatrix: Arbitrary[Matrix] = Arbitrary { Gen.sized { size => for { nbRows <- Gen.choose(0, size) nbCols <- Gen.choose(1, size) matrix <- { Gen.listOfN(nbRows * nbCols, arbDouble.arbitrary) .map(values => Matrices.dense(nbRows, nbCols, values.toArray)) } } yield matrix } } implicit val arbTreesFeaturesSubsetStrategy: Arbitrary[FeatureSubsetStrategy] = Arbitrary { val genRatio = Gen.choose(0D, 1D).suchThat(_ > 0D).map(FeatureSubsetStrategy.Ratio) val genNumberOfFeatures = Gen.choose(1, Int.MaxValue).map(FeatureSubsetStrategy.NumberOfFeatures) Gen.oneOf(Gen.const(FeatureSubsetStrategy.All), Gen.const(FeatureSubsetStrategy.All), Gen.const(FeatureSubsetStrategy.Log2), Gen.const(FeatureSubsetStrategy.OneThird), Gen.const(FeatureSubsetStrategy.Sqrt), genRatio, genNumberOfFeatures ) } implicit val arbLossStrategy: Arbitrary[LossStrategy] = Arbitrary { Gen.const(LossStrategy.SquaredError) } implicit val arbSolver: Arbitrary[Solver] = Arbitrary { Gen.oneOf( Gen.const(Solver.LBFGS), Gen.const(Solver.Auto), Gen.const(Solver.Normal) ) } }
Example 11
Source File: MatrixUtils.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import org.apache.spark.ml.linalg.{DenseMatrix, Matrix, VectorUDT} object MatrixUtils { def vectorUDT = new VectorUDT() def transformDense(matrix: DenseMatrix, transformer: (Int, Int, Double) => Double): DenseMatrix = { matrix.foreachActive((i, j, v) => { matrix(i, j) = transformer(i, j, v) }) matrix } def applyNonZeros(source: Matrix, target: DenseMatrix, transformer: (Int, Int, Double, Double) => Double): DenseMatrix = { source.foreachActive((i, j, v) => { val index = target.index(i, j) target.values(index) = transformer(i, j, v, target.values(index)) }) target } def applyAll(source: Matrix, target: DenseMatrix, transformer: (Int, Int, Double, Double) => Double): DenseMatrix = { for (j <- 0 until source.numCols; i <- 0 until source.numRows) { val index = target.index(i, j) target.values(index) = transformer(i, j, source(i, j), target.values(index)) } target } }
Example 12
Source File: CorrelationExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.linalg.{Matrix, Vectors} import org.apache.spark.ml.stat.Correlation import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession object CorrelationExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("CorrelationExample") .getOrCreate() import spark.implicits._ // $example on$ val data = Seq( Vectors.sparse(4, Seq((0, 1.0), (3, -2.0))), Vectors.dense(4.0, 5.0, 0.0, 3.0), Vectors.dense(6.0, 7.0, 0.0, 8.0), Vectors.sparse(4, Seq((0, 9.0), (3, 1.0))) ) val df = data.map(Tuple1.apply).toDF("features") val Row(coeff1: Matrix) = Correlation.corr(df, "features").head println(s"Pearson correlation matrix:\n $coeff1") val Row(coeff2: Matrix) = Correlation.corr(df, "features", "spearman").head println(s"Spearman correlation matrix:\n $coeff2") // $example off$ spark.stop() } } // scalastyle:on println
Example 13
Source File: CorrelationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.linalg.{Matrices, Matrix, Vectors} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class CorrelationSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { val xData = Array(1.0, 0.0, -2.0) val yData = Array(4.0, 5.0, 3.0) val zeros = new Array[Double](3) val data = Seq( Vectors.dense(1.0, 0.0, 0.0, -2.0), Vectors.dense(4.0, 5.0, 0.0, 3.0), Vectors.dense(6.0, 7.0, 0.0, 8.0), Vectors.dense(9.0, 0.0, 0.0, 1.0) ) private def X = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") private def extract(df: DataFrame): BDM[Double] = { val Array(Row(mat: Matrix)) = df.collect() mat.asBreeze.toDenseMatrix } test("corr(X) default, pearson") { val defaultMat = Correlation.corr(X, "features") val pearsonMat = Correlation.corr(X, "features", "pearson") // scalastyle:off val expected = Matrices.fromBreeze(BDM( (1.00000000, 0.05564149, Double.NaN, 0.4004714), (0.05564149, 1.00000000, Double.NaN, 0.9135959), (Double.NaN, Double.NaN, 1.00000000, Double.NaN), (0.40047142, 0.91359586, Double.NaN, 1.0000000))) // scalastyle:on assert(Matrices.fromBreeze(extract(defaultMat)) ~== expected absTol 1e-4) assert(Matrices.fromBreeze(extract(pearsonMat)) ~== expected absTol 1e-4) } test("corr(X) spearman") { val spearmanMat = Correlation.corr(X, "features", "spearman") // scalastyle:off val expected = Matrices.fromBreeze(BDM( (1.0000000, 0.1054093, Double.NaN, 0.4000000), (0.1054093, 1.0000000, Double.NaN, 0.9486833), (Double.NaN, Double.NaN, 1.00000000, Double.NaN), (0.4000000, 0.9486833, Double.NaN, 1.0000000))) // scalastyle:on assert(Matrices.fromBreeze(extract(spearmanMat)) ~== expected absTol 1e-4) } }