org.apache.spark.ml.linalg.DenseVector Scala Example

Source File: DatasetExtensions.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.core.schema

import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
import org.apache.spark.sql.Dataset
import org.apache.spark.sql.types.StructType

import scala.collection.mutable


  def findUnusedColumnName(prefix: String)(columnNames: scala.collection.Set[String]): String = {
    var counter = 2
    var unusedColumnName = prefix
    while (columnNames.contains(unusedColumnName)) {
      unusedColumnName += "_" + counter
      counter += 1
    }
    unusedColumnName
  }

  def findUnusedColumnName(prefix: String, schema: StructType): String = {
    findUnusedColumnName(prefix)(schema.fieldNames.toSet)
  }

  def findUnusedColumnName(prefix: String, df: Dataset[_]): String = {
    findUnusedColumnName(prefix, df.schema)
  }

}

Source File: DecisionTreeClassifierModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.tree.{DecisionTree, Node}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}


case class DecisionTreeClassifierModel(override val rootNode: Node,
                                       numFeatures: Int,
                                       override val numClasses: Int,
                                       override val thresholds: Option[Array[Double]] = None)
  extends ProbabilisticClassificationModel with DecisionTree with Serializable {
  override def predictRaw(features: Vector): Vector = {
    rootNode.predictImpl(features).impurities
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv)
        dv
      case sv: SparseVector =>
        throw new RuntimeException("Unexpected error in DecisionTreeClassifierModel:" +
          " raw2probabilityInPlace encountered SparseVector")
    }
  }
}

Source File: GBTClassifierModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.regression.DecisionTreeRegressionModel
import ml.combust.mleap.core.tree.TreeEnsemble
import ml.combust.mleap.core.tree.loss.LogLoss
import org.apache.spark.ml.linalg.mleap.BLAS
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def margin(features: Vector): Double = {
    val treePredictions = Vectors.dense(trees.map(_.predict(features)).toArray)
    BLAS.dot(treePredictions, treeWeightsVector)
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        dv.values(0) = loss.computeProbability(dv.values(0))
        dv.values(1) = 1.0 - dv.values(0)
        dv
      case sv: SparseVector => throw new RuntimeException("GBTClassificationModel encountered SparseVector")
    }
  }

  override def predictRaw(features: Vector): Vector = {
    val prediction: Double = margin(features)
    Vectors.dense(Array(-prediction, prediction))
  }
}

Source File: NaiveBayesModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.classification.NaiveBayesModel.{Bernoulli, ModelType, Multinomial}
import org.apache.spark.ml.linalg.mleap.{BLAS, Matrices}
import org.apache.spark.ml.linalg.{DenseVector, Matrix, SparseVector, Vector}



@SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala")
case class NaiveBayesModel(numFeatures: Int,
                           numClasses: Int,
                           pi: Vector,
                           theta: Matrix,
                           modelType: NaiveBayesModel.ModelType,
                           override val thresholds: Option[Array[Double]] = None)
  extends ProbabilisticClassificationModel with Model {

  private def multinomialCalculation(raw: Vector) = {
    val prob = theta.multiply(raw)
    BLAS.axpy(1.0, pi, prob)
    prob
  }

  private def bernoulliCalculation(raw: Vector) = {
    val negTheta = Matrices.map(theta, value => math.log(1.0 - math.exp(value)))
    val ones = new DenseVector(Array.fill(theta.numCols) {1.0})
    val thetaMinusNegTheta = Matrices.map(theta, value =>
      value - math.log(1.0 - math.exp(value)))
    val negThetaSum = negTheta.multiply(ones)

    raw.foreachActive((_, value) =>
      require(value == 0.0 || value == 1.0,
        s"Bernoulli naive Bayes requires 0 or 1 feature values but found $raw.")
    )
    val prob = thetaMinusNegTheta.multiply(raw)
    BLAS.axpy(1.0, pi, prob)
    BLAS.axpy(1.0, negThetaSum, prob)
    prob
  }

  override def predictRaw(raw: Vector): Vector = {
    modelType match {
      case Multinomial =>
        multinomialCalculation(raw)
      case Bernoulli =>
        bernoulliCalculation(raw)
    }
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        var i = 0
        val size = dv.size
        val maxLog = dv.values.max
        while (i < size) {
          dv.values(i) = math.exp(dv.values(i) - maxLog)
          i += 1
        }
        ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv)
        dv
      case sv: SparseVector =>
        throw new RuntimeException("Unexpected error in NaiveBayesModel:" +
          " raw2probabilityInPlace encountered SparseVector")
    }

  }
}

Source File: SupportVectorMachineModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.BLAS


case class SupportVectorMachineModel(coefficients: Vector,
                                     intercept: Double,
                                     override val thresholds: Option[Array[Double]] = Some(SupportVectorMachineModel.defaultThresholds))
  extends ProbabilisticClassificationModel with Serializable {
  private def margin(features: Vector): Double = BLAS.dot(coefficients, features) + intercept

  override val numClasses: Int = 2
  override val numFeatures: Int = coefficients.size

  override def predictRaw(features: Vector): Vector = {
    val m = margin(features)
    Vectors.dense(Array(-m, m))
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = raw
}

Source File: RandomForestClassifierModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.tree.TreeEnsemble
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


case class RandomForestClassifierModel(override val trees: Seq[DecisionTreeClassifierModel],
                                       override val treeWeights: Seq[Double],
                                       numFeatures: Int,
                                       override val numClasses: Int,
                                       override val thresholds: Option[Array[Double]] = None)
  extends ProbabilisticClassificationModel with TreeEnsemble with Serializable {
  override def predictRaw(raw: Vector): Vector = {
    val votes = Array.fill[Double](numClasses)(0.0)
    trees.view.foreach { tree =>
      val classCounts: Array[Double] = tree.rootNode.predictImpl(raw).impurities.toArray
      val total = classCounts.sum
      if (total != 0) {
        var i = 0
        while (i < numClasses) {
          votes(i) += classCounts(i) / total
          i += 1
        }
      }
    }
    Vectors.dense(votes)
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv)
        dv
      case sv: SparseVector =>
        throw new RuntimeException("Unexpected error in RandomForestClassificationModel:" +
          " raw2probabilityInPlace encountered SparseVector")
    }
  }
}

Source File: VectorConverters.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.util

import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor}
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors}

import scala.language.implicitConversions


trait VectorConverters {
  implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match {
    case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size))
    case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)),
      values = vector.values,
      dimensions = Seq(vector.size))
  }

  implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match {
    case tensor: DenseTensor[_] =>
      Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      Vectors.sparse(tensor.dimensions.product,
        tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match {
    case matrix: DenseMatrix =>
      DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols))
    case matrix: SparseMatrix =>
      val indices = matrix.rowIndices.zip(matrix.colPtrs).map {
        case (r, c) => Seq(r, c)
      }.toSeq
      SparseTensor(indices = indices,
      values = matrix.values,
      dimensions = Seq(matrix.numRows, matrix.numCols))
  }

  implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match {
    case tensor: DenseTensor[_] =>
      Matrices.dense(tensor.dimensions.head,
        tensor.dimensions(1),
        tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip
      Matrices.sparse(tensor.dimensions.head,
        tensor.dimensions(1),
        cols.toArray,
        rows.toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match {
    case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size))
    case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size))
  }


  implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match {
    case tensor: DenseTensor[_] =>
      new BDV(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      new BSV(tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]],
        tensor.dimensions.product)
  }
}
object VectorConverters extends VectorConverters

Source File: PcaOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.PCAModel
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector}


class PcaOp extends SimpleSparkOp[PCAModel] {
  override val Model: OpModel[SparkBundleContext, PCAModel] = new OpModel[SparkBundleContext, PCAModel] {
    override val klazz: Class[PCAModel] = classOf[PCAModel]

    override def opName: String = Bundle.BuiltinOps.feature.pca

    override def store(model: Model, obj: PCAModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("principal_components", Value.tensor[Double](DenseTensor(obj.pc.values,
        Seq(obj.pc.numRows, obj.pc.numCols))))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): PCAModel = {
      val values = model.value("principal_components").getTensor[Double]
      new PCAModel(uid = "",
        pc = new DenseMatrix(values.dimensions.head, values.dimensions(1), values.toArray),
        explainedVariance = new DenseVector(Array()))
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: PCAModel): PCAModel = {
    new PCAModel(uid = uid, pc = model.pc, explainedVariance = model.explainedVariance)
  }

  override def sparkInputs(obj: PCAModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: PCAModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
}

Source File: KMeansOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.clustering.KMeansModel
import org.apache.spark.ml.linalg.{DenseVector, SparseVector}
import org.apache.spark.mllib.clustering
import org.apache.spark.mllib.linalg.Vectors


class KMeansOp extends SimpleSparkOp[KMeansModel] {
  override val Model: OpModel[SparkBundleContext, KMeansModel] = new OpModel[SparkBundleContext, KMeansModel] {
    override val klazz: Class[KMeansModel] = classOf[KMeansModel]

    override def opName: String = Bundle.BuiltinOps.clustering.k_means

    override def store(model: Model, obj: KMeansModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("cluster_centers", Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.toArray)))).
        withValue("num_features", Value.long(obj.clusterCenters.head.size))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): KMeansModel = {
      val clusterCenters = model.value("cluster_centers").
        getTensorList[Double].toArray.
        map(t => Vectors.dense(t.toArray))
      val mllibModel = new clustering.KMeansModel(clusterCenters)

      new KMeansModel(uid = "", parentModel = mllibModel)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: KMeansModel): KMeansModel = {
    val clusterCenters = model.clusterCenters.map {
      case DenseVector(values) => Vectors.dense(values)
      case SparseVector(size, indices, values) => Vectors.sparse(size, indices, values)
    }
    new KMeansModel(uid = uid, parentModel = new clustering.KMeansModel(clusterCenters))
  }

  override def sparkInputs(obj: KMeansModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: KMeansModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol)
  }
}

Source File: StreamingMLUtils.scala From spark-structured-streaming-ml with Apache License 2.0

5 votes

package org.apache.spark.mllib

import scala.language.implicitConversions

import org.apache.spark.ml.linalg.{SparseVector, DenseVector, Vector}
import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors}
import org.apache.spark.mllib.util.MLUtils

object StreamingMLUtils {
  implicit def mlToMllibVector(v: Vector): OldVector = v match {
    case dv: DenseVector => OldVectors.dense(dv.toArray)
    case sv: SparseVector => OldVectors.sparse(sv.size, sv.indices, sv.values)
    case _ => throw new IllegalArgumentException
  }

  def fastSquaredDistance(x: Vector, xNorm: Double, y: Vector, yNorm: Double) = {
    MLUtils.fastSquaredDistance(x, xNorm, y, yNorm)
  }
}

Source File: LibSVMRelationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: UberXGBoostModel.scala From uberdata with Apache License 2.0

5 votes

package com.cloudera.sparkts.models

import ml.dmlc.xgboost4j.java.Rabit
import ml.dmlc.xgboost4j.scala.DMatrix
import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint}
import ml.dmlc.xgboost4j.scala.spark.{XGBoost, XGBoostModel}
import org.apache.spark.TaskContext
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.rdd.RDD

import scala.collection.JavaConverters._


object UberXGBoostModel {
  def train(trainLabel: RDD[LabeledPoint],
            configMap: Map[String, Any],
            round: Int,
            nWorkers: Int): XGBoostModel = {
    val trainData = trainLabel.cache
    XGBoost.trainWithRDD(trainData, configMap, round, nWorkers,useExternalMemory = true, missing
      = Float.NaN)
  }

  def labelPredict(testSet: RDD[XGBLabeledPoint],
                   useExternalCache: Boolean,
                   booster: XGBoostModel): RDD[(Float, Float)] = {
    val broadcastBooster = testSet.sparkContext.broadcast(booster)
    testSet.mapPartitions { testData =>
      val (toPredict, toLabel) = testData.duplicate
      val dMatrix = new DMatrix(toPredict)
      val prediction = broadcastBooster.value.booster.predict(dMatrix).flatten.toIterator
      toLabel.map(_.label).zip(prediction)
    }
  }

  def labelPredict(testSet: RDD[DenseVector],
                   booster: XGBoostModel): RDD[(Float, Float)] = {
    val broadcastBooster = testSet.sparkContext.broadcast(booster)
    val rdd = testSet.cache
    broadcastBooster.value.predict(testSet,missingValue = Float.NaN).map(value => (value(0),
      value(1)))
//    testSet.
//    testSet.mapPartitions { testData =>
//      val (toPredict, toLabel) = testData.duplicate
//      val dMatrix = new DMatrix(toPredict)
//
//      val prediction = broadcastBooster.value.booster.predict(dMatrix).flatten.toIterator
//      toLabel.map(_.label).zip(prediction)
//    }
  }
}

Source File: OptimizedCKNNFitting.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package org.apache.spark.sql.types.injections

import com.microsoft.ml.spark.nn._
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.Dataset
import breeze.linalg.{DenseVector => BDV}
import org.apache.spark.sql.types._

trait OptimizedCKNNFitting extends ConditionalKNNParams {

  private def fitGeneric[V, L](dataset: Dataset[_]): ConditionalKNNModel = {
    val kvlTriples = dataset.toDF().select(getFeaturesCol, getValuesCol, getLabelCol).collect()
      .map { row =>
        val bdv = new BDV(row.getAs[DenseVector](getFeaturesCol).values)
        val value = row.getAs[V](getValuesCol)
        val label = row.getAs[L](getLabelCol)
        (bdv, value, label)
      }
    val ballTree = ConditionalBallTree(
      kvlTriples.map(_._1), kvlTriples.map(_._2), kvlTriples.map(_._3), getLeafSize)
    new ConditionalKNNModel()
      .setFeaturesCol(getFeaturesCol)
      .setValuesCol(getValuesCol)
      .setBallTree(ballTree)
      .setOutputCol(getOutputCol)
      .setLabelCol(getLabelCol)
      .setConditionerCol(getConditionerCol)
      .setK(getK)
  }

  protected def fitOptimized(dataset: Dataset[_]): ConditionalKNNModel = {
    val vt = dataset.schema(getValuesCol).dataType
    val lt = dataset.schema(getLabelCol).dataType
    (vt, lt) match {
      case (avt: AtomicType, alt: AtomicType) => fitGeneric[avt.InternalType, alt.InternalType](dataset)
      case (avt: AtomicType, _) => fitGeneric[avt.InternalType, Any](dataset)
      case (_, alt: AtomicType) => fitGeneric[Any, alt.InternalType](dataset)
      case _ => fitGeneric[Any, Any](dataset)
    }
  }

}

trait OptimizedKNNFitting extends KNNParams {

  private def fitGeneric[V](dataset: Dataset[_]): KNNModel = {
    val kvlTuples = dataset.toDF().select(getFeaturesCol, getValuesCol).collect()
      .map { row =>
        val bdv = new BDV(row.getAs[DenseVector](getFeaturesCol).values)
        val value = row.getAs[V](getValuesCol)
        (bdv, value)
      }
    val ballTree = BallTree(
      kvlTuples.map(_._1), kvlTuples.map(_._2), getLeafSize)
    new KNNModel()
      .setFeaturesCol(getFeaturesCol)
      .setValuesCol(getValuesCol)
      .setBallTree(ballTree)
      .setOutputCol(getOutputCol)
      .setK(getK)
  }

  protected def fitOptimized(dataset: Dataset[_]): KNNModel = {
    dataset.schema(getValuesCol).dataType match {
      case avt: AtomicType => fitGeneric[avt.InternalType](dataset)
      case _ => fitGeneric[Any](dataset)
    }
  }

}

Source File: VectorFeaturizer.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.vw.featurizer

import org.apache.spark.sql.Row
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}

import scala.collection.mutable


  override def featurize(row: Row,
                         indices: mutable.ArrayBuilder[Int],
                         values: mutable.ArrayBuilder[Double]): Unit = {

    row.getAs[Vector](fieldIdx) match {
      case v: DenseVector =>
        // check if we need to hash
        if (v.size < mask + 1)
          indices ++= 0 until v.size
        else
          indices ++= (0 until v.size).map { mask & _ }

        values ++= v.values
      case v: SparseVector =>
        // check if we need to hash
        if (v.size < mask + 1)
          indices ++= v.indices
        else
          indices ++= v.indices.map { mask & _ }

        values ++= v.values
    }
    ()
  }
}

Source File: ClassificationModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}


  val numClasses: Int

  val numFeatures: Int

  def thresholds: Option[Array[Double]] = None

  def predict(features: Vector): Double = probabilityToPrediction(predictProbabilities(features))
  def predictWithProbability(features: Vector): (Double, Double) = {
    val probabilities = predictProbabilities(features)
    val index = probabilityToPredictionIndex(probabilities)
    (index.toDouble, probabilities(index))
  }

  def predictProbabilities(features: Vector): Vector = {
    val raw = predictRaw(features)
    rawToProbabilityInPlace(raw)
    raw
  }

  def rawToProbability(raw: Vector): Vector = {
    val probabilities = raw.copy
    rawToProbabilityInPlace(probabilities)
  }

  def rawToPrediction(raw: Vector): Double = {
    thresholds match {
      case Some(t) => probabilityToPrediction(rawToProbability(raw))
      case None => raw.argmax
    }
  }

  def probabilityToPrediction(probability: Vector): Double = {
    probabilityToPredictionIndex(probability).toDouble
  }

  def probabilityToPredictionIndex(probability: Vector): Int = {
    thresholds match {
      case Some(ts) =>
        val scaledProbability: Array[Double] =
          probability.toArray.zip(ts).map { case (p, t) =>
            if (t == 0.0) Double.PositiveInfinity else p / t
          }
        Vectors.dense(scaledProbability).argmax
      case None => probability.argmax
    }
  }

  def rawToProbabilityInPlace(raw: Vector): Vector

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get

  override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses),
    "probability" -> TensorType.Double(numClasses),
    "prediction" -> ScalarType.Double.nonNullable).get
}

Source File: EnsembleByKeySuite.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.stages

import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing}
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class EnsembleByKeySuite extends TestBase with TransformerFuzzing[EnsembleByKey] {

  test("Should work on Dataframes doubles or vectors") {
    val scoreDF = session.createDataFrame(Seq(
      (0, "foo", 1.0, .1),
      (1, "bar", 4.0, -2.0),
      (1, "bar", 0.0, -3.0)))
      .toDF("label1", "label2", "score1", "score2")

    val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1")
    val scoreDF2 = va.transform(scoreDF)

    val t = new EnsembleByKey().setKey("label1").setCol("score1")
    val df1 = t.transform(scoreDF2)
    df1.printSchema()
    assert(df1.collect().map(r => (r.getInt(0), r.getDouble(1))).toSet === Set((1, 2.0), (0, 1.0)))

    val t2 = new EnsembleByKey().setKeys("label1", "label2").setCols("score1", "score2", "v1")
    val df2 = t2.transform(scoreDF2)
    val res2 = df2.select("mean(score1)", "mean(v1)").collect().map(r => (r.getDouble(0), r.getAs[DenseVector](1)))
    val true2 = Set(
      (2.0, new DenseVector(Array(2.0, -2.5))),
      (1.0, new DenseVector(Array(1.0, 0.1))))
    assert(res2.toSet === true2)
  }

  test("should support collapsing or not") {
    val scoreDF = session.createDataFrame(
        Seq((0, "foo", 1.0, .1),
            (1, "bar", 4.0, -2.0),
            (1, "bar", 0.0, -3.0)))
      .toDF("label1", "label2", "score1", "score2")

    val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1")
    val scoreDF2 = va.transform(scoreDF)

    val t = new EnsembleByKey().setKey("label1").setCol("score1").setCollapseGroup(false)
    val df1 = t.transform(scoreDF2)

    assert(df1.collect().map(r => (r.getInt(0), r.getDouble(5))).toSet === Set((1, 2.0), (0, 1.0)))
    assert(df1.count() == scoreDF.count())
    df1.show()
  }

  lazy val testDF: DataFrame = {
    val initialTestDF = session.createDataFrame(
      Seq((0, "foo", 1.0, .1),
        (1, "bar", 4.0, -2.0),
        (1, "bar", 0.0, -3.0)))
      .toDF("label1", "label2", "score1", "score2")

    new VectorAssembler().setInputCols(Array("score1", "score2"))
      .setOutputCol("v1").transform(initialTestDF)
  }

  lazy val testModel: EnsembleByKey = new EnsembleByKey().setKey("label1").setCol("score1")
      .setCollapseGroup(false).setVectorDims(Map("v1"->2))

  test("should support passing the vector dims to avoid maerialization") {
    val df1 = testModel.transform(testDF)
    assert(df1.collect().map(r => (r.getInt(0), r.getDouble(5))).toSet === Set((1, 2.0), (0, 1.0)))
    assert(df1.count() == testDF.count())
    df1.show()
  }

  test("should overwrite a column if instructed") {
    val scoreDF = session.createDataFrame(
        Seq((0, "foo", 1.0, .1),
            (1, "bar", 4.0, -2.0),
            (1, "bar", 0.0, -3.0)))
      .toDF("label1", "label2", "score1", "score2")

    val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1")
    val scoreDF2 = va.transform(scoreDF)

    val t = new EnsembleByKey().setKey("label1").setCol("score1").setColName("score1").setCollapseGroup(false)
    val df1 = t.transform(scoreDF2)

    assert(scoreDF2.columns.toSet === df1.columns.toSet)

  }

  test("should rountrip serialize") {
    testSerialization()
  }

  def testObjects(): Seq[TestObject[EnsembleByKey]] = Seq(new TestObject(testModel, testDF))

  def reader: EnsembleByKey.type = EnsembleByKey
}

Source File: CNTKTestUtils.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.cntk

import java.io.File

import com.microsoft.ml.spark.build.BuildInfo
import com.microsoft.ml.spark.core.env.FileUtilities
import com.microsoft.ml.spark.core.test.base.TestBase
import com.microsoft.ml.spark.image.UnrollImage
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql._
import com.microsoft.ml.spark.io.IOImplicits._

trait CNTKTestUtils extends TestBase {

  val filesRoot = BuildInfo.datasetDir.toString
  val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString
  val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString
  val inputCol  = "cntk_images"
  val outputCol = "out"
  val labelCol  = "labels"

  val featureVectorLength = 3 * 32 * 32
  lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString

  def testModelDF(spark: SparkSession): DataFrame = {
    import spark.implicits._
    spark.sparkContext.parallelize(Seq(
    Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720,
      -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090),
    Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990,
      -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880),
    Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967,
      3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830),
    Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430,
      -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510),
    Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690,
      4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270),
    Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470,
      0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF
  }

  def testImages(spark: SparkSession): DataFrame = {
    val images = spark.read.image.load(imagePath)

    val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol)

    unroll.transform(images).select(inputCol)
  }

  def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = {
    import spark.implicits._
    if (outputDouble) {
      List
        .fill(rows)(List.fill(size)(0.0).toArray)
        .zip(List.fill(rows)(0.0))
        .toDF(inputCol, labelCol)
    } else {
      List
        .fill(rows)(List.fill(size)(0.0.toFloat).toArray)
        .zip(List.fill(rows)(0.0))
        .toDF(inputCol, labelCol)
    }
  }

  protected def compareToTestModel(result: DataFrame) = {
    //TODO improve checks
    assert(result.columns.toSet == Set(inputCol, outputCol))
    assert(result.count() == testModelDF(result.sparkSession).count())
    val max = result
      .select(outputCol)
      .collect()
      .map(row => row.getAs[DenseVector](0).toArray.max)
      .max
    assert(max < 10 & max > -10)
  }

}

Source File: Word2VecSpec.scala From mmlspark with MIT License

5 votes

// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.core.ml

import com.microsoft.ml.spark.core.schema.DatasetExtensions._
import com.microsoft.ml.spark.core.test.base.TestBase
import org.apache.spark.ml.feature.Word2Vec
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class Word2VecSpec extends TestBase {

  def genTokenizedText(): DataFrame = {
    session.createDataFrame(Seq(
      (0, Array("I", "walked", "the", "dog", "down", "the", "street")),
      (1, Array("I", "walked", "with", "the", "dog")),
      (2, Array("I", "walked", "the", "pup"))
    )).toDF("label", "words")
  }

  def genW2V(): Word2Vec = new Word2Vec().setSeed(1234).setMinCount(0)

  test("operation on tokenized strings") {
    val df = genTokenizedText()

    val df2 = genW2V().setVectorSize(2)
      .setInputCol("words").setOutputCol("features").fit(df).transform(df)

    val lines = df2.getDVCol("features")
    assert(lines.forall(_.size == 2))
  }

  test("return vectors") {
    val df = genTokenizedText()
    val model = genW2V().setVectorSize(2)
      .setInputCol("words").setOutputCol("features").fit(df)
    val vectors = model.getVectors.getDVCol("vector")
    assert(vectors(0).size == 2)
  }

  test("return synonyms") {
    val df = genTokenizedText()
    val model = genW2V().setVectorSize(2)
      .setInputCol("words").setOutputCol("features").fit(df)
    val synonyms = model.findSynonyms("dog", 2).getColAs[String]("word")
    assert(synonyms.length === 2)
  }

  test("raise an error when applied to a null array") {
    val tokenDataFrame = session.createDataFrame(Seq(
      (0, Some(Array("Hi", "I", "can", "not", "foo"))),
      (1, None))
    ).toDF("label", "tokens")
    assertSparkException[org.apache.spark.SparkException](genW2V().setInputCol("tokens"), tokenDataFrame)
  }

  test("raise an error when given strange values of parameters") {
    def base(): Word2Vec = genW2V().setInputCol("words")
    def assertIllegalArgument[T](f: T => Any, args: T*): Unit =
      args.foreach { n => interceptWithoutLogging[IllegalArgumentException] { f(n) } }
    assertIllegalArgument[Int](base.setMinCount,             -1, -10)
    assertIllegalArgument[Int](base.setMaxIter,              -1, -10)
    assertIllegalArgument[Int](base.setVectorSize,        0, -1, -10)
    assertIllegalArgument[Int](base.setWindowSize,        0, -1, -10)
    assertIllegalArgument[Int](base.setMaxSentenceLength, 0, -1, -10)
    assertIllegalArgument[Int](base.setNumPartitions,     0, -1, -10)
    assertIllegalArgument[Double](base.setStepSize, 0.0, -1.0, -10.0)
  }

  test("return a vector of zeros when it encounters an OOV word") {
    val df = genTokenizedText()
    val model = genW2V().setVectorSize(2).setMinCount(1).setInputCol("words").setOutputCol("features").fit(df)
    val df2 = session.createDataFrame(Seq(
      (0, Array("ketchup")))).toDF("label", "words")
    val results = model.transform(df2)
    val lines = results.getDVCol("features")
    val trueLines = List(new DenseVector(Array(0.0, 0.0)))
    assert(lines === trueLines)
  }

  test("be able to set vector size") {
    val df = genTokenizedText()
    val vectorSizes = List(1, 10, 100)
    vectorSizes.foreach { n =>
      val results =
          genW2V().setVectorSize(n)
            .setInputCol("words").setOutputCol("features").fit(df).transform(df)
            .getDVCol("features")
        assert(results(0).size === n)
    }
  }

}

Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: RichVector.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.utils.spark

import breeze.linalg.{DenseVector => BreezeDenseVector, SparseVector => BreezeSparseVector, Vector => BreezeVector}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.collection.mutable.ArrayBuffer


  def combine(vectors: Seq[Vector]): Vector = {
    val indices = ArrayBuffer.empty[Int]
    val values = ArrayBuffer.empty[Double]

    val size = vectors.foldLeft(0)((size, vector) => {
      vector.foreachActive { case (i, v) =>
        if (v != 0.0) {
          indices += size + i
          values += v
        }
      }
      size + vector.size
    })
    Vectors.sparse(size, indices.toArray, values.toArray).compressed
  }

  implicit class RichSparseVector(val v: SparseVector) extends AnyVal {
    def updated(index: Int, indexVal: Int, value: Double): SparseVector = {
      require(v.indices(index) == indexVal,
        s"Invalid index: indices($index)==${v.indices(index)}, expected: $indexVal")
      v.values(index) = value
      v
    }
  }
}

Source File: IDFTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License

5 votes

package com.salesforce.op.stages.impl.feature

import com.salesforce.op._
import com.salesforce.op.features.types._
import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext}
import com.salesforce.op.utils.spark.RichDataset._
import org.apache.spark.ml.feature.IDF
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.{Estimator, Transformer}
import org.junit.runner.RunWith
import org.scalatest.junit.JUnitRunner
import org.scalatest.{Assertions, FlatSpec, Matchers}


@RunWith(classOf[JUnitRunner])
class IDFTest extends FlatSpec with TestSparkContext {

  val data = Seq(
    Vectors.sparse(4, Array(1, 3), Array(1.0, 2.0)),
    Vectors.dense(0.0, 1.0, 2.0, 3.0),
    Vectors.sparse(4, Array(1), Array(1.0))
  )

  lazy val (ds, f1) = TestFeatureBuilder(data.map(_.toOPVector))

  Spec[IDF] should "compute inverted document frequency" in {
    val idf = f1.idf()
    val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds)
    val transformedData = model.asInstanceOf[Transformer].transform(ds)
    val results = transformedData.select(idf.name).collect(idf)

    idf.name shouldBe idf.originStage.getOutputFeatureName

    val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      math.log((data.length + 1.0) / (x + 1.0))
    })
    val expected = scaleDataWithIDF(data, expectedIdf)

    for {
      (res, exp) <- results.zip(expected)
      (x, y) <- res.value.toArray.zip(exp.toArray)
    } assert(math.abs(x - y) <= 1e-5)
  }

  it should "compute inverted document frequency when minDocFreq is 1" in {
    val idf = f1.idf(minDocFreq = 1)
    val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds)
    val transformedData = model.asInstanceOf[Transformer].transform(ds)
    val results = transformedData.select(idf.name).collect(idf)
    idf.name shouldBe idf.originStage.getOutputFeatureName

    val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x =>
      if (x > 0) math.log((data.length + 1.0) / (x + 1.0)) else 0
    })
    val expected = scaleDataWithIDF(data, expectedIdf)

    for {
      (res, exp) <- results.zip(expected)
      (x, y) <- res.value.toArray.zip(exp.toArray)
    } assert(math.abs(x - y) <= 1e-5)
  }

  private def scaleDataWithIDF(dataSet: Seq[Vector], model: Vector): Seq[Vector] = {
    dataSet.map {
      case data: DenseVector =>
        val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y }
        Vectors.dense(res)
      case data: SparseVector =>
        val res = data.indices.zip(data.values).map { case (id, value) =>
          (id, value * model(id))
        }
        Vectors.sparse(data.size, res)
    }
  }

}

Source File: HasNetlibBlas.scala From pravda-ml with Apache License 2.0

5 votes

package org.apache.spark.ml.odkl

import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS}
import com.github.fommil.netlib.{F2jBLAS, BLAS => NetlibBLAS}
import org.apache.spark.ml.linalg.{DenseVector, Matrices, Vector, Vectors}

trait HasNetlibBlas {
  // For level-1 routines, we use Java implementation.
  def f2jBLAS: NetlibBLAS = HasNetlibBlas._f2jBLAS

  def blas: NetlibBLAS = HasNetlibBlas._nativeBLAS

  def dscal(a: Double, data: Array[Double]) : Unit = f2jBLAS.dscal(data.length, a, data, 1)

  def axpy(a: Double, x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.daxpy(x.length, a, x, 1, y, 1)

  def axpy(a: Double, x: Vector, y : Array[Double]) : Unit = x match {
    case dense: DenseVector => axpy(a, dense.values, y)
    case _ => x.foreachActive((i, v) => y(i) += a * v)
  }

  def copy( x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.dcopy(x.length, x, 1, y, 1)
}

object HasNetlibBlas extends Serializable {
  @transient private lazy val _f2jBLAS: NetlibBLAS = {
    initSparkBlas
    new F2jBLAS
  }

  private def initSparkBlas = synchronized {
    org.apache.spark.ml.linalg.BLAS.dot(Vectors.zeros(2), Vectors.zeros(2))
    org.apache.spark.ml.linalg.BLAS.gemv(1.0, Matrices.zeros(2, 2), Vectors.zeros(2), 0.5, Vectors.zeros(2).toDense)
  }

  @transient private lazy val _nativeBLAS: NetlibBLAS = {
    initSparkBlas
    NativeBLAS
  }
}

Source File: NormalizerSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.sql.{DataFrame, Row}


class NormalizerSuite extends MLTest with DefaultReadWriteTest {

  import testImplicits._

  @transient var data: Array[Vector] = _
  @transient var l1Normalized: Array[Vector] = _
  @transient var l2Normalized: Array[Vector] = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    data = Array(
      Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0),
      Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))),
      Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))),
      Vectors.sparse(3, Seq())
    )
    l1Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.12765957, -0.23404255, -0.63829787),
      Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))),
      Vectors.dense(0.625, 0.07894737, 0.29605263),
      Vectors.sparse(3, Seq())
    )
    l2Normalized = Array(
      Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))),
      Vectors.dense(0.0, 0.0, 0.0),
      Vectors.dense(0.184549876, -0.3383414, -0.922749378),
      Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))),
      Vectors.dense(0.897906166, 0.113419726, 0.42532397),
      Vectors.sparse(3, Seq())
    )
  }

  def assertTypeOfVector(lhs: Vector, rhs: Vector): Unit = {
    assert((lhs, rhs) match {
      case (v1: DenseVector, v2: DenseVector) => true
      case (v1: SparseVector, v2: SparseVector) => true
      case _ => false
    }, "The vector type should be preserved after normalization.")
  }

  def assertValues(lhs: Vector, rhs: Vector): Unit = {
    assert(lhs ~== rhs absTol 1E-5, "The vector value is not correct after normalization.")
  }

  test("Normalization with default parameter") {
    val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized")
    val dataFrame: DataFrame = data.zip(l2Normalized).seq.toDF("features", "expected")

    testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") {
      case Row(features: Vector, normalized: Vector, expected: Vector) =>
        assertTypeOfVector(normalized, features)
        assertValues(normalized, expected)
    }
  }

  test("Normalization with setter") {
    val dataFrame: DataFrame = data.zip(l1Normalized).seq.toDF("features", "expected")
    val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized").setP(1)

    testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") {
      case Row(features: Vector, normalized: Vector, expected: Vector) =>
        assertTypeOfVector(normalized, features)
        assertValues(normalized, expected)
    }
  }

  test("read/write") {
    val t = new Normalizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setP(3.0)
    testDefaultReadWrite(t)
  }
}

Source File: ProtobufRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter

class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val labelColumnName = "label"
  val featuresColumnName = "features"
  val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField(
    featuresColumnName, VectorType)))

  it should "serialize a dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "serialize a sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new ProtobufRequestRowSerializer(Some(schema))
    val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty)
    val serialized = rrs.serializeRow(row)
    val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized)
    val protobufFromRecordIO = protobufIterator.next

    assert(!protobufIterator.hasNext)
    assert(protobuf.equals(protobufFromRecordIO))
  }

  it should "fail to set schema on invalid features name" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    intercept[IllegalArgumentException] {
      val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist")
    }
  }


  it should "fail on invalid types" in {
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new ProtobufRequestRowSerializer(Some(validSchema))
  }
}

Source File: UnlabeledLibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StringType, StructField, StructType}

class UnlabeledLibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {

  val schema = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false)))

  "UnlabeledLibSVMRequestRowSerializer" should "serialize sparse vector" in {
    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    val serialized = new String(rrs.serializeRow(row))
    assert ("0.0 1:-100.0 11:100.1\n" == serialized)
  }

  it should "serialize dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    val serialized = new String(rrs.serializeRow(row))
    assert("0.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "fail on invalid features column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs =
      new UnlabeledLibSVMRequestRowSerializer(featuresColumnName = "mangoes are not features")
    intercept[RuntimeException] {
      rrs.serializeRow(row)
    }
  }

  it should "fail on invalid features type" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row =
      new GenericRowWithSchema(values = Seq(1.0, "FEATURESSSSSZ!1!").toArray, schema = schema)
    val rrs = new UnlabeledLibSVMRequestRowSerializer()
    intercept[RuntimeException] {
      rrs.serializeRow(row)
    }
  }


  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("features", SQLDataTypes.VectorType, nullable = false)))

    val rrs = new UnlabeledLibSVMRequestRowSerializer(Some(validSchema))
  }

  it should "fail to validate incorrect schema" in {
    val invalidSchema = StructType(Array(
      StructField("features", StringType, nullable = false)))

    intercept[IllegalArgumentException] {
      new UnlabeledLibSVMRequestRowSerializer(Some(invalidSchema))
    }
  }
}

Source File: LibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest._
import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.LibSVMResponseRowDeserializer

class LibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {
  val schema = new LibSVMResponseRowDeserializer(10).schema

  "LibSVMRequestRowSerializer" should "serialize sparse vector" in {

    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new LibSVMRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert ("1.0 1:-100.0 11:100.1\n" == serialized)
  }

  it should "serialize dense vector" in {

    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema)
    val rrs = new LibSVMRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "ignore other columns" in {
    val schemaWithExtraColumns = StructType(Array(
      StructField("name", StringType, nullable = false),
      StructField("label", DoubleType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false),
        StructField("favorite activity", StringType, nullable = false)))

    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq("Elizabeth", 1.0, vec, "Crying").toArray,
      schema = schemaWithExtraColumns)

    val rrs = new LibSVMRequestRowSerializer(Some(schemaWithExtraColumns))
    val serialized = new String(rrs.serializeRow(row))
    assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized)
  }

  it should "fail on invalid features column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schema), featuresColumnName = "i do not exist dear sir!")
    }
  }

  it should "fail on invalid label column name" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schema),
        labelColumnName = "Sir! I must protest! I do not exist!")
    }
  }

  it should "fail on invalid types" in {
    val schemaWithInvalidLabelType = StructType(Array(
      StructField("label", StringType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schemaWithInvalidLabelType))
    }
    val schemaWithInvalidFeaturesType = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", StringType, nullable = false)))
    intercept[RuntimeException] {
      new LibSVMRequestRowSerializer(Some(schemaWithInvalidFeaturesType))
    }
  }

  it should "validate correct schema" in {
    val validSchema = StructType(Array(
      StructField("label", DoubleType, nullable = false),
      StructField("features", SQLDataTypes.VectorType, nullable = false)))
    new LibSVMRequestRowSerializer(Some(validSchema))
  }
}

Source File: UnlabeledCSVRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0

5 votes

package unit.com.amazonaws.services.sagemaker.sparksdk.transformation.serializers

import org.scalatest.{FlatSpec, Matchers}
import org.scalatest.mock.MockitoSugar

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.{StructField, StructType}

import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.UnlabeledCSVRequestRowSerializer

class UnlabeledCSVRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar {
  val schema: StructType =
    StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false)))

  it should "serialize sparse vector" in {

    val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    val sparseString = "-100.0," + "0.0," * 9 + "100.1," + "0.0," * 88 + "0.0\n"
    assert (sparseString == serialized)
  }

  it should "serialize dense vector" in {
    val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray)
    val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema)
    val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema))
    val serialized = new String(rrs.serializeRow(row))
    assert("10.0,-100.0,2.0\n" == serialized)
  }
}

Source File: KNNPropSpec.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.util.knn

import scala.reflect.ClassTag
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Arbitrary.arbitrary
import org.scalacheck.Gen.{choose, oneOf}
import org.scalatest.PropSpec
import org.apache.spark.ml.linalg.{
  CosineDistance,
  EuclideanDistance,
  ManhattanDistance,
  JaccardDistance,
  HammingDistance
}
import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector, Vectors}
import com.holdenkarau.spark.testing.SharedSparkContext


abstract class KNNPropSpec extends PropSpec with SharedSparkContext {
  implicit def arbitraryDenseVector: Arbitrary[DenseVector] =
    Arbitrary {
      for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr)
    }

  implicit def arbitrarySparseVector: Arbitrary[SparseVector] =
    Arbitrary {
      for (vec <- arbitrary[DenseVector]) yield vec.toSparse
    }

  implicit def arbitraryVector: Arbitrary[Vector] =
    Arbitrary(
      Gen.frequency(
        1 -> arbitrary[DenseVector],
        1 -> arbitrary[SparseVector]
      ))

  private def arraysOfNM[T: ClassTag](numRows: Int,
                                      numCols: Int,
                                      gen: Gen[T]): Gen[Array[Array[T]]] =
    Gen.listOfN(numRows * numCols, gen).map { square =>
      square.toArray.grouped(numCols).toArray
    }

  private def vectorsOfNM(numRows: Int,
                          numCols: Int,
                          gen: Gen[Double]): Gen[Array[DenseVector]] =
    for {
      arrays <- arraysOfNM(numRows, numCols, gen)
    } yield arrays.map(arr => new DenseVector(arr))

  val treeGen = for {
    measure <- oneOf(CosineDistance,
                     EuclideanDistance,
                     ManhattanDistance,
                     HammingDistance,
                     JaccardDistance)
    numVectors <- choose(1, 100)
    vectors <- vectorsOfNM(numVectors, 2, choose(-10.0, 10.0))
  } yield
    vectors
      .scanLeft(Seq[Vector]())(_ :+ _)
      .tail
      .map(
        vs =>
          VPTree(vs.map(v => VectorEntry(0L, v)).toIndexedSeq,
                 measure,
                 10,
                 10,
                 10))
}

Source File: LocalPCAModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.PCAModel
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Vectors}
import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Matrices => OldMatrices}

class LocalPCAModel(override val sparkTransformer: PCAModel) extends LocalTransformer[PCAModel] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val pc      = OldMatrices.fromML(sparkTransformer.pc).asInstanceOf[OldDenseMatrix]
        val newData = column.data.mapToMlLibVectors.map(pc.transpose.multiply).map(_.toList)
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalPCAModel extends SimpleModelLoader[PCAModel] with TypedTransformerConverter[PCAModel] {

  override def build(metadata: Metadata, data: LocalData): PCAModel = {
    val constructor = classOf[PCAModel].getDeclaredConstructor(
      classOf[String],
      classOf[DenseMatrix],
      classOf[DenseVector]
    )
    constructor.setAccessible(true)
    val pcMap = data.column("pc").get.data.head.asInstanceOf[Map[String, Any]]
    val pcMat = DataUtils.constructMatrix(pcMap).asInstanceOf[DenseMatrix]
    data.column("explainedVariance") match {
      case Some(ev) =>
        // NOTE: Spark >= 2
        val evParams = ev.data.head.asInstanceOf[Map[String, Any]]
        val explainedVariance = DataUtils.constructVector(evParams).toDense

        constructor
          .newInstance(metadata.uid, pcMat, explainedVariance)
          .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
          .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      case None =>
        // NOTE: Spark < 2
        constructor
          .newInstance(
            metadata.uid,
            pcMat,
            Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]
          )
          .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
          .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
    }
  }

  override implicit def toLocal(transformer: PCAModel) =
    new LocalPCAModel(transformer)
}

Source File: LocalMaxAbsScalerModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.MaxAbsScalerModel
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}

class LocalMaxAbsScalerModel(override val sparkTransformer: MaxAbsScalerModel)
  extends LocalTransformer[MaxAbsScalerModel] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val maxAbsUnzero =
          Vectors.dense(sparkTransformer.maxAbs.toArray.map(x => if (x == 0) 1 else x))
        val newData = column.data.map(r => {
          val vec = r match {
            case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue())
            case d =>
              throw new IllegalArgumentException(s"Unknown data type for LocalMaxAbsScaler: $d")
          }
          val brz = DataUtils.asBreeze(vec.toArray) / DataUtils.asBreeze(maxAbsUnzero.toArray)
          DataUtils.fromBreeze(brz).toList
        })
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalMaxAbsScalerModel
  extends SimpleModelLoader[MaxAbsScalerModel]
  with TypedTransformerConverter[MaxAbsScalerModel] {
  override def build(metadata: Metadata, data: LocalData): MaxAbsScalerModel = {
    val maxAbsParams = data.column("maxAbs").get.data.head.asInstanceOf[Map[String, Any]]
    val maxAbs = DataUtils.constructVector(maxAbsParams)

    val constructor =
      classOf[MaxAbsScalerModel].getDeclaredConstructor(classOf[String], classOf[Vector])
    constructor.setAccessible(true)
    constructor
      .newInstance(metadata.uid, maxAbs)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
  }

  override implicit def toLocal(
    transformer: MaxAbsScalerModel
  ): LocalMaxAbsScalerModel = new LocalMaxAbsScalerModel(transformer)
}

Source File: LocalMinMaxScalerModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.MinMaxScalerModel
import org.apache.spark.ml.linalg.{DenseVector, Vector}

class LocalMinMaxScalerModel(override val sparkTransformer: MinMaxScalerModel)
  extends LocalTransformer[MinMaxScalerModel] {
  override def transform(localData: LocalData): LocalData = {
    val originalRange =
      (DataUtils.asBreeze(sparkTransformer.originalMax.toArray) - DataUtils.asBreeze(
        sparkTransformer.originalMin.toArray
      )).toArray
    val minArray = sparkTransformer.originalMin.toArray
    val min      = sparkTransformer.getMin
    val max      = sparkTransformer.getMax

    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val newData = column.data.map(r => {
          val scale = max - min
          val vec = r match {
            case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue())
            case d =>
              throw new IllegalArgumentException(s"Unknown data type for LocalMinMaxScaler: $d")
          }
          val values = vec.toArray
          val size   = values.length
          var i      = 0
          while (i < size) {
            if (!values(i).isNaN) {
              val raw =
                if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5
              values.update(i, raw * scale + min)
            }
            i += 1
          }
          values.toList
        })
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalMinMaxScalerModel
  extends SimpleModelLoader[MinMaxScalerModel]
  with TypedTransformerConverter[MinMaxScalerModel] {
  override def build(metadata: Metadata, data: LocalData): MinMaxScalerModel = {
    val originalMinList = data
      .column("originalMin")
      .get
      .data
      .head
      .asInstanceOf[Map[String, Any]]

    val originalMin = DataUtils.constructVector(originalMinList)

    val originalMaxList = data
      .column("originalMax")
      .get
      .data
      .head
      .asInstanceOf[Map[String, Any]]

    val originalMax = DataUtils.constructVector(originalMaxList)

    val constructor = classOf[MinMaxScalerModel].getDeclaredConstructor(
      classOf[String],
      classOf[Vector],
      classOf[Vector]
    )
    constructor.setAccessible(true)
    constructor
      .newInstance(metadata.uid, originalMin, originalMax)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      .setMin(metadata.paramMap("min").toString.toDouble)
      .setMax(metadata.paramMap("max").toString.toDouble)
  }

  override implicit def toLocal(
    transformer: MinMaxScalerModel
  ) = new LocalMinMaxScalerModel(transformer)
}

Source File: get_features_from_peinfo.scala From gsoc_relationship with Apache License 2.0

5 votes

import com.datastax.spark.connector._
import play.api.libs.json.Json
import play.api.libs.json._
import java.io.{ByteArrayOutputStream, ByteArrayInputStream}
import java.util.zip.{GZIPOutputStream, GZIPInputStream}
import Array.concat
import org.apache.spark.sql.types._
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType 
import org.apache.spark.ml.linalg._
import org.apache.spark.sql.Row
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.DenseVector
import PreProcessingConfig._

case class peinfo_results_by_service_name_class(service_name: String, sha256: String)
case class peinfo_results_by_sha256_class(sha256: String, service_name: String, results: Array[Byte])
case class peinfo_join_results_class(sha256: String, service_name: String, results: String)
case class peinfo_int_final_array_rdd_class(sha256: String, array_results: Array[Double])
case class peinfo_binaray_final_array_rdd_class(sha256:String, array_results :Array[Double])
case class peinfo_final_array_rdd_class(sha256:String, array_results: Array[Double])

def unzip(x: Array[Byte]) : String = {      
    val inputStream = new GZIPInputStream(new ByteArrayInputStream(x))
    val output = scala.io.Source.fromInputStream(inputStream).mkString
    return output
}
def findAllIntinpeinfo( peinfo_json_results : JsLookupResult, time: Double): Array[Double]= {
    val entropy = peinfo_json_results \\ "entropy" ; val virt_address = peinfo_json_results \\ "virt_address"; val virt_size = peinfo_json_results \\ "virt_size"; val size = peinfo_json_results \\ "size";
    var i= 0; var List  = Array.iterate(0.0,17)(a=>a*0)
    for (k <- ( peinfo_json_results \\ "section_name")){
        k.as[String] match {
            case ".text\u0000\u0000\u0000" => { List(0)=entropy(i).as[Double]; List(1)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(2)=virt_size(i).as[Double]; List(3)=size(i).as[Double] }
            case ".data\u0000\u0000\u0000" => { List(4)=entropy(i).as[Double]; List(5)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(6)=virt_size(i).as[Double]; List(7)=size(i).as[Double] }
            case ".rsrc\u0000\u0000\u0000" => { List(8)=entropy(i).as[Double]; List(9)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(10)=virt_size(i).as[Double]; List(11)=size(i).as[Double] }
            case ".rdata\u0000\u0000" => { List(12)=entropy(i).as[Double]; List(13)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(14)=virt_size(i).as[Double]; List(15)=size(i).as[Double] }
            case other => {}
        }
        i = i + 1
    }
    List(16)= time
    return List.toArray
}

val peinfo_results_by_service_name_meta = sc.cassandraTable[peinfo_results_by_service_name_class](keyspace,service_name_table).where("service_name=?","peinfo")
val peinfo_results_by_service_name_rdd = peinfo_results_by_service_name_meta.keyBy(x=> (x.sha256,x.service_name))
val peinfo_results_by_sha256_meta = sc.cassandraTable[peinfo_results_by_sha256_class](keyspace,sha256_table)
val peinfo_results_by_sha256_rdd = peinfo_results_by_sha256_meta.keyBy(x => (x.sha256,x.service_name))
val peinfo_join_results = peinfo_results_by_service_name_rdd.join(peinfo_results_by_sha256_rdd).map(x=> (new peinfo_join_results_class(x._1._1,x._1._2, unzip(x._2._2.results)))).distinct().cache()

val peinfo_int_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "pe_sections"),{if ((Json.parse(x.results) \ "timestamp").isInstanceOf[JsUndefined]) 0.0 else (Json.parse(x.results) \ "timestamp" \\ "timestamp")(0).as[Double]})).filter(x=> !x._2.isInstanceOf[JsUndefined]).map(x=>new  peinfo_int_final_array_rdd_class(x._1,findAllIntinpeinfo(x._2,x._3)))

val peinfo_dllfunction_list= peinfo_join_results.map(x=>Json.parse(x.results) \ "imports").filter(x=> !x.isInstanceOf[JsUndefined]).flatMap(x=>x.as[List[Map[String, String]]].map(x=>(x("dll")+"."+x("function")))).toDF("func_name").groupBy("func_name").count.sort(desc("count")).filter("count > 10000").rdd.map(r => r.getString(0)).collect().toList
implicit def bool2int(b:Boolean) = if (b) 1 else 0
def findAllBininpeinfo_dllfunction(peinfo_dllfunction : Seq[String]) : Array[Double] ={
    val forlist = for (family <- peinfo_dllfunction_list) yield {
        (peinfo_dllfunction.contains(family):Int).toDouble
    }
    return (forlist).toArray
}
val List502 = Array.iterate(0.0,502)(a=>0.0)
val peinfo_binaray_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "imports"))).map(x=>new  peinfo_binaray_final_array_rdd_class(x._1,{if (x._2.isInstanceOf[JsUndefined]) List502 else findAllBininpeinfo_dllfunction(x._2.as[Seq[Map[String, String]]].map(x=>(x("dll")+"."+x("function"))))}))

val peinfo_int_final_array_rdd_before_join = peinfo_int_final_array_rdd.map(x=>(x.sha256,x.array_results))
val peinfo_binaray_final_array_rdd_before_join = peinfo_binaray_final_array_rdd.map(x=>(x.sha256,x.array_results))
val peinfo_array_rdd_by_join = peinfo_int_final_array_rdd_before_join.join(peinfo_binaray_final_array_rdd_before_join).map(x=> (x._1,concat(x._2._1,x._2._2)))
val peinfo_final_array_rdd = peinfo_array_rdd_by_join.map(x=>new peinfo_final_array_rdd_class(x._1,x._2))

val peinfo_schema = new StructType().add("sha256", StringType).add("peinfo",VectorType)
val peinfo_vector_rdd = peinfo_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results)))
val peinfo_vector_rowrdd = peinfo_vector_rdd.map(p => Row(p._1,p._2))
val peinfo_vector_dataframe = spark.createDataFrame(peinfo_vector_rowrdd, peinfo_schema)
val peinfo_scaler = new MinMaxScaler()
  .setInputCol("peinfo")
  .setOutputCol("scaled_peinfo")
val peinfo_scalerModel = peinfo_scaler.fit(peinfo_vector_dataframe)
val peinfo_scaledData_df = peinfo_scalerModel.transform(peinfo_vector_dataframe)
val peinfo_scaledData_rdd = peinfo_scaledData_df.select("sha256","scaled_peinfo").rdd.map(row=>(row.getAs[String]("sha256"),row.getAs[DenseVector]("scaled_peinfo"))).map(x=>new peinfo_final_array_rdd_class(x._1,x._2.toArray))
peinfo_scaledData_rdd.toDF().write.format("parquet").save(peinfo_final_array_file)

Source File: FeatureCrossOp.scala From automl with Apache License 2.0

5 votes

package com.tencent.angel.spark.automl.feature.cross

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}

import scala.collection.mutable.ArrayBuffer

object FeatureCrossOp {

  def flatCartesian(vector: Vector): Vector = {
    val curDim = vector.size
    vector match {
      case sv: SparseVector =>
        val indices = new ArrayBuffer[Int]()
        val values = new ArrayBuffer[Double]()
        sv.indices.foreach { idx1 =>
          sv.indices.foreach { idx2 =>
            indices += curDim * idx1 + idx2
            values += sv(idx1) * sv(idx2)
          }
        }
        val sorted = indices.zip(values).sortBy(_._1)
        val sortedIndices = sorted.map(_._1)
        val sortedValues = sorted.map(_._2)
        new SparseVector(sv.size * sv.size, sortedIndices.toArray, sortedValues.toArray)
      case dv: DenseVector =>
        val values: Array[Double] = new Array(dv.size * dv.size)
        (0 until dv.size).foreach { idx1 =>
          (0 until dv.size).foreach { idx2 =>
            values(dv.size * idx1 + idx2) = dv(idx1) * dv(idx2)
          }
        }
        new DenseVector(values)
    }
  }

  def main(args: Array[String]): Unit = {
    val v = new DenseVector(Array(1, 2, 3))
    val cv = flatCartesian(v)
    println(cv.toDense.values.mkString(","))
  }

}

Source File: FeatureUtils.scala From automl with Apache License 2.0

5 votes

package com.tencent.angel.spark.automl.feature

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.sql.{Dataset, Row}

import scala.language.postfixOps

object FeatureUtils {

  def maxDim(dataset: Dataset[Row], col: String = "features"): Int = {
    dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] =>
      val dim = rows.map { case Row(v: Vector) =>
        v match {
          case sv: SparseVector => sv.indices.last
          case dv: DenseVector => dv.size
        }
      }.max
      Iterator(dim)
    }.max + 1
  }

  def countNonZero(dataset: Dataset[Row], col: String = "features"): Array[Int] = {
    dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] =>
      val mergeIndices = rows.map { case Row(v: Vector) =>
        v match {
          case sv: SparseVector =>
            sv.indices.toList
        }
      }.reduce(_ union _ distinct)
      Iterator(mergeIndices)
    }.reduce((a, b) => (a union b).distinct).toArray
  }

}

Source File: DataUtils.scala From automl with Apache License 2.0

5 votes

package com.tencent.angel.spark.automl.utils

import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{DataFrame, SparkSession}

object DataUtils {

  def parse(ss: SparkSession,
            schema: StructType,
            X: Array[Vector],
            Y: Array[Double]): DataFrame = {
    require(X.size == Y.size,
      "The size of configurations should be equal to the size of rewards.")
    ss.createDataFrame(
      Y.zip(X)).toDF("label", "features")
  }

  def parse(ss: SparkSession,
            schema: StructType,
            X: Vector): DataFrame = {
    parse(ss, schema, Array(X), Array(0))
  }

  def toBreeze(values: Array[Double]): BDV[Double] = {
    new BDV[Double](values)
  }

  def toBreeze(vector: Vector): BDV[Double] = vector match {
    case sv: SparseVector => new BDV[Double](vector.toDense.values)
    case dv: DenseVector => new BDV[Double](dv.values)
  }

  def toBreeze(X: Array[Vector]): BDM[Double] = {
    val mat = BDM.zeros[Double](X.size, X(0).size)
    for (i <- 0 until X.size) {
      for (j <- 0 until X(0).size) {
        mat(i, j) = X(i)(j)
      }
    }
    mat
  }

}

Source File: Evaluator.scala From CTRmodel with Apache License 2.0

5 votes

package com.ggstar.evaluation

import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.sql.DataFrame

class Evaluator {
  def evaluate(predictions:DataFrame):Unit = {

    import  predictions.sparkSession.implicits._

    val scoreAndLabels = predictions.select("label", "probability").map { row =>
      (row.apply(1).asInstanceOf[DenseVector](1), row.getAs[Int]("label").toDouble)
    }

    val metrics = new BinaryClassificationMetrics(scoreAndLabels.rdd)

    println("AUC under PR = " + metrics.areaUnderPR())
    println("AUC under ROC = " + metrics.areaUnderROC())
  }
}

Source File: OuterProductNNCtrModel.scala From CTRmodel with Apache License 2.0

5 votes

package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class OuterProductNNCtrModel extends BaseCtrModel {

  def train(samples:DataFrame) : Unit = {
    //calculate inner product between item embedding and user embedding
    val samplesWithOuterProduct = FeatureEngineering.calculateEmbeddingOuterProduct(samples)

    val prePipelineModel = FeatureEngineering.preProcessOuterProductSamples(samplesWithOuterProduct)

    val preparedSamples = prePipelineModel.transform(samplesWithOuterProduct)

    //network architecture, better to keep tuning it until metrics converge
    val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length,
      preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2)

    val nnModel = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(128)
      .setSeed(1234L)
      .setMaxIter(150)      //max iterations, keep increasing it if loss function or metrics don't converge
      .setStepSize(0.005)   //learning step size, larger size will lead to loss vibration
      .setFeaturesCol("scaledFeatures")
      .setLabelCol("label")
    val pipelineStages = prePipelineModel.stages ++ Array(nnModel)

    _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithOuterProduct)
  }

  override def transform(samples:DataFrame):DataFrame = {
    val samplesWithOuterProduct = FeatureEngineering.calculateEmbeddingOuterProduct(samples)
    _pipelineModel.transform(samplesWithOuterProduct)
  }
}

Source File: FactorizationMachineCtrModel.scala From CTRmodel with Apache License 2.0

5 votes

package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.{FMModel, FMWithSGD, LabeledPoint}
import org.apache.spark.sql.DataFrame

class FactorizationMachineCtrModel extends BaseCtrModel {
  var _model:FMModel = _

  def train(samples:DataFrame) : Unit = {
    //calculate inner product between item embedding and user embedding
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    _pipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct)

    val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct)

    val formatSamples = preparedSamples.rdd.map( row =>{
      new LabeledPoint(row.getAs[Int]("label").toDouble, Vectors.fromML(row.getAs[DenseVector]("scaledFeatures")))
    })

    _model = FMWithSGD.train(formatSamples, task = 1, numIterations = 200, stepSize = 0.15, miniBatchFraction = 1, dim = (true, true, 2), regParam = (0, 0, 0), initStd = 0.1)
  }

  override def transform(samples:DataFrame):DataFrame = {
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct)

    _model.predict(preparedSamples)
  }
}

Source File: InnerProductNNCtrModel.scala From CTRmodel with Apache License 2.0

5 votes

package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.{LogisticRegression, MultilayerPerceptronClassifier}
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class InnerProductNNCtrModel extends BaseCtrModel {

  def train(samples:DataFrame) : Unit = {
    //calculate inner product between item embedding and user embedding
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)

    val prePipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct)

    val preparedSamples = prePipelineModel.transform(samplesWithInnerProduct)

    //network architecture, better to keep tuning it until metrics converge
    val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length,
      preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2)


    val nnModel = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(128)
      .setSeed(1234L)
      .setMaxIter(150)      //max iterations, keep increasing it if loss function or metrics don't converge
      .setStepSize(0.005)   //learning step size, larger size will lead to loss vibration
      .setFeaturesCol("scaledFeatures")
      .setLabelCol("label")

    val pipelineStages = prePipelineModel.stages ++ Array(nnModel)

    _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithInnerProduct)
  }

  override def transform(samples:DataFrame):DataFrame = {
    val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples)
    _pipelineModel.transform(samplesWithInnerProduct)
  }
}

Source File: NeuralNetworkCtrModel.scala From CTRmodel with Apache License 2.0

5 votes

package com.ggstar.ctrmodel

import com.ggstar.features.FeatureEngineering
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.MultilayerPerceptronClassifier
import org.apache.spark.ml.linalg.DenseVector
import org.apache.spark.sql.DataFrame

class NeuralNetworkCtrModel extends BaseCtrModel {

  def train(samples:DataFrame) : Unit = {
    val prePipelineModel = FeatureEngineering.preProcessSamples(samples)

    val preparedSamples = prePipelineModel.transform(samples)

    //network architecture, better to keep tuning it until metrics converge
    val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length,
      preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2)

    val nnModel = new MultilayerPerceptronClassifier()
      .setLayers(layers)
      .setBlockSize(128)
      .setSeed(1234L)
      .setMaxIter(150)                //max iterations, keep increasing it if loss function or metrics don't converge
      .setStepSize(0.005)             //learning step size, larger size will lead to loss vibration
      .setFeaturesCol("scaledFeatures")
      .setLabelCol("label")

    val pipelineStages = prePipelineModel.stages ++ Array(nnModel)

    _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samples)
  }
}

Source File: FeaturePropSpec.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{Vector, Vectors, DenseVector}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.types.{
  StructField,
  IntegerType,
  DoubleType,
  BooleanType,
  StructType,
  StringType,
  ArrayType
}
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Arbitrary.arbitrary
import org.scalatest.PropSpec
import com.holdenkarau.spark.testing.{
  SharedSparkContext,
  DataframeGenerator,
  Column
}


abstract class FeaturePropSpec
    extends PropSpec
    with SharedSparkContext
    with DefaultReadWriteTest {
  implicit def arbitraryDenseVector: Arbitrary[DenseVector] =
    Arbitrary {
      for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr)
    }

  implicit def arbitraryVector: Arbitrary[Vector] =
    Arbitrary(
      Gen.frequency(
        1 -> arbitrary[DenseVector]
      ))

  lazy val spark = SparkSession.builder().getOrCreate()

  def schema =
    StructType(
      List(
        StructField("integer", IntegerType),
        StructField("double", DoubleType),
        StructField("boolean", BooleanType),
        StructField("string", StringType)
      ))

  def integerGen = new Column("integer", Gen.choose(-100, 100))

  def doubleGen = new Column("double", Gen.choose(-100.0, 100.0))

  def stringGen =
    new Column("string", Gen.oneOf("A", "BC", "DEF", "GHIJ", "KLMNO"))

  def dataframeGen =
    DataframeGenerator.arbitraryDataFrameWithCustomFields(
      spark.sqlContext,
      schema)(integerGen, doubleGen, stringGen)

  def hasDistinctValues(df: DataFrame, columns: String*): Boolean = {
    columns.foldLeft(true) { (acc, col) =>
      acc && df.select(col).distinct.count() > 1
    }
  }
}

Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: XgbConverters.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.xgboost.runtime

import biz.k11i.xgboost.util.FVec
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor}
import ml.combust.mleap.xgboost.runtime.struct.FVecFactory
import ml.dmlc.xgboost4j.LabeledPoint
import ml.dmlc.xgboost4j.scala.DMatrix
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}


trait XgbConverters {
  implicit class VectorOps(vector: Vector) {
    def asXGB: DMatrix = {
      vector match {
        case SparseVector(_, indices, values) =>
          new DMatrix(Iterator(new LabeledPoint(0.0f, indices, values.map(_.toFloat))))

        case DenseVector(values) =>
          new DMatrix(Iterator(new LabeledPoint(0.0f, null, values.map(_.toFloat))))
      }
    }

    def asXGBPredictor: FVec = {
      vector match {
        case sparseVector: SparseVector =>
          FVecFactory.fromSparseVector(sparseVector)
        case denseVector: DenseVector =>
          FVecFactory.fromDenseVector(denseVector)
      }
    }
  }

  implicit class DoubleTensorOps(tensor: Tensor[Double]) {
    def asXGB: DMatrix = {
      tensor match {
        case SparseTensor(indices, values, _) =>
          new DMatrix(Iterator(new LabeledPoint(0.0f, indices.map(_.head).toArray, values.map(_.toFloat))))

        case DenseTensor(_, _) =>
          new DMatrix(Iterator(new LabeledPoint(0.0f, null, tensor.toDense.rawValues.map(_.toFloat))))
      }
    }

    def asXGBPredictor: FVec = {
      tensor match {
        case sparseTensor: SparseTensor[Double] =>
          FVecFactory.fromSparseTensor(sparseTensor)

        case denseTensor: DenseTensor[Double] =>
          FVecFactory.fromDenseTensor(denseTensor)
      }
    }
  }
}

object XgbConverters extends XgbConverters

Source File: VectorSlicerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.VectorUtil._


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala")
case class VectorSlicerModel(indices: Array[Int],
                             namedIndices: Array[(String, Int)] = Array(),
                            inputSize: Int) extends Model {
  val allIndices: Array[Int] = indices.union(namedIndices.map(_._2))

  def apply(features: Vector): Vector = features match {
    case features: DenseVector => Vectors.dense(allIndices.map(features.apply))
    case features: SparseVector => features.slice(allIndices)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(allIndices.length)).get

}

Source File: ElementwiseProductModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructField, StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala")
case class ElementwiseProductModel(scalingVec: Vector) extends Model {
  def apply(vector: Vector): Vector = {
    vector match {
      case DenseVector(values) =>
        val vs = values.clone()
        val size = vs.length
        var i = 0

        while (i < size) {
          vs(i) *= scalingVec(i)
          i += 1
        }
        Vectors.dense(vs)
      case SparseVector(size, indices, values) =>
        val vs = values.clone()
        val nnz = vs.length
        var i = 0
        while (i < nnz) {
          vs(i) *= scalingVec(indices(i))
          i += 1
        }
        Vectors.sparse(size, indices, vs)
    }
  }

  override def inputSchema: StructType = StructType(StructField("input" -> TensorType.Double(scalingVec.size))).get

  override def outputSchema: StructType = StructType(StructField("output" -> TensorType.Double(scalingVec.size))).get
}

Source File: MaxAbsScalerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.math.{max, min}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala")
case class MaxAbsScalerModel(maxAbs: Vector) extends Model {
  def apply(vector: Vector): Vector = {
    val maxAbsUnzero = Vectors.dense(maxAbs.toArray.map(x => if (x == 0) 1 else x))

    vector match {
      case DenseVector(values) =>
        val vs = values.clone()
        val size = vs.length
        var i = 0

        while (i < size) {
          if (!values(i).isNaN) {
            val rescale = max(-1.0, min(1.0, values(i) / maxAbsUnzero(i)))
            vs(i) = rescale
          }
          i += 1
        }
        Vectors.dense(vs)
      case SparseVector(size, indices, values) =>
        val vs = values.clone()
        val nnz = vs.length
        var i = 0
        while (i < nnz) {
          val raw = max(-1.0, min(1.0, values(i) / maxAbsUnzero(indices(i))))

          vs(i) = raw
          i += 1
        }
        Vectors.sparse(size, indices, vs)
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(maxAbs.size)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(maxAbs.size)).get

}

Source File: ChiSqSelectorModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.collection.mutable


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala")
case class ChiSqSelectorModel(filterIndices: Seq[Int],
                              inputSize: Int) extends Model {
  def apply(features: Vector): Vector = {
    features match {
      case SparseVector(size, indices, values) =>
        val newSize = filterIndices.length
        val newValues = mutable.ArrayBuilder.make[Double]
        val newIndices = mutable.ArrayBuilder.make[Int]
        var i = 0
        var j = 0
        var indicesIdx = 0
        var filterIndicesIdx = 0
        while (i < indices.length && j < filterIndices.length) {
          indicesIdx = indices(i)
          filterIndicesIdx = filterIndices(j)
          if (indicesIdx == filterIndicesIdx) {
            newIndices += j
            newValues += values(i)
            j += 1
            i += 1
          } else {
            if (indicesIdx > filterIndicesIdx) {
              j += 1
            } else {
              i += 1
            }
          }
        }
        // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size)
        Vectors.sparse(newSize, newIndices.result(), newValues.result())
      case DenseVector(values) =>
        val values = features.toArray
        Vectors.dense(filterIndices.map(i => values(i)).toArray)
      case other =>
        throw new UnsupportedOperationException(
          s"Only sparse and dense vectors are supported but got ${other.getClass}.")
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(filterIndices.length)).get
}

Source File: MinMaxScalerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.VectorUtil._
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.math.{max, min}


  def apply(vector: Vector): Vector = {
    val scale = maxValue - minValue

    // 0 in sparse vector will probably be rescaled to non-zero
    val values = vector.copy.toArray
    val size = values.length
    var i = 0
    while (i < size) {
      if (!values(i).isNaN) {
        val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5
        values(i) = raw * scale + minValue
      }
      i += 1
    }
    Vectors.dense(values)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(originalRange.length)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(originalRange.length)).get

}

Source File: WordToVectorModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.BLAS
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


sealed trait WordToVectorKernel {
  def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector
  def name: String
}
object WordToVectorKernel {
  private val lookup: Map[String, WordToVectorKernel] = Seq(Default, Sqrt).map {
    k => (k.name, k)
  }.toMap

  def forName(name: String): WordToVectorKernel = lookup(name)

  case object Default extends WordToVectorKernel {
    override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = {
      val sum = Vectors.zeros(size)
      for (v <- vectors) {
        BLAS.axpy(1.0, v, sum)
      }
      BLAS.scal(1.0 / sentenceSize, sum)
      sum
    }

    override def name: String = "default"
  }

  case object Sqrt extends WordToVectorKernel {
    override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = {
      val sum = Vectors.zeros(size)
      for (v <- vectors) {
        BLAS.axpy(1.0, v, sum)
      }

      val values = sum match {
        case sum: DenseVector => sum.values
        case sum: SparseVector => sum.values
      }

      var i = 0
      val s = values.length
      val sqrt = Math.sqrt(BLAS.dot(sum, sum))
      while (i < s) {
        values(i) /= sqrt
        i += 1
      }

      sum
    }

    override def name: String = "sqrt"
  }
}

case class WordToVectorModel(wordIndex: Map[String, Int],
                             wordVectors: Array[Double],
                             kernel: WordToVectorKernel = WordToVectorKernel.Default) extends Model {
  val numWords: Int = wordIndex.size
  val vectorSize: Int = wordVectors.length / numWords
  val vectors: Map[String, Vector] = {
    wordIndex.map { case (word, ind) =>
      (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize))
    }
  }.mapValues(Vectors.dense).map(identity)

  def apply(sentence: Seq[String]): Vector = {
    if (sentence.isEmpty) {
      Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double])
    } else {
      val vs = sentence.iterator.map(vectors.get).
        filter(_.isDefined).
        map(_.get)
      kernel(vectorSize, sentence.size, vs)
    }
  }

  override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(vectorSize)).get
}

Source File: NormalizerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def apply(features: Vector): Vector = {
    val norm = Vectors.norm(features, pNorm)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      features match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
          }
          Vectors.dense(values)
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          }
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
      }
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.
      features
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get
}

Source File: VectorIndexerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import java.util.NoSuchElementException

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.4.5/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala")
case class VectorIndexerModel(numFeatures: Int,
                              categoryMaps: Map[Int, Map[Double, Int]],
                              handleInvalid: HandleInvalid = HandleInvalid.Error) extends Model {
  val sortedCatFeatureIndices = categoryMaps.keys.toArray.sorted
  val localVectorMap = categoryMaps
  val localNumFeatures = numFeatures
  val localHandleInvalid = handleInvalid

  def apply(features: Vector): Vector = predict(features)
  def predict(features: Vector): Vector = {
    assert(features.size == localNumFeatures, "VectorIndexerModel expected vector of length" +
      s" $numFeatures but found length ${features.size}")
    features match {
      case dv: DenseVector =>
        var hasInvalid = false
        val tmpv = dv.copy
        localVectorMap.foreach { case (featureIndex: Int, categoryMap: Map[Double, Int]) =>
          try {
            tmpv.values(featureIndex) = categoryMap(tmpv(featureIndex))
          } catch {
            case _: NoSuchElementException =>
              localHandleInvalid match {
                case HandleInvalid.Error =>
                  throw new IllegalArgumentException(s"VectorIndexer encountered invalid value " +
                    s"${tmpv(featureIndex)} on feature index $featureIndex. To handle " +
                    s"or skip invalid value, try setting VectorIndexer.handleInvalid.")
                case HandleInvalid.Keep =>
                  tmpv.values(featureIndex) = categoryMap.size
                case HandleInvalid.Skip =>
                  hasInvalid = true
              }
          }
        }
        if (hasInvalid) null else tmpv
      case sv: SparseVector =>
        // We use the fact that categorical value 0 is always mapped to index 0.
        var hasInvalid = false
        val tmpv = sv.copy
        var catFeatureIdx = 0 // index into sortedCatFeatureIndices
        var k = 0 // index into non-zero elements of sparse vector
        while (catFeatureIdx < sortedCatFeatureIndices.length && k < tmpv.indices.length) {
          val featureIndex = sortedCatFeatureIndices(catFeatureIdx)
          if (featureIndex < tmpv.indices(k)) {
            catFeatureIdx += 1
          } else if (featureIndex > tmpv.indices(k)) {
            k += 1
          } else {
            try {
              tmpv.values(k) = localVectorMap(featureIndex)(tmpv.values(k))
            } catch {
              case _: NoSuchElementException =>
                localHandleInvalid match {
                  case HandleInvalid.Error =>
                    throw new IllegalArgumentException(s"VectorIndexer encountered invalid value " +
                      s"${tmpv.values(k)} on feature index $featureIndex. To handle " +
                      s"or skip invalid value, try setting VectorIndexer.handleInvalid.")
                  case HandleInvalid.Keep =>
                    tmpv.values(k) = localVectorMap(featureIndex).size
                  case HandleInvalid.Skip =>
                    hasInvalid = true
                }
            }
            catFeatureIdx += 1
            k += 1
          }
        }
        if (hasInvalid) null else tmpv
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(localNumFeatures)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(localNumFeatures)).get

}

Source File: StandardScalerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def apply(vector: Vector): Vector = {
    if (mean.nonEmpty) {
      val shift = mean.get.toArray
      val values = vector match {
        // specially handle DenseVector because its toArray does not clone already
        case d: DenseVector => d.values.clone()
        case v: SparseVector => v.toArray
      }
      val size = values.length
      if (std.nonEmpty) {
        val stdDev = std.get
        var i = 0
        while (i < size) {
          values(i) = if (stdDev(i) != 0.0) (values(i) - shift(i)) * (1.0 / stdDev(i)) else 0.0
          i += 1
        }
      } else {
        var i = 0
        while (i < size) {
          values(i) -= shift(i)
          i += 1
        }
      }
      Vectors.dense(values)
    } else if (std.nonEmpty) {
      val stdDev = std.get
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while(i < size) {
            values(i) *= (if (stdDev(i) != 0.0) 1.0 / stdDev(i) else 0.0)
            i += 1
          }
          Vectors.dense(values)
        case SparseVector(size, indices, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) *= (if (stdDev(indices(i)) != 0.0) 1.0 / stdDev(indices(i)) else 0.0)
            i += 1
          }
          Vectors.sparse(size, indices, values)
      }
    } else {
      throw new IllegalStateException("need to scale with mean and/or with stdev")
    }
  }

  override def inputSchema: StructType = {
    StructType("input" -> TensorType.Double(size)).get
  }

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(size)).get

}

Source File: IDFModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala")
case class IDFModel(idf: Vector) extends Model {
  def apply(v: Vector): Vector = {
    val n = v.size
    v match {
      case SparseVector(size, indices, values) =>
        val nnz = indices.length
        val newValues = new Array[Double](nnz)
        var k = 0
        while (k < nnz) {
          newValues(k) = values(k) * idf(indices(k))
          k += 1
        }
        Vectors.sparse(n, indices, newValues)
      case DenseVector(values) =>
        val newValues = new Array[Double](n)
        var j = 0
        while (j < n) {
          newValues(j) = values(j) * idf(j)
          j += 1
        }
        Vectors.dense(newValues)
      case other =>
        throw new UnsupportedOperationException(
          s"Only sparse and dense vectors are supported but got ${other.getClass}.")
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double()).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double()).get
}

Source File: GaussianMixtureModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.clustering

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.Utils._
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian


object GaussianMixtureModel {
  @SparkCode(uri = "https://github.com/apache/spark/blob/branch-2.0/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala")
  def computeProbabilities(features: DenseVector,
                           dists: Array[MultivariateGaussian],
                           weights: Array[Double]): Array[Double] = {
    val p = weights.zip(dists).map {
      case (weight, dist) => EPSILON + weight * dist.pdf(features)
    }
    val pSum = p.sum
    var i = 0
    while (i < weights.length) {
      p(i) /= pSum
      i += 1
    }
    p
  }
}

case class GaussianMixtureModel(gaussians: Array[MultivariateGaussian],
                                weights: Array[Double]) extends Model {
  val numClusters = gaussians.length
  val numFeatures: Int = weights.length

  def apply(features: Vector): Int = predict(features)

  def predict(features: Vector): Int = {
    predictionFromProbability(predictProbability(features))
  }

  def predictWithProbability(features: Vector): (Int, Double) = {
    val probability = predictProbability(features)
    val index = probability.argmax
    (index, probability(index))
  }

  def predictionFromProbability(probabilities: Vector): Int = {
    probabilities.argmax
  }

  def predictProbability(features: Vector): Vector = {
    val probs: Array[Double] = GaussianMixtureModel.computeProbabilities(features.toDense, gaussians, weights)
    Vectors.dense(probs)
  }

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get

  override def outputSchema: StructType = StructType("prediction" -> ScalarType.Int.nonNullable,
    "probability" -> TensorType.Double(numClusters)).get
}

org.apache.spark.ml.linalg.DenseVector Scala Examples