org.apache.spark.ml.linalg.Vectors Scala Example

Source File: MultilayerPerceptronClassifierWrapper.scala From drizzle-spark with Apache License 2.0

8 votes

package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel,
    val labelCount: Long,
    val layers: Array[Int],
    val weights: Array[Double]
  ) extends MLWritable {

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadataStr = sc.textFile(rMetadataPath, 1).first()
      val rMetadata = parse(rMetadataStr)
      val labelCount = (rMetadata \ "labelCount").extract[Long]
      val layers = (rMetadata \ "layers").extract[Array[Int]]
      val weights = (rMetadata \ "weights").extract[Array[Double]]

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = ("class" -> instance.getClass.getName) ~
        ("labelCount" -> instance.labelCount) ~
        ("layers" -> instance.layers.toSeq) ~
        ("weights" -> instance.weights.toArray.toSeq)
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
}

Source File: DCT.scala From drizzle-spark with Apache License 2.0

6 votes

package org.apache.spark.ml.feature

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.DataType


  @Since("1.5.0")
  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
  }

  override protected def outputDataType: DataType = new VectorUDT
}

@Since("1.6.0")
object DCT extends DefaultParamsReadable[DCT] {

  @Since("1.6.0")
  override def load(path: String): DCT = super.load(path)
}

Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.stat.distribution

import breeze.linalg.{diag, eigSym, max, DenseMatrix => BDM, DenseVector => BDV, Vector => BV}

import org.apache.spark.annotation.{DeveloperApi, Since}
import org.apache.spark.ml.impl.Utils
import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors}



  private def calculateCovarianceConstants: (BDM[Double], Double) = {
    val eigSym.EigSym(d, u) = eigSym(cov.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t

    // For numerical stability, values are considered to be non-zero only if they exceed tol.
    // This prevents any inverted value from exceeding (eps * n * max(d))^-1
    val tol = Utils.EPSILON * max(d) * d.length

    try {
      // log(pseudo-determinant) is sum of the logs of all non-zero singular values
      val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum

      // calculate the root-pseudo-inverse of the diagonal matrix of singular values
      // by inverting the square root of all non-zero values
      val pinvS = diag(new BDV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray))

      (pinvS * u.t, -0.5 * (mean.size * math.log(2.0 * math.Pi) + logPseudoDetSigma))
    } catch {
      case uex: UnsupportedOperationException =>
        throw new IllegalArgumentException("Covariance matrix has no non-zero singular values")
    }
  }
}

Source File: MultivariateGaussianSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.stat.distribution

import org.apache.spark.ml.SparkMLFunSuite
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.util.TestingUtils._


class MultivariateGaussianSuite extends SparkMLFunSuite {

  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }
}

Source File: AFTSurvivalRegressionExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.AFTSurvivalRegression
// $example off$
import org.apache.spark.sql.SparkSession


object AFTSurvivalRegressionExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("AFTSurvivalRegressionExample")
      .getOrCreate()

    // $example on$
    val training = spark.createDataFrame(Seq(
      (1.218, 1.0, Vectors.dense(1.560, -0.605)),
      (2.949, 0.0, Vectors.dense(0.346, 2.158)),
      (3.627, 0.0, Vectors.dense(1.380, 0.231)),
      (0.273, 1.0, Vectors.dense(0.520, 1.151)),
      (4.199, 0.0, Vectors.dense(0.795, -0.226))
    )).toDF("label", "censor", "features")
    val quantileProbabilities = Array(0.3, 0.6)
    val aft = new AFTSurvivalRegression()
      .setQuantileProbabilities(quantileProbabilities)
      .setQuantilesCol("quantiles")

    val model = aft.fit(training)

    // Print the coefficients, intercept and scale parameter for AFT survival regression
    println(s"Coefficients: ${model.coefficients}")
    println(s"Intercept: ${model.intercept}")
    println(s"Scale: ${model.scale}")
    model.transform(training).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: NormalizerExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object NormalizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("NormalizerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.5, -1.0)),
      (1, Vectors.dense(2.0, 1.0, 1.0)),
      (2, Vectors.dense(4.0, 10.0, 2.0))
    )).toDF("id", "features")

    // Normalize each Vector using $L^1$ norm.
    val normalizer = new Normalizer()
      .setInputCol("features")
      .setOutputCol("normFeatures")
      .setP(1.0)

    val l1NormData = normalizer.transform(dataFrame)
    println("Normalized using L^1 norm")
    l1NormData.show()

    // Normalize each Vector using $L^\infty$ norm.
    val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
    println("Normalized using L^inf norm")
    lInfNormData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: VectorSlicerExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import java.util.Arrays

import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.feature.VectorSlicer
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
// $example off$
import org.apache.spark.sql.SparkSession

object VectorSlicerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("VectorSlicerExample")
      .getOrCreate()

    // $example on$
    val data = Arrays.asList(
      Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))),
      Row(Vectors.dense(-2.0, 2.3, 0.0))
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])

    val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField())))

    val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")

    slicer.setIndices(Array(1)).setNames(Array("f3"))
    // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))

    val output = slicer.transform(dataset)
    output.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: ChiSqSelectorExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.ChiSqSelector
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object ChiSqSelectorExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("ChiSqSelectorExample")
      .getOrCreate()
    import spark.implicits._

    // $example on$
    val data = Seq(
      (7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
      (8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
      (9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
    )

    val df = spark.createDataset(data).toDF("id", "features", "clicked")

    val selector = new ChiSqSelector()
      .setNumTopFeatures(1)
      .setFeaturesCol("features")
      .setLabelCol("clicked")
      .setOutputCol("selectedFeatures")

    val result = selector.fit(df).transform(df)

    println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected")
    result.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: DCTExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.DCT
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object DCTExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("DCTExample")
      .getOrCreate()

    // $example on$
    val data = Seq(
      Vectors.dense(0.0, 1.0, -2.0, 3.0),
      Vectors.dense(-1.0, 2.0, 4.0, -7.0),
      Vectors.dense(14.0, -2.0, -5.0, 1.0))

    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val dct = new DCT()
      .setInputCol("features")
      .setOutputCol("featuresDCT")
      .setInverse(false)

    val dctDf = dct.transform(df)
    dctDf.select("featuresDCT").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: VectorAssemblerExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object VectorAssemblerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("VectorAssemblerExample")
      .getOrCreate()

    // $example on$
    val dataset = spark.createDataFrame(
      Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
    ).toDF("id", "hour", "mobile", "userFeatures", "clicked")

    val assembler = new VectorAssembler()
      .setInputCols(Array("hour", "mobile", "userFeatures"))
      .setOutputCol("features")

    val output = assembler.transform(dataset)
    println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
    output.select("features", "clicked").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: PCAExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.PCA
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object PCAExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("PCAExample")
      .getOrCreate()

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    )
    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val pca = new PCA()
      .setInputCol("features")
      .setOutputCol("pcaFeatures")
      .setK(3)
      .fit(df)

    val result = pca.transform(df).select("pcaFeatures")
    result.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: ElementwiseProductExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.ElementwiseProduct
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object ElementwiseProductExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("ElementwiseProductExample")
      .getOrCreate()

    // $example on$
    // Create some vector data; also works for sparse vectors
    val dataFrame = spark.createDataFrame(Seq(
      ("a", Vectors.dense(1.0, 2.0, 3.0)),
      ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")

    val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
    val transformer = new ElementwiseProduct()
      .setScalingVec(transformingVector)
      .setInputCol("vector")
      .setOutputCol("transformedVector")

    // Batch transform the vectors to create new column:
    transformer.transform(dataFrame).show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: MinMaxScalerExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object MinMaxScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MinMaxScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.1, -1.0)),
      (1, Vectors.dense(2.0, 1.1, 1.0)),
      (2, Vectors.dense(3.0, 10.1, 3.0))
    )).toDF("id", "features")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")

    // Compute summary statistics and generate MinMaxScalerModel
    val scalerModel = scaler.fit(dataFrame)

    // rescale each feature to range [min, max].
    val scaledData = scalerModel.transform(dataFrame)
    println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]")
    scaledData.select("features", "scaledFeatures").show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: PolynomialExpansionExample.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.PolynomialExpansion
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object PolynomialExpansionExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("PolynomialExpansionExample")
      .getOrCreate()

    // $example on$
    val data = Array(
      Vectors.dense(2.0, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(3.0, -1.0)
    )
    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val polyExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")
      .setDegree(3)

    val polyDF = polyExpansion.transform(df)
    polyDF.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: MaxAbsScalerExample.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.MaxAbsScaler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object MaxAbsScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MaxAbsScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.1, -8.0)),
      (1, Vectors.dense(2.0, 1.0, -4.0)),
      (2, Vectors.dense(4.0, 10.0, 8.0))
    )).toDF("id", "features")

    val scaler = new MaxAbsScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")

    // Compute summary statistics and generate MaxAbsScalerModel
    val scalerModel = scaler.fit(dataFrame)

    // rescale each feature to range [-1, 1]
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.select("features", "scaledFeatures").show()
    // $example off$

    spark.stop()
  }
}

Source File: VectorSlicerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{StructField, StructType}

class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    val slicer = new VectorSlicer().setInputCol("feature")
    ParamsSuite.checkParams(slicer)
    assert(slicer.getIndices.length === 0)
    assert(slicer.getNames.length === 0)
    withClue("VectorSlicer should not have any features selected by default") {
      intercept[IllegalArgumentException] {
        slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true))))
      }
    }
  }

  test("feature validity checks") {
    import VectorSlicer._
    assert(validIndices(Array(0, 1, 8, 2)))
    assert(validIndices(Array.empty[Int]))
    assert(!validIndices(Array(-1)))
    assert(!validIndices(Array(1, 2, 1)))

    assert(validNames(Array("a", "b")))
    assert(validNames(Array.empty[String]))
    assert(!validNames(Array("", "b")))
    assert(!validNames(Array("a", "b", "a")))
  }

  test("Test vector slicer") {
    val data = Array(
      Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0),
      Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3),
      Vectors.sparse(5, Seq())
    )

    // Expected after selecting indices 1, 4
    val expected = Array(
      Vectors.sparse(2, Seq((0, 2.3))),
      Vectors.dense(2.3, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(-1.1, 3.3),
      Vectors.sparse(2, Seq())
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]])

    val resultAttrs = Array("f1", "f4").map(defaultAttr.withName)
    val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]])

    val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) }
    val df = spark.createDataFrame(rdd,
      StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField())))

    val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")

    def validateResults(df: DataFrame): Unit = {
      df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 === vec2)
      }
      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
      assert(resultMetadata.numAttributes === expectedMetadata.numAttributes)
      resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) =>
        assert(a === b)
      }
    }

    vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array(1)).setNames(Array("f4"))
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
    validateResults(vectorSlicer.transform(df))
  }

  test("read/write") {
    val t = new VectorSlicer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setIndices(Array(1, 3))
      .setNames(Array("a", "d"))
    testDefaultReadWrite(t)
  }
}

Source File: MaxAbsScalerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("MaxAbsScaler fit basic case") {
    val data = Array(
      Vectors.dense(1, 0, 100),
      Vectors.dense(2, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(-2, -100)),
      Vectors.sparse(3, Array(0), Array(-1.5)))

    val expected: Array[Vector] = Array(
      Vectors.dense(0.5, 0, 1),
      Vectors.dense(1, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(-1, -1)),
      Vectors.sparse(3, Array(0), Array(-0.75)))

    val df = data.zip(expected).toSeq.toDF("features", "expected")
    val scaler = new MaxAbsScaler()
      .setInputCol("features")
      .setOutputCol("scaled")

    val model = scaler.fit(df)
    model.transform(df).select("expected", "scaled").collect()
      .foreach { case Row(vector1: Vector, vector2: Vector) =>
      assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1")
    }

    // copied model must have the same parent.
    MLTestingUtils.checkCopy(model)
  }

  test("MaxAbsScaler read/write") {
    val t = new MaxAbsScaler()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    testDefaultReadWrite(t)
  }

  test("MaxAbsScalerModel read/write") {
    val instance = new MaxAbsScalerModel(
      "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0))
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.maxAbs === instance.maxAbs)
  }

}

Source File: ChiSqSelectorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.feature
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
  with DefaultReadWriteTest {

  test("Test Chi-Square selector") {
    import testImplicits._
    val data = Seq(
      LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))),
      LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))),
      LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))),
      LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0)))
    )

    val preFilteredData = Seq(
      Vectors.dense(8.0),
      Vectors.dense(0.0),
      Vectors.dense(0.0),
      Vectors.dense(8.0)
    )

    val df = sc.parallelize(data.zip(preFilteredData))
      .map(x => (x._1.label, x._1.features, x._2))
      .toDF("label", "data", "preFilteredData")

    val selector = new ChiSqSelector()
      .setSelectorType("kbest")
      .setNumTopFeatures(1)
      .setFeaturesCol("data")
      .setLabelCol("label")
      .setOutputCol("filtered")

    selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach {
      case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 ~== vec2 absTol 1e-1)
    }

    selector.setSelectorType("percentile").setPercentile(0.34).fit(df).transform(df)
      .select("filtered", "preFilteredData").collect().foreach {
        case Row(vec1: Vector, vec2: Vector) =>
          assert(vec1 ~== vec2 absTol 1e-1)
      }

    val preFilteredData2 = Seq(
      Vectors.dense(8.0, 7.0),
      Vectors.dense(0.0, 9.0),
      Vectors.dense(0.0, 9.0),
      Vectors.dense(8.0, 9.0)
    )

    val df2 = sc.parallelize(data.zip(preFilteredData2))
      .map(x => (x._1.label, x._1.features, x._2))
      .toDF("label", "data", "preFilteredData")

    selector.setSelectorType("fpr").setAlpha(0.2).fit(df2).transform(df2)
      .select("filtered", "preFilteredData").collect().foreach {
        case Row(vec1: Vector, vec2: Vector) =>
          assert(vec1 ~== vec2 absTol 1e-1)
      }
  }

  test("ChiSqSelector read/write") {
    val t = new ChiSqSelector()
      .setFeaturesCol("myFeaturesCol")
      .setLabelCol("myLabelCol")
      .setOutputCol("myOutputCol")
      .setNumTopFeatures(2)
    testDefaultReadWrite(t)
  }

  test("ChiSqSelectorModel read/write") {
    val oldModel = new feature.ChiSqSelectorModel(Array(1, 3))
    val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel)
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.selectedFeatures === instance.selectedFeatures)
  }

  test("should support all NumericType labels and not support other types") {
    val css = new ChiSqSelector()
    MLTestingUtils.checkNumericTypes[ChiSqSelectorModel, ChiSqSelector](
      css, spark) { (expected, actual) =>
        assert(expected.selectedFeatures === actual.selectedFeatures)
      }
  }
}

Source File: DCTSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

@BeanInfo
case class DCTTestData(vec: Vector, wantedVec: Vector)

class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("forward transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = false

    testDCT(data, inverse)
  }

  test("inverse transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = true

    testDCT(data, inverse)
  }

  test("read/write") {
    val t = new DCT()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setInverse(true)
    testDefaultReadWrite(t)
  }

  private def testDCT(data: Vector, inverse: Boolean): Unit = {
    val expectedResultBuffer = data.toArray.clone()
    if (inverse) {
      new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true)
    } else {
      new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true)
    }
    val expectedResult = Vectors.dense(expectedResultBuffer)

    val dataset = Seq(DCTTestData(data, expectedResult)).toDF()

    val transformer = new DCT()
      .setInputCol("vec")
      .setOutputCol("resultVec")
      .setInverse(inverse)

    transformer.transform(dataset)
      .select("resultVec", "wantedVec")
      .collect()
      .foreach { case Row(resultVec: Vector, wantedVec: Vector) =>
      assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
    }
  }
}

Source File: ElementwiseProductSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ElementwiseProductSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("read/write") {
    val ep = new ElementwiseProduct()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setScalingVec(Vectors.dense(0.1, 0.2))
    testDefaultReadWrite(ep)
  }
}

Source File: BinarizerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}

class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  @transient var data: Array[Double] = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4)
  }

  test("params") {
    ParamsSuite.checkParams(new Binarizer)
  }

  test("Binarize continuous features with default parameter") {
    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
    val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize continuous features with setter") {
    val threshold: Double = 0.2
    val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
    val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(threshold)

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Double, y: Double) =>
        assert(x === y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize vector of continuous features with default parameter") {
    val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0)
    val dataFrame: DataFrame = Seq(
      (Vectors.dense(data), Vectors.dense(defaultBinarized))
    ).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x == y, "The feature value is not correct after binarization.")
    }
  }

  test("Binarize vector of continuous features with setter") {
    val threshold: Double = 0.2
    val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0)
    val dataFrame: DataFrame = Seq(
      (Vectors.dense(data), Vectors.dense(defaultBinarized))
    ).toDF("feature", "expected")

    val binarizer: Binarizer = new Binarizer()
      .setInputCol("feature")
      .setOutputCol("binarized_feature")
      .setThreshold(threshold)

    binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach {
      case Row(x: Vector, y: Vector) =>
        assert(x == y, "The feature value is not correct after binarization.")
    }
  }


  test("read/write") {
    val t = new Binarizer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setThreshold(0.1)
    testDefaultReadWrite(t)
  }
}

Source File: HashingTFSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.AttributeGroup
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.feature.{HashingTF => MLlibHashingTF}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.util.Utils

class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new HashingTF)
  }

  test("hashingTF") {
    val df = Seq((0, "a a b b c d".split(" ").toSeq)).toDF("id", "words")
    val n = 100
    val hashingTF = new HashingTF()
      .setInputCol("words")
      .setOutputCol("features")
      .setNumFeatures(n)
    val output = hashingTF.transform(df)
    val attrGroup = AttributeGroup.fromStructField(output.schema("features"))
    require(attrGroup.numAttributes === Some(n))
    val features = output.select("features").first().getAs[Vector](0)
    // Assume perfect hash on "a", "b", "c", and "d".
    def idx: Any => Int = murmur3FeatureIdx(n)
    val expected = Vectors.sparse(n,
      Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0)))
    assert(features ~== expected absTol 1e-14)
  }

  test("applying binary term freqs") {
    val df = Seq((0, "a a b c c c".split(" ").toSeq)).toDF("id", "words")
    val n = 100
    val hashingTF = new HashingTF()
        .setInputCol("words")
        .setOutputCol("features")
        .setNumFeatures(n)
        .setBinary(true)
    val output = hashingTF.transform(df)
    val features = output.select("features").first().getAs[Vector](0)
    def idx: Any => Int = murmur3FeatureIdx(n)  // Assume perfect hash on input features
    val expected = Vectors.sparse(n,
      Seq((idx("a"), 1.0), (idx("b"), 1.0), (idx("c"), 1.0)))
    assert(features ~== expected absTol 1e-14)
  }

  test("read/write") {
    val t = new HashingTF()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setNumFeatures(10)
    testDefaultReadWrite(t)
  }

  private def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = {
    Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures)
  }
}

Source File: BinaryClassificationEvaluatorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.evaluation

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext

class BinaryClassificationEvaluatorSuite
  extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("params") {
    ParamsSuite.checkParams(new BinaryClassificationEvaluator)
  }

  test("read/write") {
    val evaluator = new BinaryClassificationEvaluator()
      .setRawPredictionCol("myRawPrediction")
      .setLabelCol("myLabel")
      .setMetricName("areaUnderPR")
    testDefaultReadWrite(evaluator)
  }

  test("should accept both vector and double raw prediction col") {
    val evaluator = new BinaryClassificationEvaluator()
      .setMetricName("areaUnderPR")

    val vectorDF = Seq(
      (0d, Vectors.dense(12, 2.5)),
      (1d, Vectors.dense(1, 3)),
      (0d, Vectors.dense(10, 2))
    ).toDF("label", "rawPrediction")
    assert(evaluator.evaluate(vectorDF) === 1.0)

    val doubleDF = Seq(
      (0d, 0d),
      (1d, 1d),
      (0d, 0d)
    ).toDF("label", "rawPrediction")
    assert(evaluator.evaluate(doubleDF) === 1.0)

    val stringDF = Seq(
      (0d, "0d"),
      (1d, "1d"),
      (0d, "0d")
    ).toDF("label", "rawPrediction")
    val thrown = intercept[IllegalArgumentException] {
      evaluator.evaluate(stringDF)
    }
    assert(thrown.getMessage.replace("\n", "") contains "Column rawPrediction must be of type " +
      "equal to one of the following types: [DoubleType, ")
    assert(thrown.getMessage.replace("\n", "") contains "but was actually of type StringType.")
  }

  test("should support all NumericType labels and not support other types") {
    val evaluator = new BinaryClassificationEvaluator().setRawPredictionCol("prediction")
    MLTestingUtils.checkNumericTypes(evaluator, spark)
  }
}

Source File: MLSerDeSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.python

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors}

class MLSerDeSuite extends SparkFunSuite {

  MLSerDe.initialize()

  test("pickle vector") {
    val vectors = Seq(
      Vectors.dense(Array.empty[Double]),
      Vectors.dense(0.0),
      Vectors.dense(0.0, -2.0),
      Vectors.sparse(0, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(1, Array.empty[Int], Array.empty[Double]),
      Vectors.sparse(2, Array(1), Array(-2.0)))
    vectors.foreach { v =>
      val u = MLSerDe.loads(MLSerDe.dumps(v))
      assert(u.getClass === v.getClass)
      assert(u === v)
    }
  }

  test("pickle double") {
    for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) {
      val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double]
      // We use `equals` here for comparison because we cannot use `==` for NaN
      assert(x.equals(deser))
    }
  }

  test("pickle matrix") {
    val values = Array[Double](0, 1.2, 3, 4.56, 7, 8)
    val matrix = Matrices.dense(2, 3, values)
    val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix]
    assert(matrix === nm)

    // Test conversion for empty matrix
    val empty = Array.empty[Double]
    val emptyMatrix = Matrices.dense(0, 0, empty)
    val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix]
    assert(emptyMatrix == ne)

    val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4))
    val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix]
    assert(sm.toArray === nsm.toArray)

    val smt = new SparseMatrix(
      3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9),
      isTransposed = true)
    val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix]
    assert(smt.toArray === nsmt.toArray)
  }
}

Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.source.libsvm

import java.io.File
import java.nio.charset.StandardCharsets

import com.google.common.io.Files

import org.apache.spark.{SparkException, SparkFunSuite}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Row, SaveMode}
import org.apache.spark.util.Utils


class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext {
  // Path for dataset
  var path: String = _

  override def beforeAll(): Unit = {
    super.beforeAll()
    val lines =
      """
        |1 1:1.0 3:2.0 5:3.0
        |0
        |0 2:4.0 4:5.0 6:6.0
      """.stripMargin
    val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data")
    val file = new File(dir, "part-00000")
    Files.write(lines, file, StandardCharsets.UTF_8)
    path = dir.toURI.toString
  }

  override def afterAll(): Unit = {
    try {
      Utils.deleteRecursively(new File(path))
    } finally {
      super.afterAll()
    }
  }

  test("select as sparse vector") {
    val df = spark.read.format("libsvm").load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("select as dense vector") {
    val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense"))
      .load(path)
    assert(df.columns(0) == "label")
    assert(df.columns(1) == "features")
    assert(df.count() == 3)
    val row1 = df.first()
    assert(row1.getDouble(0) == 1.0)
    val v = row1.getAs[DenseVector](1)
    assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0))
  }

  test("select a vector with specifying the longer dimension") {
    val df = spark.read.option("numFeatures", "100").format("libsvm")
      .load(path)
    val row1 = df.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data and read it again") {
    val df = spark.read.format("libsvm").load(path)
    val tempDir2 = new File(tempDir, "read_write_test")
    val writepath = tempDir2.toURI.toString
    // TODO: Remove requirement to coalesce by supporting multiple reads.
    df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath)

    val df2 = spark.read.format("libsvm").load(writepath)
    val row1 = df2.first()
    val v = row1.getAs[SparseVector](1)
    assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0))))
  }

  test("write libsvm data failed due to invalid schema") {
    val df = spark.read.format("text").load(path)
    intercept[SparkException] {
      df.write.format("libsvm").save(path + "_2")
    }
  }

  test("select features from libsvm relation") {
    val df = spark.read.format("libsvm").load(path)
    df.select("features").rdd.map { case Row(d: Vector) => d }.first
    df.select("features").collect
  }
}

Source File: ProbabilisticClassifierSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.classification

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}

final class TestProbabilisticClassificationModel(
    override val uid: String,
    override val numFeatures: Int,
    override val numClasses: Int)
  extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] {

  override def copy(extra: org.apache.spark.ml.param.ParamMap): this.type = defaultCopy(extra)

  override protected def predictRaw(input: Vector): Vector = {
    input
  }

  override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = {
    rawPrediction
  }

  def friendlyPredict(values: Double*): Double = {
    predict(Vectors.dense(values.toArray))
  }
}


class ProbabilisticClassifierSuite extends SparkFunSuite {

  test("test thresholding") {
    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
      .setThresholds(Array(0.5, 0.2))
    assert(testModel.friendlyPredict(1.0, 1.0) === 1.0)
    assert(testModel.friendlyPredict(1.0, 0.2) === 0.0)
  }

  test("test thresholding not required") {
    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
    assert(testModel.friendlyPredict(1.0, 2.0) === 1.0)
  }

  test("test tiebreak") {
    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
      .setThresholds(Array(0.4, 0.4))
    assert(testModel.friendlyPredict(0.6, 0.6) === 0.0)
  }

  test("test one zero threshold") {
    val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2)
      .setThresholds(Array(0.0, 0.1))
    assert(testModel.friendlyPredict(1.0, 10.0) === 0.0)
    assert(testModel.friendlyPredict(0.0, 10.0) === 1.0)
  }

  test("bad thresholds") {
    intercept[IllegalArgumentException] {
      new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(0.0, 0.0))
    }
    intercept[IllegalArgumentException] {
      new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(-0.1, 0.1))
    }
  }
}

object ProbabilisticClassifierSuite {

  
  val allParamSettings: Map[String, Any] = ClassifierSuite.allParamSettings ++ Map(
    "probabilityCol" -> "myProbability",
    "thresholds" -> Array(0.4, 0.6)
  )

}

Source File: ANNSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext

class ANNSuite extends SparkFunSuite with MLlibTestSparkContext {

  // TODO: test for weights comparison with Weka MLP
  test("ANN with Sigmoid learns XOR function with LBFGS optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(0.0, 1.0, 1.0, 0.0)
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights
    val trainer = new FeedForwardTrainer(topology, 2, 1)
    trainer.setWeights(initialWeights)
    trainer.LBFGSOptimizer.setNumIterations(20)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input)(0), label(0))
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(math.round(p) === l)
    }
  }

  test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") {
    val inputs = Array(
      Array(0.0, 0.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0),
      Array(1.0, 1.0)
    )
    val outputs = Array(
      Array(1.0, 0.0),
      Array(0.0, 1.0),
      Array(0.0, 1.0),
      Array(1.0, 0.0)
    )
    val data = inputs.zip(outputs).map { case (features, label) =>
      (Vectors.dense(features), Vectors.dense(label))
    }
    val rddData = sc.parallelize(data, 1)
    val hiddenLayersTopology = Array(5)
    val dataSample = rddData.first()
    val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size
    val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false)
    val initialWeights = FeedForwardModel(topology, 23124).weights
    val trainer = new FeedForwardTrainer(topology, 2, 2)
    // TODO: add a test for SGD
    trainer.LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(20)
    trainer.setWeights(initialWeights).setStackSize(1)
    val model = trainer.train(rddData)
    val predictionAndLabels = rddData.map { case (input, label) =>
      (model.predict(input), label)
    }.collect()
    predictionAndLabels.foreach { case (p, l) =>
      assert(p ~== l absTol 0.5)
    }
  }
}

Source File: GradientSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.ann

import breeze.linalg.{DenseMatrix => BDM}

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.util.MLlibTestSparkContext

class GradientSuite extends SparkFunSuite with MLlibTestSparkContext {

  test("Gradient computation against numerical differentiation") {
    val input = new BDM[Double](3, 1, Array(1.0, 1.0, 1.0))
    // output must contain zeros and one 1 for SoftMax
    val target = new BDM[Double](2, 1, Array(0.0, 1.0))
    val topology = FeedForwardTopology.multiLayerPerceptron(Array(3, 4, 2), softmaxOnTop = false)
    val layersWithErrors = Seq(
      new SigmoidLayerWithSquaredError(),
      new SoftmaxLayerWithCrossEntropyLoss()
    )
    // check all layers that provide loss computation
    // 1) compute loss and gradient given the model and initial weights
    // 2) modify weights with small number epsilon (per dimension i)
    // 3) compute new loss
    // 4) ((newLoss - loss) / epsilon) should be close to the i-th component of the gradient
    for (layerWithError <- layersWithErrors) {
      topology.layers(topology.layers.length - 1) = layerWithError
      val model = topology.model(seed = 12L)
      val weights = model.weights.toArray
      val numWeights = weights.size
      val gradient = Vectors.dense(Array.fill[Double](numWeights)(0.0))
      val loss = model.computeGradient(input, target, gradient, 1)
      val eps = 1e-4
      var i = 0
      val tol = 1e-4
      while (i < numWeights) {
        val originalValue = weights(i)
        weights(i) += eps
        val newModel = topology.model(Vectors.dense(weights))
        val newLoss = computeLoss(input, target, newModel)
        val derivativeEstimate = (newLoss - loss) / eps
        assert(math.abs(gradient(i) - derivativeEstimate) < tol, "Layer failed gradient check: " +
          layerWithError.getClass)
        weights(i) = originalValue
        i += 1
      }
    }
  }

  private def computeLoss(input: BDM[Double], target: BDM[Double], model: TopologyModel): Double = {
    val outputs = model.forward(input)
    model.layerModels.last match {
      case layerWithLoss: LossFunction =>
        layerWithLoss.loss(outputs.last, target, new BDM[Double](target.rows, target.cols))
      case _ =>
        throw new UnsupportedOperationException("Top layer is required to have loss." +
          " Failed layer:" + model.layerModels.last.getClass)
    }
  }
}

Source File: LocalWord2VecModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.Word2VecModel
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel}

class LocalWord2VecModel(override val sparkTransformer: Word2VecModel)
  extends LocalTransformer[Word2VecModel] {
  lazy val parent: OldWord2VecModel = {
    val field = sparkTransformer.getClass.getDeclaredField(
      "org$apache$spark$ml$feature$Word2VecModel$$wordVectors"
    )
    field.setAccessible(true)
    field.get(sparkTransformer).asInstanceOf[OldWord2VecModel]
  }

  private def axpy(a: Double, x: Array[Double], y: Array[Double]) = {
    y.zipWithIndex.foreach {
      case (value, index) =>
        y.update(index, x(index) * a + value)
    }
  }

  private def scal(a: Double, v: Array[Double]) = {
    v.zipWithIndex.foreach {
      case (value, index) =>
        v.update(index, value * a)
    }
  }

  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val data = column.data.map(_.asInstanceOf[List[String]]).map { vec =>
          if (vec.isEmpty) {
            Array
              .fill(sparkTransformer.getVectorSize)(0.0)
              .toList
          } else {
            val vectors = parent.getVectors
              .mapValues(v => Vectors.dense(v.map(_.toDouble)))
            val sum = Array.fill(sparkTransformer.getVectorSize)(0.0)
            vec.foreach { word =>
              vectors.get(word).foreach { vec =>
                axpy(1.0, vec.toDense.values, sum)
              }
            }
            scal(1.0 / vec.length, sum)
            sum.toList
          }
        }
        val newColumn = LocalDataColumn(sparkTransformer.getOutputCol, data)
        localData.withColumn(newColumn)
      case None => localData
    }
  }
}

object LocalWord2VecModel
  extends SimpleModelLoader[Word2VecModel]
  with TypedTransformerConverter[Word2VecModel] {

  override def build(metadata: Metadata, data: LocalData): Word2VecModel = {
    val wordVectors = data.column("wordVectors").get.data.head.asInstanceOf[Seq[Float]].toArray
    val wordIndex   = data.column("wordIndex").get.data.head.asInstanceOf[Map[String, Int]]
    val oldCtor =
      classOf[OldWord2VecModel].getConstructor(classOf[Map[String, Int]], classOf[Array[Float]])
    oldCtor.setAccessible(true)

    val oldWord2VecModel = oldCtor.newInstance(wordIndex, wordVectors)

    val ctor = classOf[Word2VecModel].getConstructor(classOf[String], classOf[OldWord2VecModel])
    ctor.setAccessible(true)

    val inst = ctor
      .newInstance(metadata.uid, oldWord2VecModel)
      .setInputCol(metadata.paramMap("inputCol").toString)
      .setOutputCol(metadata.paramMap("outputCol").toString)

    inst
      .set(inst.maxIter, metadata.paramMap("maxIter").asInstanceOf[Number].intValue())
      .set(inst.seed, metadata.paramMap("seed").toString.toLong)
      .set(inst.numPartitions, metadata.paramMap("numPartitions").asInstanceOf[Number].intValue())
      .set(inst.stepSize, metadata.paramMap("stepSize").asInstanceOf[Double])
      .set(
        inst.maxSentenceLength,
        metadata.paramMap("maxSentenceLength").asInstanceOf[Number].intValue()
      )
      .set(inst.windowSize, metadata.paramMap("windowSize").asInstanceOf[Number].intValue())
      .set(inst.vectorSize, metadata.paramMap("vectorSize").asInstanceOf[Number].intValue())
  }

  override implicit def toLocal(transformer: Word2VecModel): LocalTransformer[Word2VecModel] =
    new LocalWord2VecModel(transformer)
}

Source File: LocalModelSpec22.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving

import org.apache.spark.ml.classification._
import org.apache.spark.ml.feature._
import org.apache.spark.ml.linalg.Vectors

class LocalModelSpec22 extends GenericTestSpec {

  modelTest(
    data = session.createDataFrame(Seq(
      (0L, "a b c d e spark", 1.0),
      (1L, "b d", 0.0),
      (2L, "spark f g h", 1.0),
      (3L, "hadoop mapreduce", 0.0)
    )).toDF("id", "text", "label"),
    steps = Seq(
      new Tokenizer().setInputCol("text").setOutputCol("words"),
      new HashingTF().setNumFeatures(1000).setInputCol("words").setOutputCol("features"),
      new LogisticRegression().setMaxIter(10).setRegParam(0.01)
    ),
    columns = Seq(
      "prediction"
    )
  )

  modelTest(
    data = session.createDataFrame(Seq(
      "Hi I heard about Spark".split(" "),
      "I wish Java could use case classes".split(" "),
      "Logistic regression models are neat".split(" ")
    ).map(Tuple1.apply)).toDF("text"),
    steps = Seq(
      new Word2Vec()
        .setInputCol("text")
        .setOutputCol("result")
        .setVectorSize(3)
        .setMinCount(0)
    ),
    columns = Seq(
      "result"
    )
  )

  modelTest(
    data = session.createDataFrame(Seq(
      (Vectors.dense(4.0, 0.2, 3.0, 4.0, 5.0), 1.0),
      (Vectors.dense(3.0, 0.3, 1.0, 4.1, 5.0), 1.0),
      (Vectors.dense(2.0, 0.5, 3.2, 4.0, 5.0), 1.0),
      (Vectors.dense(5.0, 0.7, 1.5, 4.0, 5.0), 1.0),
      (Vectors.dense(1.0, 0.1, 7.0, 4.0, 5.0), 0.0),
      (Vectors.dense(8.0, 0.3, 5.0, 1.0, 7.0), 0.0)
    )).toDF("features", "label"),
    steps = Seq(
      new LinearSVC()
        .setMaxIter(10)
        .setRegParam(0.1)
    ),
    columns = Seq(
      "prediction"
    )
  )

  modelTest(
    data = session.createDataFrame(Seq(
      (1.0, Double.NaN),
      (2.0, Double.NaN),
      (Double.NaN, 3.0),
      (4.0, 4.0),
      (5.0, 5.0)
    )).toDF("a", "b"),
    steps = Seq(
      new Imputer()
        .setInputCols(Array("a", "b"))
        .setOutputCols(Array("out_a", "out_b"))
    ),
    columns = Seq("out_a", "out_b")
  )
}

Source File: LocalWord2VecModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.Word2VecModel
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel}

class LocalWord2VecModel(override val sparkTransformer: Word2VecModel)
  extends LocalTransformer[Word2VecModel] {
  lazy val parent: OldWord2VecModel = {
    val field = sparkTransformer.getClass.getDeclaredField(
      "org$apache$spark$ml$feature$Word2VecModel$$wordVectors"
    )
    field.setAccessible(true)
    field.get(sparkTransformer).asInstanceOf[OldWord2VecModel]
  }

  private def axpy(a: Double, x: Array[Double], y: Array[Double]) = {
    y.zipWithIndex.foreach {
      case (value, index) =>
        y.update(index, x(index) * a + value)
    }
  }

  private def scal(a: Double, v: Array[Double]) = {
    v.zipWithIndex.foreach {
      case (value, index) =>
        v.update(index, value * a)
    }
  }

  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val data = column.data.map(_.asInstanceOf[List[String]]).map { vec =>
          if (vec.isEmpty) {
            Array
              .fill(sparkTransformer.getVectorSize)(0.0)
              .toList
          } else {
            val vectors = parent.getVectors
              .mapValues(v => Vectors.dense(v.map(_.toDouble)))
            val sum = Array.fill(sparkTransformer.getVectorSize)(0.0)
            vec.foreach { word =>
              vectors.get(word).foreach { vec =>
                axpy(1.0, vec.toDense.values, sum)
              }
            }
            scal(1.0 / vec.length, sum)
            sum.toList
          }
        }
        val newColumn = LocalDataColumn(sparkTransformer.getOutputCol, data)
        localData.withColumn(newColumn)
      case None => localData
    }
  }
}

object LocalWord2VecModel
  extends SimpleModelLoader[Word2VecModel]
  with TypedTransformerConverter[Word2VecModel] {

  override def build(metadata: Metadata, data: LocalData): Word2VecModel = {
    val wordVectors = data.column("wordVectors").get.data.head.asInstanceOf[Seq[Float]].toArray
    val wordIndex   = data.column("wordIndex").get.data.head.asInstanceOf[Map[String, Int]]
    val oldCtor =
      classOf[OldWord2VecModel].getConstructor(classOf[Map[String, Int]], classOf[Array[Float]])
    oldCtor.setAccessible(true)

    val oldWord2VecModel = oldCtor.newInstance(wordIndex, wordVectors)

    val ctor = classOf[Word2VecModel].getConstructor(classOf[String], classOf[OldWord2VecModel])
    ctor.setAccessible(true)

    val inst = ctor
      .newInstance(metadata.uid, oldWord2VecModel)
      .setInputCol(metadata.paramMap("inputCol").toString)
      .setOutputCol(metadata.paramMap("outputCol").toString)

    inst
      .set(inst.maxIter, metadata.paramMap("maxIter").asInstanceOf[Number].intValue())
      .set(inst.seed, metadata.paramMap("seed").toString.toLong)
      .set(inst.numPartitions, metadata.paramMap("numPartitions").asInstanceOf[Number].intValue())
      .set(inst.stepSize, metadata.paramMap("stepSize").asInstanceOf[Double])
      .set(
        inst.maxSentenceLength,
        metadata.paramMap("maxSentenceLength").asInstanceOf[Number].intValue()
      )
      .set(inst.windowSize, metadata.paramMap("windowSize").asInstanceOf[Number].intValue())
      .set(inst.vectorSize, metadata.paramMap("vectorSize").asInstanceOf[Number].intValue())
  }

  override implicit def toLocal(transformer: Word2VecModel): LocalTransformer[Word2VecModel] =
    new LocalWord2VecModel(transformer)
}

Source File: LocalLogisticRegressionModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.classification

import java.lang.Boolean

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.linalg.{Matrix, SparseMatrix, Vector, Vectors}

class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel)
  extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {}

object LocalLogisticRegressionModel
  extends SimpleModelLoader[LogisticRegressionModel]
  with TypedTransformerConverter[LogisticRegressionModel] {

  override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = {
    val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor(
      classOf[String],
      classOf[Matrix],
      classOf[Vector],
      classOf[Int],
      java.lang.Boolean.TYPE
    )
    constructor.setAccessible(true)
    val coefficientMatrixParams =
      data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]]
    val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams)
    val interceptVectorParams =
      data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]]
    val interceptVector = DataUtils.constructVector(interceptVectorParams)
    constructor
      .newInstance(
        metadata.uid,
        coefficientMatrix,
        interceptVector,
        data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer],
        data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean]
      )
      .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String])
      .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String])
      .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String])
      .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String])
      .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double])
  }

  override implicit def toLocal(
    transformer: LogisticRegressionModel
  ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer)
}

Source File: LocalCountVectorizerModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.CountVectorizerModel
import org.apache.spark.ml.linalg.Vectors

import scala.collection.mutable

class LocalCountVectorizerModel(override val sparkTransformer: CountVectorizerModel)
  extends LocalTransformer[CountVectorizerModel] {
  override def transform(localData: LocalData): LocalData = {
    import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
    val dict  = sparkTransformer.vocabulary.zipWithIndex.toMap
    val minTf = sparkTransformer.getMinTF

    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val newCol = column.data.map(_.asInstanceOf[List[String]]).map { arr =>
          val termCounts = mutable.HashMap.empty[Int, Double]
          var tokenCount = 0L
          arr.foreach { token =>
            dict.get(token) foreach { index =>
              val storedValue = termCounts.getOrElseUpdate(index, 0.0)
              termCounts.update(index, storedValue + 1.0)
            }
            tokenCount += 1
          }
          val eTF = if (minTf >= 1.0) minTf else tokenCount * minTf
          val eCounts = if (sparkTransformer.getBinary) {
            termCounts filter (_._2 >= eTF) map (_._1 -> 1.0) toSeq
          } else {
            termCounts filter (_._2 >= eTF) toSeq
          }

          Vectors.sparse(dict.size, eCounts.toList).toList
        }
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newCol))
      case None => localData
    }
  }
}

object LocalCountVectorizerModel
  extends SimpleModelLoader[CountVectorizerModel]
  with TypedTransformerConverter[CountVectorizerModel] {

  override def build(metadata: Metadata, data: LocalData): CountVectorizerModel = {
    val vocabulary = data.column("vocabulary").get.data.head.asInstanceOf[Seq[String]].toArray
    val inst       = new CountVectorizerModel(metadata.uid, vocabulary)
    inst
      .setInputCol(metadata.paramMap("inputCol").toString)
      .setOutputCol(metadata.paramMap("outputCol").toString)
      .set(inst.binary, metadata.paramMap("binary").asInstanceOf[Boolean])
      .set(inst.minDF, metadata.paramMap("minDF").toString.toDouble)
      .set(inst.minTF, metadata.paramMap("minTF").toString.toDouble)
      .set(inst.vocabSize, metadata.paramMap("vocabSize").asInstanceOf[Number].intValue())
  }

  override implicit def toLocal(
    transformer: CountVectorizerModel
  ) = new LocalCountVectorizerModel(transformer)
}

Source File: LocalPCAModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.PCAModel
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Vectors}
import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Matrices => OldMatrices}

class LocalPCAModel(override val sparkTransformer: PCAModel) extends LocalTransformer[PCAModel] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val pc      = OldMatrices.fromML(sparkTransformer.pc).asInstanceOf[OldDenseMatrix]
        val newData = column.data.mapToMlLibVectors.map(pc.transpose.multiply).map(_.toList)
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalPCAModel extends SimpleModelLoader[PCAModel] with TypedTransformerConverter[PCAModel] {

  override def build(metadata: Metadata, data: LocalData): PCAModel = {
    val constructor = classOf[PCAModel].getDeclaredConstructor(
      classOf[String],
      classOf[DenseMatrix],
      classOf[DenseVector]
    )
    constructor.setAccessible(true)
    val pcMap = data.column("pc").get.data.head.asInstanceOf[Map[String, Any]]
    val pcMat = DataUtils.constructMatrix(pcMap).asInstanceOf[DenseMatrix]
    data.column("explainedVariance") match {
      case Some(ev) =>
        // NOTE: Spark >= 2
        val evParams = ev.data.head.asInstanceOf[Map[String, Any]]
        val explainedVariance = DataUtils.constructVector(evParams).toDense

        constructor
          .newInstance(metadata.uid, pcMat, explainedVariance)
          .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
          .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      case None =>
        // NOTE: Spark < 2
        constructor
          .newInstance(
            metadata.uid,
            pcMat,
            Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector]
          )
          .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
          .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
    }
  }

  override implicit def toLocal(transformer: PCAModel) =
    new LocalPCAModel(transformer)
}

Source File: LocalPolynomialExpansion.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import org.apache.spark.ml.feature.PolynomialExpansion
import org.apache.spark.ml.linalg.{Vector, Vectors}

class LocalPolynomialExpansion(override val sparkTransformer: PolynomialExpansion)
  extends LocalTransformer[PolynomialExpansion] {

  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val method = classOf[PolynomialExpansion].getMethod("createTransformFunc")
        val newData = column.data.map(r => {
          val row            = r.asInstanceOf[List[Any]].map(_.toString.toDouble).toArray
          val vector: Vector = Vectors.dense(row)
          method.invoke(sparkTransformer).asInstanceOf[Vector => Vector](vector).toList
        })
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalPolynomialExpansion
  extends SimpleModelLoader[PolynomialExpansion]
  with TypedTransformerConverter[PolynomialExpansion] {

  override def build(metadata: Metadata, data: LocalData): PolynomialExpansion = {
    new PolynomialExpansion(metadata.uid)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
      .setDegree(metadata.paramMap("degree").asInstanceOf[Number].intValue())
  }

  override implicit def toLocal(
    transformer: PolynomialExpansion
  ) = new LocalPolynomialExpansion(transformer)
}

Source File: LocalMaxAbsScalerModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.preprocessors

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.feature.MaxAbsScalerModel
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}

class LocalMaxAbsScalerModel(override val sparkTransformer: MaxAbsScalerModel)
  extends LocalTransformer[MaxAbsScalerModel] {
  override def transform(localData: LocalData): LocalData = {
    localData.column(sparkTransformer.getInputCol) match {
      case Some(column) =>
        val maxAbsUnzero =
          Vectors.dense(sparkTransformer.maxAbs.toArray.map(x => if (x == 0) 1 else x))
        val newData = column.data.map(r => {
          val vec = r match {
            case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue())
            case d =>
              throw new IllegalArgumentException(s"Unknown data type for LocalMaxAbsScaler: $d")
          }
          val brz = DataUtils.asBreeze(vec.toArray) / DataUtils.asBreeze(maxAbsUnzero.toArray)
          DataUtils.fromBreeze(brz).toList
        })
        localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData))
      case None => localData
    }
  }
}

object LocalMaxAbsScalerModel
  extends SimpleModelLoader[MaxAbsScalerModel]
  with TypedTransformerConverter[MaxAbsScalerModel] {
  override def build(metadata: Metadata, data: LocalData): MaxAbsScalerModel = {
    val maxAbsParams = data.column("maxAbs").get.data.head.asInstanceOf[Map[String, Any]]
    val maxAbs = DataUtils.constructVector(maxAbsParams)

    val constructor =
      classOf[MaxAbsScalerModel].getDeclaredConstructor(classOf[String], classOf[Vector])
    constructor.setAccessible(true)
    constructor
      .newInstance(metadata.uid, maxAbs)
      .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String])
      .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String])
  }

  override implicit def toLocal(
    transformer: MaxAbsScalerModel
  ): LocalMaxAbsScalerModel = new LocalMaxAbsScalerModel(transformer)
}

Source File: LocalMultilayerPerceptronClassificationModel.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.classification

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel
import org.apache.spark.ml.linalg.{Vector, Vectors}

class LocalMultilayerPerceptronClassificationModel(
  override val sparkTransformer: MultilayerPerceptronClassificationModel
) extends LocalPredictionModel[MultilayerPerceptronClassificationModel] {}

object LocalMultilayerPerceptronClassificationModel
  extends SimpleModelLoader[MultilayerPerceptronClassificationModel]
  with TypedTransformerConverter[MultilayerPerceptronClassificationModel] {

  override def build(
    metadata: Metadata,
    data: LocalData
  ): MultilayerPerceptronClassificationModel = {
    val layers = data.column("layers").get.data.head.asInstanceOf[Seq[Int]].toArray
    val weightsParam = data.column("weights").get.data.head.asInstanceOf[Map[String, Any]]
    val weights = DataUtils.constructVector(weightsParam)
    val constructor = classOf[MultilayerPerceptronClassificationModel].getDeclaredConstructor(
      classOf[String],
      classOf[Array[Int]],
      classOf[Vector]
    )
    constructor.setAccessible(true)
    constructor
      .newInstance(
        metadata.uid,
        layers,
        weights
      )
      .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String])
      .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String])
  }

  override implicit def toLocal(
    sparkTransformer: MultilayerPerceptronClassificationModel
  ): LocalMultilayerPerceptronClassificationModel = {
    new LocalMultilayerPerceptronClassificationModel(sparkTransformer)
  }
}

Source File: LocalNaiveBayes.scala From spark-ml-serving with Apache License 2.0

5 votes

package io.hydrosphere.spark_ml_serving.classification

import io.hydrosphere.spark_ml_serving.TypedTransformerConverter
import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel
import io.hydrosphere.spark_ml_serving.common._
import io.hydrosphere.spark_ml_serving.common.utils.DataUtils
import org.apache.spark.ml.classification.NaiveBayesModel
import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors}

class LocalNaiveBayes(override val sparkTransformer: NaiveBayesModel)
  extends LocalProbabilisticClassificationModel[NaiveBayesModel] {}

object LocalNaiveBayes
  extends SimpleModelLoader[NaiveBayesModel]
  with TypedTransformerConverter[NaiveBayesModel] {

  override def build(metadata: Metadata, data: LocalData): NaiveBayesModel = {
    val constructor = classOf[NaiveBayesModel].getDeclaredConstructor(
      classOf[String],
      classOf[Vector],
      classOf[Matrix]
    )
    constructor.setAccessible(true)
    val matrixMetadata = data.column("theta").get.data.head.asInstanceOf[Map[String, Any]]
    val matrix         = DataUtils.constructMatrix(matrixMetadata)
    val piParams = data.column("pi").get.data.head.asInstanceOf[Map[String, Any]]
    val piVec = DataUtils.constructVector(piParams)

    val nb = constructor
      .newInstance(metadata.uid, piVec, matrix)
      .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String])
      .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String])
      .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String])
      .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String])

    nb.set(nb.smoothing, metadata.paramMap("smoothing").asInstanceOf[Number].doubleValue())
    nb.set(nb.modelType, metadata.paramMap("modelType").asInstanceOf[String])
    nb.set(nb.labelCol, metadata.paramMap("labelCol").asInstanceOf[String])

    nb
  }

  override implicit def toLocal(sparkTransformer: NaiveBayesModel): LocalNaiveBayes = {
    new LocalNaiveBayes(sparkTransformer)
  }
}

Source File: UCB.scala From automl with Apache License 2.0

5 votes

package com.tencent.angel.spark.automl.tuner.acquisition

import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate
import org.apache.commons.logging.{Log, LogFactory}
import org.apache.spark.ml.linalg.{Vector, Vectors}


class UCB(
           override val surrogate: Surrogate,
           val beta: Double = 100)
  extends Acquisition(surrogate) {

  val LOG: Log = LogFactory.getLog(classOf[Surrogate])

  override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = {
    val pred = surrogate.predict(X) // (mean, variance)

    val m: Double = pred._1
    val s: Double = Math.sqrt(pred._2)

    if (s == 0) {
      // if std is zero, we have observed x on all instances
      // using a RF, std should be never exactly 0.0
      (0.0, Vectors.dense(new Array[Double](X.size)))
    } else {
      val ucb = m + beta * s

      (ucb, Vectors.dense(new Array[Double](X.size)))
    }
  }
}

Source File: EI.scala From automl with Apache License 2.0

5 votes

package com.tencent.angel.spark.automl.tuner.acquisition

import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate
import org.apache.commons.logging.{Log, LogFactory}
import org.apache.commons.math3.distribution.NormalDistribution
import org.apache.spark.ml.linalg.{Vector, Vectors}


class EI(
          override val surrogate: Surrogate,
          val par: Double)
  extends Acquisition(surrogate) {

  val LOG: Log = LogFactory.getLog(classOf[Surrogate])

  override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = {
    val pred = surrogate.predict(X) // (mean, variance)

    // Use the best seen observation as incumbent
    val eta: Double = surrogate.curBest._2
    //println(s"best seen result: $eta")

    val m: Double = pred._1
    val s: Double = Math.sqrt(pred._2)
    //println(s"${X.toArray.mkString("(", ",", ")")}: mean[$m], variance[$s]")

    if (s == 0) {
      // if std is zero, we have observed x on all instances
      // using a RF, std should be never exactly 0.0
      (0.0, Vectors.dense(new Array[Double](X.size)))
    } else {
      val z = (pred._1 - eta - par) / s
      val norm: NormalDistribution = new NormalDistribution
      val cdf: Double = norm.cumulativeProbability(z)
      val pdf: Double = norm.density(z)
      val ei = s * (z * cdf + pdf)
      //println(s"EI of ${X.toArray.mkString("(", ",", ")")}: $ei, cur best: $eta, z: $z, cdf: $cdf, pdf: $pdf")
      (ei, Vectors.dense(new Array[Double](X.size)))
    }
  }
}

Source File: Describe.scala From Scala-Machine-Learning-Projects with MIT License

5 votes

package com.packt.ScalaML.ChrunPrediction

import org.apache.spark._
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel }
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.max
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator }
import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics
import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator

import org.apache.spark._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import org.apache.spark.sql.Dataset

import org.apache.spark.ml.linalg.{ Matrix, Vectors }
import org.apache.spark.ml.stat.Correlation
import org.apache.spark.sql.Row

object Describe {
  case class CustomerAccount(state_code: String, account_length: Integer, area_code: String,
    international_plan: String, voice_mail_plan: String, num_voice_mail: Double,
    total_day_mins: Double, total_day_calls: Double, total_day_charge: Double,
    total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double,
    total_night_mins: Double, total_night_calls: Double, total_night_charge: Double,
    total_international_mins: Double, total_international_calls: Double, total_international_charge: Double,
    total_international_num_calls: Double, churn: String)

  val schema = StructType(Array(
    StructField("state_code", StringType, true),
    StructField("account_length", IntegerType, true),
    StructField("area_code", StringType, true),
    StructField("international_plan", StringType, true),
    StructField("voice_mail_plan", StringType, true),
    StructField("num_voice_mail", DoubleType, true),
    StructField("total_day_mins", DoubleType, true),
    StructField("total_day_calls", DoubleType, true),
    StructField("total_day_charge", DoubleType, true),
    StructField("total_evening_mins", DoubleType, true),
    StructField("total_evening_calls", DoubleType, true),
    StructField("total_evening_charge", DoubleType, true),
    StructField("total_night_mins", DoubleType, true),
    StructField("total_night_calls", DoubleType, true),
    StructField("total_night_charge", DoubleType, true),
    StructField("total_international_mins", DoubleType, true),
    StructField("total_international_calls", DoubleType, true),
    StructField("total_international_charge", DoubleType, true),
    StructField("total_international_num_calls", DoubleType, true),
    StructField("churn", StringType, true)))

  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .master("local[*]")
      .config("spark.sql.warehouse.dir", "E:/Exp/")
      .appName("Desribe")
      .getOrCreate()

    spark.conf.set("spark.debug.maxToStringFields", 10000)
    val DEFAULT_MAX_TO_STRING_FIELDS = 2500
    if (SparkEnv.get != null) {
      SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS)
    } else {
      DEFAULT_MAX_TO_STRING_FIELDS
    }
    import spark.implicits._

    val trainSet: Dataset[CustomerAccount] = spark.read.
      option("inferSchema", "false")
      .format("com.databricks.spark.csv")
      .schema(schema)
      .load("data/churn-bigml-80.csv")
      .as[CustomerAccount]

    val statsDF = trainSet.describe()   
    statsDF.show()

    trainSet.createOrReplaceTempView("UserAccount")
    spark.catalog.cacheTable("UserAccount")
    
    spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show()
    spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show()
    trainSet.groupBy("churn").count.show()
    spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn")
    
  }
}

Source File: LocalTreeIntegrationSuite.scala From oraf with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.DecisionTreeRegressor
import org.apache.spark.mllib.tree.DecisionTreeSuite
import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext}
import org.apache.spark.sql.DataFrame


  private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val distribTree = setParams(new DecisionTreeRegressor(), testParams)
    val localTree = setParams(new LocalDecisionTreeRegressor(), testParams)
    val localModel = localTree.fit(train)
    val model = distribTree.fit(train)
    OptimizedTreeTests.checkEqual(model, localModel)
  }


  test("Local & distributed training produce the same tree on a toy dataset") {
    val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a slightly larger toy dataset") {
    val data = sc.parallelize(Range(0, 16).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce the same tree on a larger toy dataset") {
    val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce same tree on a dataset of categorical features") {
    val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances())
    // Create a map of categorical feature index to arity; each feature has arity nclasses
    val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3)
    // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its
    // categorical features
    val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a dataset of continuous features") {
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._
    // Use maxDepth = 5 and default params
    val params = medDepthTreeSettings
    val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext,
      nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2)
      .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray)))
      .toDF().cache()
    testEquivalence(data, params)
  }

  test("Local & distributed training produce the same tree on a dataset of constant features") {
    // Generate constant, continuous data
    val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

}

Source File: LocalTreeUnitSuite.scala From oraf with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.tree._
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext


    def deepTreeTest(depth: Int): Unit = {
      val deepTreeData = OptimizedTreeTests.deepTreeData(sc, depth)
      val df = spark.createDataFrame(deepTreeData)
      // Construct estimators; single-tree random forest & decision tree regressor.
      val localTree = new LocalDecisionTreeRegressor()
        .setFeaturesCol("features") // indexedFeatures
        .setLabelCol("label")
        .setMaxDepth(depth)
        .setMinInfoGain(0.0)

      // Fit model, check depth...
      val localModel = localTree.fit(df)
      assert(localModel.rootNode.subtreeDepth == depth)
    }

    // Test small depth tree
    deepTreeTest(10)
    // Test medium depth tree
    deepTreeTest(40)
    // Test high depth tree
    deepTreeTest(200)
  }

}

Source File: OptimizedDecisionTreeIntegrationSuite.scala From oraf with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.Estimator
import org.apache.spark.ml.classification.{DecisionTreeClassifier, OptimizedDecisionTreeClassifier}
import org.apache.spark.ml.feature.{Instance, LabeledPoint}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.{DecisionTreeRegressor, OptimizedDecisionTreeRegressor}
import org.apache.spark.mllib.tree.DecisionTreeSuite
import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext}
import org.apache.spark.sql.DataFrame



  private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val oldTree = setParams(new DecisionTreeRegressor(), testParams)
    val newTree = setParams(new OptimizedDecisionTreeRegressor(), testParams)
    val newModel = newTree.fit(train)
    val oldModel = oldTree.fit(train)
    OptimizedTreeTests.checkEqual(oldModel, newModel)
  }

  private def testClassifierEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = {
    val oldTree = setParams(new DecisionTreeClassifier(), testParams)
    val newTree = setParams(new OptimizedDecisionTreeClassifier(), testParams)
    val newModel = newTree.fit(train)
    val model = oldTree.fit(train)
    OptimizedTreeTests.checkEqual(model, newModel)
  }

  test("Local & distributed training produce the same tree on a toy dataset") {
    val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
    testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree with two feature values") {
    val data = sc.parallelize(Range(0, 8).map(x => {
     if (x > 3) {
       Instance(x, 1.0, Vectors.dense(0.0))
     } else {
       Instance(x, 1.0, Vectors.dense(1.0))
     }}))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
    testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a slightly larger toy dataset") {
    val data = sc.parallelize(Range(0, 10).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce the same tree on a larger toy dataset") {
    val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, medDepthTreeSettings)
  }

  test("Local & distributed training produce same tree on a dataset of categorical features") {
    val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances())
    // Create a map of categorical feature index to arity; each feature has arity nclasses
    val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3)
    // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its
    // categorical features
    val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

  test("Local & distributed training produce the same tree on a dataset of continuous features") {
    val sqlContext = spark.sqlContext
    import sqlContext.implicits._
    // Use maxDepth = 5 and default params
    val params = medDepthTreeSettings
    val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext,
      nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2)
      .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray)))
      .toDF().cache()
    testEquivalence(data, params)
  }

  test("Local & distributed training produce the same tree on a dataset of constant features") {
    // Generate constant, continuous data
    val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1))))
    val df = spark.createDataFrame(data)
    testEquivalence(df, OptimizedTreeTests.allParamSettings)
  }

}

Source File: VSoftmaxRegressionSuite.scala From spark-vlbfgs with Apache License 2.0

5 votes

package org.apache.spark.ml.classification

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg.{SparseMatrix, Vector, Vectors}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset}

import scala.language.existentials


class VSoftmaxRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {

  import testImplicits._

  private val seed = 42
  @transient var multinomialDataset: Dataset[_] = _
  private val eps: Double = 1e-5

  override def beforeAll(): Unit = {
    super.beforeAll()

    multinomialDataset = {
      val nPoints = 50
      val coefficients = Array(
        -0.57997, 0.912083, -0.371077, -0.819866, 2.688191,
        -0.16624, -0.84355, -0.048509, -0.301789, 4.170682)

      val xMean = Array(5.843, 3.057, 3.758, 1.199)
      val xVariance = Array(0.6856, 0.1899, 3.116, 0.581)

      val testData = LogisticRegressionSuite.generateMultinomialLogisticInput(
        coefficients, xMean, xVariance, addIntercept = true, nPoints, seed)

      val df = sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed))
      df.cache()
      println("softmax test data:")
      df.show(10, false)
      df
    }
  }

  test("test on multinomialDataset") {

    def b2s(b: Boolean): String = {
      if (b) "w/" else "w/o"
    }

    for (standardization <- Seq(false, true)) {
      for ((reg, elasticNet) <- Seq((0.0, 0.0), (2.3, 0.0), (0.3, 0.05), (0.01, 1.0))) {
        println()
        println(s"# test ${b2s(standardization)} standardization, reg=${reg}, elasticNet=${elasticNet}")

        val trainer = new LogisticRegression()
          .setFamily("multinomial")
          .setStandardization(standardization)
          .setWeightCol("weight")
          .setRegParam(reg)
          .setFitIntercept(false)
          .setElasticNetParam(elasticNet)

        val model = trainer.fit(multinomialDataset)

        val vtrainer = new VSoftmaxRegression()
          .setColsPerBlock(2)
          .setRowsPerBlock(5)
          .setColPartitions(2)
          .setRowPartitions(3)
          .setWeightCol("weight")
          .setGeneratingFeatureMatrixBuffer(2)
          .setStandardization(standardization)
          .setRegParam(reg)
          .setElasticNetParam(elasticNet)
        val vmodel = vtrainer.fit(multinomialDataset)

        println(s"VSoftmaxRegression coefficientMatrix:\n" +
          s"${vmodel.coefficientMatrix.asInstanceOf[SparseMatrix].toDense},\n" +
          s"ml.SoftmaxRegression coefficientMatrix:\n" +
          s"${model.coefficientMatrix}\n")

        assert(vmodel.coefficientMatrix ~== model.coefficientMatrix relTol eps)
      }
    }
  }
}

Source File: DistributedVectorSuite.scala From spark-vlbfgs with Apache License 2.0

5 votes

package org.apache.spark.ml.linalg.distributed

import breeze.linalg.{DenseVector => BDV, norm => Bnorm}
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.ml.util.VUtils
import org.apache.spark.mllib.util.MLlibTestSparkContext

class DistributedVectorSuite extends SparkFunSuite with MLlibTestSparkContext {

  var BV1: BDV[Double] = null
  var BV2: BDV[Double] = null
  var BV3: BDV[Double] = null
  var BV4: BDV[Double] = null
  var DV1: DistributedVector = null
  var DV2: DistributedVector = null
  var DV3: DistributedVector = null
  var DV4: DistributedVector = null

  override def beforeAll(): Unit = {
    super.beforeAll()

    val v1: Array[Double] = Seq(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0).toArray
    val v2: Array[Double] = Seq(-1.0, -2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0).toArray
    val v3: Array[Double] = Seq(-1.0, -2.0, -3.0, 5.0, -6.0, 7.0, 8.0, 9.0).toArray
    val v4: Array[Double] = Seq(0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 8.0, 0.0).toArray

    BV1 = new BDV(v1)
    BV2 = new BDV(v2)
    BV3 = new BDV(v3)
    BV4 = new BDV(v4)

    val sizePerPart = 3
    val numPartitions = VUtils.getNumBlocks(sizePerPart, v1.length)

    DV1 = VUtils.splitArrIntoDV(sc, v1, sizePerPart, numPartitions).persist()
    DV2 = VUtils.splitArrIntoDV(sc, v2, sizePerPart, numPartitions).persist()
    DV3 = VUtils.splitArrIntoDV(sc, v3, sizePerPart, numPartitions).persist()
    DV4 = VUtils.splitArrIntoDV(sc, v4, sizePerPart, numPartitions).persist()
  }

  test("toLocal") {
    val localDV1 = DV1.toLocal
    assert(localDV1 ~== Vectors.fromBreeze(BV1) relTol 1e-8)
    val localDV4 = DV4.compressed.toLocal
    assert(localDV4 ~== Vectors.fromBreeze(BV4) relTol 1e-8)
  }

  test("add") {
    val local1 = DV1.add(2.0).persist().toLocal
    val local2 = DV1.add(DV2).persist().toLocal
    assert(local1 ~== Vectors.fromBreeze(BV1 + 2.0) relTol 1e-8)
    assert(local2 ~== Vectors.fromBreeze(BV1 + BV2) relTol 1e-8)
  }

  test("scale") {
    val local1 = DV1.scale(2.0).persist().toLocal
    assert(local1 ~== Vectors.fromBreeze(BV1 * 2.0) relTol 1e-8)
  }

  test("addScaledVector") {
    val res = DV1.addScaledVector(3.0, DV2).persist().toLocal
    assert(res ~== Vectors.fromBreeze(BV1 + (BV2 * 3.0)) relTol 1e-8)
  }

  test("dot") {
    val dotVal = DV1.dot(DV2)
    val bDotVal = BV1.dot(BV2)
    assert(dotVal ~== bDotVal relTol 1e-8)
  }

  test("norm") {
    assert(DV1.norm ~== Bnorm(BV1) relTol 1e-8)
  }

  test("combine") {
    val combined = DistributedVectors.combine(
      (10.0, DV1), (100.0, DV2), (18.0, DV3)
    ).persist().toLocal
    val bCombined = (BV1 * 10.0) + (BV2 * 100.0) + (BV3 * 18.0)
    assert(combined ~== Vectors.fromBreeze(bCombined) relTol 1e-8)
  }

  test("zeros") {
    var res1 = VUtils.zipRDDWithPartitionIDAndCollect(
      DistributedVectors.zeros(sc, 3, 2, 5).values)
    var res2 = Array((0, Vectors.dense(0.0, 0.0, 0.0)), (1, Vectors.dense(0.0, 0.0)))
    assert(res1 === res2)
    res1 = VUtils.zipRDDWithPartitionIDAndCollect(
      DistributedVectors.zeros(sc, 3, 2, 7, 1.5).values)
    res2 = Array((0, Vectors.dense(0.0, 0.0, 0.0)), (1, Vectors.dense(0.0, 0.0, 0.0, 1.5)))
    assert(res1 === res2)
  }
}

Source File: VLinearRegressionSuite.scala From spark-vlbfgs with Apache License 2.0

5 votes

package org.apache.spark.ml.regression

import scala.language.existentials
import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.DataFrame


class VLinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext {

  import testImplicits._
  var datasetWithWeight: DataFrame = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    datasetWithWeight = sc.parallelize(Seq(
      Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse),
      Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)),
      Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)),
      Instance(29.0, 4.0, Vectors.dense(3.0, 13.0))
    ), 2).toDF()
  }

  test("test on datasetWithWeight") {

    def b2s(b: Boolean): String = {
      if (b) "w/" else "w/o"
    }

    for (fitIntercept <- Seq(false, true)) {
      for (standardization <- Seq(false, true)) {
        for ((reg, elasticNet)<- Seq((0.0, 0.0), (2.3, 0.0), (2.3, 0.5))) {

          println()
          println(s"# test ${b2s(fitIntercept)} intercept, ${b2s(standardization)} standardization, reg=${reg}, elasticNet=${elasticNet}")

          val vtrainer = new VLinearRegression()
            .setColsPerBlock(1)
            .setRowsPerBlock(1)
            .setGeneratingFeatureMatrixBuffer(2)
            .setFitIntercept(fitIntercept)
            .setStandardization(standardization)
            .setRegParam(reg)
            .setWeightCol("weight")
            .setElasticNetParam(elasticNet)
          val vmodel = vtrainer.fit(datasetWithWeight)

          // Note that in ml.LinearRegression, when datasets numInstanse is small
          // solver l-bfgs and solver normal will generate slightly different result when reg not zero
          // because there std calculation result have multiple difference numInstance/(numInstance - 1)
          // here test keep consistent with l-bfgs solver
          val trainer = new LinearRegression()
            .setSolver("l-bfgs") // by default it may use noraml solver so here force set it.
            .setFitIntercept(fitIntercept)
            .setStandardization(standardization)
            .setRegParam(reg)
            .setWeightCol("weight")
            .setElasticNetParam(elasticNet)

          val model = trainer.fit(datasetWithWeight)
          logInfo(s"LinearRegression total iterations: ${model.summary.totalIterations}")

          println(s"VLinearRegression coefficients: ${vmodel.coefficients.toDense}, intercept: ${vmodel.intercept}\n" +
            s"LinearRegression coefficients: ${model.coefficients.toDense}, intercept: ${model.intercept}")

          def filterSmallValue(v: Vector) = {
            Vectors.dense(v.toArray.map(x => if (math.abs(x) < 1e-6) 0.0 else x))
          }
          assert(filterSmallValue(vmodel.coefficients) ~== filterSmallValue(model.coefficients) relTol 1e-3)
          assert(vmodel.intercept ~== model.intercept relTol 1e-3)
        }
      }
    }
  }
}

Source File: ReebDiagram.scala From spark-tda with Apache License 2.0

5 votes

import java.io.{File, PrintWriter}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.feature.{ReebDiagram, VectorAssembler}
import org.apache.spark.sql.functions._

def computeReebDiagram(
  pathToTextFile: String,
  quantity: Int,
  linkThresholdRatio: Double,
  coreThresholdRatio: Double,
  topTreeRatio: Double) {

  def save(f: File)(func: PrintWriter => Unit) {
    val p = new PrintWriter(f)
    try {
      func(p)
    } finally {
      p.close()
    }
  }

  val filename = pathToTextFile.split("\\.")(0)

  val outputFilename = s"$filename-REEB-k${quantity}-l${linkThresholdRatio}-c${coreThresholdRatio}-i${topTreeRatio}.tsv"

  val points = sc.textFile(pathToTextFile)
    .map {
      line => line.trim.split("\\s+")
    }
    .zipWithIndex
    .map { case (row, i) =>
      (i, row(0).toDouble, row(1).toDouble, 0)
    }
    .toDF("id", "x", "y", "cover_id")

  val cardinality = points.count

  val assembler = new VectorAssembler()
    .setInputCols(Array("x", "y"))
    .setOutputCol("feature")

  val features = assembler
    .transform(points)

  val reeb = new ReebDiagram()
    .setK(quantity)
    .setLinkThresholdRatio(linkThresholdRatio)
    .setCoreThresholdRatio(coreThresholdRatio)
    .setTopTreeSize((topTreeRatio * cardinality).toInt)
    .setTopTreeLeafSize(quantity)
    .setIdCol("id")
    .setCoverCol("cover_id")
    .setFeaturesCol("feature")
    .setOutputCol("cluster_id")

  val transformed = reeb
    .fit(features)
    .transform(features)

  val clusters = Map(
    transformed
      .select("cluster_id")
      .rdd
      .map(row => row.getLong(0))
      .distinct
      .zipWithIndex
      .collect(): _*)

  val result = transformed
    .select("x", "y", "cluster_id")
    .rdd
    .map(row => (row.getDouble(0), row.getDouble(1), row.getLong(2)))
    .map { case (x, y, clusterId) => (x, y, clusters(clusterId) + 1)}
    .collect()

  save(new File(outputFilename)) {
    println(s"OUTPUT TO: ${outputFilename}")
    f => result.foreach{
      case (x, y, ccid) => f.println(s"${x}\t${y}\t${ccid}")
    }
  }
}

Source File: FeaturePropSpec.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{Vector, Vectors, DenseVector}
import org.apache.spark.ml.linalg.SQLDataTypes.VectorType
import org.apache.spark.sql.{SparkSession, DataFrame}
import org.apache.spark.sql.types.{
  StructField,
  IntegerType,
  DoubleType,
  BooleanType,
  StructType,
  StringType,
  ArrayType
}
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Arbitrary.arbitrary
import org.scalatest.PropSpec
import com.holdenkarau.spark.testing.{
  SharedSparkContext,
  DataframeGenerator,
  Column
}


abstract class FeaturePropSpec
    extends PropSpec
    with SharedSparkContext
    with DefaultReadWriteTest {
  implicit def arbitraryDenseVector: Arbitrary[DenseVector] =
    Arbitrary {
      for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr)
    }

  implicit def arbitraryVector: Arbitrary[Vector] =
    Arbitrary(
      Gen.frequency(
        1 -> arbitrary[DenseVector]
      ))

  lazy val spark = SparkSession.builder().getOrCreate()

  def schema =
    StructType(
      List(
        StructField("integer", IntegerType),
        StructField("double", DoubleType),
        StructField("boolean", BooleanType),
        StructField("string", StringType)
      ))

  def integerGen = new Column("integer", Gen.choose(-100, 100))

  def doubleGen = new Column("double", Gen.choose(-100.0, 100.0))

  def stringGen =
    new Column("string", Gen.oneOf("A", "BC", "DEF", "GHIJ", "KLMNO"))

  def dataframeGen =
    DataframeGenerator.arbitraryDataFrameWithCustomFields(
      spark.sqlContext,
      schema)(integerGen, doubleGen, stringGen)

  def hasDistinctValues(df: DataFrame, columns: String*): Boolean = {
    columns.foldLeft(true) { (acc, col) =>
      acc && df.select(col).distinct.count() > 1
    }
  }
}

Source File: ReebDiagramTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.{Vectors, EuclideanDistance, Vector}
import org.apache.spark.sql.functions.{col, explode, udf}
import org.scalatest.{PropSpec, Matchers, GivenWhenThen}
import org.scalatest.prop.GeneratorDrivenPropertyChecks


class ReebDiagramTest
    extends FeaturePropSpec
    with GivenWhenThen
    with GeneratorDrivenPropertyChecks
    with Matchers {
  val assembler = new VectorAssembler()
    .setInputCols(Array("double", "integer"))
    .setOutputCol("vector")
  val cover = new Cover()
    .setExploding(true)
    .setInputCols("double", "integer")
    .setOutputCol("cover_id")

  property("argument topTreeSize must be positive") {
    intercept[IllegalArgumentException] {
      val reeb = new ReebDiagram()
//        .setIdCol("id")
//        .setCoverCol("cover_id")
//        .setFeaturesCol("vector")
//        .setOutputCol("cluster_id")
        .setTopTreeSize(0)
    }
  }

  property("placeholder") {
    val reeb = new ReebDiagram()
      .setK(15)
      .setIdCol("id")
      .setCoverCol("cover_id")
      .setFeaturesCol("vector")
      .setOutputCol("cluster_id")
    forAll(dataframeGen.arbitrary) { df =>
      val assembled = assembler.transform(df)
      whenever(
        assembled.count() > 0 && hasDistinctValues(assembled,
                                                   "double",
                                                   "integer")) {
        val transformed = cover
          .fit(assembled)
          .transform(assembled)
        val result = reeb
          .setTopTreeSize(1)
          .fit(transformed)
          .transform(transformed)
//        result.show()
      }
    }
  }
}

Source File: CoverTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.functions.{col, explode, udf}
import org.scalatest.{PropSpec, Matchers, GivenWhenThen}
import org.scalatest.prop.GeneratorDrivenPropertyChecks


class CoverTest
    extends FeaturePropSpec
    with GivenWhenThen
    with GeneratorDrivenPropertyChecks
    with Matchers {
  val assembler = new VectorAssembler()
    .setInputCols(Array("double", "integer"))
    .setOutputCol("vector")

  property("argument numSplits must be positive") {
    intercept[IllegalArgumentException] {
      val cover = new Cover()
        .setInputCols("double")
        .setOutputCol("cover_ids")
        .setNumSplits(0)
    }
  }

  property("argument overlapRatio must be positive") {
    intercept[IllegalArgumentException] {
      val cover = new Cover()
        .setInputCols("double")
        .setOutputCol("cover_ids")
        .setOverlapRatio(0.0)
    }
  }

  property("cover estimator changes nothing with the original dataframe") {
    val cover = new Cover()
      .setInputCols("double", "integer", "vector")
      .setOutputCol("cover_ids")

    forAll(dataframeGen.arbitrary) { df =>
      val transformed = assembler.transform(df)
      whenever(
        transformed.count() > 0 && hasDistinctValues(transformed,
                                                     "double",
                                                     "integer",
                                                     "vector")) {
        val covered = cover
          .fit(transformed)
          .transform(transformed)
          .drop("cover_ids")
          .except(transformed)
          .count() should be(0)
      }
    }
  }

  property("generated cover covers all range of specified columns") {
    val cover = new Cover()
      .setInputCols("double", "integer", "vector")
      .setOutputCol("cover_ids")
    val uncovered = udf { xs: Seq[Long] =>
      xs.length == 0
    }

    forAll(dataframeGen.arbitrary) { df =>
      val transformed = assembler.transform(df)
      whenever(
        transformed.count() > 0 && hasDistinctValues(transformed,
                                                     "double",
                                                     "integer",
                                                     "vector")) {
        cover
          .fit(transformed)
          .transform(transformed)
          .where(uncovered(col("cover_ids")))
          .count() should be(0)
      }
    }
  }

  property("Cover is readable/writable") {
    val cover = new Cover()
      .setInputCols("double", "integer")
      .setOutputCol("cover_ids")
    testDefaultReadWrite(cover)
  }

  property("CoverModel is readable/writable") {
    val model = new CoverModel("myCoverModel",
                               Vectors.dense(-1.0, 0.0),
                               Vectors.dense(1.0, 10.0))
      .setInputCols("double", "integer")
      .setOutputCol("cover_ids")
    val newModel = testDefaultReadWrite(model)
    assert(newModel.min === model.min)
    assert(newModel.max === model.max)
  }
}

Source File: PartitionersTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.util.knn

import org.scalacheck.Prop.forAllNoShrink
import org.scalatest.Matchers
import org.scalatest.prop.GeneratorDrivenPropertyChecks
import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance}


class PartitionersTest
    extends KNNPropSpec
    with GeneratorDrivenPropertyChecks
    with Matchers {
  property("TopTreesPartitioner can be constructed with empty data") {
    forAll { (v: Vector, coverId: Int) =>
      val partitioner =
        new TopTreesPartitioner(TopTrees(IndexedSeq.empty[(Int, Tree)]))
      val vector = VectorEntry(0L, v)
      intercept[NoSuchElementException] {
        partitioner.getPartition((coverId, vector))
      }
    }
  }

  property(
    "TopTrees can be constructed with non empty data and maintain its consistency") {
    forAll(treeGen) {
      case (trees) =>
        val indexedTrees = trees.zipWithIndex.map { case (t, i) => (i, t) }
        val partitioner = new TopTreesPartitioner(TopTrees(indexedTrees))
        val indices = indexedTrees
          .flatMap {
            case (index, tree) =>
              tree.iterator.map(d => (index, d))
          }
          .map {
            case (index, entry) =>
              partitioner.getPartition((index, entry))
          }
          .toSet
        indices should contain theSameElementsAs (0 until partitioner.numPartitions)
          .toSet
        (0 until partitioner.numPartitions).toSet should contain theSameElementsAs indices
        intercept[IllegalArgumentException] {
          partitioner.getPartition(0)
        }
    }
  }
}

Source File: KNNPropSpec.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.util.knn

import scala.reflect.ClassTag
import org.scalacheck.{Arbitrary, Gen}
import org.scalacheck.Arbitrary.arbitrary
import org.scalacheck.Gen.{choose, oneOf}
import org.scalatest.PropSpec
import org.apache.spark.ml.linalg.{
  CosineDistance,
  EuclideanDistance,
  ManhattanDistance,
  JaccardDistance,
  HammingDistance
}
import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector, Vectors}
import com.holdenkarau.spark.testing.SharedSparkContext


abstract class KNNPropSpec extends PropSpec with SharedSparkContext {
  implicit def arbitraryDenseVector: Arbitrary[DenseVector] =
    Arbitrary {
      for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr)
    }

  implicit def arbitrarySparseVector: Arbitrary[SparseVector] =
    Arbitrary {
      for (vec <- arbitrary[DenseVector]) yield vec.toSparse
    }

  implicit def arbitraryVector: Arbitrary[Vector] =
    Arbitrary(
      Gen.frequency(
        1 -> arbitrary[DenseVector],
        1 -> arbitrary[SparseVector]
      ))

  private def arraysOfNM[T: ClassTag](numRows: Int,
                                      numCols: Int,
                                      gen: Gen[T]): Gen[Array[Array[T]]] =
    Gen.listOfN(numRows * numCols, gen).map { square =>
      square.toArray.grouped(numCols).toArray
    }

  private def vectorsOfNM(numRows: Int,
                          numCols: Int,
                          gen: Gen[Double]): Gen[Array[DenseVector]] =
    for {
      arrays <- arraysOfNM(numRows, numCols, gen)
    } yield arrays.map(arr => new DenseVector(arr))

  val treeGen = for {
    measure <- oneOf(CosineDistance,
                     EuclideanDistance,
                     ManhattanDistance,
                     HammingDistance,
                     JaccardDistance)
    numVectors <- choose(1, 100)
    vectors <- vectorsOfNM(numVectors, 2, choose(-10.0, 10.0))
  } yield
    vectors
      .scanLeft(Seq[Vector]())(_ :+ _)
      .tail
      .map(
        vs =>
          VPTree(vs.map(v => VectorEntry(0L, v)).toIndexedSeq,
                 measure,
                 10,
                 10,
                 10))
}

Source File: IndicesTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.util.knn

import org.scalacheck.Prop.forAllNoShrink
import org.scalatest.Matchers
import org.scalatest.prop.GeneratorDrivenPropertyChecks
import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance}


class IndicesTest
    extends KNNPropSpec
    with GeneratorDrivenPropertyChecks
    with Matchers {
  property("TopTrees can be constructed with empty data") {
    forAll { (v: Vector, coverId: Int) =>
      val topTrees = TopTrees(IndexedSeq.empty[(Int, Tree)])
      val vector = VectorEntry(0L, v)
      topTrees.get((coverId, vector)) shouldBe None
      topTrees.isDefinedAt((coverId, vector)) shouldBe false
      intercept[NoSuchElementException] {
        topTrees((coverId, vector))
      }
    }
  }

  property(
    "TopTrees can be constructed with non empty data and maintain its consistency") {
    forAll(treeGen) {
      case (trees) =>
        val indexedTrees = trees.zipWithIndex.map { case (t, i) => (i, t) }
        val topTrees = TopTrees(indexedTrees)
        val indices = indexedTrees
          .flatMap {
            case (index, tree) =>
              tree.iterator.map(d => (index, d))
          }
          .map {
            case (index, entry) =>
              topTrees((index, entry))
          }
          .toSet
        indices should contain theSameElementsAs (0 until topTrees.numIndices)
          .toSet
        (0 until topTrees.numIndices).toSet should contain theSameElementsAs indices
    }
  }
}

Source File: TreesTest.scala From spark-tda with Apache License 2.0

5 votes

package org.apache.spark.ml.util.knn

import org.scalacheck.Prop.forAllNoShrink
import org.scalatest.Matchers
import org.scalatest.prop.GeneratorDrivenPropertyChecks
import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance}


class TreesTest
    extends KNNPropSpec
    with GeneratorDrivenPropertyChecks
    with Matchers {

  property("VPTree can be constructed with empty data") {
    forAll { (v: Vector) =>
      val tree =
        VPTree(IndexedSeq.empty[VectorWithId], EuclideanDistance, 0, 0)
      val vector = VectorEntry(0L, v)
      tree.iterator shouldBe empty
      tree.query(vector) shouldBe empty
      tree.numLeaves shouldBe 0
    }
  }

  property("VPTree can be constructed with data not having any duplication") {
    val origin = VectorEntry(0L, Vectors.dense(0, 0))
    val data = (-5 to 5).flatMap { i =>
      (-5 to 5).map { j =>
        VectorEntry(0L, Vectors.dense(i, j))
      }
    }
    List(1, data.size / 2, data.size, data.size * 2).foreach { leafSize =>
      val tree = VPTree(data, EuclideanDistance, 1, 1, leafSize)
      tree.size shouldBe data.size
      tree.iterator.toIterable should contain theSameElementsAs data
      data.foreach(v => tree.query(v, 1).head._1 shouldBe v)
      tree
        .query(origin, 5)
        .map(_._1.vector) should contain theSameElementsAs Set(
        Vectors.dense(-1, 0),
        Vectors.dense(1, 0),
        Vectors.dense(0, -1),
        Vectors.dense(0, 1),
        Vectors.dense(0, 0)
      )
      tree
        .query(origin, 9)
        .map(_._1.vector) should contain theSameElementsAs Set(
        Vectors.dense(-1, -1),
        Vectors.dense(-1, 0),
        Vectors.dense(-1, 1),
        Vectors.dense(0, -1),
        Vectors.dense(0, 0),
        Vectors.dense(0, 1),
        Vectors.dense(1, -1),
        Vectors.dense(1, 0),
        Vectors.dense(1, 1)
      )
      tree.numLeaves shouldBe (tree.cardinality / leafSize.toDouble).ceil
    }
  }

  property("VPTree can be constructed with data having duplication") {
    val origin = VectorEntry(0L, Vectors.dense(0, 0))
    val data =
      (Vectors.dense(2.0, 0.0) +: Array.fill(5)(Vectors.dense(0.0, 1.0)))
        .map(VectorEntry(0L, _))
    val tree = VPTree(data, EuclideanDistance, 6, 6)
    val knn = tree.query(origin, 5)
    tree.numLeaves shouldBe 2
    knn.size shouldBe 5
    knn.map(_._1.vector).toSet should contain theSameElementsAs Array(
      Vectors.dense(0.0, 1.0))
  }
}

Source File: MleapNodeWrapper.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.tree.clustering

import ml.bundle.ctree.Node
import ml.combust.bundle.tree.cluster.NodeWrapper
import ml.combust.mleap.core.clustering.ClusteringTreeNode
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.mleap.VectorWithNorm


object MleapNodeWrapper extends NodeWrapper[ClusteringTreeNode] {
  override def node(n: ClusteringTreeNode): Node = {
    Node(index = n.index,
      norm = n.centerWithNorm.norm,
      values = n.centerWithNorm.vector.toArray.toSeq,
      numChildren = n.children.length)
  }

  override def children(n: ClusteringTreeNode): Array[ClusteringTreeNode] = n.children

  override def create(node: Node, children: Seq[ClusteringTreeNode]): ClusteringTreeNode = {
    ClusteringTreeNode(index = node.index,
      centerWithNorm = VectorWithNorm(Vectors.dense(node.values.toArray), node.norm),
      children = children.toArray)
  }
}

Source File: ElementwiseProductOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.ElementwiseProductModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.ElementwiseProduct
import org.apache.spark.ml.linalg.Vectors


class ElementwiseProductOp extends MleapOp[ElementwiseProduct, ElementwiseProductModel] {
  override val Model: OpModel[MleapContext, ElementwiseProductModel] = new OpModel[MleapContext, ElementwiseProductModel] {
    override val klazz: Class[ElementwiseProductModel] = classOf[ElementwiseProductModel]

    override def opName: String = Bundle.BuiltinOps.feature.elementwise_product

    override def store(model: Model, obj: ElementwiseProductModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("scaling_vec", Value.vector(obj.scalingVec.toArray))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): ElementwiseProductModel = {
      ElementwiseProductModel(scalingVec = Vectors.dense(model.value("scaling_vec").getTensor[Double].toArray))
    }
  }

  override def model(node: ElementwiseProduct): ElementwiseProductModel = node.model
}

Source File: BucketedRandomProjectionLSHOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.BucketedRandomProjectionLSHModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.BucketedRandomProjectionLSH
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors


class BucketedRandomProjectionLSHOp extends MleapOp[BucketedRandomProjectionLSH, BucketedRandomProjectionLSHModel] {
  override val Model: OpModel[MleapContext, BucketedRandomProjectionLSHModel] = new OpModel[MleapContext, BucketedRandomProjectionLSHModel] {
    override val klazz: Class[BucketedRandomProjectionLSHModel] = classOf[BucketedRandomProjectionLSHModel]

    override def opName: String = Bundle.BuiltinOps.feature.bucketed_random_projection_lsh

    override def store(model: Model, obj: BucketedRandomProjectionLSHModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("random_unit_vectors", Value.tensorList[Double](obj.randomUnitVectors.map(v => Tensor.denseVector(v.toArray)))).
        withValue("bucket_length", Value.double(obj.bucketLength)).
        withValue("input_size", Value.int(obj.inputSize))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): BucketedRandomProjectionLSHModel = {
      val ruv = model.value("random_unit_vectors").getTensorList[Double].map(_.toArray).map(Vectors.dense)
      BucketedRandomProjectionLSHModel(randomUnitVectors = ruv,
        bucketLength = model.value("bucket_length").getDouble,
        inputSize = model.value("input_size").getInt)
    }
  }

  override def model(node: BucketedRandomProjectionLSH): BucketedRandomProjectionLSHModel = node.model
}

Source File: MaxAbsScalerOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.MaxAbsScalerModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.MaxAbsScaler
import org.apache.spark.ml.linalg.Vectors


class MaxAbsScalerOp extends MleapOp[MaxAbsScaler, MaxAbsScalerModel]{
  override val Model: OpModel[MleapContext, MaxAbsScalerModel] = new OpModel[MleapContext, MaxAbsScalerModel] {
    override val klazz: Class[MaxAbsScalerModel] = classOf[MaxAbsScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.max_abs_scaler

    override def store(model: Model, obj: MaxAbsScalerModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("maxAbs", Value.vector(obj.maxAbs.toArray))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): MaxAbsScalerModel = {
      MaxAbsScalerModel(maxAbs = Vectors.dense(model.value("maxAbs").getTensor[Double].toArray))
    }
  }

  override def model(node: MaxAbsScaler): MaxAbsScalerModel = node.model
}

Source File: IDFOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.IDFModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.IDF
import org.apache.spark.ml.linalg.Vectors


class IDFOp extends MleapOp[IDF, IDFModel] {
  override val Model: OpModel[MleapContext, IDFModel] = new OpModel[MleapContext, IDFModel] {
    override val klazz: Class[IDFModel] = classOf[IDFModel]

    override def opName: String = Bundle.BuiltinOps.feature.idf

    override def store(model: Model, obj: IDFModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("idf", Value.vector(obj.idf.toArray))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): IDFModel = {
      IDFModel(idf = Vectors.dense(model.value("idf").getTensor[Double].toArray))
    }
  }

  override def model(node: IDF): IDFModel = node.model
}

Source File: StandardScalerOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.mleap.core.feature.StandardScalerModel
import ml.combust.mleap.runtime.transformer.feature.StandardScaler
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import org.apache.spark.ml.linalg.Vectors


class StandardScalerOp extends MleapOp[StandardScaler, StandardScalerModel] {
  override val Model: OpModel[MleapContext, StandardScalerModel] = new OpModel[MleapContext, StandardScalerModel] {
    override val klazz: Class[StandardScalerModel] = classOf[StandardScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.standard_scaler

    override def store(model: Model, obj: StandardScalerModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("mean", obj.mean.map(_.toArray).map(Value.vector[Double])).
        withValue("std", obj.std.map(_.toArray).map(Value.vector[Double]))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): StandardScalerModel = {
      val mean = model.getValue("mean").map(_.getTensor[Double].toArray).map(Vectors.dense)
      val std = model.getValue("std").map(_.getTensor[Double].toArray).map(Vectors.dense)
      StandardScalerModel(mean = mean, std = std)
    }
  }

  override def model(node: StandardScaler): StandardScalerModel = node.model
}

Source File: MinMaxScalerOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.feature.MinMaxScalerModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors


class MinMaxScalerOp extends MleapOp[MinMaxScaler, MinMaxScalerModel]{
  override val Model: OpModel[MleapContext, MinMaxScalerModel] = new OpModel[MleapContext, MinMaxScalerModel] {
    override val klazz: Class[MinMaxScalerModel] = classOf[MinMaxScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.min_max_scaler

    override def store(model: Model, obj: MinMaxScalerModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("min", Value.vector(obj.originalMin.toArray)).
        withValue("max", Value.vector(obj.originalMax.toArray))
        .withValue("minValue", Value.double(obj.minValue))
        .withValue("maxValue", Value.double(obj.maxValue))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): MinMaxScalerModel = {
      val minValue = model.getValue("minValue").map(_.getDouble).getOrElse(0.0)
      val maxValue = model.getValue("maxValue").map(_.getDouble).getOrElse(1.0)

      MinMaxScalerModel(originalMin = Vectors.dense(model.value("min").getTensor[Double].toArray),
        originalMax = Vectors.dense(model.value("max").getTensor[Double].toArray),
        minValue = minValue,
        maxValue = maxValue
      )
    }
  }

  override def model(node: MinMaxScaler): MinMaxScalerModel = node.model
}

Source File: GaussianMixtureOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.clustering.GaussianMixtureModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.clustering.GaussianMixture
import ml.combust.mleap.tensor.{DenseTensor, Tensor}
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian


class GaussianMixtureOp extends MleapOp[GaussianMixture, GaussianMixtureModel] {
  override val Model: OpModel[MleapContext, GaussianMixtureModel] = new OpModel[MleapContext, GaussianMixtureModel] {
    override val klazz: Class[GaussianMixtureModel] = classOf[GaussianMixtureModel]

    override def opName: String = Bundle.BuiltinOps.clustering.gaussian_mixture

    override def store(model: Model, obj: GaussianMixtureModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      val (means, covs) = obj.gaussians.map(g => (g.mean, g.cov)).unzip
      model.withValue("means", Value.tensorList(means.map(m => Tensor.denseVector(m.toArray)))).
        withValue("covs", Value.tensorList(covs.map(c => DenseTensor(c.toArray, Seq(c.numRows, c.numCols))))).
        withValue("weights", Value.doubleList(obj.weights.toSeq))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): GaussianMixtureModel = {
      val means = model.value("means").getTensorList[Double].map(values => Vectors.dense(values.toArray))
      val covs = model.value("covs").getTensorList[Double].map {
        values => Matrices.dense(values.dimensions.head, values.dimensions(1), values.toArray)
      }
      val gaussians = means.zip(covs).map {
        case (mean, cov) => new MultivariateGaussian(mean, cov)
      }.toArray
      val weights = model.value("weights").getDoubleList.toArray
      GaussianMixtureModel(gaussians, weights)
    }
  }

  override def model(node: GaussianMixture): GaussianMixtureModel = node.model
}

Source File: KMeansOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.clustering.KMeansModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.clustering.KMeans
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors


class KMeansOp extends MleapOp[KMeans, KMeansModel] {
  override val Model: OpModel[MleapContext, KMeansModel] = new OpModel[MleapContext, KMeansModel] {
    override val klazz: Class[KMeansModel] = classOf[KMeansModel]

    override def opName: String = Bundle.BuiltinOps.clustering.k_means

    override def store(model: Model, obj: KMeansModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("cluster_centers",
        Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.vector.toArray))))
      .withValue("num_features", Value.long(obj.numFeatures))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): KMeansModel = {
      val numFeatures = model.value("num_features").getLong.toInt

      KMeansModel(model.value("cluster_centers").getTensorList[Double].map(t => Vectors.dense(t.toArray)), numFeatures)
    }
  }

  override def model(node: KMeans): KMeansModel = node.model
}

Source File: MultiLayerPerceptronClassifierOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.classification.MultiLayerPerceptronClassifierModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.classification.MultiLayerPerceptronClassifier
import org.apache.spark.ml.linalg.Vectors


class MultiLayerPerceptronClassifierOp extends MleapOp[MultiLayerPerceptronClassifier, MultiLayerPerceptronClassifierModel] {
  override val Model: OpModel[MleapContext, MultiLayerPerceptronClassifierModel] = new OpModel[MleapContext, MultiLayerPerceptronClassifierModel] {
    override def opName: String = Bundle.BuiltinOps.classification.multi_layer_perceptron_classifier

    override val klazz: Class[MultiLayerPerceptronClassifierModel] = classOf[MultiLayerPerceptronClassifierModel]

    override def store(model: Model, obj: MultiLayerPerceptronClassifierModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("layers", Value.longList(obj.layers.map(_.toLong))).
        withValue("weights", Value.vector(obj.weights.toArray)).
        withValue("thresholds", obj.thresholds.map(Value.doubleList(_)))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): MultiLayerPerceptronClassifierModel = {
      MultiLayerPerceptronClassifierModel(layers = model.value("layers").getLongList.map(_.toInt),
        weights = Vectors.dense(model.value("weights").getTensor[Double].toArray),
        thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray))
    }
  }

  override def model(node: MultiLayerPerceptronClassifier): MultiLayerPerceptronClassifierModel = node.model
}

Source File: LogisticRegressionOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, LogisticRegressionModel, ProbabilisticLogisticsRegressionModel}
import ml.combust.mleap.runtime.transformer.classification.LogisticRegression
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.linalg.{Matrices, Vectors}


class LogisticRegressionOp extends MleapOp[LogisticRegression, LogisticRegressionModel] {

  private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5

  override val Model: OpModel[MleapContext, LogisticRegressionModel] = new OpModel[MleapContext, LogisticRegressionModel] {
    override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel]

    override def opName: String = Bundle.BuiltinOps.classification.logistic_regression

    override def store(model: Model, obj: LogisticRegressionModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      val m = model.withValue("num_classes", Value.long(obj.numClasses))
      if(obj.isMultinomial) {
        val mm = obj.multinomialModel
        val cm = mm.coefficientMatrix
        m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))).
          withValue("intercept_vector", Value.vector(mm.interceptVector.toArray)).
          withValue("thresholds", mm.thresholds.map(_.toSeq).map(Value.doubleList))
      } else {
        m.withValue("coefficients", Value.vector(obj.binaryModel.coefficients.toArray)).
          withValue("intercept", Value.double(obj.binaryModel.intercept)).
          withValue("threshold", Value.double(obj.binaryModel.threshold))
      }
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): LogisticRegressionModel = {
      val numClasses = model.value("num_classes").getLong

      val lm = if(numClasses > 2) {
        val tensor = model.value("coefficient_matrix").getTensor[Double]
        val cm = Matrices.dense(numRows = tensor.dimensions.head, numCols = tensor.dimensions(1), tensor.toArray)

        ProbabilisticLogisticsRegressionModel(coefficientMatrix = cm,
          interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray),
          thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray))
      } else {
        // default threshold is 0.5 for both Spark and Scikit-learn
        val threshold = model.getValue("threshold")
                              .map(value => value.getDouble)
                              .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD)
        BinaryLogisticRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
          intercept = model.value("intercept").getDouble,
          threshold = threshold)
      }

      LogisticRegressionModel(lm)
    }
  }

  override def model(node: LogisticRegression): LogisticRegressionModel = node.model
}

Source File: SupportVectorMachineOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.mleap.core.classification.SupportVectorMachineModel
import ml.combust.mleap.runtime.transformer.classification.SupportVectorMachine
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import org.apache.spark.ml.linalg.Vectors


class SupportVectorMachineOp extends MleapOp[SupportVectorMachine, SupportVectorMachineModel] {
  override val Model: OpModel[MleapContext, SupportVectorMachineModel] = new OpModel[MleapContext, SupportVectorMachineModel] {
    override val klazz: Class[SupportVectorMachineModel] = classOf[SupportVectorMachineModel]

    override def opName: String = Bundle.BuiltinOps.classification.support_vector_machine

    override def store(model: Model, obj: SupportVectorMachineModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept)).
        withValue("num_classes", Value.long(2)).
        withValue("thresholds", obj.thresholds.map(_.toSeq).map(Value.doubleList))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): SupportVectorMachineModel = {
      if(model.value("num_classes").getLong != 2) {
        throw new IllegalArgumentException("MLeap only supports binary SVM")
      }
      SupportVectorMachineModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble,
        thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray))
    }
  }

  override def model(node: SupportVectorMachine): SupportVectorMachineModel = node.model
}

Source File: NaiveBayesClassifierOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl.Model
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.runtime.transformer.classification.NaiveBayesClassifier
import ml.combust.mleap.core.classification.NaiveBayesModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.linalg.{Matrices, Vectors}



class NaiveBayesClassifierOp extends MleapOp[NaiveBayesClassifier, NaiveBayesModel]{
  override val Model: OpModel[MleapContext, NaiveBayesModel] = new OpModel[MleapContext, NaiveBayesModel]{
    override val klazz: Class[NaiveBayesModel] = classOf[NaiveBayesModel]

    override def opName: String = Bundle.BuiltinOps.classification.naive_bayes

    override def store(model: Model, obj: NaiveBayesModel)(implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("num_features", Value.long(obj.numFeatures)).
        withValue("num_classes", Value.long(obj.numClasses)).
        withValue("pi", Value.vector(obj.pi.toArray)).
        withValue("theta", Value.tensor(DenseTensor(obj.theta.toArray, Seq(obj.theta.numRows, obj.theta.numCols)))).
        withValue("model_type", Value.string(obj.modelType.toString)).
        withValue("thresholds", obj.thresholds.map(Value.doubleList(_)))
    }

    override def load(model: Model)(implicit context: BundleContext[MleapContext]): NaiveBayesModel = {
      val theta = model.value("theta").getTensor[Double]
      val modelType = NaiveBayesModel.forName(model.value("model_type").getString)
      val numClasses = model.value("num_classes").getLong.toInt
      val thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray)
      require(thresholds.isEmpty || thresholds.get.length == numClasses,
        "NaiveBayesModel loaded with non-matching numClasses and thresholds.length. " +
          s" numClasses=$numClasses, but thresholds has length ${thresholds.get.length}")
      new NaiveBayesModel(numFeatures = model.value("num_features").getLong.toInt,
        numClasses = numClasses,
        pi = Vectors.dense(model.value("pi").getTensor[Double].toArray),
        theta = Matrices.dense(theta.dimensions.head, theta.dimensions(1), theta.toArray),
        modelType = modelType,
        thresholds = thresholds)
    }

  }
  override def model(node: NaiveBayesClassifier): NaiveBayesModel = node.model
}

Source File: GeneralizedLinearRegressionOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.regression.GeneralizedLinearRegressionModel
import ml.combust.mleap.core.regression.GeneralizedLinearRegressionModel.{Family, FamilyAndLink, Link}
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.regression.GeneralizedLinearRegression
import org.apache.spark.ml.linalg.Vectors


class GeneralizedLinearRegressionOp extends MleapOp[GeneralizedLinearRegression, GeneralizedLinearRegressionModel] {
  override val Model: OpModel[MleapContext, GeneralizedLinearRegressionModel] = new OpModel[MleapContext, GeneralizedLinearRegressionModel] {
    override val klazz: Class[GeneralizedLinearRegressionModel] = classOf[GeneralizedLinearRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.generalized_linear_regression

    override def store(model: Model, obj: GeneralizedLinearRegressionModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept)).
        withValue("family", Value.string(obj.fal.family.name)).
        withValue("link", Value.string(obj.fal.link.name))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): GeneralizedLinearRegressionModel = {
      val family = Family.fromName(model.value("family").getString)
      val link = model.getValue("link").map(v => Link.fromName(v.getString)).getOrElse(family.defaultLink)
      GeneralizedLinearRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble,
        fal = new FamilyAndLink(family, link)
      )
    }
  }

  override def model(node: GeneralizedLinearRegression): GeneralizedLinearRegressionModel = node.model
}

Source File: AFTSurvivalRegressionOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.core.regression.AFTSurvivalRegressionModel
import ml.combust.mleap.runtime.MleapContext
import ml.combust.mleap.runtime.transformer.regression.AFTSurvivalRegression
import org.apache.spark.ml.linalg.Vectors


class AFTSurvivalRegressionOp extends MleapOp[AFTSurvivalRegression, AFTSurvivalRegressionModel] {
  override val Model: OpModel[MleapContext, AFTSurvivalRegressionModel] = new OpModel[MleapContext, AFTSurvivalRegressionModel] {
    override val klazz: Class[AFTSurvivalRegressionModel] = classOf[AFTSurvivalRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.aft_survival_regression

    override def store(model: Model, obj: AFTSurvivalRegressionModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept)).
        withValue("quantile_probabilities", Value.doubleList(obj.quantileProbabilities)).
        withValue("scale", Value.double(obj.scale))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): AFTSurvivalRegressionModel = {

      AFTSurvivalRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble,
        quantileProbabilities = model.value("quantile_probabilities").getDoubleList.toArray,
        scale = model.value("scale").getDouble)
    }
  }

  override def model(node: AFTSurvivalRegression): AFTSurvivalRegressionModel = node.model
}

Source File: LinearRegressionOp.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.mleap.core.regression.LinearRegressionModel
import ml.combust.mleap.runtime.transformer.regression.LinearRegression
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.bundle.ops.MleapOp
import ml.combust.mleap.runtime.MleapContext
import org.apache.spark.ml.linalg.Vectors


class LinearRegressionOp extends MleapOp[LinearRegression, LinearRegressionModel] {
  override val Model: OpModel[MleapContext, LinearRegressionModel] = new OpModel[MleapContext, LinearRegressionModel] {
    override val klazz: Class[LinearRegressionModel] = classOf[LinearRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.linear_regression

    override def store(model: Model, obj: LinearRegressionModel)
                      (implicit context: BundleContext[MleapContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[MleapContext]): LinearRegressionModel = {
      LinearRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble)
    }
  }

  override def model(node: LinearRegression): LinearRegressionModel = node.model
}

Source File: IDFSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.feature

import ml.combust.mleap.core.feature.IDFModel
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class IDFSpec extends FunSpec {

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      val transformer = IDF(shape = NodeShape.feature(),
        model = IDFModel(Vectors.dense(Array(1.0, 2.0, 3.0))))
      assert(transformer.schema.fields ==
        Seq(StructField("input", TensorType.Double()),
          StructField("output", TensorType.Double())))
    }
  }
}

Source File: StandardScalerSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.feature

import java.io.File
import java.net.URI

import ml.combust.bundle.BundleFile
import ml.combust.bundle.serializer.SerializationFormat
import ml.combust.mleap.core.feature.StandardScalerModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.test.TestUtil
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec
import resource.managed
import ml.combust.mleap.runtime.MleapSupport._

class StandardScalerSpec extends FunSpec {

  val means = Some(Vectors.dense(Array(50.0, 20.0, 30.0)))
  val std = Some(Vectors.dense(Array(5.0, 1.0, 3.0)))

  val transformer = StandardScaler(shape = NodeShape.feature(),
    model = StandardScalerModel(std, means))

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(transformer.schema.fields ==
        Seq(StructField("input", TensorType.Double(3)),
          StructField("output", TensorType.Double(3))))
    }
  }

  describe("serialization") {
    it("serializes std as well as mean correctly") {
      val uri = new URI(s"jar:file:${TestUtil.baseDir}/standard-scaler.json.zip")
      for (file <- managed(BundleFile(uri))) {
        transformer.writeBundle.name("bundle")
          .format(SerializationFormat.Json)
          .save(file)
      }

      val file = new File(s"${TestUtil.baseDir}/standard-scaler.json.zip")
      val scaler = (for (bf <- managed(BundleFile(file))) yield {
        bf.loadMleapBundle().get.root
      }).tried.get.asInstanceOf[StandardScaler]

      assert(transformer.model.std sameElements scaler.model.std)
      assert(transformer.model.mean sameElements scaler.model.mean)
    }
  }
}

Source File: MaxAbsScalerSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.feature

import ml.combust.mleap.core.feature.MaxAbsScalerModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class MaxAbsScalerSpec extends FunSpec{
  val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(0.0, 20.0, 20.0))))
  val frame = DefaultLeapFrame(schema, dataset)

  val maxAbsScaler = MaxAbsScaler(
    shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_normalized"),
    model = MaxAbsScalerModel(Vectors.dense(Array(10.0, 20.0, 40.0))))

  describe("#transform") {
    it("scales the input data by maximum value vector") {
      val frame2 = maxAbsScaler.transform(frame).get
      val data = frame2.dataset.toArray
      val norm = data(0).getTensor[Double](1)

      assert(norm(0) == 0.0)
      assert(norm(1) == 1.0)
      assert(norm(2) == 0.5)
    }

    describe("with invalid input column") {
      val maxAbsScaler2 = maxAbsScaler.copy(shape = NodeShape.feature(inputCol = "bad_input"))

      it("returns a Failure") { assert(maxAbsScaler2.transform(frame).isFailure) }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(maxAbsScaler.schema.fields ==
        Seq(StructField("test_vec", TensorType.Double(3)),
          StructField("test_normalized", TensorType.Double(3))))
    }
  }
}

Source File: MinMaxScalerSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.feature

import java.io.File

import ml.combust.bundle.BundleFile
import ml.combust.mleap.core.feature.MinMaxScalerModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.runtime.transformer.Pipeline
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec
import resource.managed
import ml.combust.mleap.runtime.MleapSupport._


class MinMaxScalerSpec extends FunSpec{
  val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(0.0, 20.0, 20.0))))
  val frame = DefaultLeapFrame(schema, dataset)

  val minMaxScaler = MinMaxScaler(
    shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_normalized"),
    model = MinMaxScalerModel(Vectors.dense(Array(0.0, 0.0, 0.0)), Vectors.dense(Array(10.0, 20.0, 40.0))))

  describe("#transform") {
    it("scales the input data between min / max value vectors") {
      val frame2 = minMaxScaler.transform(frame).get
      val data = frame2.dataset.toArray
      val norm = data(0).getTensor[Double](1)

      assert(norm(0) == 0.0)
      assert(norm(1) == 1.0)
      assert(norm(2) == 0.5)
    }
    describe("with invalid input column") {
      val minMaxScaler2 = minMaxScaler.copy(shape = NodeShape.feature(inputCol = "bad_feature"))

      it("returns a Failure") {
        assert(minMaxScaler2.transform(frame).isFailure)
      }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(minMaxScaler.schema.fields ==
        Seq(StructField("test_vec", TensorType.Double(3)),
          StructField("test_normalized", TensorType.Double(3))))
    }
  }

  describe("min max scaler with defaults for min/max still works") {
    it ("loads correctly in mleap") {
      val file = new File(getClass.getResource("/min_max_scaler_tf.zip").toURI)
      val pipeline = (for (bf <- managed(BundleFile(file))) yield {
        bf.loadMleapBundle().get.root
      }).tried.get.asInstanceOf[Pipeline]

      assert(pipeline.model.transformers.size == 2)
    }
  }
}

Source File: ElementWiseProductSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.feature

import ml.combust.mleap.core.feature.ElementwiseProductModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class ElementWiseProductSpec extends FunSpec {
  val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(0.0, 20.0, 20.0))))
  val frame = DefaultLeapFrame(schema, dataset)

  val ewp = ElementwiseProduct(shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_norm"),
    model = ElementwiseProductModel(Vectors.dense(Array(0.5, 1.0, 0.5))))

  describe("#transform") {
    it("multiplies each input vector by a provided weight vector") {
      val frame2 = ewp.transform(frame).get
      val data = frame2.dataset(0).getTensor[Double](1)

      assert(data.toArray sameElements Array(0.0, 20.0, 10.0))
    }

    describe("with invalid input column") {
      val ewp2 = ewp.copy(shape = NodeShape.feature(inputCol = "bad_input"))

      it("returns a Failure") { assert(ewp2.transform(frame).isFailure) }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(ewp.schema.fields ==
        Seq(StructField("test_vec", TensorType.Double(3)),
          StructField("test_norm", TensorType.Double(3))))
    }
  }
}

Source File: PcaSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.feature

import ml.combust.mleap.core.feature.PcaModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.scalatest.FunSpec


class PcaSpec extends FunSpec {
  val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(2.0, 1.0, 0.0))))
  val frame = DefaultLeapFrame(schema, dataset)

  val pc = new DenseMatrix(3, 2, Array(1d, -1, 2,
    0, -3, 1))
  val input = Vectors.dense(Array(2d, 1, 0))
  val pca = Pca(
    shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_pca"),
    model = PcaModel(pc))

  describe("#transform") {
    it("extracts the principal components from the input column") {
      val frame2 = pca.transform(frame).get
      val data = frame2.dataset(0).getTensor[Double](1).toArray

      assert(data sameElements Array[Double](1, -3))
    }

    describe("with invalid input column") {
      val pca2 = pca.copy(shape = NodeShape.feature(inputCol = "bad_input"))

      it("returns a Failure") { assert(pca2.transform(frame).isFailure) }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(pca.schema.fields ==
        Seq(StructField("test_vec", TensorType.Double()),
          StructField("test_pca", TensorType.Double())))
    }
  }
}

Source File: KMeansSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.clustering

import ml.combust.mleap.core.clustering.KMeansModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class KMeansSpec extends FunSpec {
  val v1 = Vectors.dense(Array(1.0, 2.0, 55.0))
  val v2 = Vectors.dense(Array(11.0, 200.0, 55.0))
  val v3 = Vectors.dense(Array(100.0, 22.0, 55.0))

  val schema = StructType(Seq(StructField("features", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(DenseTensor(Array(2.0, 5.0, 34.0), Seq(3))),
    Row(DenseTensor(Array(20.0, 230.0, 34.0), Seq(3))),
    Row(DenseTensor(Array(111.0, 20.0, 56.0), Seq(3))))
  val frame = DefaultLeapFrame(schema, dataset)
  val km = KMeans(shape = NodeShape.basicCluster(), model = KMeansModel(Seq(v1, v2, v3), 3))

  describe("#transform") {
    it("uses the k-means to find closest cluster") {
      val frame2 = km.transform(frame).get
      val data = frame2.dataset.toArray

      assert(data(0).getInt(1) == 0)
      assert(data(1).getInt(1) == 1)
      assert(data(2).getInt(1) == 2)
    }

    describe("with invalid features column") {
      val km2 = km.copy(shape = NodeShape.basicCluster(featuresCol = "bad_features"))

      it("returns a Failure") { assert(km2.transform(frame).isFailure) }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(km.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Int.nonNullable)))
    }
  }
}

Source File: BisectingKMeansSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.clustering

import ml.combust.mleap.core.clustering.{BisectingKMeansModel, ClusteringTreeNode}
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.mleap.VectorWithNorm
import org.scalatest.FunSpec

class BisectingKMeansSpec extends FunSpec {

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      val transformer = BisectingKMeans(shape = NodeShape.basicCluster(),
        model = new BisectingKMeansModel(ClusteringTreeNode(23,
          VectorWithNorm(Vectors.dense(1, 2, 3)) , Array())))

      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Int.nonNullable)))
    }
  }
}

Source File: PipelineSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer

import ml.combust.mleap.core.feature.VectorAssemblerModel
import ml.combust.mleap.core.regression.LinearRegressionModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.transformer.feature.VectorAssembler
import ml.combust.mleap.runtime.transformer.regression.LinearRegression
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class PipelineSpec extends FunSpec {

  describe("input/output schema") {
    it("has inputs or outputs of its transformers") {
      val vectorAssembler = VectorAssembler(
        shape = NodeShape().withInput("input0", "feature1").
          withInput("input1", "feature2").
          withInput("input2", "feature3").
          withStandardOutput("features"),
        model = VectorAssemblerModel(Seq(ScalarShape(), ScalarShape(), ScalarShape())))

      val regression = LinearRegression(shape = NodeShape().withInput("features", "features").
        withOutput("prediction", "prediction"),
        model = LinearRegressionModel(Vectors.dense(1.0, 2.0, 3.0), 4.0))

      val pipeline = Pipeline(uid = "root_pipeline", shape = NodeShape(), PipelineModel(Seq(
          Pipeline(uid = "child_pipeline_1", shape = NodeShape(), PipelineModel(Seq(vectorAssembler))),
        Pipeline(uid = "child_pipeline_2", shape = NodeShape(), PipelineModel(Seq(regression))))))

      assert(pipeline.schema.fields == Seq(
          StructField("feature1", ScalarType.Double),
          StructField("feature2", ScalarType.Double),
          StructField("feature3", ScalarType.Double),
          StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)))

      assert(pipeline.inputSchema.fields == Seq(
        StructField("feature1", ScalarType.Double),
        StructField("feature2", ScalarType.Double),
        StructField("feature3", ScalarType.Double)))

      assert(pipeline.outputSchema.fields == Seq(
        StructField("features", TensorType.Double(3)),
        StructField("prediction", ScalarType.Double.nonNullable)))

      assert(pipeline.strictOutputSchema.fields == Seq(
        StructField("prediction", ScalarType.Double.nonNullable)))

      assert(pipeline.intermediateSchema.fields == Seq(
        StructField("features", TensorType.Double(3))))
    }
  }
}

Source File: LogisticRegressionSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.classification

import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, LogisticRegressionModel}
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class LogisticRegressionSpec extends FunSpec {
  val schema = StructType(Seq(StructField("features", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(0.5, -0.5, 1.0))))
  val frame = DefaultLeapFrame(schema, dataset)
  val logisticRegression = LogisticRegression(shape = NodeShape.probabilisticClassifier(),
    model = LogisticRegressionModel(BinaryLogisticRegressionModel(coefficients = Vectors.dense(Array(1.0, 1.0, 2.0)),
      intercept = -0.2,
      threshold = 0.75)))

  describe("LogisticRegression") {
    describe("#transform") {
      it("executes the logistic regression model and outputs the prediction") {
        val frame2 = logisticRegression.transform(frame).get
        val prediction = frame2.dataset(0).getDouble(1)

        assert(prediction == 1.0)
      }

      describe("with probability column") {
        val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(probabilityCol = Some("probability")))

        it("executes the logistic regression model and outputs the prediction/probability") {
          val frame2 = logisticRegression2.transform(frame).get
          val data = frame2.dataset.toArray
          val probability = data(0).getTensor[Double](1)(1)
          val prediction = data(0).getDouble(2)

          assert(prediction == 1.0)
          assert(probability > 0.84)
          assert(probability < 0.86)
        }
      }

      describe("with invalid features column") {
        val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(featuresCol = "bad_features"))

        it("returns a Failure") { assert(logisticRegression2.transform(frame).isFailure) }
      }
    }

    describe("input/output schema") {
      it("has the correct inputs and outputs") {
        assert(logisticRegression.schema.fields ==
          Seq(StructField("features", TensorType.Double(3)),
            StructField("prediction", ScalarType.Double.nonNullable)))
      }

      it("has the correct inputs and outputs with probability column") {
        val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(probabilityCol = Some("probability")))
        assert(logisticRegression2.schema.fields ==
          Seq(StructField("features", TensorType.Double(3)),
            StructField("probability", TensorType.Double(2)),
            StructField("prediction", ScalarType.Double.nonNullable)))
      }

      it("has the correct inputs and outputs with rawPrediction column") {
        val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(rawPredictionCol = Some("rp")))
        assert(logisticRegression2.schema.fields ==
          Seq(StructField("features", TensorType.Double(3)),
            StructField("rp", TensorType.Double(2)),
            StructField("prediction", ScalarType.Double.nonNullable)))
      }

      it("has the correct inputs and outputs with both probability and rawPrediction column") {
        val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(
          rawPredictionCol = Some("rp"),
          probabilityCol = Some("p")))
        assert(logisticRegression2.schema.fields ==
          Seq(StructField("features", TensorType.Double(3)),
            StructField("rp", TensorType.Double(2)),
            StructField("p", TensorType.Double(2)),
            StructField("prediction", ScalarType.Double.nonNullable)))
      }
    }
  }
}

Source File: SupportVectorMachineSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.classification

import ml.combust.mleap.core.classification.SupportVectorMachineModel
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class SupportVectorMachineSpec extends FunSpec {

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(),
        model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }

    it("has the correct inputs and outputs with probability column") {
      val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(probabilityCol = Some("probability")),
        model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }

    it("has the correct inputs and outputs with rawPrediction column") {
      val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(rawPredictionCol = Some("rp")),
        model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("rp", TensorType(BasicType.Double, Seq(2))),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }

    it("has the correct inputs and outputs with both probability and rawPrediction columns") {
      val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(
        rawPredictionCol = Some("rp"),
        probabilityCol = Some("probability")),
        model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("rp", TensorType.Double(2)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: OneVsRestSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.classification

import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, OneVsRestModel}
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class OneVsRestSpec extends FunSpec {
  describe("input/output schema") {
    it("has the correct inputs and outputs without probability column") {
      val transformer = OneVsRest(shape = NodeShape.basicClassifier(),
        model = new OneVsRestModel(Array(
          BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double)))
    }

    it("has the correct inputs and outputs with probability column") {
      val transformer = OneVsRest(shape = NodeShape().withInput("features", "features").
        withOutput("probability", "prob").
        withOutput("prediction", "prediction"),
        model = new OneVsRestModel(Array(
          BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(2)),
          StructField("prob", ScalarType.Double),
          StructField("prediction", ScalarType.Double)))
    }

    it("has the correct inputs and outputs with raw prediction column") {
      val transformer = OneVsRest(shape = NodeShape().withInput("features", "features").
        withOutput("probability", "prob").
        withOutput("raw_prediction", "raw").
        withOutput("prediction", "prediction"),
        model = new OneVsRestModel(Array(
          BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(2)),
          StructField("prob", ScalarType.Double),
          StructField("raw", TensorType.Double(1)),
          StructField("prediction", ScalarType.Double)))
    }
  }
}

Source File: MultiLayerPerceptronClassifierSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.classification

import ml.combust.mleap.core.classification.MultiLayerPerceptronClassifierModel
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class MultiLayerPerceptronClassifierSpec extends FunSpec {

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      val transformer =
        MultiLayerPerceptronClassifier(shape = NodeShape.basicClassifier(),
          model = new MultiLayerPerceptronClassifierModel(Seq(3, 1), Vectors.dense(Array(1.9, 2.2, 4, 1))))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType(BasicType.Double, Seq(3))),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: LinearSVCSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.classification

import org.scalatest.FunSpec
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import ml.combust.mleap.core.classification.LinearSVCModel


class LinearSVCSpec extends FunSpec
{
    describe("input/output schema")
    {
        it("has the correct inputs and outputs")
        {
            val transformer = LinearSVC(shape = NodeShape.basicClassifier(),
                model = new LinearSVCModel(Vectors.dense(1, 2, 3), 2))
            assert(transformer.schema.fields ==
                    Seq(StructField("features", TensorType.Double(3)),
                        StructField("prediction", ScalarType.Double.nonNullable)))
        }

        it("has the correct inputs and outputs with prediction column")
        {
            val transformer = LinearSVC(shape = NodeShape.probabilisticClassifier(rawPredictionCol = Some("rp"),predictionCol = "pred"),
                model = new LinearSVCModel(Vectors.dense(1, 2, 3), 2))
            assert(transformer.schema.fields ==
                    Seq(StructField("features", TensorType.Double(3)),
                        StructField("rp", TensorType.Double(2)),
                        StructField("pred", ScalarType.Double.nonNullable)))
        }

    }
}

Source File: GeneralizedLinearRegressionSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.regression

import ml.combust.mleap.core.regression.GeneralizedLinearRegressionModel
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class GeneralizedLinearRegressionSpec extends FunSpec {

  describe("input/output schema") {
    it("has the correct inputs and outputs with prediction column only") {
      val transformer = GeneralizedLinearRegression(shape = NodeShape.regression(),
        model = new GeneralizedLinearRegressionModel(Vectors.dense(1, 2, 3), 23, null))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }

    it("has the correct inputs and outputs with prediction column as well as linkPrediction column") {
      val transformer = GeneralizedLinearRegression(shape = NodeShape.regression().
              withOutput("link_prediction", "lp"),
        model = new GeneralizedLinearRegressionModel(Vectors.dense(1, 2, 3), 23, null))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable),
          StructField("lp", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: LinearRegressionSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.regression

import ml.combust.mleap.core.regression.LinearRegressionModel
import ml.combust.mleap.core.types._
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class LinearRegressionSpec extends FunSpec {
  val schema = StructType(Seq(StructField("features", TensorType(BasicType.Double)))).get
  val dataset = Seq(Row(Tensor.denseVector(Array(20.0, 10.0, 5.0))))
  val frame = DefaultLeapFrame(schema, dataset)
  val linearRegression = LinearRegression(shape = NodeShape.regression(),
    model = LinearRegressionModel(coefficients = Vectors.dense(Array(1.0, 0.5, 5.0)),
      intercept = 73.0))

  describe("LinearRegression") {
    describe("#transform") {
      it("executes the linear regression model and outputs a prediction") {
        val frame2 = linearRegression.transform(frame).get
        val prediction = frame2.dataset(0).getDouble(1)

        assert(prediction == 123.0)
      }

      describe("with invalid features input") {
        it("returns a Failure") {
          val frame2 = linearRegression.copy(shape = NodeShape.regression(featuresCol = "bad_features")).transform(frame)

          assert(frame2.isFailure)
        }
      }
    }
  }

  describe("input/output schema") {
    it("has the correct inputs and outputs") {
      assert(linearRegression.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: AFTSurvivalRegressionSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.runtime.transformer.regression

import ml.combust.mleap.core.regression.AFTSurvivalRegressionModel
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class AFTSurvivalRegressionSpec extends FunSpec {

  describe("input/output schema") {

    it("has the correct inputs and outputs") {
      val transformer = AFTSurvivalRegression(shape = NodeShape.regression()
        .withOutput("quantiles", "quantiles"),
        model = new AFTSurvivalRegressionModel(Vectors.dense(1, 3, 4), 23, Array(1, 2, 3, 4, 5), 5))
      assert(transformer.schema.fields ==
        Seq(StructField("features", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable),
          StructField("quantiles", TensorType.Double(5))))
    }
  }
}

Source File: VectorSlicerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.VectorUtil._


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala")
case class VectorSlicerModel(indices: Array[Int],
                             namedIndices: Array[(String, Int)] = Array(),
                            inputSize: Int) extends Model {
  val allIndices: Array[Int] = indices.union(namedIndices.map(_._2))

  def apply(features: Vector): Vector = features match {
    case features: DenseVector => Vectors.dense(allIndices.map(features.apply))
    case features: SparseVector => features.slice(allIndices)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(allIndices.length)).get

}

Source File: ElementwiseProductModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructField, StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala")
case class ElementwiseProductModel(scalingVec: Vector) extends Model {
  def apply(vector: Vector): Vector = {
    vector match {
      case DenseVector(values) =>
        val vs = values.clone()
        val size = vs.length
        var i = 0

        while (i < size) {
          vs(i) *= scalingVec(i)
          i += 1
        }
        Vectors.dense(vs)
      case SparseVector(size, indices, values) =>
        val vs = values.clone()
        val nnz = vs.length
        var i = 0
        while (i < nnz) {
          vs(i) *= scalingVec(indices(i))
          i += 1
        }
        Vectors.sparse(size, indices, vs)
    }
  }

  override def inputSchema: StructType = StructType(StructField("input" -> TensorType.Double(scalingVec.size))).get

  override def outputSchema: StructType = StructType(StructField("output" -> TensorType.Double(scalingVec.size))).get
}

Source File: MaxAbsScalerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.math.{max, min}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala")
case class MaxAbsScalerModel(maxAbs: Vector) extends Model {
  def apply(vector: Vector): Vector = {
    val maxAbsUnzero = Vectors.dense(maxAbs.toArray.map(x => if (x == 0) 1 else x))

    vector match {
      case DenseVector(values) =>
        val vs = values.clone()
        val size = vs.length
        var i = 0

        while (i < size) {
          if (!values(i).isNaN) {
            val rescale = max(-1.0, min(1.0, values(i) / maxAbsUnzero(i)))
            vs(i) = rescale
          }
          i += 1
        }
        Vectors.dense(vs)
      case SparseVector(size, indices, values) =>
        val vs = values.clone()
        val nnz = vs.length
        var i = 0
        while (i < nnz) {
          val raw = max(-1.0, min(1.0, values(i) / maxAbsUnzero(indices(i))))

          vs(i) = raw
          i += 1
        }
        Vectors.sparse(size, indices, vs)
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(maxAbs.size)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(maxAbs.size)).get

}

Source File: ChiSqSelectorModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.collection.mutable


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala")
case class ChiSqSelectorModel(filterIndices: Seq[Int],
                              inputSize: Int) extends Model {
  def apply(features: Vector): Vector = {
    features match {
      case SparseVector(size, indices, values) =>
        val newSize = filterIndices.length
        val newValues = mutable.ArrayBuilder.make[Double]
        val newIndices = mutable.ArrayBuilder.make[Int]
        var i = 0
        var j = 0
        var indicesIdx = 0
        var filterIndicesIdx = 0
        while (i < indices.length && j < filterIndices.length) {
          indicesIdx = indices(i)
          filterIndicesIdx = filterIndices(j)
          if (indicesIdx == filterIndicesIdx) {
            newIndices += j
            newValues += values(i)
            j += 1
            i += 1
          } else {
            if (indicesIdx > filterIndicesIdx) {
              j += 1
            } else {
              i += 1
            }
          }
        }
        // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size)
        Vectors.sparse(newSize, newIndices.result(), newValues.result())
      case DenseVector(values) =>
        val values = features.toArray
        Vectors.dense(filterIndices.map(i => values(i)).toArray)
      case other =>
        throw new UnsupportedOperationException(
          s"Only sparse and dense vectors are supported but got ${other.getClass}.")
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(filterIndices.length)).get
}

Source File: FeatureHasherModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import ml.combust.mleap.core.util.Platform
import ml.combust.mleap.core.util.Murmur3_x86_32.{hashInt, hashLong, hashUnsafeBytes2}
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable

object FeatureHasherModel {
  val seed = HashingTermFrequencyModel.seed

  def murmur3(term: Any): Int = {
    term match {
      case null => seed
      case b: Boolean => hashInt(if (b) 1 else 0, seed)
      case b: Byte => hashInt(b, seed)
      case s: Short => hashInt(s, seed)
      case i: Int => hashInt(i, seed)
      case l: Long => hashLong(l, seed)
      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
      case s: String =>
        val utf8 = s.getBytes("UTF-8")
        hashUnsafeBytes2(utf8, Platform.BYTE_ARRAY_OFFSET, utf8.length, seed)
      case _ => throw new IllegalStateException("FeatureHasher with murmur3 algorithm does not " +
        s"support type ${term.getClass.getCanonicalName} of input data.")
    }
  }

}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.3.0/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala")
case class FeatureHasherModel(numFeatures: Int = 1 << 18, 
                              categoricalCols: Seq[String],
                              inputNames: Seq[String],
                              inputTypes: Seq[DataType]
                             ) extends Model  {
  assert(inputTypes.forall(dt ⇒ dt.shape.isScalar), "must provide scalar shapes as inputs")

  val schema = inputNames.zip(inputTypes)
  val realFields = schema.filter(t ⇒ t._2.base match {
    case BasicType.Short if !categoricalCols.contains(t._1) ⇒ true
    case BasicType.Double if !categoricalCols.contains(t._1) ⇒ true
    case BasicType.Float if !categoricalCols.contains(t._1) ⇒ true
    case BasicType.Int if !categoricalCols.contains(t._1) ⇒ true
    case BasicType.Long if !categoricalCols.contains(t._1) ⇒ true
    case _ ⇒ false
  }).toMap.keys.toSeq

  def getDouble(x: Any): Double = {
    x match {
      case n: java.lang.Number ⇒ n.doubleValue()
      // will throw ClassCastException if it cannot be cast, as would row.getDouble
      case other ⇒ other.asInstanceOf[Double]
    }
  }

  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

  def apply(things: Seq[Any]): Vector = {
    val map = new mutable.OpenHashMap[Int, Double]()
    schema.zip(things).foreach { case (sc, item) ⇒
      if (item != null) {
        val (rawIdx, value) = if (realFields.contains(sc._1)) {
          // numeric values are kept as is, with vector index based on hash of "column_name"
          val value = getDouble(item)
          val hash = FeatureHasherModel.murmur3(sc._1)
          (hash, value)
        } else {
          // string, boolean and numeric values that are in catCols are treated as categorical,
          // with an indicator value of 1.0 and vector index based on hash of "column_name=value"
          val value = item.toString
          val fieldName = s"${sc._1}=$value"
          val hash = FeatureHasherModel.murmur3(fieldName)
          (hash, 1.0)
        }
        val idx = nonNegativeMod(rawIdx, numFeatures)
        map.+=((idx, map.getOrElse(idx, 0.0) + value))
      }
    }
    Vectors.sparse(numFeatures, map.toSeq)
  }
  
  override def inputSchema: StructType = {
    val inputFields = inputTypes.zipWithIndex.map {
      case (dtype, i) => StructField(s"input$i", dtype)
    }
    StructType(inputFields).get
  }

  override def outputSchema: StructType = {
    StructType(StructField("output" -> TensorType.Double(numFeatures))).get
  }
}

Source File: MinMaxScalerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.VectorUtil._
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}

import scala.math.{max, min}


  def apply(vector: Vector): Vector = {
    val scale = maxValue - minValue

    // 0 in sparse vector will probably be rescaled to non-zero
    val values = vector.copy.toArray
    val size = values.length
    var i = 0
    while (i < size) {
      if (!values(i).isNaN) {
        val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5
        values(i) = raw * scale + minValue
      }
      i += 1
    }
    Vectors.dense(values)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(originalRange.length)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(originalRange.length)).get

}

Source File: WordToVectorModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.BLAS
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


sealed trait WordToVectorKernel {
  def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector
  def name: String
}
object WordToVectorKernel {
  private val lookup: Map[String, WordToVectorKernel] = Seq(Default, Sqrt).map {
    k => (k.name, k)
  }.toMap

  def forName(name: String): WordToVectorKernel = lookup(name)

  case object Default extends WordToVectorKernel {
    override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = {
      val sum = Vectors.zeros(size)
      for (v <- vectors) {
        BLAS.axpy(1.0, v, sum)
      }
      BLAS.scal(1.0 / sentenceSize, sum)
      sum
    }

    override def name: String = "default"
  }

  case object Sqrt extends WordToVectorKernel {
    override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = {
      val sum = Vectors.zeros(size)
      for (v <- vectors) {
        BLAS.axpy(1.0, v, sum)
      }

      val values = sum match {
        case sum: DenseVector => sum.values
        case sum: SparseVector => sum.values
      }

      var i = 0
      val s = values.length
      val sqrt = Math.sqrt(BLAS.dot(sum, sum))
      while (i < s) {
        values(i) /= sqrt
        i += 1
      }

      sum
    }

    override def name: String = "sqrt"
  }
}

case class WordToVectorModel(wordIndex: Map[String, Int],
                             wordVectors: Array[Double],
                             kernel: WordToVectorKernel = WordToVectorKernel.Default) extends Model {
  val numWords: Int = wordIndex.size
  val vectorSize: Int = wordVectors.length / numWords
  val vectors: Map[String, Vector] = {
    wordIndex.map { case (word, ind) =>
      (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize))
    }
  }.mapValues(Vectors.dense).map(identity)

  def apply(sentence: Seq[String]): Vector = {
    if (sentence.isEmpty) {
      Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double])
    } else {
      val vs = sentence.iterator.map(vectors.get).
        filter(_.isDefined).
        map(_.get)
      kernel(vectorSize, sentence.size, vs)
    }
  }

  override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(vectorSize)).get
}

Source File: NormalizerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def apply(features: Vector): Vector = {
    val norm = Vectors.norm(features, pNorm)

    if (norm != 0.0) {
      // For dense vector, we've to allocate new memory for new output vector.
      // However, for sparse vector, the `index` array will not be changed,
      // so we can re-use it to save memory.
      features match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while (i < size) {
            values(i) /= norm
            i += 1
          }
          Vectors.dense(values)
        case SparseVector(size, ids, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) /= norm
            i += 1
          }
          Vectors.sparse(size, ids, values)
        case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass)
      }
    } else {
      // Since the norm is zero, return the input vector object itself.
      // Note that it's safe since we always assume that the data in RDD
      // should be immutable.
      features
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get
}

Source File: HashingTermFrequencyModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.{Vector, Vectors}
import ml.combust.mleap.core.util.Murmur3_x86_32._
import ml.combust.mleap.core.util.Platform

import scala.collection.mutable

object HashingTermFrequencyModel {
  val seed = 42

  def murmur3(term: Any): Int = {
    term match {
      case null => seed
      case b: Boolean => hashInt(if (b) 1 else 0, seed)
      case b: Byte => hashInt(b, seed)
      case s: Short => hashInt(s, seed)
      case i: Int => hashInt(i, seed)
      case l: Long => hashLong(l, seed)
      case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
      case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
      case s: String =>
        val utf8 = s.getBytes("UTF-8")
        hashUnsafeBytes(utf8, Platform.BYTE_ARRAY_OFFSET, utf8.length, seed)
      case _ => throw new IllegalStateException("HashingTF with murmur3 algorithm does not " +
        s"support type ${term.getClass.getCanonicalName} of input data.")
    }
  }
}


  @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/core/src/main/scala/org/apache/spark/util/Utils.scala")
  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

  override def inputSchema: StructType = {
    StructType(StructField("input" -> ListType(BasicType.String))).get
  }

  override def outputSchema: StructType = {
    StructType(StructField("output" -> TensorType.Double(numFeatures))).get
  }
}

Source File: StandardScalerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def apply(vector: Vector): Vector = {
    if (mean.nonEmpty) {
      val shift = mean.get.toArray
      val values = vector match {
        // specially handle DenseVector because its toArray does not clone already
        case d: DenseVector => d.values.clone()
        case v: SparseVector => v.toArray
      }
      val size = values.length
      if (std.nonEmpty) {
        val stdDev = std.get
        var i = 0
        while (i < size) {
          values(i) = if (stdDev(i) != 0.0) (values(i) - shift(i)) * (1.0 / stdDev(i)) else 0.0
          i += 1
        }
      } else {
        var i = 0
        while (i < size) {
          values(i) -= shift(i)
          i += 1
        }
      }
      Vectors.dense(values)
    } else if (std.nonEmpty) {
      val stdDev = std.get
      vector match {
        case DenseVector(vs) =>
          val values = vs.clone()
          val size = values.length
          var i = 0
          while(i < size) {
            values(i) *= (if (stdDev(i) != 0.0) 1.0 / stdDev(i) else 0.0)
            i += 1
          }
          Vectors.dense(values)
        case SparseVector(size, indices, vs) =>
          val values = vs.clone()
          val nnz = values.length
          var i = 0
          while (i < nnz) {
            values(i) *= (if (stdDev(indices(i)) != 0.0) 1.0 / stdDev(indices(i)) else 0.0)
            i += 1
          }
          Vectors.sparse(size, indices, values)
      }
    } else {
      throw new IllegalStateException("need to scale with mean and/or with stdev")
    }
  }

  override def inputSchema: StructType = {
    StructType("input" -> TensorType.Double(size)).get
  }

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(size)).get

}

Source File: CountVectorizerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType}
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala")
case class CountVectorizerModel(vocabulary: Array[String],
                                binary: Boolean,
                                minTf: Double) extends Model {
  val dict: Map[String, Int] = vocabulary.zipWithIndex.toMap

  def apply(document: Seq[String]): Vector = {
    val termCounts = mutable.Map[Int, Double]()
    var tokenCount = 0L
    document.foreach {
      term =>
        dict.get(term) match {
          case Some(index) => termCounts += (index -> termCounts.get(index).map(_ + 1).getOrElse(1))
          case None => // ignore terms not found in dictionary
        }
        tokenCount += 1
    }

    val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf
    val effectiveCounts = if(binary) {
      termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq
    } else {
      termCounts.filter(_._2 >= effectiveMinTF).toSeq
    }

    Vectors.sparse(dict.size, effectiveCounts)
  }

  override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(dict.size)).get
}

Source File: OneHotEncoderModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.{Vector, Vectors}


  def apply(labels: Array[Double]): Array[Vector] = {
    if (labels.length != categorySizes.length) {
      throw new IllegalArgumentException(s"invalid input size: ${labels.length}, must be ${categorySizes.length}")
    }
    labels.zipWithIndex.map {
      case (label: Double, colIdx: Int) ⇒ encoder(label, colIdx)
    }
  }

  private def encoder(label: Double, colIdx: Int): Vector = {
    val labelInt = label.toInt

    if(label != labelInt) {
      throw new IllegalArgumentException(s"invalid label: $label, must be integer")
    }

    val origCategorySize = categorySizes(colIdx)
    val idx = if (label >= 0 && label < origCategorySize) {
      label
    } else {
      if (keepInvalid) {
        origCategorySize
      } else {
        if (label < 0) {
          throw new IllegalArgumentException(s"Negative value: $label. Input can't be negative. To handle invalid values, set Param handleInvalid to ${HandleInvalid.Keep}")
        } else {
          throw new IllegalArgumentException(s"Unseen value: $label. To handle unseen values, set Param handleInvalid to ${HandleInvalid.Keep}")
        }
      }
    }
    val size = configedCategorySizes(colIdx)
    if (idx < size) {
      Vectors.sparse(size, Array(idx.toInt), oneValue)
    } else {
      Vectors.sparse(size, emptyIndices, emptyValues)
    }
  }

  override def inputSchema: StructType = {
    val f = categorySizes.zipWithIndex.map {
      case (_, i) => StructField(s"input$i", ScalarType.Double.setNullable(false))
    }
    StructType(f).get
  }

  override def outputSchema: StructType = {
    val f = categorySizes.zipWithIndex.map {
      case (size, i) => StructField(s"output$i", TensorType.Double(size))
    }
    StructType(f).get
  }
}

Source File: IDFModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala")
case class IDFModel(idf: Vector) extends Model {
  def apply(v: Vector): Vector = {
    val n = v.size
    v match {
      case SparseVector(size, indices, values) =>
        val nnz = indices.length
        val newValues = new Array[Double](nnz)
        var k = 0
        while (k < nnz) {
          newValues(k) = values(k) * idf(indices(k))
          k += 1
        }
        Vectors.sparse(n, indices, newValues)
      case DenseVector(values) =>
        val newValues = new Array[Double](n)
        var j = 0
        while (j < n) {
          newValues(j) = values(j) * idf(j)
          j += 1
        }
        Vectors.dense(newValues)
      case other =>
        throw new UnsupportedOperationException(
          s"Only sparse and dense vectors are supported but got ${other.getClass}.")
    }
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double()).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double()).get
}

Source File: MinHashLSHModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature
import ml.combust.mleap.core.types.{StructType, TensorType}
import ml.combust.mleap.tensor.{DenseTensor, Tensor}
import org.apache.spark.ml.linalg.{Vector, Vectors}


object MinHashLSHModel {
  val HASH_PRIME = 2038074743
}

case class MinHashLSHModel(randomCoefficients: Seq[(Int, Int)], inputSize: Int) extends LSHModel{
  def apply(features: Vector): Tensor[Double] = predict(features)

  def predict(features: Vector): Tensor[Double] = {
    require(features.numNonzeros > 0, "Must have at least 1 non zero entry.")
    val elemsList = features.toSparse.indices.toList
    val hashValues = randomCoefficients.map { case (a, b) =>
      elemsList.map { elem: Int =>
        ((1 + elem) * a + b) % MinHashLSHModel.HASH_PRIME
      }.min.toDouble
    }

    // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
    DenseTensor(hashValues.toArray, Seq(hashValues.length, 1))
  }

  override def keyDistance(x: Vector, y: Vector): Double = {
    val xSet = x.toSparse.indices.toSet
    val ySet = y.toSparse.indices.toSet
    val intersectionSize = xSet.intersect(ySet).size.toDouble
    val unionSize = xSet.size + ySet.size - intersectionSize
    assert(unionSize > 0, "The union of two input sets must have at least 1 elements")
    1 - intersectionSize / unionSize
  }

  override def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = {
    // Since it's generated by hashing, it will be a pair of dense vectors.
    // TODO: This hashDistance function requires more discussion in SPARK-18454
    x.zip(y).map(vectorPair =>
      vectorPair._1.toArray.zip(vectorPair._2.toArray).count(pair => pair._1 != pair._2)
    ).min
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize, 1)).get
}

Source File: VectorAssemblerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor}
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable


  def apply(vv: Seq[Any]): Vector = {
    val indices = mutable.ArrayBuilder.make[Int]
    val values = mutable.ArrayBuilder.make[Double]
    var cur = 0
    vv.foreach {
      case v: Double =>
        if (v != 0.0) {
          indices += cur
          values += v
        }
        cur += 1
      case tensor: DenseTensor[_] if tensor.dimensions.size == 1 =>
        val dTensor = tensor.asInstanceOf[DenseTensor[Double]]
        dTensor.values.indices.foreach {
          i =>
            val v = dTensor.values(i)
            if(v != 0.0) {
              indices += cur + i
              values += v
            }
        }
        cur += dTensor.values.length
      case tensor: SparseTensor[_] if tensor.dimensions.size == 1 =>
        val dTensor = tensor.asInstanceOf[SparseTensor[Double]]
        var idx = 0
        dTensor.indices.map(_.head).foreach {
          i =>
            val v = dTensor.values(idx)
            if(v != 0.0) {
              indices += cur + i
              values += v
            }
            idx += 1
        }
        cur += dTensor.dimensions.head
      case vec: Vector =>
        vec.foreachActive { case (i, v) =>
          if (v != 0.0) {
            indices += cur + i
            values += v
          }
        }
        cur += vec.size
      case v: java.math.BigDecimal =>
        val d = v.doubleValue()
        if (d != 0.0) {
          indices += cur
          values += d
        }
        cur += 1
      case Some(v: Double) =>
        if(v != 0.0) {
          indices += cur
          values += v
        }
        cur += 1
    }
    Vectors.sparse(cur, indices.result(), values.result()).compressed
  }

  override def inputSchema: StructType = {
    val inputFields = inputShapes.zipWithIndex.map {
      case (shape, i) => StructField(s"input$i", DataType(BasicType.Double, shape))
    }

    StructType(inputFields).get
  }

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(outputSize)).get
}

Source File: BinarizerModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable


@SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala")
case class BinarizerModel(threshold: Double,
                          inputShape: DataShape) extends Model {
  assert(inputShape.isScalar || inputShape.isTensor, "Must provide a tensor or scalar shape")

  def apply(value: Double): Double = {
    if (value > threshold) 1.0 else 0.0
  }

  def apply(value: Vector): Vector = {
    val indices = mutable.ArrayBuilder.make[Int]
    val values = mutable.ArrayBuilder.make[Double]

    value.foreachActive { (index, value) =>
      if (value > threshold) {
        indices += index
        values += 1.0
      }
    }
    Vectors.sparse(value.size, indices.result(), values.result()).compressed
  }

  override def inputSchema: StructType = {
    StructType("input" -> DataType(BasicType.Double, inputShape).setNullable(!inputShape.isScalar)).get
  }

  override def outputSchema: StructType = {
    StructType("output" -> DataType(BasicType.Double, inputShape).setNullable(!inputShape.isScalar)).get
  }
}

Source File: InteractionModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import ml.combust.mleap.tensor.Tensor
import ml.combust.mleap.core.util.VectorConverters._
import org.apache.spark.ml.linalg.{Vector, Vectors}

import scala.collection.mutable


  def foreachNonzeroOutput(v: Any, f: (Int, Double) => Unit): Unit = {
    val value = v match {
      case tensor: Tensor[_] => tensor.asInstanceOf[Tensor[Double]]: Vector
      case _ => v
    }

    value match {
      case d: Double =>
        assert(numFeatures.length == 1, "DoubleType columns should only contain one feature.")
        val numOutputCols = numFeatures.head
        if (numOutputCols > 1) {
          assert(
            d >= 0.0 && d == d.toInt && d < numOutputCols,
            s"Values from column must be indices, but got $d.")
          f(d.toInt, 1.0)
        } else {
          f(0, d)
        }
      case vec: Vector =>
        assert(numFeatures.length == vec.size,
          s"Vector column size was ${vec.size}, expected ${numFeatures.length}")
        vec.foreachActive { (i, v) =>
          val numOutputCols = numFeatures(i)
          if (numOutputCols > 1) {
            assert(
              v >= 0.0 && v == v.toInt && v < numOutputCols,
              s"Values from column must be indices, but got $v.")
            f(outputOffsets(i) + v.toInt, 1.0)
          } else {
            f(outputOffsets(i), v)
          }
        }
      case null =>
        throw new IllegalArgumentException("Values to interact cannot be null.")
      case o =>
        throw new IllegalArgumentException(s"$o of type ${o.getClass.getName} is not supported.")
    }
  }
}

Source File: DCTModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D
import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.{Vector, Vectors}


case class DCTModel(inverse: Boolean, inputSize: Int) extends Model {

  def apply(features: Vector): Vector = {
    val result = features.toArray.clone()
    val jTransformer = new DoubleDCT_1D(result.length)
    if (inverse) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get
}

Source File: BucketedRandomProjectionLSHModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructType, TensorType}
import ml.combust.mleap.tensor.{DenseTensor, Tensor}
import org.apache.spark.ml.linalg.mleap.BLAS
import org.apache.spark.ml.linalg.{Vector, Vectors}


case class BucketedRandomProjectionLSHModel(randomUnitVectors: Seq[Vector],
                                            bucketLength: Double,
                                            inputSize: Int) extends LSHModel {
  def apply(features: Vector): Tensor[Double] = predict(features)
  def predict(features: Vector): Tensor[Double] = {
    val hashValues: Seq[Double] = randomUnitVectors.map({
      randUnitVector => Math.floor(BLAS.dot(features, randUnitVector) / bucketLength)
    })

    // TODO: Output vectors of dimension numHashFunctions in SPARK-18450
    DenseTensor(hashValues.toArray, Seq(hashValues.length, 1))
  }

  override def keyDistance(x: Vector, y: Vector): Double = {
    Math.sqrt(Vectors.sqdist(x, y))
  }

  override def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = {
    // Since it's generated by hashing, it will be a pair of dense vectors.
    x.zip(y).map(vectorPair => Vectors.sqdist(vectorPair._1, vectorPair._2)).min
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize, 1)).get
}

Source File: PolynomialFeaturesModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.sklearn

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types.{StructType, TensorType}
import org.apache.spark.ml.linalg.{Vector, Vectors}


case class PolynomialFeaturesModel(combinations: String) extends Model {

  private val pattern = "x(\\d+)(?:[\\^](\\d+))?".r
  private val polynomials = extractPolynomials(combinations)
  private val indices = polynomials.flatMap(poly => poly.terms).map(term => term.index).toSet

  private def extractPolynomials(combinations: String): List[Polynomial] = {
    combinations.split(",")
                .map(combination => extractPolynomial(combination))
                .toList
  }

  private def extractPolynomial(polynomial: String): Polynomial = {
    Polynomial(pattern.findAllIn(polynomial).matchData
      .map(matcher => {Term(matcher.group(1).toInt, Option(matcher.group(2)).getOrElse("1").toInt)})
      .toList
    )
  }

  def getPolyValue(poly: Polynomial, features: Vector): Double = {
    poly.terms.map(term => scala.math.pow(features(term.index), term.power)).product
  }

  def apply(features: Vector): Vector = {
    Vectors.dense(polynomials.map(poly => getPolyValue(poly, features)).toArray)
  }

  override def inputSchema: StructType = StructType("input" -> TensorType.Double(indices.size)).get

  override def outputSchema: StructType = StructType("output" -> TensorType.Double(polynomials.size)).get
}

case class Term(index: Int, power: Int)

case class Polynomial(terms: List[Term])

Source File: GaussianMixtureModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.clustering

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.mleap.Utils._
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian


object GaussianMixtureModel {
  @SparkCode(uri = "https://github.com/apache/spark/blob/branch-2.0/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala")
  def computeProbabilities(features: DenseVector,
                           dists: Array[MultivariateGaussian],
                           weights: Array[Double]): Array[Double] = {
    val p = weights.zip(dists).map {
      case (weight, dist) => EPSILON + weight * dist.pdf(features)
    }
    val pSum = p.sum
    var i = 0
    while (i < weights.length) {
      p(i) /= pSum
      i += 1
    }
    p
  }
}

case class GaussianMixtureModel(gaussians: Array[MultivariateGaussian],
                                weights: Array[Double]) extends Model {
  val numClusters = gaussians.length
  val numFeatures: Int = weights.length

  def apply(features: Vector): Int = predict(features)

  def predict(features: Vector): Int = {
    predictionFromProbability(predictProbability(features))
  }

  def predictWithProbability(features: Vector): (Int, Double) = {
    val probability = predictProbability(features)
    val index = probability.argmax
    (index, probability(index))
  }

  def predictionFromProbability(probabilities: Vector): Int = {
    probabilities.argmax
  }

  def predictProbability(features: Vector): Vector = {
    val probs: Array[Double] = GaussianMixtureModel.computeProbabilities(features.toDense, gaussians, weights)
    Vectors.dense(probs)
  }

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get

  override def outputSchema: StructType = StructType("prediction" -> ScalarType.Int.nonNullable,
    "probability" -> TensorType.Double(numClusters)).get
}

Source File: Node.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.tree

import ml.combust.mleap.core.annotation.SparkCode
import org.apache.spark.ml.linalg.{Vector, Vectors}


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala")
final case class InternalNode(left: Node,
                              right: Node,
                              split: Split) extends Node {
  override def predictImpl(features: Vector): LeafNode = {
    if(split.shouldGoLeft(features)) {
      left.predictImpl(features)
    } else {
      right.predictImpl(features)
    }
  }
}

Source File: OneVsRestModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.{Vector, Vectors}


  def predictAll(features: Vector): (Double, Vector, Double) = {
    val predArray = Array.fill[Double](classifiers.length)(0.0)
    val (prediction, probability) = classifiers.zipWithIndex.map {
      case (c:ProbabilisticClassificationModel, i) =>
        val raw = c.predictRaw(features)
        predArray(i) = raw(1)
        val probability = c.rawToProbabilityInPlace(raw)(1)

        (i.toDouble, probability)
      case (c,i) =>
        val raw = c.predict(features)
        predArray(i) = raw
        (i.toDouble,raw)

    }.maxBy(_._2)

    (probability, Vectors.dense(predArray), prediction)
  }

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get

  override def outputSchema: StructType = StructType("probability" -> ScalarType.Double,
    "raw_prediction" -> TensorType.Double(classifiers.length),
    "prediction" -> ScalarType.Double).get
}

Source File: ClassificationModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors}


  val numClasses: Int

  val numFeatures: Int

  def thresholds: Option[Array[Double]] = None

  def predict(features: Vector): Double = probabilityToPrediction(predictProbabilities(features))
  def predictWithProbability(features: Vector): (Double, Double) = {
    val probabilities = predictProbabilities(features)
    val index = probabilityToPredictionIndex(probabilities)
    (index.toDouble, probabilities(index))
  }

  def predictProbabilities(features: Vector): Vector = {
    val raw = predictRaw(features)
    rawToProbabilityInPlace(raw)
    raw
  }

  def rawToProbability(raw: Vector): Vector = {
    val probabilities = raw.copy
    rawToProbabilityInPlace(probabilities)
  }

  def rawToPrediction(raw: Vector): Double = {
    thresholds match {
      case Some(t) => probabilityToPrediction(rawToProbability(raw))
      case None => raw.argmax
    }
  }

  def probabilityToPrediction(probability: Vector): Double = {
    probabilityToPredictionIndex(probability).toDouble
  }

  def probabilityToPredictionIndex(probability: Vector): Int = {
    thresholds match {
      case Some(ts) =>
        val scaledProbability: Array[Double] =
          probability.toArray.zip(ts).map { case (p, t) =>
            if (t == 0.0) Double.PositiveInfinity else p / t
          }
        Vectors.dense(scaledProbability).argmax
      case None => probability.argmax
    }
  }

  def rawToProbabilityInPlace(raw: Vector): Vector

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get

  override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses),
    "probability" -> TensorType.Double(numClasses),
    "prediction" -> ScalarType.Double.nonNullable).get
}

Source File: GBTClassifierModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.regression.DecisionTreeRegressionModel
import ml.combust.mleap.core.tree.TreeEnsemble
import ml.combust.mleap.core.tree.loss.LogLoss
import org.apache.spark.ml.linalg.mleap.BLAS
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


  def margin(features: Vector): Double = {
    val treePredictions = Vectors.dense(trees.map(_.predict(features)).toArray)
    BLAS.dot(treePredictions, treeWeightsVector)
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        dv.values(0) = loss.computeProbability(dv.values(0))
        dv.values(1) = 1.0 - dv.values(0)
        dv
      case sv: SparseVector => throw new RuntimeException("GBTClassificationModel encountered SparseVector")
    }
  }

  override def predictRaw(features: Vector): Vector = {
    val prediction: Double = margin(features)
    Vectors.dense(Array(-prediction, prediction))
  }
}

Source File: MultiLayerPerceptronClassifierModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.ann.FeedForwardTopology
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.feature.LabeledPoint
import org.apache.spark.ml.linalg.{Vector, Vectors}


    def decodeLabel(output: Vector): Double = {
      output.argmax.toDouble
    }
  }
}

@SparkCode(uri = "https://github.com/apache/spark/blob/v2.3.0/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala")
case class MultiLayerPerceptronClassifierModel(layers: Seq[Int],
                                               weights: Vector,
                                               override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel {
  val numFeatures: Int = layers.head

  private val mlpModel = FeedForwardTopology
    .multiLayerPerceptron(layers.toArray)
    .model(weights)

  override def predictRaw(features: Vector): Vector = {
    mlpModel.predictRaw(features)
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    mlpModel.raw2ProbabilityInPlace(raw)
  }

  override val numClasses: Int = layers.last

}

Source File: SupportVectorMachineModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.BLAS


case class SupportVectorMachineModel(coefficients: Vector,
                                     intercept: Double,
                                     override val thresholds: Option[Array[Double]] = Some(SupportVectorMachineModel.defaultThresholds))
  extends ProbabilisticClassificationModel with Serializable {
  private def margin(features: Vector): Double = BLAS.dot(coefficients, features) + intercept

  override val numClasses: Int = 2
  override val numFeatures: Int = coefficients.size

  override def predictRaw(features: Vector): Vector = {
    val m = margin(features)
    Vectors.dense(Array(-m, m))
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = raw
}

Source File: RandomForestClassifierModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.tree.TreeEnsemble
import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors}


case class RandomForestClassifierModel(override val trees: Seq[DecisionTreeClassifierModel],
                                       override val treeWeights: Seq[Double],
                                       numFeatures: Int,
                                       override val numClasses: Int,
                                       override val thresholds: Option[Array[Double]] = None)
  extends ProbabilisticClassificationModel with TreeEnsemble with Serializable {
  override def predictRaw(raw: Vector): Vector = {
    val votes = Array.fill[Double](numClasses)(0.0)
    trees.view.foreach { tree =>
      val classCounts: Array[Double] = tree.rootNode.predictImpl(raw).impurities.toArray
      val total = classCounts.sum
      if (total != 0) {
        var i = 0
        while (i < numClasses) {
          votes(i) += classCounts(i) / total
          i += 1
        }
      }
    }
    Vectors.dense(votes)
  }

  override def rawToProbabilityInPlace(raw: Vector): Vector = {
    raw match {
      case dv: DenseVector =>
        ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv)
        dv
      case sv: SparseVector =>
        throw new RuntimeException("Unexpected error in RandomForestClassificationModel:" +
          " raw2probabilityInPlace encountered SparseVector")
    }
  }
}

Source File: LinearSVCModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType}
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.BLAS

object LinearSVCModel
{
    val defaultThreshold = 0.0
}

@SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala")
case class LinearSVCModel(coefficients: Vector,
                            intercept: Double,
                            threshold: Double = LinearSVCModel.defaultThreshold
                         ) extends ClassificationModel
{
    val numClasses: Int = 2
    val numFeatures: Int = coefficients.size

    private val margin: Vector => Double = features =>
    {
        BLAS.dot(features, coefficients) + intercept
    }

    
    override def predict(features: Vector): Double =
    {
        if (margin(features) > threshold) 1.0 else 0.0
    }

    override def predictRaw(features: Vector): Vector =
    {
        val m = margin(features)
        Vectors.dense(-m, m)
    }

    def rawToPrediction(rawPrediction: Vector): Double =
    {
        if (rawPrediction(1) > threshold) 1.0 else 0.0
    }

    override def inputSchema: StructType =  StructType("features" -> TensorType.Double(numFeatures)).get

    override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses),
        "prediction" -> ScalarType.Double.nonNullable).get
}

Source File: VectorConverters.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.util

import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV}
import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor}
import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors}

import scala.language.implicitConversions


trait VectorConverters {
  implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match {
    case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size))
    case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)),
      values = vector.values,
      dimensions = Seq(vector.size))
  }

  implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match {
    case tensor: DenseTensor[_] =>
      Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      Vectors.sparse(tensor.dimensions.product,
        tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match {
    case matrix: DenseMatrix =>
      DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols))
    case matrix: SparseMatrix =>
      val indices = matrix.rowIndices.zip(matrix.colPtrs).map {
        case (r, c) => Seq(r, c)
      }.toSeq
      SparseTensor(indices = indices,
      values = matrix.values,
      dimensions = Seq(matrix.numRows, matrix.numCols))
  }

  implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match {
    case tensor: DenseTensor[_] =>
      Matrices.dense(tensor.dimensions.head,
        tensor.dimensions(1),
        tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip
      Matrices.sparse(tensor.dimensions.head,
        tensor.dimensions(1),
        cols.toArray,
        rows.toArray,
        tensor.values.asInstanceOf[Array[Double]])
  }

  implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match {
    case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size))
    case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size))
  }


  implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match {
    case tensor: DenseTensor[_] =>
      new BDV(tensor.rawValues.asInstanceOf[Array[Double]])
    case tensor: SparseTensor[_] =>
      new BSV(tensor.indices.map(_.head).toArray,
        tensor.values.asInstanceOf[Array[Double]],
        tensor.dimensions.product)
  }
}
object VectorConverters extends VectorConverters

Source File: LinalgUtils.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.linalg

import ml.combust.mleap.core.annotation.SparkCode
import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.{BLAS, VectorWithNorm}


    val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * normDiff + EPSILON)
    if (precisionBound1 < precision) {
      sqDist = sumSquaredNorm - 2.0 * BLAS.dot(v1, v2)
    } else if (v1.isInstanceOf[SparseVector] || v2.isInstanceOf[SparseVector]) {
      val dotValue = BLAS.dot(v1, v2)
      sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0)
      val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) /
        (sqDist + EPSILON)
      if (precisionBound2 > precision) {
        sqDist = Vectors.sqdist(v1, v2)
      }
    } else {
      sqDist = Vectors.sqdist(v1, v2)
    }
    sqDist
  }

  def log1pExp(x: Double): Double = {
    if (x > 0) {
      x + math.log1p(math.exp(-x))
    } else {
      math.log1p(math.exp(x))
    }
  }
}

Source File: AFTSurvivalRegressionModel.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.regression

import ml.combust.mleap.core.Model
import ml.combust.mleap.core.annotation.SparkCode
import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.linalg.mleap.BLAS


@SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala")
case class AFTSurvivalRegressionModel(coefficients: Vector,
                                      intercept: Double,
                                      quantileProbabilities: Array[Double],
                                      scale: Double) extends Model {
  def apply(features: Vector): Double = predict(features)

  def predictWithQuantiles(features: Vector): (Double, Vector) = {
    val quantiles = predictQuantiles(features)
    (predict(features), quantiles)
  }

  def predictQuantiles(features: Vector): Vector = {
    // scale parameter for the Weibull distribution of lifetime
    val lambda = math.exp(BLAS.dot(coefficients, features) + intercept)
    // shape parameter for the Weibull distribution of lifetime
    val k = 1 / scale
    val quantiles = quantileProbabilities.map {
      q => lambda * math.exp(math.log(-math.log(1 - q)) / k)
    }

    Vectors.dense(quantiles)
  }

  def predict(features: Vector): Double = {
    math.exp(BLAS.dot(coefficients, features) + intercept)
  }

  override def inputSchema: StructType = StructType("features" -> TensorType.Double(coefficients.size)).get

  override def outputSchema: StructType = {
    StructType("prediction" -> ScalarType.Double.nonNullable,
      "quantiles" -> TensorType.Double(quantileProbabilities.length)).get
  }
}

Source File: MinMaxScalerModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

import org.apache.spark.ml.util.TestingUtils._


class MinMaxScalerModelSpec extends FunSpec{
  describe("min max scaler model") {
    val scaler = MinMaxScalerModel(Vectors.dense(Array(1.0, 0.0, 5.0, 10.0)), Vectors.dense(Array(15.0, 10.0, 15.0, 20.0)))

    it("scales vector based on min/max range"){
      val inputVector = Vectors.dense(15.0, 5.0, 5.0, 19.0)
      val expectedVector = Vectors.dense(1.0, 0.5, 0.0, 0.9)
      assert(scaler(inputVector) ~= expectedVector relTol 1E-9)
    }

    it("has the right input schema") {
      assert(scaler.inputSchema.fields == Seq(StructField("input", TensorType.Double(4))))
    }

    it("has the right output schema") {
      assert(scaler.outputSchema.fields == Seq(StructField("output", TensorType.Double(4))))
    }
  }
}

Source File: ElementwiseProductModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class ElementwiseProductModelSpec extends FunSpec{
  describe("elementwise product model") {
    val scaler = ElementwiseProductModel(Vectors.dense(Array(0.5, 1.0, 1.0)))

    it("multiplies each input vector by a provided weight vector"){
      val inputArray = Array(15.0, 10.0, 10.0)
      val expectedVector = Array(7.5, 10.0, 10.0)

      assert(scaler(Vectors.dense(inputArray)).toArray.sameElements(expectedVector))
    }

    it("has the right input schema") {
      assert(scaler.inputSchema.fields ==
        Seq(StructField("input", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(scaler.outputSchema.fields ==
        Seq(StructField("output", TensorType.Double(3))))
    }
  }
}

Source File: PcaModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vectors}
import org.scalatest.FunSpec


class PcaModelSpec extends FunSpec {
  describe("pca model") {
    val pc = new DenseMatrix(3, 2, Array[Double](1, -1, 2,
      0, -3, 1))
    val pca = PcaModel(pc)

    it("uses the principal components matrix to transform a vector to a lower-dimensional vector") {

      val input = Vectors.dense(Array[Double](2, 1, 0))

      assert(pca(input).toArray sameElements Array[Double](1, -3))
    }

    it("has the right input schema") {
      assert(pca.inputSchema.fields == Seq(StructField("input", TensorType.Double())))
    }

    it("has the right output schema") {
      assert(pca.outputSchema.fields == Seq(StructField("output", TensorType.Double())))
    }
  }
}

Source File: MaxAbsScalerModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

import org.apache.spark.ml.util.TestingUtils._


class MaxAbsScalerModelSpec extends FunSpec {
  describe("Max Abs Scaler Model") {
    val scaler = MaxAbsScalerModel(Vectors.dense(Array(20.0, 10.0, 10.0, 20.0)))

    it("Scales the vector based on absolute max value"){
      val inputVector = Vectors.dense(15.0, -5.0, 5.0, 19.0)
      val expectedVector = Vectors.dense(0.75, -0.5, 0.5, 0.95)
      assert(scaler(inputVector) ~= expectedVector relTol 1E-9)
    }

    it("Has the right input schema") {
      assert(scaler.inputSchema.fields == Seq(StructField("input", TensorType.Double(4))))
    }

    it("Has the right output schema") {
      assert(scaler.outputSchema.fields == Seq(StructField("output", TensorType.Double(4))))
    }
  }
}

Source File: BinarizerModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class BinarizerModelSpec extends FunSpec {
  describe("binarizer with several inputs"){
    val binarizer = BinarizerModel(0.3, TensorShape(3))

    it("Makes a value 0 or 1 based on the threshold") {
      val features = Vectors.dense(Array(0.1, 0.4, 0.3))
      val binFeatures = binarizer(features).toArray

      assert(binFeatures(0) == 0.0)
      assert(binFeatures(1) == 1.0)
      assert(binFeatures(2) == 0.0)
    }

    it("Has the right input schema") {
      assert(binarizer.inputSchema.fields ==
        Seq(StructField("input", TensorType.Double(3))))
    }

    it("Has the right output schema") {
      assert(binarizer.outputSchema.fields ==
        Seq(StructField("output", TensorType.Double(3))))
    }
  }

  describe("binarizer with one input") {
    val binarizer = BinarizerModel(0.3, ScalarShape())

    it("Has the right input schema") {
      assert(binarizer.inputSchema.fields ==
        Seq(StructField("input", ScalarType.Double.nonNullable)))
    }

    it("Has the right output schema") {
      assert(binarizer.outputSchema.fields ==
        Seq(StructField("output", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: DCTModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class DCTModelSpec extends FunSpec {
  describe("dct model") {
    val model = DCTModel(false, 3)
    describe("issue167") {
      it("should not modify input features") {
        val expected = Array(123.4, 23.4, 56.7)
        val features = Vectors.dense(Array(123.4, 23.4, 56.7))
        val dctFeatures = model(features)

        assert(features.toArray.sameElements(expected))
        assert(!features.toArray.sameElements(dctFeatures.toArray))
        assert(features.toArray != dctFeatures.toArray)
      }
    }

    describe("input/output schema") {
      it("has the right input schema") {
        assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double(3))))
      }

      it("has the right output schema") {
        assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(3))))
      }
    }
  }
}

Source File: WordToVectorModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{BasicType, ListType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalactic.TolerantNumerics
import org.scalatest.FunSpec

class WordToVectorModelSpec extends FunSpec {
  implicit val doubleEquality = TolerantNumerics.tolerantDoubleEquality(0.000001)

  describe("word to vector model") {
    val model = WordToVectorModel(Map("test" -> 1), Array(12))

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("input", ListType(BasicType.String))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("output", TensorType.Double(1))))
    }
  }

  describe("WordToVectorKernel") {
    describe("for name") {
      it("returns the kernel for string") {
        assert(WordToVectorKernel.forName("default") == WordToVectorKernel.Default)
        assert(WordToVectorKernel.forName("sqrt") == WordToVectorKernel.Sqrt)
      }
    }
  }

  describe("Sqrt kernel") {
    it("produces results using the sqrt kernel (division by sqrt(dot(vec, vec)))") {
      val hello = Vectors.dense(-0.02743354,  0.13925314, -0.41874424,  0.05635237, -1.01364303,
        0.13555442, -0.36437142,  0.10494551,  1.25634718,  0.74919909,
        -0.75405639,  0.34798685, -0.33082211, -1.83296537,  1.8524611 ,
        0.16053002,  0.05308712, -0.61047131, -2.04251647, -0.6457383 ,
        -0.06899478, -1.06984603,  1.81890905, -1.57762015, -1.14214861,
        -0.37704349, -1.13758969, -1.11241293, -0.01736556,  0.55350637,
        1.29117298,  0.6780861 ,  0.72507775,  0.38882053, -1.13152575)
      val there = Vectors.dense(0.05639598, -0.0189869 ,  0.01236993,  0.00477022, -0.10707449,
        0.02502576,  0.0702049 ,  0.07715208,  0.03785434,  0.06749821,
        0.0028507 ,  0.03143736, -0.07800865, -0.066576  ,  0.05038944,
        0.04129622,  0.05770208, -0.09861612, -0.02329824, -0.03803944,
        -0.01226865, -0.03243028,  0.05924392, -0.07248155, -0.03818463,
        0.03131858, -0.03253553,  0.04506788, -0.02503723, -0.03580079,
        0.05802456, -0.00171577, -0.07222789,  0.01021192,  0.01579604)
      val `{make}` = Vectors.dense(1.69664776, -0.9033435 , -1.13164949,  1.94182444, -0.53111398,
        2.28728724,  1.39580894,  1.38314795, -1.03503716,  1.0247947 ,
        -2.175174  ,  1.62514234, -0.64084077, -0.20218629, -0.0694286 ,
        0.37854579, -2.70390058, -2.27423668, -2.79813218, -0.46218753,
        0.77630186, -0.82613772,  1.18320072, -2.93088889,  0.6440177 ,
        -0.02956525, -1.51469374, -2.94850779, -0.89843947, -0.16953184,
        -1.4054004 , -1.22051024,  0.41841957,  0.26196802,  3.39272285)
      val wordVectors = Array(hello, there, `{make}`).flatMap(_.toArray)

      val model = WordToVectorModel(Map("hello" -> 0, "there" -> 1, "{make}" -> 2),
        wordVectors,
        kernel = WordToVectorKernel.Sqrt)

      val resultHello = model(Seq("hello"))
      val expectedHello = Vectors.dense(-0.00489383,  0.02484115, -0.07469912,  0.01005261, -0.18082216,
        0.02418134, -0.06499964,  0.01872106,  0.22411777,  0.13364843,
        -0.13451492,  0.06207682, -0.05901483, -0.32697977,  0.33045758,
        0.02863669,  0.00947013, -0.108901  , -0.36436126, -0.11519223,
        -0.01230787, -0.19084813,  0.32447228, -0.28142914, -0.20374607,
        -0.06726019, -0.20293281, -0.19844157, -0.00309781,  0.09873912,
        0.23033029,  0.1209627 ,  0.12934546,  0.06936107, -0.20185107)

      val resultSentence = model(Seq("hello", "there", "{make}", "qqq"))
      val expectedSentence = Vectors.dense(0.13878191, -0.06297886, -0.1236953 ,  0.16108668, -0.13284827,
        0.19686932,  0.0885994 ,  0.12588461,  0.02084325,  0.14810168,
        -0.23535359,  0.16121693, -0.08441966, -0.16903109,  0.14745265,
        0.04667632, -0.20855054, -0.23993334, -0.39118211, -0.09216406,
        0.05589835, -0.15509237,  0.24620885, -0.36842539, -0.04313309,
        -0.03018265, -0.21592611, -0.32297428, -0.07566708,  0.02800181,
        -0.00452011, -0.04376236,  0.08615666,  0.05316085,  0.18312679)
      for ((a, b) <- resultHello.toArray.zip(expectedHello.toArray)) { assert(a === b) }
      for ((a, b) <- resultSentence.toArray.zip(expectedSentence.toArray)) { assert(a === b) }
    }
  }
}

Source File: VectorAssemblerModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import java.math.BigDecimal

import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class VectorAssemblerModelSpec extends FunSpec {
  val assembler = VectorAssemblerModel(Seq(
    ScalarShape(), ScalarShape(),
    TensorShape(2),
    TensorShape(5)))

  describe("#apply") {
    it("assembles doubles and vectors into a new vector") {
      val expectedArray = Array(45.0, 76.8, 23.0, 45.6, 0.0, 22.3, 45.6, 0.0, 99.3)

      assert(assembler(Array(45.0,
        new BigDecimal(76.8),
        Vectors.dense(Array(23.0, 45.6)),
        Vectors.sparse(5, Array(1, 2, 4), Array(22.3, 45.6, 99.3)))).toArray.sameElements(expectedArray))
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(assembler.inputSchema.fields == Seq(
        StructField("input0", ScalarType.Double),
        StructField("input1", ScalarType.Double),
        StructField("input2", TensorType.Double(2)),
        StructField("input3", TensorType.Double(5))))
    }

    it("has the right output schema") {
      assert(assembler.outputSchema.fields == Seq(StructField("output", TensorType.Double(9))))
    }
  }
}

Source File: InteractionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class InteractionModelSpec extends FunSpec {
  describe("with all numeric inputs") {

    val encoderSpec: Array[Array[Int]] = Array(Array(1), Array(1, 1))
    val model = InteractionModel(encoderSpec, Seq(ScalarShape(), TensorShape(2)))

    it("produces the expected interaction vector") {
      val features = Seq(2.toDouble, Vectors.dense(3, 4))
      assert(model(features).toArray.toSeq == Seq(6, 8))
    }

    it("has the right inputs") {
      assert(model.inputSchema.fields == Seq(StructField("input0", ScalarType.Double),
        StructField("input1", TensorType.Double(2))))
    }

    it("has the right outputs") {
      assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(2))))
    }
  }

  describe("with one nominal input") {
    val encoderSpec: Array[Array[Int]] = Array(Array(4), Array(1, 1))
    val model = InteractionModel(encoderSpec, Seq(ScalarShape(), TensorShape(2)))

    it("produce the expected interaction vector") {
      val features = Seq(2.toDouble, Vectors.dense(3, 4))

      assert(model(features).toArray.toSeq == Seq(0, 0, 0, 0, 3, 4, 0, 0))
    }

    it("has the right inputs") {
      assert(model.inputSchema.fields == Seq(StructField("input0", ScalarType.Double),
        StructField("input1", TensorType.Double(2))))
    }

    it("has the right outputs") {
      assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(8))))
    }
  }
}

Source File: StandardScalerModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class StandardScalerModelSpec extends FunSpec {
  describe("standard scaler with dense data") {
    describe("with mean") {
      val scaler = StandardScalerModel(None, Some(Vectors.dense(Array(50.0, 20.0, 30.0))))

      it("scales based off of the mean") {
        val expectedVector = Array(5.0, 5.0, 3.0)
        assert(scaler(Vectors.dense(Array(55.0, 25.0, 33.0))).toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 3)
    }

    describe("with stdev") {
      val scaler = StandardScalerModel(Some(Vectors.dense(Array(2.5, 8.0, 10.0))), None)

      it("scales based off the standard deviation") {
        val expectedVector = Array(1.6, .4375, 1.0)
        assert(scaler(Vectors.dense(Array(4.0, 3.5, 10.0))).toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 3)
    }

    describe("with mean and stdev") {
      val scaler = StandardScalerModel(Some(Vectors.dense(Array(2.5, 8.0, 10.0))), Some(Vectors.dense(Array(50.0, 20.0, 30.0))))

      it("scales based off the mean and standard deviation") {
        val expectedVector = Array(1.6, .4375, 1.0)
        assert(scaler(Vectors.dense(Array(54.0, 23.5, 40.0))).toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 3)
    }
  }

  describe("standard scaler with sparse data") {
    describe("with mean") {
      val scaler = StandardScalerModel(None, Some(Vectors.sparse(5, Array(1, 2, 4), Array(20, 45, 100))))

      it("scales based off of the mean") {
        val expectedVector = Array(0.0, 5.0, 5.0, 0.0, 3.0)
        assert(scaler(Vectors.sparse(5, Array(1, 2, 4), Array(25, 50, 103))).toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 5)
    }

    describe("with stdev") {
      val scaler = StandardScalerModel(Some(Vectors.sparse(5, Array(1, 2, 4), Array(20, 45, 100))), None)

      it("scales based off the standard deviation") {
        val expectedVector = Array(0.0, 1.25, 2.2, 0.0, 1.02)
        assert(scaler(Vectors.sparse(5, Array(1, 2, 4), Array(25, 99, 102))).toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 5)
    }

    describe("with mean and stdev") {
      val scaler = StandardScalerModel(Some(Vectors.sparse(5, Array(1, 2, 4), Array(2.5, 8.0, 10.0))),
        Some(Vectors.sparse(5, Array(1, 2, 4), Array(50.0, 20.0, 30.0))))

      it("scales based off the mean and standard deviation") {
        val expectedVector = Array(0.0, 1.6, .4375,  0.0, 1.0)
        val actual = scaler(Vectors.sparse(5, Array(1, 2, 4), Array(54.0, 23.5, 40.0)))
        assert(actual.toArray.sameElements(expectedVector))
      }

      it should behave like aModelWithSchema(scaler, 5)
    }
  }

  def aModelWithSchema(model: StandardScalerModel, tensorSize: Integer) = {
    it("has the right input schema") {
      assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double(tensorSize))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(tensorSize))))
    }
  }
}

Source File: IDFModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class IDFModelSpec extends FunSpec {

  describe("idf model") {
    val model = IDFModel(Vectors.dense(Array(1.0, 2.0)))

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("input", TensorType.Double())))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("output", TensorType.Double())))
    }
  }
}

Source File: NormalizerModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class NormalizerModelSpec extends FunSpec {
  describe("normalizer model") {

    val normalizer = NormalizerModel(20.0, 3)

    it("normalizes the feature vector using the p normalization value") {
      val features = Vectors.dense(Array(0.0, 20.0, 40.0))
      val norm = normalizer(features).toArray

      assert(norm(0) < 0.0001 && norm(0) > -0.0001)
      assert(norm(1) < 0.5001 && norm(1) > 0.49999)
      assert(norm(2) < 1.0001 && norm(2) > 0.99999)
    }

    it("has the right input schema") {
      assert(normalizer.inputSchema.fields == Seq(StructField("input", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(normalizer.outputSchema.fields == Seq(StructField("output", TensorType.Double(3))))
    }
  }
}

Source File: PolynomialExpansionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.feature

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class PolynomialExpansionModelSpec extends FunSpec {
  describe("polynomial expansion model") {
    val model = PolynomialExpansionModel(2, 2)

    it("performs polynomial expansion on an input vector") {
      val inputArray = Array(2.0,3.0)
      val expectedVector = Array(2.0, 4.0, 3.0, 6.0, 9.0)

      assert(model(Vectors.dense(inputArray)).toArray.sameElements(expectedVector))
    }

    it("has the right input schema") {
      assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double(2))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(5))))
    }
  }
}

Source File: PolynomialFeaturesModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.sklearn

import ml.combust.mleap.core.types.{StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class PolynomialFeaturesModelSpec extends FunSpec {

  val model = new PolynomialFeaturesModel("[x0,x1,x0^2,x0 x1,x1^2,x0^3,x0^2 x1,x0 x1^2,x1^3]")

  describe("sklearn polynomial features") {

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("input", TensorType.Double(2))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("output", TensorType.Double(9))))
    }

    it("calculates the polynomial features based off given combinations") {
      val result = model(Vectors.dense(3, 4))
      assert(result == Vectors.dense(3.0, 4.0, 9.0, 12.0, 16.0, 27.0, 36.0, 48.0, 64.0))
    }
  }
}

Source File: KMeansModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.clustering

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class KMeansModelSpec extends FunSpec {
  val v1 = Vectors.dense(Array(1.0, 2.0, 55.0))
  val v2 = Vectors.dense(Array(11.0, 200.0, 55.0))
  val v3 = Vectors.dense(Array(100.0, 22.0, 55.0))
  val km = KMeansModel(Array(v1, v2, v3), 3)

  describe("#apply") {
    it("finds the closest cluster") {
      assert(km(Vectors.dense(Array(2.0, 5.0, 34.0))) == 0)
      assert(km(Vectors.dense(Array(20.0, 230.0, 34.0))) == 1)
      assert(km(Vectors.dense(Array(111.0, 20.0, 56.0))) == 2)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(km.inputSchema.fields == Seq(StructField("features", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(km.outputSchema.fields == Seq(StructField("prediction", ScalarType.Int.nonNullable)))
    }
  }
}

Source File: BisectingKMeansModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.clustering

import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.linalg.mleap.VectorWithNorm
import org.scalatest.FunSpec

class BisectingKMeansModelSpec extends FunSpec {

  describe("bisecting kmeans model") {
    val model = new BisectingKMeansModel(ClusteringTreeNode(23,
      VectorWithNorm(Vectors.dense(1, 2, 3)) , Array()))

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("prediction", ScalarType.Int.nonNullable)))
    }
  }
}

Source File: NodeSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.tree

import org.scalatest.FunSpec
import org.apache.spark.ml.linalg.Vectors


class InternalNodeSpec extends FunSpec {
  describe("#typeName") {
    it("is InternalNode") {  }
  }

  describe("#predictImpl") {
    val leftNode = LeafNode(0.45)
    val rightNode = LeafNode(0.33)
    val features = Vectors.dense(Array(0.3))

    describe("when split goes left") {
      it("returns the left node") {
        val node = InternalNode(leftNode, rightNode, ContinuousSplit(0, 0.4))
        assert(node.predictImpl(features) == leftNode)
      }
    }

    describe("when split goes right") {
      it("returns the right node") {
        val node = InternalNode(leftNode, rightNode, ContinuousSplit(0, 0.2))
        assert(node.predictImpl(features) == rightNode)
      }
    }
  }
}

class LeafNodeSpec extends FunSpec {
  describe("#predictImpl") {
    it("returns itself") {
      val node = LeafNode(0.45)
      assert(node.predictImpl(Vectors.dense(Array(0.67))) == node)
    }
  }
}

Source File: MultiLayerPerceptronClassifierModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.types._
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class MultiLayerPerceptronClassifierModelSpec extends FunSpec {

  describe("multi layer perceptron classifier model") {
    val model = new MultiLayerPerceptronClassifierModel(Seq(3, 1), Vectors.dense(Array(1.9, 2.2, 4, 1)))

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("raw_prediction", TensorType.Double(1)),
          StructField("probability", TensorType.Double(1)),
          StructField("prediction", ScalarType.Double.nonNullable)
        ))
    }
  }

}

Source File: GBTClassifierModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.test.TestUtil
import ml.combust.mleap.core.types.{BasicType, ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class GBTClassifierModelSpec extends FunSpec {
  val tree1 = TestUtil.buildDecisionTreeRegression(0.5, 0, goLeft = true)
  val tree2 = TestUtil.buildDecisionTreeRegression(0.75, 1, goLeft = false)
  val tree3 = TestUtil.buildDecisionTreeRegression(-0.1, 2, goLeft = true)

  val classifier = GBTClassifierModel(trees = Seq(tree1, tree2, tree3),
    treeWeights = Seq(0.5, 2.0, 1.0),
    numFeatures = 3)

  describe("#apply") {
    val features = Vectors.dense(Array(0.2, 0.8, 0.4))

    it("predicts the class based on the features") {
      assert(classifier(features) == 1.0)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(classifier.inputSchema.fields ==
        Seq(StructField("features", TensorType(BasicType.Double, Seq(3)))))
    }

    it("has the right output schema") {
      assert(classifier.outputSchema.fields ==
        Seq(StructField("raw_prediction", TensorType.Double(2)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)
        ))
    }
  }
}

Source File: OneVsRestModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class OneVsRestModelSpec extends FunSpec {

  describe("one vs rest model") {
    val model = new OneVsRestModel(Array(
      BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2)

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features", TensorType.Double(2))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(
          StructField("probability", ScalarType.Double),
          StructField("raw_prediction", TensorType.Double(1)),
          StructField("prediction", ScalarType.Double)
        ))
    }
  }
}

Source File: LogisticRegressionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.scalatest.FunSpec


class LogisticRegressionModelSpec extends FunSpec {
  describe("BinaryLogisticRegression") {
    val weights = Vectors.dense(1.0, 2.0, 4.0)
    val intercept = 0.7

    describe("issue210: Logistic function not being applied") {
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4)
      it("applies the logistic function for prediction") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)

        assert(lr.predict(features) == 1.0)
      }
    }

    describe("issue386:Wrong Binary LogisticRegression predictions") {
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4)
      it("compare binary logisticRegression prediction with the transform api predictions") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.probabilityToPrediction(lr.rawToProbability(lr.predictRaw(features))))
        assert(lr.predict(features) == 1.0)
      }

      it("compare binary logisticRegression prediction with rawToPrediction() results") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.rawToPrediction(lr.predictRaw(features)))
        assert(lr.predict(features) == 1.0)
      }
    }

    describe("issue386:Binary LogisticRegression predictions with 1.0 threshold"){
      val lr = BinaryLogisticRegressionModel(weights, intercept, 1.0)
      it("binary logisticRegression prediction equals zero for 1.0 threshold") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.probabilityToPrediction(lr.rawToProbability(lr.predictRaw(features))))
        assert(lr.predict(features) == 0.0)
      }
    }

    describe("issue386:Binary LogisticRegression predictions with 0.0 threshold"){
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.0)
      it("binary logisticRegression prediction equals 1 for zero threshold") {
        val features = Vectors.dense(-1.0, 1.0, -0.5)
        assert(lr.predict(features) == lr.rawToPrediction(lr.predictRaw(features)))
        assert(lr.predict(features) == 1.0)
      }
    }

    describe("input/output schema"){
      val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4)
      it("has the right input schema") {
        assert(lr.inputSchema.fields == Seq(StructField("features", TensorType.Double(3))))
      }

      it("has the right output schema") {
        assert(lr.outputSchema.fields == Seq(
          StructField("raw_prediction", TensorType.Double(2)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)
        ))
      }
    }
  }

  describe("ProbabilisticLogisticsRegressionModel") {
    val weights = Matrices.dense(3, 3, Array(1, 2, 3, 1, 2, 3, 1, 2, 3))
    val intercept = Vectors.dense(1, 2, 3)
    val lr = ProbabilisticLogisticsRegressionModel(weights, intercept, None)

    describe("input/output schema"){
      it("has the right input schema") {
        assert(lr.inputSchema.fields == Seq(StructField("features", TensorType.Double(3))))
      }

      it("has the right output schema") {
        assert(lr.outputSchema.fields == Seq(
          StructField("raw_prediction", TensorType.Double(3)),
          StructField("probability", TensorType.Double(3)),
          StructField("prediction", ScalarType.Double.nonNullable)
        ))
      }
    }
  }
}

Source File: SupportVectorMachineModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.classification

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class SupportVectorMachineModelSpec extends FunSpec {

  describe("svm model") {
    val model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2)

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("raw_prediction", TensorType.Double(2)),
          StructField("probability", TensorType.Double(2)),
          StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: IsotonicRegressionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.regression

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class IsotonicRegressionModelSpec extends FunSpec {
  val regression = IsotonicRegressionModel(boundaries = Array(0.0, 4.0, 5.0, 7.0, 8.0),
    predictions = Seq(100.0, 200.0, 300.0, 400.0, 500.0),
    isotonic = true,
    featureIndex = Some(2))

  describe("#apply") {
    it("applies the linear regression to a feature vector") {

      assert(regression(4.0) == 200.0)
      assert(regression(4.5) == 250.0)
      assert(regression(Vectors.dense(Array(1.0, 2.3, 7.2))) == 420.0)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double())))
    }

    it("has the right output schema") {
      assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: AFTSurvivalRegressionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.regression

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class AFTSurvivalRegressionModelSpec extends FunSpec {

  describe("AFT survival regression model") {
    val model = new AFTSurvivalRegressionModel(Vectors.dense(1, 2, 3), 2, Array(4, 5, 6, 7), 3)

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features",TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("prediction", ScalarType.Double.nonNullable),
          StructField("quantiles", TensorType.Double(4))))
    }
  }
}

Source File: LinearRegressionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.regression

import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.scalatest.FunSpec
import org.apache.spark.ml.linalg.Vectors


class LinearRegressionModelSpec extends FunSpec {
  val linearRegression = LinearRegressionModel(Vectors.dense(Array(0.5, 0.75, 0.25)), .33)

  describe("#apply") {
    it("applies the linear regression to a feature vector") {
      assert(linearRegression(Vectors.dense(Array(1.0, 0.5, 1.0))) == 1.455)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(linearRegression.inputSchema.fields == Seq(StructField("features", TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(linearRegression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: RandomForestRegressionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.regression

import ml.combust.mleap.core.test.TestUtil
import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class RandomForestRegressionModelSpec extends FunSpec {
  val tree1 = TestUtil.buildDecisionTreeRegression(0.5, 0, goLeft = true)
  val tree2 = TestUtil.buildDecisionTreeRegression(0.75, 1, goLeft = false)
  val tree3 = TestUtil.buildDecisionTreeRegression(0.1, 2, goLeft = true)

  val regression = RandomForestRegressionModel(Seq(tree1, tree2, tree3), 5)

  describe("#predict") {
    it("uses the forest to make a prediction") {
      val features = Vectors.dense(Array(0.2, 0.8, 0.4))

      assert(tree1.predict(features) == 0.5)
      assert(tree2.predict(features) == 0.75)
      assert(tree3.predict(features) == 0.1)
      assert(regression.predict(features) == (0.5 + 0.75 + 0.1) / 3)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double(5))))
    }

    it("has the right output schema") {
      assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }

}

Source File: DecisionTreeRegressionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.regression

import ml.combust.mleap.core.tree.{ContinuousSplit, InternalNode, LeafNode}
import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class DecisionTreeRegressionModelSpec extends FunSpec {
  val node = InternalNode(LeafNode(Seq(0.78)), LeafNode(Seq(0.34)), ContinuousSplit(0, 0.5))
  val regression = DecisionTreeRegressionModel(node, 5)

  describe("#predict") {
    it("returns the prediction for the decision tree") {
      val features = Vectors.dense(Array(0.3, 1.0, 43.23, -21.2, 66.7))

      assert(regression.predict(features) == 0.78)
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double(5))))
    }

    it("has the right output schema") {
      assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: GBTRegressionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.regression

import ml.combust.mleap.core.test.TestUtil
import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec


class GBTRegressionModelSpec extends FunSpec {
  val tree1 = TestUtil.buildDecisionTreeRegression(0.5, 0, goLeft = true)
  val tree2 = TestUtil.buildDecisionTreeRegression(0.75, 1, goLeft = false)
  val tree3 = TestUtil.buildDecisionTreeRegression(0.1, 2, goLeft = true)

  val regression = GBTRegressionModel(Seq(tree1, tree2, tree3), Seq(0.5, 2.0, 1.0), 5)

  describe("#apply") {
    val features = Vectors.dense(Array(0.2, 0.8, 0.4))

    it("predicts the value based on the features") {
      assert(tree1.predict(features) == 0.5)
      assert(tree2.predict(features) == 0.75)
      assert(tree3.predict(features) == 0.1)
      assert(regression.predict(features) == (0.5 * 0.5 + 0.75 * 2.0 + 0.1 * 1.0))
    }
  }

  describe("input/output schema") {
    it("has the right input schema") {
      assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double(5))))
    }

    it("has the right output schema") {
      assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: GeneralizedLinearRegressionModelSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.core.regression

import ml.combust.mleap.core.types.{ScalarShape, ScalarType, StructField, TensorType}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSpec

class GeneralizedLinearRegressionModelSpec extends FunSpec {

  describe("generalized linear regression model") {
    val model = new GeneralizedLinearRegressionModel(Vectors.dense(1, 2, 3), 23, null)

    it("has the right input schema") {
      assert(model.inputSchema.fields ==
        Seq(StructField("features",TensorType.Double(3))))
    }

    it("has the right output schema") {
      assert(model.outputSchema.fields ==
        Seq(StructField("prediction", ScalarType.Double.nonNullable),
           StructField("link_prediction", ScalarType.Double.nonNullable)))
    }
  }
}

Source File: LinearSVCOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.classification.bundle.ops

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl.{Bundle, Model, NodeShape, Value}
import ml.combust.bundle.op.OpModel
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.classification.LinearSVCModel
import org.apache.spark.ml.linalg.Vectors


    override val Model: OpModel[SparkBundleContext, LinearSVCModel] = new OpModel[SparkBundleContext, LinearSVCModel]
    {
        override val klazz: Class[LinearSVCModel] = classOf[LinearSVCModel]

        override def opName: String = Bundle.BuiltinOps.classification.linear_svc

        override def store(model: Model, obj: LinearSVCModel)
                          (implicit context: BundleContext[SparkBundleContext]): Model =
        {
            val m = model.withValue("num_classes", Value.long(obj.numClasses))
            // Set the rest of the parameters
            m.withValue("coefficients", Value.vector(obj.coefficients.toArray))
                    .withValue("intercept", Value.double(obj.intercept))
                    .withValue("threshold", Value.double(obj.getThreshold))
        }

        override def load(model: Model)
                         (implicit context: BundleContext[SparkBundleContext]): LinearSVCModel =
        {
            new LinearSVCModel(uid = "",
                coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
                intercept = model.value("intercept").getDouble
              ).setThreshold(model.value("threshold").getDouble)
        }
    }

    override def sparkInputs(obj: LinearSVCModel): Seq[ParamSpec] =
    {
        Seq("features" -> obj.featuresCol)
    }

    override def sparkOutputs(obj: LinearSVCModel): Seq[SimpleParamSpec] =
    {
        Seq("raw_prediction" -> obj.rawPredictionCol,
            "prediction" -> obj.predictionCol)
    }

    override def sparkLoad(uid: String, shape: NodeShape, model: LinearSVCModel): LinearSVCModel =
    {
        new LinearSVCModel(uid = uid, coefficients = model.coefficients, intercept = model.intercept).setThreshold(model.getThreshold)
    }
}

Source File: ElementwiseProductOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.ElementwiseProduct
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.Param


class ElementwiseProductOp extends SimpleSparkOp[ElementwiseProduct] {
  override val Model: OpModel[SparkBundleContext, ElementwiseProduct] = new OpModel[SparkBundleContext, ElementwiseProduct] {
    override val klazz: Class[ElementwiseProduct] = classOf[ElementwiseProduct]

    override def opName: String = Bundle.BuiltinOps.feature.elementwise_product

    override def store(model: Model, obj: ElementwiseProduct)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("scaling_vec", Value.vector(obj.getScalingVec.toArray))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): ElementwiseProduct = {
      new ElementwiseProduct(uid = "").setScalingVec(Vectors.dense(model.value("scaling_vec").getTensor[Double].toArray))
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: ElementwiseProduct): ElementwiseProduct = {
    new ElementwiseProduct(uid = uid).setScalingVec(model.getScalingVec)
  }

  override def sparkInputs(obj: ElementwiseProduct): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: ElementwiseProduct): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol  )
  }
}

Source File: BucketedRandomProjectionLSHOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import ml.combust.mleap.core.types.TensorShape
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.bundle._
import org.apache.spark.ml.feature.BucketedRandomProjectionLSHModel
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.mleap.TypeConverters.sparkToMleapDataShape


class BucketedRandomProjectionLSHOp extends SimpleSparkOp[BucketedRandomProjectionLSHModel] {
  override val Model: OpModel[SparkBundleContext, BucketedRandomProjectionLSHModel] = new OpModel[SparkBundleContext, BucketedRandomProjectionLSHModel] {
    override val klazz: Class[BucketedRandomProjectionLSHModel] = classOf[BucketedRandomProjectionLSHModel]

    override def opName: String = Bundle.BuiltinOps.feature.bucketed_random_projection_lsh

    override def store(model: Model, obj: BucketedRandomProjectionLSHModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val dataset = context.context.dataset.get
      val inputShape = sparkToMleapDataShape(dataset.schema(obj.getInputCol), dataset).asInstanceOf[TensorShape]

      model.withValue("random_unit_vectors", Value.tensorList[Double](obj.randUnitVectors.map(_.toArray).map(Tensor.denseVector))).
        withValue("bucket_length", Value.double(obj.getBucketLength))
        .withValue("input_size", Value.int(inputShape.dimensions.get(0)))

    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): BucketedRandomProjectionLSHModel = {
      val ruv = model.value("random_unit_vectors").getTensorList[Double].map(_.toArray).map(Vectors.dense)
      val m = new BucketedRandomProjectionLSHModel(uid = "", randUnitVectors = ruv.toArray)
      m.set(m.bucketLength, model.value("bucket_length").getDouble)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: BucketedRandomProjectionLSHModel): BucketedRandomProjectionLSHModel = {
    val m = new BucketedRandomProjectionLSHModel(uid = uid, randUnitVectors = model.randUnitVectors)
    m.set(m.bucketLength, model.getBucketLength)
  }

  override def sparkInputs(obj: BucketedRandomProjectionLSHModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: BucketedRandomProjectionLSHModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }

}

Source File: MaxAbsScalerOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.MaxAbsScalerModel
import org.apache.spark.ml.linalg.Vectors


class MaxAbsScalerOp extends SimpleSparkOp[MaxAbsScalerModel]{
  override val Model: OpModel[SparkBundleContext, MaxAbsScalerModel] = new OpModel[SparkBundleContext, MaxAbsScalerModel] {
    override val klazz: Class[MaxAbsScalerModel] = classOf[MaxAbsScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.max_abs_scaler

    override def store(model: Model, obj: MaxAbsScalerModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("maxAbs", Value.vector(obj.maxAbs.toArray))
  }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): MaxAbsScalerModel = {
      new MaxAbsScalerModel(uid = "",
        maxAbs = Vectors.dense(model.value("maxAbs").getTensor[Double].toArray))
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: MaxAbsScalerModel): MaxAbsScalerModel = {
    new MaxAbsScalerModel(uid = uid,
      maxAbs = model.maxAbs)
  }

  override def sparkInputs(obj: MaxAbsScalerModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: MaxAbsScalerModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
}

Source File: StandardScalerOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.bundle.dsl._
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.StandardScalerModel
import org.apache.spark.ml.linalg.Vectors


class StandardScalerOp extends SimpleSparkOp[StandardScalerModel] {
  override val Model: OpModel[SparkBundleContext, StandardScalerModel] = new OpModel[SparkBundleContext, StandardScalerModel] {
    override val klazz: Class[StandardScalerModel] = classOf[StandardScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.standard_scaler

    override def store(model: Model, obj: StandardScalerModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val mean = if(obj.getWithMean) Some(obj.mean.toArray) else None
      val std = if(obj.getWithStd) Some(obj.std.toArray) else None

      model.withValue("mean", mean.map(Value.vector[Double])).
        withValue("std", std.map(Value.vector[Double]))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): StandardScalerModel = {
      val std = model.getValue("std").map(_.getTensor[Double].toArray).map(Vectors.dense)
      val mean = model.getValue("mean").map(_.getTensor[Double].toArray).map(Vectors.dense)
      val size = std.map(_.size).orElse(mean.map(_.size)).get

      val m = new StandardScalerModel(uid = "",
        std = std.getOrElse(Vectors.sparse(size, Array(), Array())),
        mean = mean.getOrElse(Vectors.sparse(size, Array(), Array())))
      if (std.isEmpty) { m.set(m.withStd, false)} else {m.set(m.withStd, true)}
      if (mean.isEmpty) { m.set(m.withMean, false)} else {m.set(m.withMean, true)}
      m
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: StandardScalerModel): StandardScalerModel = {
    val m = new StandardScalerModel(uid = uid, std = model.std, mean = model.mean)
    if (model.isDefined(model.withMean)) { m.set(m.withMean, model.getWithMean) }
    if (model.isDefined(model.withStd)) { m.set(m.withStd, model.getWithStd) }
    m
  }

  override def sparkInputs(obj: StandardScalerModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: StandardScalerModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
}

Source File: MinMaxScalerOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.feature

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.feature.MinMaxScalerModel
import org.apache.spark.ml.linalg.Vectors


class MinMaxScalerOp extends SimpleSparkOp[MinMaxScalerModel] {
  override val Model: OpModel[SparkBundleContext, MinMaxScalerModel] = new OpModel[SparkBundleContext, MinMaxScalerModel] {
    override val klazz: Class[MinMaxScalerModel] = classOf[MinMaxScalerModel]

    override def opName: String = Bundle.BuiltinOps.feature.min_max_scaler

    override def store(model: Model, obj: MinMaxScalerModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("min", Value.vector(obj.originalMin.toArray)).
        withValue("max", Value.vector(obj.originalMax.toArray))
        .withValue("minValue", Value.double(obj.getMin))
        .withValue("maxValue", Value.double(obj.getMax))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): MinMaxScalerModel = {
      new MinMaxScalerModel(uid = "",
        originalMin = Vectors.dense(model.value("min").getTensor[Double].toArray),
        originalMax = Vectors.dense(model.value("max").getTensor[Double].toArray))
        .setMin(model.getValue("minValue").map(_.getDouble).getOrElse(0.0))
        .setMax(model.getValue("maxValue").map(_.getDouble).getOrElse(1.0))
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: MinMaxScalerModel): MinMaxScalerModel = {
    val m = new MinMaxScalerModel(uid = uid,
      originalMin = model.originalMin,
      originalMax = model.originalMax)
    if (model.isDefined(model.max)) { m.setMax(model.getMax)}
    if (model.isDefined(model.min)) { m.setMin(model.getMin)}
    m
  }

  override def sparkInputs(obj: MinMaxScalerModel): Seq[ParamSpec] = {
    Seq("input" -> obj.inputCol)
  }

  override def sparkOutputs(obj: MinMaxScalerModel): Seq[SimpleParamSpec] = {
    Seq("output" -> obj.outputCol)
  }
}

Source File: GaussianMixtureOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.clustering

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.{DenseTensor, Tensor}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.clustering.GaussianMixtureModel
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.stat.distribution.MultivariateGaussian


class GaussianMixtureOp extends SimpleSparkOp[GaussianMixtureModel] {
  override val Model: OpModel[SparkBundleContext, GaussianMixtureModel] = new OpModel[SparkBundleContext, GaussianMixtureModel] {
    override val klazz: Class[GaussianMixtureModel] = classOf[GaussianMixtureModel]

    override def opName: String = Bundle.BuiltinOps.clustering.gaussian_mixture

    override def store(model: Model, obj: GaussianMixtureModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val (rows, cols) = obj.gaussians.headOption.
        map(g => (g.cov.numRows, g.cov.numCols)).
        getOrElse((-1, -1))
      val (means, covs) = obj.gaussians.map(g => (g.mean, g.cov)).unzip

      model.withValue("means", Value.tensorList(means.map(_.toArray).map(Tensor.denseVector))).
        withValue("covs", Value.tensorList(covs.map(m => DenseTensor(m.toArray, Seq(m.numRows, m.numCols))))).
        withValue("weights", Value.doubleList(obj.weights.toSeq))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): GaussianMixtureModel = {
      val means = model.value("means").getTensorList[Double].map(values => Vectors.dense(values.toArray))
      val covs = model.value("covs").getTensorList[Double].map(values => Matrices.dense(values.dimensions.head, values.dimensions(1), values.toArray))
      val gaussians = means.zip(covs).map {
        case (mean, cov) => new MultivariateGaussian(mean, cov)
      }.toArray
      val weights = model.value("weights").getDoubleList.toArray

      new GaussianMixtureModel(uid = "",
        gaussians = gaussians,
        weights = weights)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: GaussianMixtureModel): GaussianMixtureModel = {
    new GaussianMixtureModel(uid = uid, weights = model.weights, gaussians = model.gaussians)
  }

  override def sparkInputs(obj: GaussianMixtureModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: GaussianMixtureModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol,
      "probability" -> obj.probabilityCol)
  }
}

Source File: MultiLayerPerceptronClassifierOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel
import org.apache.spark.ml.linalg.Vectors


class MultiLayerPerceptronClassifierOp extends SimpleSparkOp[MultilayerPerceptronClassificationModel] {
  override val Model: OpModel[SparkBundleContext, MultilayerPerceptronClassificationModel] = new OpModel[SparkBundleContext, MultilayerPerceptronClassificationModel] {
    override def opName: String = Bundle.BuiltinOps.classification.multi_layer_perceptron_classifier

    override val klazz: Class[MultilayerPerceptronClassificationModel] = classOf[MultilayerPerceptronClassificationModel]

    override def store(model: Model, obj: MultilayerPerceptronClassificationModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val thresholds = if(obj.isSet(obj.thresholds)) {
        Some(obj.getThresholds)
      } else None
      model.withValue("layers", Value.longList(obj.layers.map(_.toLong))).
        withValue("weights", Value.vector(obj.weights.toArray)).
        withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): MultilayerPerceptronClassificationModel = {
      val m = new MultilayerPerceptronClassificationModel(uid = "",
        layers = model.value("layers").getLongList.map(_.toInt).toArray,
        weights = Vectors.dense(model.value("weights").getTensor[Double].toArray))
      model.getValue("thresholds").
        map(t => m.setThresholds(t.getDoubleList.toArray)).
        getOrElse(m)
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: MultilayerPerceptronClassificationModel): MultilayerPerceptronClassificationModel = {
    val m = new MultilayerPerceptronClassificationModel(uid = uid,layers = model.layers, weights = model.weights)
    if (model.isSet(model.thresholds)) m.setThresholds(model.getThresholds)
    m
  }

  override def sparkInputs(obj: MultilayerPerceptronClassificationModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: MultilayerPerceptronClassificationModel):  Seq[SimpleParamSpec] = {
    Seq("raw_prediction" -> obj.rawPredictionCol,
      "probability" -> obj.probabilityCol,
      "prediction" -> obj.predictionCol)
  }
}

Source File: LogisticRegressionOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.op.OpModel
import ml.combust.bundle.dsl._
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.linalg.{Matrices, Vectors}


class LogisticRegressionOp extends SimpleSparkOp[LogisticRegressionModel] {

  private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5

  override val Model: OpModel[SparkBundleContext, LogisticRegressionModel] = new OpModel[SparkBundleContext, LogisticRegressionModel] {
    override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel]

    override def opName: String = Bundle.BuiltinOps.classification.logistic_regression

    override def store(model: Model, obj: LogisticRegressionModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val m = model.withValue("num_classes", Value.long(obj.numClasses))
      if(obj.numClasses > 2) {
        val cm = obj.coefficientMatrix
        val thresholds = if(obj.isSet(obj.thresholds)) {
          Some(obj.getThresholds)
        } else None
        m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))).
          withValue("intercept_vector", Value.vector(obj.interceptVector.toArray)).
          withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList))
      } else {
        m.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
          withValue("intercept", Value.double(obj.intercept)).
          withValue("threshold", Value.double(obj.getThreshold))
      }
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): LogisticRegressionModel = {
      val numClasses = model.value("num_classes").getLong
      val r = if(numClasses > 2) {
        val cmTensor = model.value("coefficient_matrix").getTensor[Double]
        val coefficientMatrix = Matrices.dense(cmTensor.dimensions.head, cmTensor.dimensions(1), cmTensor.toArray)
        val lr = new LogisticRegressionModel(uid = "",
          coefficientMatrix = coefficientMatrix,
          interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray),
          numClasses = numClasses.toInt,
          isMultinomial = true)
        model.getValue("thresholds").
          map(t => lr.setThresholds(t.getDoubleList.toArray)).
          getOrElse(lr)
      } else {
        val lr = new LogisticRegressionModel(uid = "",
          coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
          intercept = model.value("intercept").getDouble)

        // default threshold is 0.5 for both Spark and Scikit-learn
        val threshold = model.getValue("threshold")
          .map(value => value.getDouble)
          .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD)

        lr.setThreshold(threshold)
      }
      r
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: LogisticRegressionModel): LogisticRegressionModel = {
    val numClasses = model.numClasses
    val r = if (numClasses > 2) {
        val lr = new LogisticRegressionModel(uid = uid,
          coefficientMatrix = model.coefficientMatrix,
          interceptVector = model.interceptVector,
          numClasses = numClasses,
          isMultinomial = true)
        if(model.isDefined(model.thresholds)) { lr.setThresholds(model.getThresholds) }
        lr
    } else {
        val lr = new LogisticRegressionModel(uid = uid,
          coefficientMatrix = model.coefficientMatrix,
          interceptVector = model.interceptVector,
          numClasses = numClasses,
          isMultinomial = false)
        if(model.isDefined(model.threshold)) { lr.setThreshold(model.getThreshold) }
        lr
    }
    r
  }

  override def sparkInputs(obj: LogisticRegressionModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: LogisticRegressionModel): Seq[SimpleParamSpec] = {
    Seq("raw_prediction" -> obj.rawPredictionCol,
      "probability" -> obj.probabilityCol,
      "prediction" -> obj.predictionCol)
  }
}

Source File: NaiveBayesClassifierOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.classification

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.mleap.tensor.DenseTensor
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.classification.NaiveBayesModel
import org.apache.spark.ml.linalg.{Matrices, Vectors}


class NaiveBayesClassifierOp extends SimpleSparkOp[NaiveBayesModel] {
  override val Model: OpModel[SparkBundleContext, NaiveBayesModel] = new OpModel[SparkBundleContext, NaiveBayesModel] {
    override val klazz: Class[NaiveBayesModel] = classOf[NaiveBayesModel]

    override def opName: String = Bundle.BuiltinOps.classification.naive_bayes

    override def store(model: Model, obj: NaiveBayesModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val thresholds = if(obj.isSet(obj.thresholds)) {
        Some(obj.getThresholds)
      } else None
      model.withValue("num_features", Value.long(obj.numFeatures)).
        withValue("num_classes", Value.long(obj.numClasses)).
        withValue("pi", Value.vector(obj.pi.toArray)).
        withValue("theta", Value.tensor(DenseTensor(obj.theta.toArray, Seq(obj.theta.numRows, obj.theta.numCols)))).
        withValue("model_type", Value.string(obj.getModelType)).
        withValue("thresholds", thresholds.map(Value.doubleList(_)))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): NaiveBayesModel = {
      val theta = model.value("theta").getTensor[Double]
      val nb = new NaiveBayesModel(uid = "",
        pi = Vectors.dense(model.value("pi").getTensor[Double].toArray),
        theta = Matrices.dense(theta.dimensions.head, theta.dimensions(1), theta.toArray))
      val modelType = model.value("model_type").getString
      model.getValue("thresholds").map(t => nb.setThresholds(t.getDoubleList.toArray))
      nb.set(nb.modelType, modelType)
    }

  }

  override def sparkLoad(uid: String, shape: NodeShape, model: NaiveBayesModel): NaiveBayesModel = {
    val r = new NaiveBayesModel(uid = uid, pi = model.pi, theta = model.theta)
    if (model.isDefined(model.thresholds)) { r.setThresholds(model.getThresholds) }
    if (model.isDefined(model.modelType)) { r.set(r.modelType, model.getModelType)}
    r
  }

  override def sparkInputs(obj: NaiveBayesModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: NaiveBayesModel): Seq[SimpleParamSpec] = {
    Seq("raw_prediction" -> obj.rawPredictionCol,
      "probability" -> obj.probabilityCol,
      "prediction" -> obj.predictionCol)
  }
}

Source File: GeneralizedLinearRegressionOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.OpModel
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GeneralizedLinearRegressionModel


class GeneralizedLinearRegressionOp extends SimpleSparkOp[GeneralizedLinearRegressionModel] {
  override val Model: OpModel[SparkBundleContext, GeneralizedLinearRegressionModel] = new OpModel[SparkBundleContext, GeneralizedLinearRegressionModel] {
    override val klazz: Class[GeneralizedLinearRegressionModel] = classOf[GeneralizedLinearRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.generalized_linear_regression

    override def store(model: Model, obj: GeneralizedLinearRegressionModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      val modelWithoutLink = model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept)).
        withValue("family", Value.string(obj.getFamily))
      if (obj.isDefined(obj.link)) {
        modelWithoutLink.withValue("link", Value.string(obj.getLink))
      } else {
        modelWithoutLink
      }
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): GeneralizedLinearRegressionModel = {
      val m = new GeneralizedLinearRegressionModel(uid = "",
        coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble)
      m.set(m.family, model.value("family").getString)
      for (link <- model.getValue("link")) {
        m.set(m.link, link.getString)
      }
      m
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: GeneralizedLinearRegressionModel): GeneralizedLinearRegressionModel = {
    val m = new GeneralizedLinearRegressionModel(uid = uid,
      coefficients = model.coefficients,
      intercept = model.intercept)
    m.set(m.family, model.getFamily)
    if (model.isSet(model.link)) m.set(m.link, model.getLink)
    m
  }

  override def sparkInputs(obj: GeneralizedLinearRegressionModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: GeneralizedLinearRegressionModel): Seq[SimpleParamSpec] = {
    Seq("link_prediction" -> obj.linkPredictionCol,
      "prediction" -> obj.predictionCol)
  }
}

Source File: AFTSurvivalRegressionOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.bundle.dsl._
import ml.combust.bundle.op.{OpModel, OpNode}
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.regression.AFTSurvivalRegressionModel


class AFTSurvivalRegressionOp extends SimpleSparkOp[AFTSurvivalRegressionModel] {
  override val Model: OpModel[SparkBundleContext, AFTSurvivalRegressionModel] = new OpModel[SparkBundleContext, AFTSurvivalRegressionModel] {
    override val klazz: Class[AFTSurvivalRegressionModel] = classOf[AFTSurvivalRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.aft_survival_regression

    override def store(model: Model, obj: AFTSurvivalRegressionModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept)).
        withValue("quantile_probabilities", Value.doubleList(obj.getQuantileProbabilities)).
        withValue("scale", Value.double(obj.scale))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): AFTSurvivalRegressionModel = {
      new AFTSurvivalRegressionModel(uid = "",
        coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble,
        scale = model.value("scale").getDouble).
        setQuantileProbabilities(model.value("quantile_probabilities").getDoubleList.toArray)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: AFTSurvivalRegressionModel): AFTSurvivalRegressionModel = {
    new AFTSurvivalRegressionModel(uid = uid,
      coefficients = model.coefficients,
      intercept = model.intercept,
      scale = model.scale).setQuantileProbabilities(model.getQuantileProbabilities)
  }

  override def sparkInputs(obj: AFTSurvivalRegressionModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: AFTSurvivalRegressionModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol,
      "quantiles" -> obj.quantilesCol)
  }
}

Source File: LinearRegressionOp.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.bundle.ops.regression

import ml.combust.bundle.BundleContext
import ml.combust.bundle.op.{OpModel, OpNode}
import ml.combust.bundle.dsl._
import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.param.Param
import org.apache.spark.ml.regression.LinearRegressionModel


class LinearRegressionOp extends SimpleSparkOp[LinearRegressionModel] {
  override val Model: OpModel[SparkBundleContext, LinearRegressionModel] = new OpModel[SparkBundleContext, LinearRegressionModel] {
    override val klazz: Class[LinearRegressionModel] = classOf[LinearRegressionModel]

    override def opName: String = Bundle.BuiltinOps.regression.linear_regression

    override def store(model: Model, obj: LinearRegressionModel)
                      (implicit context: BundleContext[SparkBundleContext]): Model = {
      model.withValue("coefficients", Value.vector(obj.coefficients.toArray)).
        withValue("intercept", Value.double(obj.intercept))
    }

    override def load(model: Model)
                     (implicit context: BundleContext[SparkBundleContext]): LinearRegressionModel = {
      new LinearRegressionModel(uid = "",
        coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray),
        intercept = model.value("intercept").getDouble)
    }
  }

  override def sparkLoad(uid: String, shape: NodeShape, model: LinearRegressionModel): LinearRegressionModel = {
    new LinearRegressionModel(uid = uid,
      coefficients = model.coefficients,
      intercept = model.intercept)
  }

  override def sparkInputs(obj: LinearRegressionModel): Seq[ParamSpec] = {
    Seq("features" -> obj.featuresCol)
  }

  override def sparkOutputs(obj: LinearRegressionModel): Seq[SimpleParamSpec] = {
    Seq("prediction" -> obj.predictionCol)
  }
}

Source File: LinearSVCParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.classification.parity

import org.apache.spark.ml.classification.LinearSVCModel
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql.DataFrame

class LinearSVCParitySpec extends SparkParityBase
{
    override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti")
    override val sparkTransformer: Transformer = new Pipeline()
      .setStages(Array(
        new StringIndexer().
            setInputCol("fico_score_group_fnl").
            setOutputCol("fico_index"),
        new VectorAssembler().
                setInputCols(Array("fico_index", "dti")).
                setOutputCol("features"),
        new LinearSVCModel("linear_svc",
            Vectors.dense(0.44, 0.77),
            0.66).setThreshold(0.5).setFeaturesCol("features")))
      .fit(dataset)

    // The string order type is ignored, because once the transformer is built based on some order type, we need to serialize only the string to index map
    // but not the order in which it has to index. This value we can ignore while we check the transformer values.
    override val unserializedParams: Set[String] = Set("stringOrderType")
}

Source File: LogisticRegressionParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.parity.classification

import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler}
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql.DataFrame
import org.apache.spark.ml.linalg.Vectors


class LogisticRegressionParitySpec extends SparkParityBase {
  override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti")
  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer().
    setInputCol("fico_score_group_fnl").
    setOutputCol("fico_index"),
    new VectorAssembler().
      setInputCols(Array("fico_index", "dti")).
      setOutputCol("features"),
    new LogisticRegressionModel(uid = "logr",
      coefficients = Vectors.dense(0.44, 0.77),
      intercept = 0.66).setThreshold(0.7).setFeaturesCol("features"))).fit(dataset)

  override val unserializedParams = Set("stringOrderType")
}

Source File: MultinomialLogisticRegressionParitySpec.scala From mleap with Apache License 2.0

5 votes

package org.apache.spark.ml.parity.classification

import org.apache.spark.ml.classification.LogisticRegressionModel
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.parity.SparkParityBase
import org.apache.spark.ml.{Pipeline, Transformer}
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType}

class MultinomialLogisticRegressionParitySpec extends SparkParityBase {

  val labels = Seq(0.0, 1.0, 2.0, 0.0, 1.0, 2.0)
  val ages = Seq(15, 30, 40, 50, 15, 80)
  val heights = Seq(175, 190, 155, 160, 170, 180)
  val weights = Seq(67, 100, 57, 56, 56, 88)

  val rows = spark.sparkContext.parallelize(Seq.tabulate(6) { i => Row(labels(i), ages(i), heights(i), weights(i)) })
  val schema = new StructType().add("label", DoubleType, nullable = false)
    .add("age", IntegerType, nullable = false)
    .add("height", IntegerType, nullable = false)
    .add("weight", IntegerType, nullable = false)

  override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema)

  override val sparkTransformer: Transformer = new Pipeline().setStages(Array(
    new VectorAssembler().
      setInputCols(Array("age", "height", "weight")).
      setOutputCol("features"),
    new LogisticRegressionModel(uid = "logr", 
      coefficientMatrix = Matrices.dense(3, 3, Array(-1.3920551604166562, -0.13119545493644366, 1.5232506153530998, 0.3129112131192873, -0.21959056436528473, -0.09332064875400257, -0.24696506013528507, 0.6122879917796569, -0.36532293164437174)),
      interceptVector = Vectors.dense(0.4965574044951358, -2.1486146169780063, 1.6520572124828703),
      numClasses = 3, isMultinomial = true))).fit(dataset)
}

Source File: ExecuteTransformSpec.scala From mleap with Apache License 2.0

5 votes

package ml.combust.mleap.executor

import ml.combust.mleap.core.feature.VectorAssemblerModel
import ml.combust.mleap.core.regression.LinearRegressionModel
import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row}
import ml.combust.mleap.runtime.transformer.{Pipeline, PipelineModel}
import ml.combust.mleap.runtime.transformer.feature.VectorAssembler
import ml.combust.mleap.runtime.transformer.regression.LinearRegression
import ml.combust.mleap.tensor.Tensor
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.{FunSpec, Matchers}
import ml.combust.mleap.core.types._
import org.scalatest.concurrent.ScalaFutures

import scala.concurrent.ExecutionContext.Implicits.global
import scala.concurrent.Future
import scala.util.{Success, Try}

class ExecuteTransformSpec extends FunSpec with ScalaFutures with Matchers {

  describe("execute transform") {

    val pipeline = Pipeline("pipeline", NodeShape(),
      PipelineModel(Seq(
        VectorAssembler(shape = NodeShape().withInput("input0", "first_double").
          withInput("input1", "second_double").
          withStandardOutput("features"),
        model = VectorAssemblerModel(Seq(ScalarShape(), ScalarShape()))),
        LinearRegression(shape = NodeShape.regression(),
        model = LinearRegressionModel(Vectors.dense(2.0, 2.0), 5.0)))))

    val input = DefaultLeapFrame(StructType(Seq(StructField("first_double", ScalarType.Double),
      StructField("second_double" -> ScalarType.Double))).get,
      Seq(Row(20.0, 10.0)))

    it("transforms successfully a leap frame in strict mode") {
      val result = ExecuteTransform(pipeline, input, TransformOptions(Some(Seq("features", "prediction")), SelectMode.Strict)).
        flatMap(Future.fromTry)

      whenReady(result) {
         frame => {
           val data = frame.collect().head

           assert(frame.schema.fields.length == 2)
           assert(frame.schema.indexOf("features").get == 0)
           assert(data.getTensor(0) == Tensor.denseVector(Array(20.0, 10.0)))
           assert(data.getDouble(1) == 65.0)
         }
      }
    }

    it("transforms successfully a leap frame with default options") {
      val result = ExecuteTransform(pipeline, input, TransformOptions.default).flatMap(Future.fromTry)
      whenReady(result) {
        frame => assert(frame.schema.hasField("prediction"))
      }
    }

    it("throws exception when transforming and selecting a missing field in strict mode") {
      val result = ExecuteTransform(pipeline, input, TransformOptions(Some(Seq("features", "prediction", "does-not-exist")), SelectMode.Strict)).
        flatMap(Future.fromTry)

      whenReady(result.failed) {
        ex => ex shouldBe a [IllegalArgumentException]
      }
    }

    it("transforms successfully a leap frame in relaxed mode, ignoring unknown fields") {
      val result = ExecuteTransform(pipeline, input, TransformOptions(Some(Seq("features", "prediction", "does-not-exist")), SelectMode.Relaxed)).
        flatMap(Future.fromTry)

      whenReady(result) {
        frame => {
          val data = frame.collect().head

          assert(frame.schema.fields.length == 2)
          assert(frame.schema.indexOf("features").get == 0)
          assert(data.getTensor(0) == Tensor.denseVector(Array(20.0, 10.0)))
          assert(data.getDouble(1) == 65.0)
        }
      }
    }

    it("throws exception when transforming throws exception") {
      val invalidPipeline = Pipeline("pipeline", NodeShape(),
        PipelineModel(Seq(
          VectorAssembler(shape = NodeShape().withInput("input0", "first_double").
            withInput("input1", "second_double").
            withStandardOutput("features"),
            model = VectorAssemblerModel(Seq(ScalarShape(), ScalarShape()))),
          LinearRegression(shape = NodeShape.regression(),
            // missing coefficient for LR
            model = LinearRegressionModel(Vectors.dense(2.0), 5.0)))))
      val result = ExecuteTransform(invalidPipeline, input, TransformOptions.default).flatMap(Future.fromTry)

      whenReady(result.failed) {
        ex => ex shouldBe a [IllegalArgumentException]
      }
    }
  }
}

Source File: SpillTreeSpec.scala From spark-knn with Apache License 2.0

5 votes

package org.apache.spark.ml.knn

import org.apache.spark.ml.knn.KNN.RowWithVector
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.funspec.AnyFunSpec
import org.scalatest.matchers.should.Matchers

class SpillTreeSpec extends AnyFunSpec with Matchers {
  describe("SpillTree") {
    val origin = Vectors.dense(0, 0)
    describe("can be constructed with empty data") {
      val tree = SpillTree.build(IndexedSeq.empty[RowWithVector], tau = 0.0)
      it("iterator should be empty") {
        tree.iterator shouldBe empty
      }
      it("should return empty when queried") {
        tree.query(origin) shouldBe empty
      }
      it("should have zero leaf") {
        tree.leafCount shouldBe 0
      }
    }

    describe("with equidistant points on a circle") {
      val n = 12
      val points = (1 to n).map {
        i => new RowWithVector(Vectors.dense(math.sin(2 * math.Pi * i / n), math.cos(2 * math.Pi * i / n)), null)
      }
      val leafSize = n / 4
      describe("built with tau = 0.0") {
        val tree = SpillTree.build(points, leafSize = leafSize, tau = 0.0)
        it("should have correct size") {
          tree.size shouldBe points.size
        }
        it("should return an iterator that goes through all data points") {
          tree.iterator.toIterable should contain theSameElementsAs points
        }
        it("can return more than min leaf size") {
          val k = leafSize + 5
          points.foreach(v => tree.query(v.vector, k).size shouldBe k)
        }
      }
      describe("built with tau = 0.5") {
        val tree = SpillTree.build(points, leafSize = leafSize, tau = 0.5)
        it("should have correct size") {
          tree.size shouldBe points.size
        }
        it("should return an iterator that goes through all data points") {
          tree.iterator.toIterable should contain theSameElementsAs points
        }
        it("works for every point to identify itself") {
          points.foreach(v => tree.query(v.vector, 1).head._1 shouldBe v)
        }
        it("has consistent size and iterator") {
          def check(tree: Tree): Unit = {
            tree match {
              case t: SpillTree =>
                t.iterator.size shouldBe t.size

                check(t.leftChild)
                check(t.rightChild)
              case _ =>
            }
          }
          check(tree)
        }
      }
    }
  }

  describe("HybridTree") {
    val origin = Vectors.dense(0, 0)
    describe("can be constructed with empty data") {
      val tree = HybridTree.build(IndexedSeq.empty[RowWithVector], tau = 0.0)
      it("iterator should be empty") {
        tree.iterator shouldBe empty
      }
      it("should return empty when queried") {
        tree.query(origin) shouldBe empty
      }
      it("should have zero leaf") {
        tree.leafCount shouldBe 0
      }
    }

    describe("with equidistant points on a circle") {
      val n = 12
      val points = (1 to n).map {
        i => new RowWithVector(Vectors.dense(math.sin(2 * math.Pi * i / n), math.cos(2 * math.Pi * i / n)), null)
      }
      val leafSize = n / 4
      val tree = HybridTree.build(points, leafSize = leafSize, tau = 0.5)
      it("should have correct size") {
        tree.size shouldBe points.size
      }
      it("should return an iterator that goes through all data points") {
        tree.iterator.toIterable should contain theSameElementsAs points
      }
    }
  }
}

Source File: MetricTreeSpec.scala From spark-knn with Apache License 2.0

5 votes

package org.apache.spark.ml.knn

import org.apache.spark.ml.knn.KNN.{RowWithVector, VectorWithNorm}
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.funspec.AnyFunSpec
import org.scalatest.matchers.should.Matchers

class MetricTreeSpec extends AnyFunSpec with Matchers {

  describe("MetricTree") {
    val origin = Vectors.dense(0, 0)
    describe("can be constructed with empty data") {
      val tree = MetricTree.build(IndexedSeq.empty[RowWithVector])
      it("iterator should be empty") {
        tree.iterator shouldBe empty
      }
      it("should return empty when queried") {
        tree.query(origin) shouldBe empty
      }
      it("should have zero leaf") {
        tree.leafCount shouldBe 0
      }
    }

    describe("without duplicates") {
      val data = (-5 to 5).flatMap(i => (-5 to 5).map(j => new RowWithVector(Vectors.dense(i, j), null)))
      List(1, data.size / 2, data.size, data.size * 2).foreach {
        leafSize =>
          describe(s"with leafSize of $leafSize") {
            val tree = MetricTree.build(data, leafSize)
            it("should have correct size") {
              tree.size shouldBe data.size
            }
            it("should return an iterator that goes through all data points") {
              tree.iterator.toIterable should contain theSameElementsAs data
            }
            it("should return vector itself for those in input set") {
              data.foreach(v => tree.query(v.vector, 1).head._1 shouldBe v)
            }
            it("should return nearest neighbors correctly") {
              tree.query(origin, 5).map(_._1.vector.vector) should contain theSameElementsAs Set(
                Vectors.dense(-1, 0),
                Vectors.dense(1, 0),
                Vectors.dense(0, -1),
                Vectors.dense(0, 1),
                Vectors.dense(0, 0)
              )
              tree.query(origin, 9).map(_._1.vector.vector) should contain theSameElementsAs
                (-1 to 1).flatMap(i => (-1 to 1).map(j => Vectors.dense(i, j)))
            }
            it("should have correct number of leaves") {
              tree.leafCount shouldBe (tree.size / leafSize.toDouble).ceil
            }
            it("all points should fall with radius of pivot") {
              def check(tree: Tree): Unit = {
                tree.iterator.foreach(_.vector.fastDistance(tree.pivot) <= tree.radius)
                tree match {
                  case t: MetricTree =>
                    check(t.leftChild)
                    check(t.rightChild)
                  case _ =>
                }
              }
              check(tree)
            }
          }
      }
    }

    describe("with duplicates") {
      val data = (Vectors.dense(2.0, 0.0) +: Array.fill(5)(Vectors.dense(0.0, 1.0))).map(new RowWithVector(_, null))
      val tree = MetricTree.build(data)
      it("should have 2 leaves") {
        tree.leafCount shouldBe 2
      }
      it("should return all available duplicated candidates") {
        val res = tree.query(origin, 5).map(_._1.vector.vector)
        res.size shouldBe 5
        res.toSet should contain theSameElementsAs Array(Vectors.dense(0.0, 1.0))
      }
    }

    describe("for other corner cases") {
      it("queryCost should work on Empty") {
        Empty.distance(new KNNCandidates(new VectorWithNorm(origin), 1)) shouldBe 0
        Empty.distance(new VectorWithNorm(origin)) shouldBe 0
      }
    }
  }
}

Source File: MLPipelineTrackerIT.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas.ml

import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.scalatest.Matchers
import com.hortonworks.spark.atlas._
import com.hortonworks.spark.atlas.types._
import com.hortonworks.spark.atlas.TestUtils._

class MLPipelineTrackerIT extends BaseResourceIT with Matchers with WithHiveSupport {
  private val atlasClient = new RestAtlasClient(atlasClientConf)

  def clusterName: String = atlasClientConf.get(AtlasClientConf.CLUSTER_NAME)

  def getTableEntity(tableName: String): SACAtlasEntityWithDependencies = {
    val dbDefinition = createDB("db1", "hdfs:///test/db/db1")
    val sd = createStorageFormat()
    val schema = new StructType()
      .add("user", StringType, false)
      .add("age", IntegerType, true)
    val tableDefinition = createTable("db1", s"$tableName", schema, sd)
    internal.sparkTableToEntity(tableDefinition, clusterName, Some(dbDefinition))
  }

  // Enable it to run integrated test.
  it("pipeline and pipeline model") {
    val uri = "hdfs://"
    val pipelineDir = "tmp/pipeline"
    val modelDir = "tmp/model"

    val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir)
    val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir)

    atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, modelDirEntity))

    val df = sparkSession.createDataFrame(Seq(
      (1, Vectors.dense(0.0, 1.0, 4.0), 1.0),
      (2, Vectors.dense(1.0, 0.0, 4.0), 2.0),
      (3, Vectors.dense(1.0, 0.0, 5.0), 3.0),
      (4, Vectors.dense(0.0, 0.0, 5.0), 4.0)
    )).toDF("id", "features", "label")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("features_scaled")
      .setMin(0.0)
      .setMax(3.0)
    val pipeline = new Pipeline().setStages(Array(scaler))

    val model = pipeline.fit(df)

    pipeline.write.overwrite().save(pipelineDir)

    val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity)

    atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, pipelineEntity))

    val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity)

    atlasClient.createEntitiesWithDependencies(Seq(modelDirEntity, modelEntity))

    val tableEntities1 = getTableEntity("chris1")
    val tableEntities2 = getTableEntity("chris2")

    atlasClient.createEntitiesWithDependencies(tableEntities1)
    atlasClient.createEntitiesWithDependencies(tableEntities2)

  }
}

Source File: MLAtlasEntityUtilsSuite.scala From spark-atlas-connector with Apache License 2.0

5 votes

package com.hortonworks.spark.atlas.types

import java.io.File

import org.apache.atlas.{AtlasClient, AtlasConstants}
import org.apache.atlas.model.instance.AtlasEntity
import org.apache.commons.io.FileUtils
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.types.{IntegerType, StringType, StructType}
import org.scalatest.{FunSuite, Matchers}
import com.hortonworks.spark.atlas.TestUtils._
import com.hortonworks.spark.atlas.{AtlasUtils, WithHiveSupport}

class MLAtlasEntityUtilsSuite extends FunSuite with Matchers with WithHiveSupport {

  def getTableEntity(tableName: String): AtlasEntity = {
    val dbDefinition = createDB("db1", "hdfs:///test/db/db1")
    val sd = createStorageFormat()
    val schema = new StructType()
      .add("user", StringType, false)
      .add("age", IntegerType, true)
    val tableDefinition = createTable("db1", s"$tableName", schema, sd)

    val tableEntities = internal.sparkTableToEntity(
      tableDefinition, AtlasConstants.DEFAULT_CLUSTER_NAME, Some(dbDefinition))
    val tableEntity = tableEntities.entity

    tableEntity
  }

  test("pipeline, pipeline model, fit and transform") {
    val uri = "/"
    val pipelineDir = "tmp/pipeline"
    val modelDir = "tmp/model"

    val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir)
    pipelineDirEntity.entity.getAttribute("uri") should be (uri)
    pipelineDirEntity.entity.getAttribute("directory") should be (pipelineDir)
    pipelineDirEntity.dependencies.length should be (0)

    val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir)
    modelDirEntity.entity.getAttribute("uri") should be (uri)
    modelDirEntity.entity.getAttribute("directory") should be (modelDir)
    modelDirEntity.dependencies.length should be (0)

    val df = sparkSession.createDataFrame(Seq(
      (1, Vectors.dense(0.0, 1.0, 4.0), 1.0),
      (2, Vectors.dense(1.0, 0.0, 4.0), 2.0),
      (3, Vectors.dense(1.0, 0.0, 5.0), 3.0),
      (4, Vectors.dense(0.0, 0.0, 5.0), 4.0)
    )).toDF("id", "features", "label")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("features_scaled")
      .setMin(0.0)
      .setMax(3.0)
    val pipeline = new Pipeline().setStages(Array(scaler))

    val model = pipeline.fit(df)

    pipeline.write.overwrite().save(pipelineDir)

    val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity)
    pipelineEntity.entity.getTypeName should be (metadata.ML_PIPELINE_TYPE_STRING)
    pipelineEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (
      pipeline.uid)
    pipelineEntity.entity.getAttribute("name") should be (pipeline.uid)
    pipelineEntity.entity.getRelationshipAttribute("directory") should be (
      AtlasUtils.entityToReference(pipelineDirEntity.entity, useGuid = false))
    pipelineEntity.dependencies should be (Seq(pipelineDirEntity))

    val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity)
    val modelUid = model.uid.replaceAll("pipeline", "model")
    modelEntity.entity.getTypeName should be (metadata.ML_MODEL_TYPE_STRING)
    modelEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (modelUid)
    modelEntity.entity.getAttribute("name") should be (modelUid)
    modelEntity.entity.getRelationshipAttribute("directory") should be (
      AtlasUtils.entityToReference(modelDirEntity.entity, useGuid = false))

    modelEntity.dependencies should be (Seq(modelDirEntity))

    FileUtils.deleteDirectory(new File("tmp"))
  }
}

Source File: RBFKernel.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.commons.kernel

import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV}
import breeze.numerics.{exp, inf}
import org.apache.spark.ml.linalg.{Vector, Vectors}


class RBFKernel(private var sigma: Double,
                private val lower: Double = 1e-6,
                private val upper: Double = inf) extends TrainDatasetBearingKernel
  with NoiselessKernel with SameOnDiagonalKernel {
  def this() = this(1)

  override def setHyperparameters(value: BDV[Double]): RBFKernel.this.type = {
    sigma = value(0)
    this
  }

  override def getHyperparameters: BDV[Double] = BDV[Double](sigma)

  override def numberOfHyperparameters: Int = 1

  private def getSigma() = sigma

  private var squaredDistances: Option[BDM[Double]] = None

  override def hyperparameterBoundaries: (BDV[Double], BDV[Double]) = {
    (BDV[Double](lower), BDV[Double](upper))
  }

  override def setTrainingVectors(vectors: Array[Vector]): this.type = {
    super.setTrainingVectors(vectors)
    val sqd = BDM.zeros[Double](vectors.length, vectors.length)
    for (i <- vectors.indices; j <- 0 to i) {
      val dist = Vectors.sqdist(vectors(i), vectors(j))
      sqd(i, j) = dist
      sqd(j, i) = dist
    }

    squaredDistances = Some(sqd)
    this
  }

  override def trainingKernel(): BDM[Double] = {
    val result = squaredDistances.getOrElse(throw new TrainingVectorsNotInitializedException) / (-2d * sqr(getSigma()))
    exp.inPlace(result)
    result
  }

  override def trainingKernelAndDerivative(): (BDM[Double], Array[BDM[Double]]) = {
    val sqd = squaredDistances.getOrElse(throw new TrainingVectorsNotInitializedException)

    val kernel = trainingKernel()
    val derivative = sqd *:* kernel
    derivative /= cube(getSigma())

    (kernel, Array(derivative))
  }

  override def crossKernel(test: Array[Vector]): BDM[Double] = {
    val train = getTrainingVectors
    val result = BDM.zeros[Double](test.length, train.length)

    for (i <- test.indices; j <- train.indices)
      result(i, j) = Vectors.sqdist(test(i), train(j)) / (-2d * sqr(getSigma()))

    exp.inPlace(result)

    result
  }

  override def selfKernel(test: Vector): Double = 1d

  private def sqr(x: Double) = x * x

  private def cube(x: Double) = x * x * x

  override def toString = f"RBFKernel(sigma=$sigma%1.1e)"
}

Source File: Scaling.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.commons.util

import breeze.linalg.DenseVector
import breeze.numerics.sqrt
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.rdd.RDD

private[ml] trait Scaling {
  def scale(data: RDD[LabeledPoint]) = {
    val x = data.map(x => DenseVector(x.features.toArray)).cache()
    val y = data.map(_.label)
    val n = x.count().toDouble
    val mean = x.reduce(_ + _) / n
    val centered = x.map(_ - mean).cache()
    val variance = centered.map(xx => xx *:* xx).reduce(_ + _) / n
    x.unpersist()
    val varianceNoZeroes = variance.map(v => if (v > 0d) v else 1d)
    val scaled = centered.map(_ /:/ sqrt(varianceNoZeroes)).map(_.toArray).map(Vectors.dense).zip(y).map {
      case(f, y) => LabeledPoint(y, f)
    }.cache()
    if (scaled.count() > 0) // ensure scaled is materialized
      centered.unpersist()
    scaled
  }
}

Source File: MNIST.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.classification.examples

import org.apache.spark.ml.classification.GaussianProcessClassifier
import org.apache.spark.ml.commons.kernel.RBFKernel
import org.apache.spark.ml.commons.util.Scaling
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession

object MNIST extends App with Scaling {
  val name = "MNIST"
  val spark = SparkSession.builder().appName(name).master(s"local[${args(0)}]").getOrCreate()
  val path = args(1)
  val parallelism = args(0).toInt * 4
  val forExpert = args(2).toInt
  val activeSet = args(3).toInt

  import spark.sqlContext.implicits._
  val dataset = (scale _ andThen labels201 _) (spark.read.format("csv").load(path).rdd.map(row => {
    val features = Vectors.dense((1 until row.length).map("_c" + _).map(row.getAs[String]).map(_.toDouble).toArray)
    val label = row.getAs[String]("_c0").toDouble
    LabeledPoint(label, features)
  }).cache()).toDF.repartition(parallelism).cache()

  val gp = new GaussianProcessClassifier()
    .setDatasetSizeForExpert(forExpert)
    .setActiveSetSize(activeSet)
    .setKernel(() => new RBFKernel(10))
    .setTol(1e-3)

  val cv = new TrainValidationSplit()
    .setEstimator(gp)
    .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("accuracy"))
    .setEstimatorParamMaps(new ParamGridBuilder().build())
    .setTrainRatio(0.8)

  println("Accuracy: " + cv.fit(dataset).validationMetrics.toList)

  def labels201(data: RDD[LabeledPoint]) : RDD[LabeledPoint] = {
    val old2new = data.map(_.label).distinct().collect().zipWithIndex.toMap
    data.map(lp => LabeledPoint(old2new(lp.label), lp.features))
  }
}

Source File: Iris.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.classification.examples

import org.apache.spark.ml.classification.{GaussianProcessClassifier, OneVsRest}
import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.sql.SparkSession

object Iris extends App  {
  val name = "Iris"
  val spark = SparkSession.builder().appName(name).master("local[4]").getOrCreate()

  import spark.sqlContext.implicits._

  val name2indx = Map("Iris-versicolor" -> 0, "Iris-setosa" -> 1, "Iris-virginica" -> 2)

  val dataset = spark.read.format("csv").load("data/iris.csv").rdd.map(row => {
    val features = Vectors.dense(Array("_c0", "_c1", "_c2", "_c3")
      .map(col => row.getAs[String](col).toDouble))

    val label = name2indx(row.getAs[String]("_c4"))
    LabeledPoint(label, features)
  }).toDF

  val gp = new GaussianProcessClassifier().setDatasetSizeForExpert(20).setActiveSetSize(30)
  val ovr = new OneVsRest().setClassifier(gp)

  val cv = new CrossValidator()
    .setEstimator(ovr)
    .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("accuracy"))
    .setEstimatorParamMaps(new ParamGridBuilder().build())
    .setNumFolds(10)

  println("Accuracy: " + cv.fit(dataset).avgMetrics.toList)
}

Source File: PerformanceBenchmark.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.regression.benchmark

import breeze.linalg.{sum, DenseMatrix => BDM, DenseVector => BDV, _}
import breeze.numerics.sin
import org.apache.spark.ml.commons.kernel.RBFKernel
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GaussianProcessRegression
import org.apache.spark.sql.SparkSession

import scala.util.Random

object PerformanceBenchmark extends App {
  val spark = SparkSession.builder()
    .appName("bench")
    .master(s"local[${args(0)}]").getOrCreate()
  import spark.sqlContext.implicits._

  val sampleSize = args(2).toInt
  val nFeatures = 3
  val parallelism = args(0).toInt * 4
  val expertSampleSize = args(1).toInt

  val instancesRDD = spark.sparkContext.parallelize(0 until parallelism).flatMap(index => {
    val random = new Random(13 * index)
    val X = BDM.create(sampleSize/parallelism,
      nFeatures,
      Array.fill(sampleSize * nFeatures/parallelism)(random.nextDouble()))
    val Y = sin(sum(X(*, ::)) / 1000d).toArray

     (0 until X.rows).map{ i=>
      val x = X(i, ::)
      val y = Y(i)
      LabeledPoint(y, Vectors.dense(x.t.toArray))
    }
  })

  val instances = instancesRDD.toDF.cache()
  instances.count()

  val gp = new GaussianProcessRegression()
    .setKernel(() => new RBFKernel(0.1))
    .setDatasetSizeForExpert(expertSampleSize)
    .setActiveSetSize(expertSampleSize)
    .setSeed(13)
    .setSigma2(1e-3)

  time(gp.fit(instances))


  def time[T](f: => T): T = {
    val start = System.currentTimeMillis()
    val result = f
    println("TIME: " + (System.currentTimeMillis() - start))
    result
  }
}

Source File: Synthetics.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.regression.examples

import breeze.linalg._
import breeze.numerics._
import org.apache.spark.ml.commons.KMeansActiveSetProvider
import org.apache.spark.ml.commons.kernel.{RBFKernel, WhiteNoiseKernel, _}
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GaussianProcessRegression

object Synthetics extends App with GPExample {
  import spark.sqlContext.implicits._

  override def name = "Synthetics"

  val noiseVar = 0.01
  val g = breeze.stats.distributions.Gaussian(0, math.sqrt(noiseVar))

  val X = linspace(0, 1, length = 2000).toDenseMatrix
  val Y = sin(X).toArray.map(y => y + g.sample())

  val instances = spark.sparkContext.parallelize(X.toArray.zip(Y).map { case(v, y) =>
    LabeledPoint(y, Vectors.dense(Array(v)))}).toDF

  val gp = new GaussianProcessRegression()
    .setKernel(() => 1*new RBFKernel(0.1, 1e-6, 10) + WhiteNoiseKernel(0.5, 0, 1))
    .setDatasetSizeForExpert(100)
    .setActiveSetProvider(new KMeansActiveSetProvider())
    .setActiveSetSize(100)
    .setSeed(13)
    .setSigma2(1e-3)

  cv(gp, instances, 0.11)
}

Source File: Airfoil.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.regression.examples

import org.apache.spark.ml.commons.kernel.{ARDRBFKernel, _}
import org.apache.spark.ml.commons.util.Scaling
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.GaussianProcessRegression

object Airfoil extends App with GPExample with Scaling {
  import spark.sqlContext.implicits._

  override def name = "Airfoil"

  val airfoil = readSCV("data/airfoil.csv")

  val scaled = scale(airfoil).toDF

  val gp = new GaussianProcessRegression()
    .setActiveSetSize(1000)
    .setSigma2(1e-4)
    .setKernel(() => 1 * new ARDRBFKernel(5) + 1.const * new EyeKernel)

  cv(gp, scaled, 2.1)

  def readSCV(path : String) = {
    spark.read.format("csv").load(path).rdd.map(row => {
      val features = Vectors.dense(Array("_c0", "_c1", "_c2", "_c3", "_c4")
        .map(col => row.getAs[String](col).toDouble))
      LabeledPoint(row.getAs[String]("_c5").toDouble, features)
    })
  }
}

Source File: ARDRBFKernelTest.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.commons.kernel

import breeze.linalg.{all, DenseMatrix => BDM, DenseVector => BDV}
import breeze.numerics.abs
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSuite

class ARDRBFKernelTest extends FunSuite {
  private val dataset = Array(Array(1d, 2d), Array(2d, 3d), Array(5d, 7d)).map(Vectors.dense)

  private def computationalDerivative(beta: BDV[Double], h: Double): BDM[Double] = {
    val left = new ARDRBFKernel(beta - h)
    val right = new ARDRBFKernel(beta + h)

    left.setTrainingVectors(dataset)
    right.setTrainingVectors(dataset)

    (right.trainingKernel() - left.trainingKernel()) / (2 * h)
  }

  test("being called after `setTrainingVector`," +
    " `derivative` should return the correct kernel matrix derivative") {
    val beta = BDV[Double](0.2, 0.3)
    val ard = new ARDRBFKernel(beta)
    ard.setTrainingVectors(dataset)

    val analytical = ard.trainingKernelAndDerivative()._2.reduce(_ + _)
    val computational = computationalDerivative(beta, 1e-3)

    assert(all(abs(analytical - computational) <:< 1e-3))
  }

}

Source File: RBFKernelTest.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.commons.kernel

import breeze.linalg.{DenseMatrix, DenseVector, all}
import breeze.numerics.abs
import org.apache.spark.ml.linalg.Vectors
import org.scalatest.FunSuite

class RBFKernelTest extends FunSuite {
  test("Calling `trainingKernel` before `setTrainingVectors` " +
    "yields `TrainingVectorsNotInitializedException") {
    val rbf = new RBFKernel()

    assertThrows[TrainingVectorsNotInitializedException] {
      rbf.trainingKernel()
    }
  }

  test("Calling `derivative` before `setTrainingVectors` " +
    "yields `TrainingVectorsNotInitializedException") {
    val rbf = new RBFKernel()

    assertThrows[TrainingVectorsNotInitializedException] {
      rbf.trainingKernelAndDerivative()
    }
  }

  private val dataset = Array(Array(1d, 2d), Array(2d, 3d), Array(5d, 7d)).map(Vectors.dense)

  test("being called after `setTrainingVector`," +
    " `trainingKernel` should return the correct kernel matrix") {
    val rbf = new RBFKernel(math.sqrt(0.2))
    rbf.setTrainingVectors(dataset)

    val correctKernelMatrix = DenseMatrix((1.000000e+00, 6.737947e-03, 3.053624e-45),
                                          (6.737947e-03, 1.000000e+00, 7.187782e-28),
                                          (3.053624e-45, 7.187782e-28, 1.000000e+00))

    assert(all(abs(rbf.trainingKernel() - correctKernelMatrix) <:< 1e-4))
  }

  private def computationalDerivative(sigma: Double, h: Double) = {
    val rbfLeft = new RBFKernel(sigma - h)
    val rbfRight = new RBFKernel(sigma + h)

    rbfLeft.setTrainingVectors(dataset)
    rbfRight.setTrainingVectors(dataset)

    (rbfRight.trainingKernel() - rbfLeft.trainingKernel()) / (2 * h)
  }

  test("being called after `setTrainingVector`," +
    " `derivative` should return the correct kernel matrix derivative") {
    val rbf = new RBFKernel(0.2)
    rbf.setTrainingVectors(dataset)

    val analytical = rbf.trainingKernelAndDerivative()._2(0)
    val computational = computationalDerivative(0.2, 1e-3)

    assert(all(abs(analytical - computational) <:< 1e-3))
  }

  test("crossKernel returns correct kernel") {
    val rbf = new RBFKernel(math.sqrt(0.2))
    rbf.setTrainingVectors(dataset.drop(1))
    val crossKernel = rbf.crossKernel(dataset.take(1))
    val correctCrossKernel = DenseMatrix((6.737947e-03, 3.053624e-45))
    assert(all(abs(crossKernel - correctCrossKernel) <:< 1e-4))
  }

  test("crossKernel returns correct kernel if called on a single vector") {
    val rbf = new RBFKernel(math.sqrt(0.2))
    rbf.setTrainingVectors(dataset.drop(1))
    val crossKernel = rbf.crossKernel(dataset(0))
    val correctCrossKernel = DenseVector(6.737947e-03, 3.053624e-45).t
    assert(all(abs(crossKernel - correctCrossKernel) <:< 1e-4))
  }
}

Source File: SparkVector.scala From Machine-Learning-with-Spark-Second-Edition with MIT License

5 votes

package linalg.vector
import org.apache.spark.ml.linalg.{Vector, Vectors}

object SparkVector {

  def main(args: Array[String]): Unit = {
    // Create a dense vector (1.0, 0.0, 3.0).

    val dVectorOne: Vector = Vectors.dense(1.0, 0.0, 2.0)
    println("dVectorOne:" + dVectorOne)

    //  Sparse vector (1.0, 0.0, 2.0, 3.0)
    // corresponding to nonzero entries.
    val sVectorOne: Vector = Vectors.sparse(4,  Array(0, 2,3),  Array(1.0, 2.0, 3.0))

    // Create a sparse vector (1.0, 0.0, 2.0, 2.0) by specifying its
    // nonzero entries.
    val sVectorTwo: Vector = Vectors.sparse(4,  Seq((0, 1.0), (2, 2.0), (3, 3.0)))

    println("sVectorOne:" + sVectorOne)
    println("sVectorTwo:" + sVectorTwo)

    val sVectorOneMax = sVectorOne.argmax
    val sVectorOneNumNonZeros = sVectorOne.numNonzeros
    val sVectorOneSize = sVectorOne.size
    val sVectorOneArray = sVectorOne.toArray

    println("sVectorOneMax:" + sVectorOneMax)
    println("sVectorOneNumNonZeros:" + sVectorOneNumNonZeros)
    println("sVectorOneSize:" + sVectorOneSize)
    println("sVectorOneArray:" + sVectorOneArray)
    val dVectorOneToSparse = dVectorOne.toSparse

    println("dVectorOneToSparse:" + dVectorOneToSparse)


  }
}

Source File: MLUserDefinedType.scala From spark-testing-base with Apache License 2.0

5 votes

package com.holdenkarau.spark.testing

import org.apache.spark.sql.types.DataType
import org.apache.spark.ml.linalg.SQLDataTypes.{MatrixType, VectorType}
import org.apache.spark.ml.linalg.{DenseMatrix, Vectors}
import org.scalacheck.{Arbitrary, Gen}


object MLUserDefinedType {
  def unapply(dataType: DataType): Option[Gen[Any]] =
    dataType match {
      case MatrixType => {
        val dense = for {
          rows <- Gen.choose(0, 20)
          cols <- Gen.choose(0, 20)
          values <- Gen.containerOfN[Array, Double](rows * cols, Arbitrary.arbitrary[Double])
        } yield new DenseMatrix(rows, cols, values)
        val sparse = dense.map(_.toSparse)
        Some(Gen.oneOf(dense, sparse))
      }
      case VectorType => {
        val dense = Arbitrary.arbitrary[Array[Double]].map(Vectors.dense)
        val sparse = for {
          indices <- Gen.nonEmptyContainerOf[Set, Int](Gen.choose(0, Int.MaxValue - 1))
          values <- Gen.listOfN(indices.size, Arbitrary.arbitrary[Double])
        } yield Vectors.sparse(indices.max + 1, indices.toSeq.zip(values))
        Some(Gen.oneOf(dense, sparse))
      }
      case _ => None
    }
}

Source File: LogisticRegressionWorkload.scala From spark-bench with Apache License 2.0

5 votes

package com.ibm.sparktc.sparkbench.workload.ml

import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._
import com.ibm.sparktc.sparkbench.utils.SaveModes
import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator => BCE}
import org.apache.spark.sql.{DataFrame, Row, SparkSession}

// ¯\_(ツ)_/¯
// the logic for this workload came from:
// https://github.com/szilard/benchm-ml/blob/master/1-linear/5-spark.txt
// ¯\_(ツ)_/¯

case class LogisticRegressionResult(
                                     name: String,
                                     appid: String,
                                     start_time: Long,
                                     input: String,
                                     train_count: Long,
                                     train_time: Long,
                                     test_file: String,
                                     test_count: Long,
                                     test_time: Long,
                                     load_time: Long,
                                     count_time: Long,
                                     total_runtime: Long,
                                     area_under_roc: Double
                                   )

object LogisticRegressionWorkload extends WorkloadDefaults {
  val name = "lr-bml"
  def apply(m: Map[String, Any]) = new LogisticRegressionWorkload(
    input = Some(getOrThrow(m, "input").asInstanceOf[String]),
    output = getOrDefault[Option[String]](m, "workloadresultsoutputdir", None),
    saveMode = getOrDefault[String](m, "save-mode", SaveModes.error),
    testFile = getOrThrow(m, "testfile").asInstanceOf[String],
    numPartitions = getOrDefault[Int](m, "numpartitions", 32),
    cacheEnabled = getOrDefault[Boolean](m, "cacheenabled", true)
  )

}

case class LogisticRegressionWorkload(
                                       input: Option[String],
                                       output: Option[String],
                                       saveMode: String,
                                       testFile: String,
                                       numPartitions: Int,
                                       cacheEnabled: Boolean
  ) extends Workload {

  private[ml] def load(filename: String)(implicit spark: SparkSession): DataFrame = {
    import spark.implicits._
    spark.sparkContext.textFile(filename)
      .map { line =>
        val vv = line.split(',').map(_.toDouble)
        val label = vv(0)
        val features = Vectors.dense(vv.slice(1, vv.length)).toSparse
        (label, features)
      }.toDF("label", "features")
  }

  private[ml] def ld(fn: String)(implicit spark: SparkSession) = time {
    val ds = load(fn)(spark).repartition(numPartitions)
    if (cacheEnabled) ds.cache
    ds
  }

  override def doWorkload(df: Option[DataFrame], spark: SparkSession): DataFrame = {
    val startTime = System.currentTimeMillis
    val (ltrainTime, d_train) = ld(s"${input.get}")(spark)
    val (ltestTime, d_test) = ld(s"$testFile")(spark)
    val (countTime, (trainCount, testCount)) = time { (d_train.count(), d_test.count()) }
    val (trainTime, model) = time(new LogisticRegression().setTol(1e-4).fit(d_train))
    val (testTime, areaUnderROC) = time(new BCE().setMetricName("areaUnderROC").evaluate(model.transform(d_test)))

    val loadTime = ltrainTime + ltestTime

    //spark.createDataFrame(Seq(SleepResult("sleep", timestamp, t)))

    spark.createDataFrame(Seq(LogisticRegressionResult(
      name = "lr-bml",
      appid = spark.sparkContext.applicationId,
      startTime,
      input.get,
      train_count = trainCount,
      trainTime,
      testFile,
      test_count = testCount,
      testTime,
      loadTime,
      countTime,
      loadTime + trainTime + testTime,
      areaUnderROC
    )))
  }
}

Source File: MultivariateGaussianSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.stat.distribution

import org.apache.spark.ml.SparkMLFunSuite
import org.apache.spark.ml.linalg.{Matrices, Vectors}
import org.apache.spark.ml.util.TestingUtils._


class MultivariateGaussianSuite extends SparkMLFunSuite {

  test("univariate") {
    val x1 = Vectors.dense(0.0)
    val x2 = Vectors.dense(1.5)

    val mu = Vectors.dense(0.0)
    val sigma1 = Matrices.dense(1, 1, Array(1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5)

    val sigma2 = Matrices.dense(1, 1, Array(4.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5)
  }

  test("multivariate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0))
    val dist1 = new MultivariateGaussian(mu, sigma1)
    assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5)
    assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5)

    val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0))
    val dist2 = new MultivariateGaussian(mu, sigma2)
    assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5)
    assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5)
  }

  test("multivariate degenerate") {
    val x1 = Vectors.dense(0.0, 0.0)
    val x2 = Vectors.dense(1.0, 1.0)

    val mu = Vectors.dense(0.0, 0.0)
    val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0))
    val dist = new MultivariateGaussian(mu, sigma)
    assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5)
    assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5)
  }

  test("SPARK-11302") {
    val x = Vectors.dense(629, 640, 1.7188, 618.19)
    val mu = Vectors.dense(
      1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697)
    val sigma = Matrices.dense(4, 4, Array(
      166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053,
      169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484,
      12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373,
      164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207))
    val dist = new MultivariateGaussian(mu, sigma)
    // Agrees with R's dmvnorm: 7.154782e-05
    assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9)
  }
}

Source File: AFTSurvivalRegressionExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.regression.AFTSurvivalRegression
// $example off$
import org.apache.spark.sql.SparkSession


object AFTSurvivalRegressionExample {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("AFTSurvivalRegressionExample")
      .getOrCreate()

    // $example on$
    val training = spark.createDataFrame(Seq(
      (1.218, 1.0, Vectors.dense(1.560, -0.605)),
      (2.949, 0.0, Vectors.dense(0.346, 2.158)),
      (3.627, 0.0, Vectors.dense(1.380, 0.231)),
      (0.273, 1.0, Vectors.dense(0.520, 1.151)),
      (4.199, 0.0, Vectors.dense(0.795, -0.226))
    )).toDF("label", "censor", "features")
    val quantileProbabilities = Array(0.3, 0.6)
    val aft = new AFTSurvivalRegression()
      .setQuantileProbabilities(quantileProbabilities)
      .setQuantilesCol("quantiles")

    val model = aft.fit(training)

    // Print the coefficients, intercept and scale parameter for AFT survival regression
    println(s"Coefficients: ${model.coefficients}")
    println(s"Intercept: ${model.intercept}")
    println(s"Scale: ${model.scale}")
    model.transform(training).show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: NormalizerExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object NormalizerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("NormalizerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.5, -1.0)),
      (1, Vectors.dense(2.0, 1.0, 1.0)),
      (2, Vectors.dense(4.0, 10.0, 2.0))
    )).toDF("id", "features")

    // Normalize each Vector using $L^1$ norm.
    val normalizer = new Normalizer()
      .setInputCol("features")
      .setOutputCol("normFeatures")
      .setP(1.0)

    val l1NormData = normalizer.transform(dataFrame)
    println("Normalized using L^1 norm")
    l1NormData.show()

    // Normalize each Vector using $L^\infty$ norm.
    val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity)
    println("Normalized using L^inf norm")
    lInfNormData.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: VectorSlicerExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import java.util.Arrays

import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.feature.VectorSlicer
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
// $example off$
import org.apache.spark.sql.SparkSession

object VectorSlicerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("VectorSlicerExample")
      .getOrCreate()

    // $example on$
    val data = Arrays.asList(
      Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))),
      Row(Vectors.dense(-2.0, 2.3, 0.0))
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]])

    val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField())))

    val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features")

    slicer.setIndices(Array(1)).setNames(Array("f3"))
    // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3"))

    val output = slicer.transform(dataset)
    output.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: ChiSqSelectorExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.ChiSqSelector
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object ChiSqSelectorExample {
  def main(args: Array[String]) {
    val spark = SparkSession
      .builder
      .appName("ChiSqSelectorExample")
      .getOrCreate()
    import spark.implicits._

    // $example on$
    val data = Seq(
      (7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0),
      (8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0),
      (9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0)
    )

    val df = spark.createDataset(data).toDF("id", "features", "clicked")

    val selector = new ChiSqSelector()
      .setNumTopFeatures(1)
      .setFeaturesCol("features")
      .setLabelCol("clicked")
      .setOutputCol("selectedFeatures")

    val result = selector.fit(df).transform(df)

    println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected")
    result.show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: DCTExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.DCT
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object DCTExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("DCTExample")
      .getOrCreate()

    // $example on$
    val data = Seq(
      Vectors.dense(0.0, 1.0, -2.0, 3.0),
      Vectors.dense(-1.0, 2.0, 4.0, -7.0),
      Vectors.dense(14.0, -2.0, -5.0, 1.0))

    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val dct = new DCT()
      .setInputCol("features")
      .setOutputCol("featuresDCT")
      .setInverse(false)

    val dctDf = dct.transform(df)
    dctDf.select("featuresDCT").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: VectorAssemblerExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object VectorAssemblerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("VectorAssemblerExample")
      .getOrCreate()

    // $example on$
    val dataset = spark.createDataFrame(
      Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0))
    ).toDF("id", "hour", "mobile", "userFeatures", "clicked")

    val assembler = new VectorAssembler()
      .setInputCols(Array("hour", "mobile", "userFeatures"))
      .setOutputCol("features")

    val output = assembler.transform(dataset)
    println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
    output.select("features", "clicked").show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: PCAExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.PCA
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object PCAExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("PCAExample")
      .getOrCreate()

    // $example on$
    val data = Array(
      Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))),
      Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0),
      Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0)
    )
    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val pca = new PCA()
      .setInputCol("features")
      .setOutputCol("pcaFeatures")
      .setK(3)
      .fit(df)

    val result = pca.transform(df).select("pcaFeatures")
    result.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: ElementwiseProductExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.ElementwiseProduct
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object ElementwiseProductExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("ElementwiseProductExample")
      .getOrCreate()

    // $example on$
    // Create some vector data; also works for sparse vectors
    val dataFrame = spark.createDataFrame(Seq(
      ("a", Vectors.dense(1.0, 2.0, 3.0)),
      ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector")

    val transformingVector = Vectors.dense(0.0, 1.0, 2.0)
    val transformer = new ElementwiseProduct()
      .setScalingVec(transformingVector)
      .setInputCol("vector")
      .setOutputCol("transformedVector")

    // Batch transform the vectors to create new column:
    transformer.transform(dataFrame).show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: MinMaxScalerExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.MinMaxScaler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object MinMaxScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MinMaxScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.1, -1.0)),
      (1, Vectors.dense(2.0, 1.1, 1.0)),
      (2, Vectors.dense(3.0, 10.1, 3.0))
    )).toDF("id", "features")

    val scaler = new MinMaxScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")

    // Compute summary statistics and generate MinMaxScalerModel
    val scalerModel = scaler.fit(dataFrame)

    // rescale each feature to range [min, max].
    val scaledData = scalerModel.transform(dataFrame)
    println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]")
    scaledData.select("features", "scaledFeatures").show()
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: PolynomialExpansionExample.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.PolynomialExpansion
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object PolynomialExpansionExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("PolynomialExpansionExample")
      .getOrCreate()

    // $example on$
    val data = Array(
      Vectors.dense(2.0, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(3.0, -1.0)
    )
    val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features")

    val polyExpansion = new PolynomialExpansion()
      .setInputCol("features")
      .setOutputCol("polyFeatures")
      .setDegree(3)

    val polyDF = polyExpansion.transform(df)
    polyDF.show(false)
    // $example off$

    spark.stop()
  }
}
// scalastyle:on println

Source File: MaxAbsScalerExample.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.ml.feature.MaxAbsScaler
import org.apache.spark.ml.linalg.Vectors
// $example off$
import org.apache.spark.sql.SparkSession

object MaxAbsScalerExample {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .appName("MaxAbsScalerExample")
      .getOrCreate()

    // $example on$
    val dataFrame = spark.createDataFrame(Seq(
      (0, Vectors.dense(1.0, 0.1, -8.0)),
      (1, Vectors.dense(2.0, 1.0, -4.0)),
      (2, Vectors.dense(4.0, 10.0, 8.0))
    )).toDF("id", "features")

    val scaler = new MaxAbsScaler()
      .setInputCol("features")
      .setOutputCol("scaledFeatures")

    // Compute summary statistics and generate MaxAbsScalerModel
    val scalerModel = scaler.fit(dataFrame)

    // rescale each feature to range [-1, 1]
    val scaledData = scalerModel.transform(dataFrame)
    scaledData.select("features", "scaledFeatures").show()
    // $example off$

    spark.stop()
  }
}

Source File: DCT.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import edu.emory.mathcs.jtransforms.dct._

import org.apache.spark.annotation.Since
import org.apache.spark.ml.UnaryTransformer
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.BooleanParam
import org.apache.spark.ml.util._
import org.apache.spark.sql.types.DataType


  @Since("1.5.0")
  def getInverse: Boolean = $(inverse)

  setDefault(inverse -> false)

  override protected def createTransformFunc: Vector => Vector = { vec =>
    val result = vec.toArray
    val jTransformer = new DoubleDCT_1D(result.length)
    if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true)
    Vectors.dense(result)
  }

  override protected def validateInputType(inputType: DataType): Unit = {
    require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.")
  }

  override protected def outputDataType: DataType = new VectorUDT
}

@Since("1.6.0")
object DCT extends DefaultParamsReadable[DCT] {

  @Since("1.6.0")
  override def load(path: String): DCT = super.load(path)
}

Source File: MultilayerPerceptronClassifierWrapper.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.r

import org.apache.hadoop.fs.Path
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.JsonMethods._

import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier}
import org.apache.spark.ml.feature.{IndexToString, RFormula}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.ml.r.RWrapperUtils._
import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter}
import org.apache.spark.sql.{DataFrame, Dataset}

private[r] class MultilayerPerceptronClassifierWrapper private (
    val pipeline: PipelineModel
  ) extends MLWritable {

  import MultilayerPerceptronClassifierWrapper._

  val mlpModel: MultilayerPerceptronClassificationModel =
    pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel]

  val weights: Array[Double] = mlpModel.weights.toArray
  val layers: Array[Int] = mlpModel.layers

  def transform(dataset: Dataset[_]): DataFrame = {
    pipeline.transform(dataset)
      .drop(mlpModel.getFeaturesCol)
      .drop(mlpModel.getLabelCol)
      .drop(PREDICTED_LABEL_INDEX_COL)
  }

  
  override def read: MLReader[MultilayerPerceptronClassifierWrapper] =
    new MultilayerPerceptronClassifierWrapperReader

  override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path)

  class MultilayerPerceptronClassifierWrapperReader
    extends MLReader[MultilayerPerceptronClassifierWrapper]{

    override def load(path: String): MultilayerPerceptronClassifierWrapper = {
      implicit val format = DefaultFormats
      val pipelinePath = new Path(path, "pipeline").toString

      val pipeline = PipelineModel.load(pipelinePath)
      new MultilayerPerceptronClassifierWrapper(pipeline)
    }
  }

  class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper)
    extends MLWriter {

    override protected def saveImpl(path: String): Unit = {
      val rMetadataPath = new Path(path, "rMetadata").toString
      val pipelinePath = new Path(path, "pipeline").toString

      val rMetadata = "class" -> instance.getClass.getName
      val rMetadataJson: String = compact(render(rMetadata))
      sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath)

      instance.pipeline.save(pipelinePath)
    }
  }
}

Source File: VectorSlicerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute}
import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.types.{StructField, StructType}

class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  test("params") {
    val slicer = new VectorSlicer().setInputCol("feature")
    ParamsSuite.checkParams(slicer)
    assert(slicer.getIndices.length === 0)
    assert(slicer.getNames.length === 0)
    withClue("VectorSlicer should not have any features selected by default") {
      intercept[IllegalArgumentException] {
        slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true))))
      }
    }
  }

  test("feature validity checks") {
    import VectorSlicer._
    assert(validIndices(Array(0, 1, 8, 2)))
    assert(validIndices(Array.empty[Int]))
    assert(!validIndices(Array(-1)))
    assert(!validIndices(Array(1, 2, 1)))

    assert(validNames(Array("a", "b")))
    assert(validNames(Array.empty[String]))
    assert(!validNames(Array("", "b")))
    assert(!validNames(Array("a", "b", "a")))
  }

  test("Test vector slicer") {
    val data = Array(
      Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))),
      Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0),
      Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0),
      Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3),
      Vectors.sparse(5, Seq())
    )

    // Expected after selecting indices 1, 4
    val expected = Array(
      Vectors.sparse(2, Seq((0, 2.3))),
      Vectors.dense(2.3, 1.0),
      Vectors.dense(0.0, 0.0),
      Vectors.dense(-1.1, 3.3),
      Vectors.sparse(2, Seq())
    )

    val defaultAttr = NumericAttribute.defaultAttr
    val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName)
    val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]])

    val resultAttrs = Array("f1", "f4").map(defaultAttr.withName)
    val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]])

    val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) }
    val df = spark.createDataFrame(rdd,
      StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField())))

    val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result")

    def validateResults(df: DataFrame): Unit = {
      df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) =>
        assert(vec1 === vec2)
      }
      val resultMetadata = AttributeGroup.fromStructField(df.schema("result"))
      val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected"))
      assert(resultMetadata.numAttributes === expectedMetadata.numAttributes)
      resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) =>
        assert(a === b)
      }
    }

    vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty)
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array(1)).setNames(Array("f4"))
    validateResults(vectorSlicer.transform(df))

    vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4"))
    validateResults(vectorSlicer.transform(df))
  }

  test("read/write") {
    val t = new VectorSlicer()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setIndices(Array(1, 3))
      .setNames(Array("a", "d"))
    testDefaultReadWrite(t)
  }
}

Source File: MaxAbsScalerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("MaxAbsScaler fit basic case") {
    val data = Array(
      Vectors.dense(1, 0, 100),
      Vectors.dense(2, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(-2, -100)),
      Vectors.sparse(3, Array(0), Array(-1.5)))

    val expected: Array[Vector] = Array(
      Vectors.dense(0.5, 0, 1),
      Vectors.dense(1, 0, 0),
      Vectors.sparse(3, Array(0, 2), Array(-1, -1)),
      Vectors.sparse(3, Array(0), Array(-0.75)))

    val df = data.zip(expected).toSeq.toDF("features", "expected")
    val scaler = new MaxAbsScaler()
      .setInputCol("features")
      .setOutputCol("scaled")

    val model = scaler.fit(df)
    model.transform(df).select("expected", "scaled").collect()
      .foreach { case Row(vector1: Vector, vector2: Vector) =>
      assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1")
    }

    // copied model must have the same parent.
    MLTestingUtils.checkCopy(model)
  }

  test("MaxAbsScaler read/write") {
    val t = new MaxAbsScaler()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    testDefaultReadWrite(t)
  }

  test("MaxAbsScalerModel read/write") {
    val instance = new MaxAbsScalerModel(
      "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0))
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
    val newInstance = testDefaultReadWrite(instance)
    assert(newInstance.maxAbs === instance.maxAbs)
  }

}

Source File: ChiSqSelectorSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.param.ParamsSuite
import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils}
import org.apache.spark.ml.util.TestingUtils._
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.{Dataset, Row}

class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext
  with DefaultReadWriteTest {

  @transient var dataset: Dataset[_] = _

  override def beforeAll(): Unit = {
    super.beforeAll()

    // Toy dataset, including the top feature for a chi-squared test.
    // These data are chosen such that each feature's test has a distinct p-value.
    
  val allParamSettings: Map[String, Any] = Map(
    "selectorType" -> "percentile",
    "numTopFeatures" -> 1,
    "percentile" -> 0.12,
    "outputCol" -> "myOutput"
  )
}

Source File: DCTSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import scala.beans.BeanInfo

import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D

import org.apache.spark.SparkFunSuite
import org.apache.spark.ml.linalg.{Vector, Vectors}
import org.apache.spark.ml.util.DefaultReadWriteTest
import org.apache.spark.mllib.util.MLlibTestSparkContext
import org.apache.spark.sql.Row

@BeanInfo
case class DCTTestData(vec: Vector, wantedVec: Vector)

class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest {

  import testImplicits._

  test("forward transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = false

    testDCT(data, inverse)
  }

  test("inverse transform of discrete cosine matches jTransforms result") {
    val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray)
    val inverse = true

    testDCT(data, inverse)
  }

  test("read/write") {
    val t = new DCT()
      .setInputCol("myInputCol")
      .setOutputCol("myOutputCol")
      .setInverse(true)
    testDefaultReadWrite(t)
  }

  private def testDCT(data: Vector, inverse: Boolean): Unit = {
    val expectedResultBuffer = data.toArray.clone()
    if (inverse) {
      new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true)
    } else {
      new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true)
    }
    val expectedResult = Vectors.dense(expectedResultBuffer)

    val dataset = Seq(DCTTestData(data, expectedResult)).toDF()

    val transformer = new DCT()
      .setInputCol("vec")
      .setOutputCol("resultVec")
      .setInverse(inverse)

    transformer.transform(dataset)
      .select("resultVec", "wantedVec")
      .collect()
      .foreach { case Row(resultVec: Vector, wantedVec: Vector) =>
      assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6)
    }
  }
}

org.apache.spark.ml.linalg.Vectors Scala Examples