org.apache.spark.ml.linalg.Vectors Scala Examples
The following examples show how to use org.apache.spark.ml.linalg.Vectors.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MultilayerPerceptronClassifierWrapper.scala From drizzle-spark with Apache License 2.0 | 8 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel, val labelCount: Long, val layers: Array[Int], val weights: Array[Double] ) extends MLWritable { def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadataStr = sc.textFile(rMetadataPath, 1).first() val rMetadata = parse(rMetadataStr) val labelCount = (rMetadata \ "labelCount").extract[Long] val layers = (rMetadata \ "layers").extract[Array[Int]] val weights = (rMetadata \ "weights").extract[Array[Double]] val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline, labelCount, layers, weights) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = ("class" -> instance.getClass.getName) ~ ("labelCount" -> instance.labelCount) ~ ("layers" -> instance.layers.toSeq) ~ ("weights" -> instance.weights.toArray.toSeq) val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 2
Source File: DCT.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 3
Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import breeze.linalg.{diag, eigSym, max, DenseMatrix => BDM, DenseVector => BDV, Vector => BV} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.ml.impl.Utils import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} private def calculateCovarianceConstants: (BDM[Double], Double) = { val eigSym.EigSym(d, u) = eigSym(cov.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t // For numerical stability, values are considered to be non-zero only if they exceed tol. // This prevents any inverted value from exceeding (eps * n * max(d))^-1 val tol = Utils.EPSILON * max(d) * d.length try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new BDV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) (pinvS * u.t, -0.5 * (mean.size * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") } } }
Example 4
Source File: MultivariateGaussianSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import org.apache.spark.ml.SparkMLFunSuite import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.util.TestingUtils._ class MultivariateGaussianSuite extends SparkMLFunSuite { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 5
Source File: AFTSurvivalRegressionExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.AFTSurvivalRegression // $example off$ import org.apache.spark.sql.SparkSession object AFTSurvivalRegressionExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("AFTSurvivalRegressionExample") .getOrCreate() // $example on$ val training = spark.createDataFrame(Seq( (1.218, 1.0, Vectors.dense(1.560, -0.605)), (2.949, 0.0, Vectors.dense(0.346, 2.158)), (3.627, 0.0, Vectors.dense(1.380, 0.231)), (0.273, 1.0, Vectors.dense(0.520, 1.151)), (4.199, 0.0, Vectors.dense(0.795, -0.226)) )).toDF("label", "censor", "features") val quantileProbabilities = Array(0.3, 0.6) val aft = new AFTSurvivalRegression() .setQuantileProbabilities(quantileProbabilities) .setQuantilesCol("quantiles") val model = aft.fit(training) // Print the coefficients, intercept and scale parameter for AFT survival regression println(s"Coefficients: ${model.coefficients}") println(s"Intercept: ${model.intercept}") println(s"Scale: ${model.scale}") model.transform(training).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 6
Source File: NormalizerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object NormalizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("NormalizerExample") .getOrCreate() // $example on$ val dataFrame = spark.createDataFrame(Seq( (0, Vectors.dense(1.0, 0.5, -1.0)), (1, Vectors.dense(2.0, 1.0, 1.0)), (2, Vectors.dense(4.0, 10.0, 2.0)) )).toDF("id", "features") // Normalize each Vector using $L^1$ norm. val normalizer = new Normalizer() .setInputCol("features") .setOutputCol("normFeatures") .setP(1.0) val l1NormData = normalizer.transform(dataFrame) println("Normalized using L^1 norm") l1NormData.show() // Normalize each Vector using $L^\infty$ norm. val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity) println("Normalized using L^inf norm") lInfNormData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 7
Source File: VectorSlicerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.sql.SparkSession object VectorSlicerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSlicerExample") .getOrCreate() // $example on$ val data = Arrays.asList( Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), Row(Vectors.dense(-2.0, 2.3, 0.0)) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) output.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 8
Source File: ChiSqSelectorExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.ChiSqSelector import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object ChiSqSelectorExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("ChiSqSelectorExample") .getOrCreate() import spark.implicits._ // $example on$ val data = Seq( (7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0), (8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0), (9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0) ) val df = spark.createDataset(data).toDF("id", "features", "clicked") val selector = new ChiSqSelector() .setNumTopFeatures(1) .setFeaturesCol("features") .setLabelCol("clicked") .setOutputCol("selectedFeatures") val result = selector.fit(df).transform(df) println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected") result.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 9
Source File: DCTExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.DCT import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object DCTExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("DCTExample") .getOrCreate() // $example on$ val data = Seq( Vectors.dense(0.0, 1.0, -2.0, 3.0), Vectors.dense(-1.0, 2.0, 4.0, -7.0), Vectors.dense(14.0, -2.0, -5.0, 1.0)) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val dct = new DCT() .setInputCol("features") .setOutputCol("featuresDCT") .setInverse(false) val dctDf = dct.transform(df) dctDf.select("featuresDCT").show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 10
Source File: VectorAssemblerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object VectorAssemblerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorAssemblerExample") .getOrCreate() // $example on$ val dataset = spark.createDataFrame( Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0)) ).toDF("id", "hour", "mobile", "userFeatures", "clicked") val assembler = new VectorAssembler() .setInputCols(Array("hour", "mobile", "userFeatures")) .setOutputCol("features") val output = assembler.transform(dataset) println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 11
Source File: PCAExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object PCAExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("PCAExample") .getOrCreate() // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df) val result = pca.transform(df).select("pcaFeatures") result.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 12
Source File: ElementwiseProductExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.ElementwiseProduct import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object ElementwiseProductExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("ElementwiseProductExample") .getOrCreate() // $example on$ // Create some vector data; also works for sparse vectors val dataFrame = spark.createDataFrame(Seq( ("a", Vectors.dense(1.0, 2.0, 3.0)), ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector") val transformingVector = Vectors.dense(0.0, 1.0, 2.0) val transformer = new ElementwiseProduct() .setScalingVec(transformingVector) .setInputCol("vector") .setOutputCol("transformedVector") // Batch transform the vectors to create new column: transformer.transform(dataFrame).show() // $example off$ spark.stop() } } // scalastyle:on println
Example 13
Source File: MinMaxScalerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object MinMaxScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("MinMaxScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.createDataFrame(Seq( (0, Vectors.dense(1.0, 0.1, -1.0)), (1, Vectors.dense(2.0, 1.1, 1.0)), (2, Vectors.dense(3.0, 10.1, 3.0)) )).toDF("id", "features") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures") // Compute summary statistics and generate MinMaxScalerModel val scalerModel = scaler.fit(dataFrame) // rescale each feature to range [min, max]. val scaledData = scalerModel.transform(dataFrame) println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]") scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 14
Source File: PolynomialExpansionExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.PolynomialExpansion import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object PolynomialExpansionExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("PolynomialExpansionExample") .getOrCreate() // $example on$ val data = Array( Vectors.dense(2.0, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(3.0, -1.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val polyExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) val polyDF = polyExpansion.transform(df) polyDF.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 15
Source File: MaxAbsScalerExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MaxAbsScaler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object MaxAbsScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("MaxAbsScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.createDataFrame(Seq( (0, Vectors.dense(1.0, 0.1, -8.0)), (1, Vectors.dense(2.0, 1.0, -4.0)), (2, Vectors.dense(4.0, 10.0, 8.0)) )).toDF("id", "features") val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaledFeatures") // Compute summary statistics and generate MaxAbsScalerModel val scalerModel = scaler.fit(dataFrame) // rescale each feature to range [-1, 1] val scaledData = scalerModel.transform(dataFrame) scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() } }
Example 16
Source File: VectorSlicerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 17
Source File: MaxAbsScalerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("MaxAbsScaler fit basic case") { val data = Array( Vectors.dense(1, 0, 100), Vectors.dense(2, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-2, -100)), Vectors.sparse(3, Array(0), Array(-1.5))) val expected: Array[Vector] = Array( Vectors.dense(0.5, 0, 1), Vectors.dense(1, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-1, -1)), Vectors.sparse(3, Array(0), Array(-0.75))) val df = data.zip(expected).toSeq.toDF("features", "expected") val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaled") val model = scaler.fit(df) model.transform(df).select("expected", "scaled").collect() .foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1") } // copied model must have the same parent. MLTestingUtils.checkCopy(model) } test("MaxAbsScaler read/write") { val t = new MaxAbsScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } test("MaxAbsScalerModel read/write") { val instance = new MaxAbsScalerModel( "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.maxAbs === instance.maxAbs) } }
Example 18
Source File: ChiSqSelectorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("Test Chi-Square selector") { import testImplicits._ val data = Seq( LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))) ) val preFilteredData = Seq( Vectors.dense(8.0), Vectors.dense(0.0), Vectors.dense(0.0), Vectors.dense(8.0) ) val df = sc.parallelize(data.zip(preFilteredData)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") val selector = new ChiSqSelector() .setSelectorType("kbest") .setNumTopFeatures(1) .setFeaturesCol("data") .setLabelCol("label") .setOutputCol("filtered") selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } selector.setSelectorType("percentile").setPercentile(0.34).fit(df).transform(df) .select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } val preFilteredData2 = Seq( Vectors.dense(8.0, 7.0), Vectors.dense(0.0, 9.0), Vectors.dense(0.0, 9.0), Vectors.dense(8.0, 9.0) ) val df2 = sc.parallelize(data.zip(preFilteredData2)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") selector.setSelectorType("fpr").setAlpha(0.2).fit(df2).transform(df2) .select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } } test("ChiSqSelector read/write") { val t = new ChiSqSelector() .setFeaturesCol("myFeaturesCol") .setLabelCol("myLabelCol") .setOutputCol("myOutputCol") .setNumTopFeatures(2) testDefaultReadWrite(t) } test("ChiSqSelectorModel read/write") { val oldModel = new feature.ChiSqSelectorModel(Array(1, 3)) val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel) val newInstance = testDefaultReadWrite(instance) assert(newInstance.selectedFeatures === instance.selectedFeatures) } test("should support all NumericType labels and not support other types") { val css = new ChiSqSelector() MLTestingUtils.checkNumericTypes[ChiSqSelectorModel, ChiSqSelector]( css, spark) { (expected, actual) => assert(expected.selectedFeatures === actual.selectedFeatures) } } }
Example 19
Source File: DCTSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 20
Source File: ElementwiseProductSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext class ElementwiseProductSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("read/write") { val ep = new ElementwiseProduct() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setScalingVec(Vectors.dense(0.1, 0.2)) testDefaultReadWrite(ep) } }
Example 21
Source File: BinarizerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[Double] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4) } test("params") { ParamsSuite.checkParams(new Binarizer) } test("Binarize continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize continuous features with setter") { val threshold: Double = 0.2 val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with setter") { val threshold: Double = 0.2 val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("read/write") { val t = new Binarizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setThreshold(0.1) testDefaultReadWrite(t) } }
Example 22
Source File: HashingTFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.feature.{HashingTF => MLlibHashingTF} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new HashingTF) } test("hashingTF") { val df = Seq((0, "a a b b c d".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) val output = hashingTF.transform(df) val attrGroup = AttributeGroup.fromStructField(output.schema("features")) require(attrGroup.numAttributes === Some(n)) val features = output.select("features").first().getAs[Vector](0) // Assume perfect hash on "a", "b", "c", and "d". def idx: Any => Int = murmur3FeatureIdx(n) val expected = Vectors.sparse(n, Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) assert(features ~== expected absTol 1e-14) } test("applying binary term freqs") { val df = Seq((0, "a a b c c c".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) .setBinary(true) val output = hashingTF.transform(df) val features = output.select("features").first().getAs[Vector](0) def idx: Any => Int = murmur3FeatureIdx(n) // Assume perfect hash on input features val expected = Vectors.sparse(n, Seq((idx("a"), 1.0), (idx("b"), 1.0), (idx("c"), 1.0))) assert(features ~== expected absTol 1e-14) } test("read/write") { val t = new HashingTF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setNumFeatures(10) testDefaultReadWrite(t) } private def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = { Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures) } }
Example 23
Source File: BinaryClassificationEvaluatorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext class BinaryClassificationEvaluatorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new BinaryClassificationEvaluator) } test("read/write") { val evaluator = new BinaryClassificationEvaluator() .setRawPredictionCol("myRawPrediction") .setLabelCol("myLabel") .setMetricName("areaUnderPR") testDefaultReadWrite(evaluator) } test("should accept both vector and double raw prediction col") { val evaluator = new BinaryClassificationEvaluator() .setMetricName("areaUnderPR") val vectorDF = Seq( (0d, Vectors.dense(12, 2.5)), (1d, Vectors.dense(1, 3)), (0d, Vectors.dense(10, 2)) ).toDF("label", "rawPrediction") assert(evaluator.evaluate(vectorDF) === 1.0) val doubleDF = Seq( (0d, 0d), (1d, 1d), (0d, 0d) ).toDF("label", "rawPrediction") assert(evaluator.evaluate(doubleDF) === 1.0) val stringDF = Seq( (0d, "0d"), (1d, "1d"), (0d, "0d") ).toDF("label", "rawPrediction") val thrown = intercept[IllegalArgumentException] { evaluator.evaluate(stringDF) } assert(thrown.getMessage.replace("\n", "") contains "Column rawPrediction must be of type " + "equal to one of the following types: [DoubleType, ") assert(thrown.getMessage.replace("\n", "") contains "but was actually of type StringType.") } test("should support all NumericType labels and not support other types") { val evaluator = new BinaryClassificationEvaluator().setRawPredictionCol("prediction") MLTestingUtils.checkNumericTypes(evaluator, spark) } }
Example 24
Source File: MLSerDeSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.python import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, SparseMatrix, Vectors} class MLSerDeSuite extends SparkFunSuite { MLSerDe.initialize() test("pickle vector") { val vectors = Seq( Vectors.dense(Array.empty[Double]), Vectors.dense(0.0), Vectors.dense(0.0, -2.0), Vectors.sparse(0, Array.empty[Int], Array.empty[Double]), Vectors.sparse(1, Array.empty[Int], Array.empty[Double]), Vectors.sparse(2, Array(1), Array(-2.0))) vectors.foreach { v => val u = MLSerDe.loads(MLSerDe.dumps(v)) assert(u.getClass === v.getClass) assert(u === v) } } test("pickle double") { for (x <- List(123.0, -10.0, 0.0, Double.MaxValue, Double.MinValue, Double.NaN)) { val deser = MLSerDe.loads(MLSerDe.dumps(x.asInstanceOf[AnyRef])).asInstanceOf[Double] // We use `equals` here for comparison because we cannot use `==` for NaN assert(x.equals(deser)) } } test("pickle matrix") { val values = Array[Double](0, 1.2, 3, 4.56, 7, 8) val matrix = Matrices.dense(2, 3, values) val nm = MLSerDe.loads(MLSerDe.dumps(matrix)).asInstanceOf[DenseMatrix] assert(matrix === nm) // Test conversion for empty matrix val empty = Array.empty[Double] val emptyMatrix = Matrices.dense(0, 0, empty) val ne = MLSerDe.loads(MLSerDe.dumps(emptyMatrix)).asInstanceOf[DenseMatrix] assert(emptyMatrix == ne) val sm = new SparseMatrix(3, 2, Array(0, 1, 3), Array(1, 0, 2), Array(0.9, 1.2, 3.4)) val nsm = MLSerDe.loads(MLSerDe.dumps(sm)).asInstanceOf[SparseMatrix] assert(sm.toArray === nsm.toArray) val smt = new SparseMatrix( 3, 3, Array(0, 2, 3, 5), Array(0, 2, 1, 0, 2), Array(0.9, 1.2, 3.4, 5.7, 8.9), isTransposed = true) val nsmt = MLSerDe.loads(MLSerDe.dumps(smt)).asInstanceOf[SparseMatrix] assert(smt.toArray === nsmt.toArray) } }
Example 25
Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 26
Source File: ProbabilisticClassifierSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} final class TestProbabilisticClassificationModel( override val uid: String, override val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] { override def copy(extra: org.apache.spark.ml.param.ParamMap): this.type = defaultCopy(extra) override protected def predictRaw(input: Vector): Vector = { input } override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { rawPrediction } def friendlyPredict(values: Double*): Double = { predict(Vectors.dense(values.toArray)) } } class ProbabilisticClassifierSuite extends SparkFunSuite { test("test thresholding") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.5, 0.2)) assert(testModel.friendlyPredict(1.0, 1.0) === 1.0) assert(testModel.friendlyPredict(1.0, 0.2) === 0.0) } test("test thresholding not required") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) assert(testModel.friendlyPredict(1.0, 2.0) === 1.0) } test("test tiebreak") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.4, 0.4)) assert(testModel.friendlyPredict(0.6, 0.6) === 0.0) } test("test one zero threshold") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.0, 0.1)) assert(testModel.friendlyPredict(1.0, 10.0) === 0.0) assert(testModel.friendlyPredict(0.0, 10.0) === 1.0) } test("bad thresholds") { intercept[IllegalArgumentException] { new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(0.0, 0.0)) } intercept[IllegalArgumentException] { new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(-0.1, 0.1)) } } } object ProbabilisticClassifierSuite { val allParamSettings: Map[String, Any] = ClassifierSuite.allParamSettings ++ Map( "probabilityCol" -> "myProbability", "thresholds" -> Array(0.4, 0.6) ) }
Example 27
Source File: ANNSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext class ANNSuite extends SparkFunSuite with MLlibTestSparkContext { // TODO: test for weights comparison with Weka MLP test("ANN with Sigmoid learns XOR function with LBFGS optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array(0.0, 1.0, 1.0, 0.0) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights val trainer = new FeedForwardTrainer(topology, 2, 1) trainer.setWeights(initialWeights) trainer.LBFGSOptimizer.setNumIterations(20) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input)(0), label(0)) }.collect() predictionAndLabels.foreach { case (p, l) => assert(math.round(p) === l) } } test("ANN with SoftMax learns XOR function with 2-bit output and batch GD optimizer") { val inputs = Array( Array(0.0, 0.0), Array(0.0, 1.0), Array(1.0, 0.0), Array(1.0, 1.0) ) val outputs = Array( Array(1.0, 0.0), Array(0.0, 1.0), Array(0.0, 1.0), Array(1.0, 0.0) ) val data = inputs.zip(outputs).map { case (features, label) => (Vectors.dense(features), Vectors.dense(label)) } val rddData = sc.parallelize(data, 1) val hiddenLayersTopology = Array(5) val dataSample = rddData.first() val layerSizes = dataSample._1.size +: hiddenLayersTopology :+ dataSample._2.size val topology = FeedForwardTopology.multiLayerPerceptron(layerSizes, false) val initialWeights = FeedForwardModel(topology, 23124).weights val trainer = new FeedForwardTrainer(topology, 2, 2) // TODO: add a test for SGD trainer.LBFGSOptimizer.setConvergenceTol(1e-4).setNumIterations(20) trainer.setWeights(initialWeights).setStackSize(1) val model = trainer.train(rddData) val predictionAndLabels = rddData.map { case (input, label) => (model.predict(input), label) }.collect() predictionAndLabels.foreach { case (p, l) => assert(p ~== l absTol 0.5) } } }
Example 28
Source File: GradientSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.ann import breeze.linalg.{DenseMatrix => BDM} import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientSuite extends SparkFunSuite with MLlibTestSparkContext { test("Gradient computation against numerical differentiation") { val input = new BDM[Double](3, 1, Array(1.0, 1.0, 1.0)) // output must contain zeros and one 1 for SoftMax val target = new BDM[Double](2, 1, Array(0.0, 1.0)) val topology = FeedForwardTopology.multiLayerPerceptron(Array(3, 4, 2), softmaxOnTop = false) val layersWithErrors = Seq( new SigmoidLayerWithSquaredError(), new SoftmaxLayerWithCrossEntropyLoss() ) // check all layers that provide loss computation // 1) compute loss and gradient given the model and initial weights // 2) modify weights with small number epsilon (per dimension i) // 3) compute new loss // 4) ((newLoss - loss) / epsilon) should be close to the i-th component of the gradient for (layerWithError <- layersWithErrors) { topology.layers(topology.layers.length - 1) = layerWithError val model = topology.model(seed = 12L) val weights = model.weights.toArray val numWeights = weights.size val gradient = Vectors.dense(Array.fill[Double](numWeights)(0.0)) val loss = model.computeGradient(input, target, gradient, 1) val eps = 1e-4 var i = 0 val tol = 1e-4 while (i < numWeights) { val originalValue = weights(i) weights(i) += eps val newModel = topology.model(Vectors.dense(weights)) val newLoss = computeLoss(input, target, newModel) val derivativeEstimate = (newLoss - loss) / eps assert(math.abs(gradient(i) - derivativeEstimate) < tol, "Layer failed gradient check: " + layerWithError.getClass) weights(i) = originalValue i += 1 } } } private def computeLoss(input: BDM[Double], target: BDM[Double], model: TopologyModel): Double = { val outputs = model.forward(input) model.layerModels.last match { case layerWithLoss: LossFunction => layerWithLoss.loss(outputs.last, target, new BDM[Double](target.rows, target.cols)) case _ => throw new UnsupportedOperationException("Top layer is required to have loss." + " Failed layer:" + model.layerModels.last.getClass) } } }
Example 29
Source File: LocalWord2VecModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.Word2VecModel import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel} class LocalWord2VecModel(override val sparkTransformer: Word2VecModel) extends LocalTransformer[Word2VecModel] { lazy val parent: OldWord2VecModel = { val field = sparkTransformer.getClass.getDeclaredField( "org$apache$spark$ml$feature$Word2VecModel$$wordVectors" ) field.setAccessible(true) field.get(sparkTransformer).asInstanceOf[OldWord2VecModel] } private def axpy(a: Double, x: Array[Double], y: Array[Double]) = { y.zipWithIndex.foreach { case (value, index) => y.update(index, x(index) * a + value) } } private def scal(a: Double, v: Array[Double]) = { v.zipWithIndex.foreach { case (value, index) => v.update(index, value * a) } } override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val data = column.data.map(_.asInstanceOf[List[String]]).map { vec => if (vec.isEmpty) { Array .fill(sparkTransformer.getVectorSize)(0.0) .toList } else { val vectors = parent.getVectors .mapValues(v => Vectors.dense(v.map(_.toDouble))) val sum = Array.fill(sparkTransformer.getVectorSize)(0.0) vec.foreach { word => vectors.get(word).foreach { vec => axpy(1.0, vec.toDense.values, sum) } } scal(1.0 / vec.length, sum) sum.toList } } val newColumn = LocalDataColumn(sparkTransformer.getOutputCol, data) localData.withColumn(newColumn) case None => localData } } } object LocalWord2VecModel extends SimpleModelLoader[Word2VecModel] with TypedTransformerConverter[Word2VecModel] { override def build(metadata: Metadata, data: LocalData): Word2VecModel = { val wordVectors = data.column("wordVectors").get.data.head.asInstanceOf[Seq[Float]].toArray val wordIndex = data.column("wordIndex").get.data.head.asInstanceOf[Map[String, Int]] val oldCtor = classOf[OldWord2VecModel].getConstructor(classOf[Map[String, Int]], classOf[Array[Float]]) oldCtor.setAccessible(true) val oldWord2VecModel = oldCtor.newInstance(wordIndex, wordVectors) val ctor = classOf[Word2VecModel].getConstructor(classOf[String], classOf[OldWord2VecModel]) ctor.setAccessible(true) val inst = ctor .newInstance(metadata.uid, oldWord2VecModel) .setInputCol(metadata.paramMap("inputCol").toString) .setOutputCol(metadata.paramMap("outputCol").toString) inst .set(inst.maxIter, metadata.paramMap("maxIter").asInstanceOf[Number].intValue()) .set(inst.seed, metadata.paramMap("seed").toString.toLong) .set(inst.numPartitions, metadata.paramMap("numPartitions").asInstanceOf[Number].intValue()) .set(inst.stepSize, metadata.paramMap("stepSize").asInstanceOf[Double]) .set( inst.maxSentenceLength, metadata.paramMap("maxSentenceLength").asInstanceOf[Number].intValue() ) .set(inst.windowSize, metadata.paramMap("windowSize").asInstanceOf[Number].intValue()) .set(inst.vectorSize, metadata.paramMap("vectorSize").asInstanceOf[Number].intValue()) } override implicit def toLocal(transformer: Word2VecModel): LocalTransformer[Word2VecModel] = new LocalWord2VecModel(transformer) }
Example 30
Source File: LocalModelSpec22.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import org.apache.spark.ml.classification._ import org.apache.spark.ml.feature._ import org.apache.spark.ml.linalg.Vectors class LocalModelSpec22 extends GenericTestSpec { modelTest( data = session.createDataFrame(Seq( (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0) )).toDF("id", "text", "label"), steps = Seq( new Tokenizer().setInputCol("text").setOutputCol("words"), new HashingTF().setNumFeatures(1000).setInputCol("words").setOutputCol("features"), new LogisticRegression().setMaxIter(10).setRegParam(0.01) ), columns = Seq( "prediction" ) ) modelTest( data = session.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text"), steps = Seq( new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) ), columns = Seq( "result" ) ) modelTest( data = session.createDataFrame(Seq( (Vectors.dense(4.0, 0.2, 3.0, 4.0, 5.0), 1.0), (Vectors.dense(3.0, 0.3, 1.0, 4.1, 5.0), 1.0), (Vectors.dense(2.0, 0.5, 3.2, 4.0, 5.0), 1.0), (Vectors.dense(5.0, 0.7, 1.5, 4.0, 5.0), 1.0), (Vectors.dense(1.0, 0.1, 7.0, 4.0, 5.0), 0.0), (Vectors.dense(8.0, 0.3, 5.0, 1.0, 7.0), 0.0) )).toDF("features", "label"), steps = Seq( new LinearSVC() .setMaxIter(10) .setRegParam(0.1) ), columns = Seq( "prediction" ) ) modelTest( data = session.createDataFrame(Seq( (1.0, Double.NaN), (2.0, Double.NaN), (Double.NaN, 3.0), (4.0, 4.0), (5.0, 5.0) )).toDF("a", "b"), steps = Seq( new Imputer() .setInputCols(Array("a", "b")) .setOutputCols(Array("out_a", "out_b")) ), columns = Seq("out_a", "out_b") ) }
Example 31
Source File: LocalWord2VecModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.Word2VecModel import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.feature.{Word2VecModel => OldWord2VecModel} class LocalWord2VecModel(override val sparkTransformer: Word2VecModel) extends LocalTransformer[Word2VecModel] { lazy val parent: OldWord2VecModel = { val field = sparkTransformer.getClass.getDeclaredField( "org$apache$spark$ml$feature$Word2VecModel$$wordVectors" ) field.setAccessible(true) field.get(sparkTransformer).asInstanceOf[OldWord2VecModel] } private def axpy(a: Double, x: Array[Double], y: Array[Double]) = { y.zipWithIndex.foreach { case (value, index) => y.update(index, x(index) * a + value) } } private def scal(a: Double, v: Array[Double]) = { v.zipWithIndex.foreach { case (value, index) => v.update(index, value * a) } } override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val data = column.data.map(_.asInstanceOf[List[String]]).map { vec => if (vec.isEmpty) { Array .fill(sparkTransformer.getVectorSize)(0.0) .toList } else { val vectors = parent.getVectors .mapValues(v => Vectors.dense(v.map(_.toDouble))) val sum = Array.fill(sparkTransformer.getVectorSize)(0.0) vec.foreach { word => vectors.get(word).foreach { vec => axpy(1.0, vec.toDense.values, sum) } } scal(1.0 / vec.length, sum) sum.toList } } val newColumn = LocalDataColumn(sparkTransformer.getOutputCol, data) localData.withColumn(newColumn) case None => localData } } } object LocalWord2VecModel extends SimpleModelLoader[Word2VecModel] with TypedTransformerConverter[Word2VecModel] { override def build(metadata: Metadata, data: LocalData): Word2VecModel = { val wordVectors = data.column("wordVectors").get.data.head.asInstanceOf[Seq[Float]].toArray val wordIndex = data.column("wordIndex").get.data.head.asInstanceOf[Map[String, Int]] val oldCtor = classOf[OldWord2VecModel].getConstructor(classOf[Map[String, Int]], classOf[Array[Float]]) oldCtor.setAccessible(true) val oldWord2VecModel = oldCtor.newInstance(wordIndex, wordVectors) val ctor = classOf[Word2VecModel].getConstructor(classOf[String], classOf[OldWord2VecModel]) ctor.setAccessible(true) val inst = ctor .newInstance(metadata.uid, oldWord2VecModel) .setInputCol(metadata.paramMap("inputCol").toString) .setOutputCol(metadata.paramMap("outputCol").toString) inst .set(inst.maxIter, metadata.paramMap("maxIter").asInstanceOf[Number].intValue()) .set(inst.seed, metadata.paramMap("seed").toString.toLong) .set(inst.numPartitions, metadata.paramMap("numPartitions").asInstanceOf[Number].intValue()) .set(inst.stepSize, metadata.paramMap("stepSize").asInstanceOf[Double]) .set( inst.maxSentenceLength, metadata.paramMap("maxSentenceLength").asInstanceOf[Number].intValue() ) .set(inst.windowSize, metadata.paramMap("windowSize").asInstanceOf[Number].intValue()) .set(inst.vectorSize, metadata.paramMap("vectorSize").asInstanceOf[Number].intValue()) } override implicit def toLocal(transformer: Word2VecModel): LocalTransformer[Word2VecModel] = new LocalWord2VecModel(transformer) }
Example 32
Source File: LocalLogisticRegressionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import java.lang.Boolean import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.{Matrix, SparseMatrix, Vector, Vectors} class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel) extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {} object LocalLogisticRegressionModel extends SimpleModelLoader[LogisticRegressionModel] with TypedTransformerConverter[LogisticRegressionModel] { override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = { val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor( classOf[String], classOf[Matrix], classOf[Vector], classOf[Int], java.lang.Boolean.TYPE ) constructor.setAccessible(true) val coefficientMatrixParams = data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]] val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams) val interceptVectorParams = data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]] val interceptVector = DataUtils.constructVector(interceptVectorParams) constructor .newInstance( metadata.uid, coefficientMatrix, interceptVector, data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer], data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean] ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double]) } override implicit def toLocal( transformer: LogisticRegressionModel ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer) }
Example 33
Source File: LocalCountVectorizerModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.CountVectorizerModel import org.apache.spark.ml.linalg.Vectors import scala.collection.mutable class LocalCountVectorizerModel(override val sparkTransformer: CountVectorizerModel) extends LocalTransformer[CountVectorizerModel] { override def transform(localData: LocalData): LocalData = { import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ val dict = sparkTransformer.vocabulary.zipWithIndex.toMap val minTf = sparkTransformer.getMinTF localData.column(sparkTransformer.getInputCol) match { case Some(column) => val newCol = column.data.map(_.asInstanceOf[List[String]]).map { arr => val termCounts = mutable.HashMap.empty[Int, Double] var tokenCount = 0L arr.foreach { token => dict.get(token) foreach { index => val storedValue = termCounts.getOrElseUpdate(index, 0.0) termCounts.update(index, storedValue + 1.0) } tokenCount += 1 } val eTF = if (minTf >= 1.0) minTf else tokenCount * minTf val eCounts = if (sparkTransformer.getBinary) { termCounts filter (_._2 >= eTF) map (_._1 -> 1.0) toSeq } else { termCounts filter (_._2 >= eTF) toSeq } Vectors.sparse(dict.size, eCounts.toList).toList } localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newCol)) case None => localData } } } object LocalCountVectorizerModel extends SimpleModelLoader[CountVectorizerModel] with TypedTransformerConverter[CountVectorizerModel] { override def build(metadata: Metadata, data: LocalData): CountVectorizerModel = { val vocabulary = data.column("vocabulary").get.data.head.asInstanceOf[Seq[String]].toArray val inst = new CountVectorizerModel(metadata.uid, vocabulary) inst .setInputCol(metadata.paramMap("inputCol").toString) .setOutputCol(metadata.paramMap("outputCol").toString) .set(inst.binary, metadata.paramMap("binary").asInstanceOf[Boolean]) .set(inst.minDF, metadata.paramMap("minDF").toString.toDouble) .set(inst.minTF, metadata.paramMap("minTF").toString.toDouble) .set(inst.vocabSize, metadata.paramMap("vocabSize").asInstanceOf[Number].intValue()) } override implicit def toLocal( transformer: CountVectorizerModel ) = new LocalCountVectorizerModel(transformer) }
Example 34
Source File: LocalPCAModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.feature.PCAModel import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Vectors} import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Matrices => OldMatrices} class LocalPCAModel(override val sparkTransformer: PCAModel) extends LocalTransformer[PCAModel] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val pc = OldMatrices.fromML(sparkTransformer.pc).asInstanceOf[OldDenseMatrix] val newData = column.data.mapToMlLibVectors.map(pc.transpose.multiply).map(_.toList) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalPCAModel extends SimpleModelLoader[PCAModel] with TypedTransformerConverter[PCAModel] { override def build(metadata: Metadata, data: LocalData): PCAModel = { val constructor = classOf[PCAModel].getDeclaredConstructor( classOf[String], classOf[DenseMatrix], classOf[DenseVector] ) constructor.setAccessible(true) val pcMap = data.column("pc").get.data.head.asInstanceOf[Map[String, Any]] val pcMat = DataUtils.constructMatrix(pcMap).asInstanceOf[DenseMatrix] data.column("explainedVariance") match { case Some(ev) => // NOTE: Spark >= 2 val evParams = ev.data.head.asInstanceOf[Map[String, Any]] val explainedVariance = DataUtils.constructVector(evParams).toDense constructor .newInstance(metadata.uid, pcMat, explainedVariance) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) case None => // NOTE: Spark < 2 constructor .newInstance( metadata.uid, pcMat, Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector] ) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) } } override implicit def toLocal(transformer: PCAModel) = new LocalPCAModel(transformer) }
Example 35
Source File: LocalPolynomialExpansion.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.PolynomialExpansion import org.apache.spark.ml.linalg.{Vector, Vectors} class LocalPolynomialExpansion(override val sparkTransformer: PolynomialExpansion) extends LocalTransformer[PolynomialExpansion] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val method = classOf[PolynomialExpansion].getMethod("createTransformFunc") val newData = column.data.map(r => { val row = r.asInstanceOf[List[Any]].map(_.toString.toDouble).toArray val vector: Vector = Vectors.dense(row) method.invoke(sparkTransformer).asInstanceOf[Vector => Vector](vector).toList }) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalPolynomialExpansion extends SimpleModelLoader[PolynomialExpansion] with TypedTransformerConverter[PolynomialExpansion] { override def build(metadata: Metadata, data: LocalData): PolynomialExpansion = { new PolynomialExpansion(metadata.uid) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) .setDegree(metadata.paramMap("degree").asInstanceOf[Number].intValue()) } override implicit def toLocal( transformer: PolynomialExpansion ) = new LocalPolynomialExpansion(transformer) }
Example 36
Source File: LocalMaxAbsScalerModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.feature.MaxAbsScalerModel import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} class LocalMaxAbsScalerModel(override val sparkTransformer: MaxAbsScalerModel) extends LocalTransformer[MaxAbsScalerModel] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val maxAbsUnzero = Vectors.dense(sparkTransformer.maxAbs.toArray.map(x => if (x == 0) 1 else x)) val newData = column.data.map(r => { val vec = r match { case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue()) case d => throw new IllegalArgumentException(s"Unknown data type for LocalMaxAbsScaler: $d") } val brz = DataUtils.asBreeze(vec.toArray) / DataUtils.asBreeze(maxAbsUnzero.toArray) DataUtils.fromBreeze(brz).toList }) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalMaxAbsScalerModel extends SimpleModelLoader[MaxAbsScalerModel] with TypedTransformerConverter[MaxAbsScalerModel] { override def build(metadata: Metadata, data: LocalData): MaxAbsScalerModel = { val maxAbsParams = data.column("maxAbs").get.data.head.asInstanceOf[Map[String, Any]] val maxAbs = DataUtils.constructVector(maxAbsParams) val constructor = classOf[MaxAbsScalerModel].getDeclaredConstructor(classOf[String], classOf[Vector]) constructor.setAccessible(true) constructor .newInstance(metadata.uid, maxAbs) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) } override implicit def toLocal( transformer: MaxAbsScalerModel ): LocalMaxAbsScalerModel = new LocalMaxAbsScalerModel(transformer) }
Example 37
Source File: LocalMultilayerPerceptronClassificationModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel import org.apache.spark.ml.linalg.{Vector, Vectors} class LocalMultilayerPerceptronClassificationModel( override val sparkTransformer: MultilayerPerceptronClassificationModel ) extends LocalPredictionModel[MultilayerPerceptronClassificationModel] {} object LocalMultilayerPerceptronClassificationModel extends SimpleModelLoader[MultilayerPerceptronClassificationModel] with TypedTransformerConverter[MultilayerPerceptronClassificationModel] { override def build( metadata: Metadata, data: LocalData ): MultilayerPerceptronClassificationModel = { val layers = data.column("layers").get.data.head.asInstanceOf[Seq[Int]].toArray val weightsParam = data.column("weights").get.data.head.asInstanceOf[Map[String, Any]] val weights = DataUtils.constructVector(weightsParam) val constructor = classOf[MultilayerPerceptronClassificationModel].getDeclaredConstructor( classOf[String], classOf[Array[Int]], classOf[Vector] ) constructor.setAccessible(true) constructor .newInstance( metadata.uid, layers, weights ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) } override implicit def toLocal( sparkTransformer: MultilayerPerceptronClassificationModel ): LocalMultilayerPerceptronClassificationModel = { new LocalMultilayerPerceptronClassificationModel(sparkTransformer) } }
Example 38
Source File: LocalNaiveBayes.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.NaiveBayesModel import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors} class LocalNaiveBayes(override val sparkTransformer: NaiveBayesModel) extends LocalProbabilisticClassificationModel[NaiveBayesModel] {} object LocalNaiveBayes extends SimpleModelLoader[NaiveBayesModel] with TypedTransformerConverter[NaiveBayesModel] { override def build(metadata: Metadata, data: LocalData): NaiveBayesModel = { val constructor = classOf[NaiveBayesModel].getDeclaredConstructor( classOf[String], classOf[Vector], classOf[Matrix] ) constructor.setAccessible(true) val matrixMetadata = data.column("theta").get.data.head.asInstanceOf[Map[String, Any]] val matrix = DataUtils.constructMatrix(matrixMetadata) val piParams = data.column("pi").get.data.head.asInstanceOf[Map[String, Any]] val piVec = DataUtils.constructVector(piParams) val nb = constructor .newInstance(metadata.uid, piVec, matrix) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) nb.set(nb.smoothing, metadata.paramMap("smoothing").asInstanceOf[Number].doubleValue()) nb.set(nb.modelType, metadata.paramMap("modelType").asInstanceOf[String]) nb.set(nb.labelCol, metadata.paramMap("labelCol").asInstanceOf[String]) nb } override implicit def toLocal(sparkTransformer: NaiveBayesModel): LocalNaiveBayes = { new LocalNaiveBayes(sparkTransformer) } }
Example 39
package com.tencent.angel.spark.automl.tuner.acquisition import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate import org.apache.commons.logging.{Log, LogFactory} import org.apache.spark.ml.linalg.{Vector, Vectors} class UCB( override val surrogate: Surrogate, val beta: Double = 100) extends Acquisition(surrogate) { val LOG: Log = LogFactory.getLog(classOf[Surrogate]) override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = { val pred = surrogate.predict(X) // (mean, variance) val m: Double = pred._1 val s: Double = Math.sqrt(pred._2) if (s == 0) { // if std is zero, we have observed x on all instances // using a RF, std should be never exactly 0.0 (0.0, Vectors.dense(new Array[Double](X.size))) } else { val ucb = m + beta * s (ucb, Vectors.dense(new Array[Double](X.size))) } } }
Example 40
package com.tencent.angel.spark.automl.tuner.acquisition import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate import org.apache.commons.logging.{Log, LogFactory} import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.ml.linalg.{Vector, Vectors} class EI( override val surrogate: Surrogate, val par: Double) extends Acquisition(surrogate) { val LOG: Log = LogFactory.getLog(classOf[Surrogate]) override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = { val pred = surrogate.predict(X) // (mean, variance) // Use the best seen observation as incumbent val eta: Double = surrogate.curBest._2 //println(s"best seen result: $eta") val m: Double = pred._1 val s: Double = Math.sqrt(pred._2) //println(s"${X.toArray.mkString("(", ",", ")")}: mean[$m], variance[$s]") if (s == 0) { // if std is zero, we have observed x on all instances // using a RF, std should be never exactly 0.0 (0.0, Vectors.dense(new Array[Double](X.size))) } else { val z = (pred._1 - eta - par) / s val norm: NormalDistribution = new NormalDistribution val cdf: Double = norm.cumulativeProbability(z) val pdf: Double = norm.density(z) val ei = s * (z * cdf + pdf) //println(s"EI of ${X.toArray.mkString("(", ",", ")")}: $ei, cur best: $eta, z: $z, cdf: $cdf, pdf: $pdf") (ei, Vectors.dense(new Array[Double](X.size))) } } }
Example 41
Source File: Describe.scala From Scala-Machine-Learning-Projects with MIT License | 5 votes |
package com.packt.ScalaML.ChrunPrediction import org.apache.spark._ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ import org.apache.spark.ml.classification.{ BinaryLogisticRegressionSummary, LogisticRegression, LogisticRegressionModel } import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.max import org.apache.spark.ml.Pipeline import org.apache.spark.ml.tuning.{ ParamGridBuilder, CrossValidator } import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql._ import org.apache.spark.sql.Dataset import org.apache.spark.ml.linalg.{ Matrix, Vectors } import org.apache.spark.ml.stat.Correlation import org.apache.spark.sql.Row object Describe { case class CustomerAccount(state_code: String, account_length: Integer, area_code: String, international_plan: String, voice_mail_plan: String, num_voice_mail: Double, total_day_mins: Double, total_day_calls: Double, total_day_charge: Double, total_evening_mins: Double, total_evening_calls: Double, total_evening_charge: Double, total_night_mins: Double, total_night_calls: Double, total_night_charge: Double, total_international_mins: Double, total_international_calls: Double, total_international_charge: Double, total_international_num_calls: Double, churn: String) val schema = StructType(Array( StructField("state_code", StringType, true), StructField("account_length", IntegerType, true), StructField("area_code", StringType, true), StructField("international_plan", StringType, true), StructField("voice_mail_plan", StringType, true), StructField("num_voice_mail", DoubleType, true), StructField("total_day_mins", DoubleType, true), StructField("total_day_calls", DoubleType, true), StructField("total_day_charge", DoubleType, true), StructField("total_evening_mins", DoubleType, true), StructField("total_evening_calls", DoubleType, true), StructField("total_evening_charge", DoubleType, true), StructField("total_night_mins", DoubleType, true), StructField("total_night_calls", DoubleType, true), StructField("total_night_charge", DoubleType, true), StructField("total_international_mins", DoubleType, true), StructField("total_international_calls", DoubleType, true), StructField("total_international_charge", DoubleType, true), StructField("total_international_num_calls", DoubleType, true), StructField("churn", StringType, true))) def main(args: Array[String]) { val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName("Desribe") .getOrCreate() spark.conf.set("spark.debug.maxToStringFields", 10000) val DEFAULT_MAX_TO_STRING_FIELDS = 2500 if (SparkEnv.get != null) { SparkEnv.get.conf.getInt("spark.debug.maxToStringFields", DEFAULT_MAX_TO_STRING_FIELDS) } else { DEFAULT_MAX_TO_STRING_FIELDS } import spark.implicits._ val trainSet: Dataset[CustomerAccount] = spark.read. option("inferSchema", "false") .format("com.databricks.spark.csv") .schema(schema) .load("data/churn-bigml-80.csv") .as[CustomerAccount] val statsDF = trainSet.describe() statsDF.show() trainSet.createOrReplaceTempView("UserAccount") spark.catalog.cacheTable("UserAccount") spark.sqlContext.sql("SELECT churn, SUM(total_day_mins) + SUM(total_evening_mins) + SUM(total_night_mins) + SUM(total_international_mins) as Total_minutes FROM UserAccount GROUP BY churn").show() spark.sqlContext.sql("SELECT churn, SUM(total_day_charge) as TDC, SUM(total_evening_charge) as TEC, SUM(total_night_charge) as TNC, SUM(total_international_charge) as TIC, SUM(total_day_charge) + SUM(total_evening_charge) + SUM(total_night_charge) + SUM(total_international_charge) as Total_charge FROM UserAccount GROUP BY churn ORDER BY Total_charge DESC").show() trainSet.groupBy("churn").count.show() spark.sqlContext.sql("SELECT churn,SUM(total_international_num_calls) FROM UserAccount GROUP BY churn") } }
Example 42
Source File: LocalTreeIntegrationSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.Estimator import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.DecisionTreeRegressor import org.apache.spark.mllib.tree.DecisionTreeSuite import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext} import org.apache.spark.sql.DataFrame private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val distribTree = setParams(new DecisionTreeRegressor(), testParams) val localTree = setParams(new LocalDecisionTreeRegressor(), testParams) val localModel = localTree.fit(train) val model = distribTree.fit(train) OptimizedTreeTests.checkEqual(model, localModel) } test("Local & distributed training produce the same tree on a toy dataset") { val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a slightly larger toy dataset") { val data = sc.parallelize(Range(0, 16).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce the same tree on a larger toy dataset") { val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce same tree on a dataset of categorical features") { val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances()) // Create a map of categorical feature index to arity; each feature has arity nclasses val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3) // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its // categorical features val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a dataset of continuous features") { val sqlContext = spark.sqlContext import sqlContext.implicits._ // Use maxDepth = 5 and default params val params = medDepthTreeSettings val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext, nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2) .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray))) .toDF().cache() testEquivalence(data, params) } test("Local & distributed training produce the same tree on a dataset of constant features") { // Generate constant, continuous data val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } }
Example 43
Source File: LocalTreeUnitSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.tree._ import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext def deepTreeTest(depth: Int): Unit = { val deepTreeData = OptimizedTreeTests.deepTreeData(sc, depth) val df = spark.createDataFrame(deepTreeData) // Construct estimators; single-tree random forest & decision tree regressor. val localTree = new LocalDecisionTreeRegressor() .setFeaturesCol("features") // indexedFeatures .setLabelCol("label") .setMaxDepth(depth) .setMinInfoGain(0.0) // Fit model, check depth... val localModel = localTree.fit(df) assert(localModel.rootNode.subtreeDepth == depth) } // Test small depth tree deepTreeTest(10) // Test medium depth tree deepTreeTest(40) // Test high depth tree deepTreeTest(200) } }
Example 44
Source File: OptimizedDecisionTreeIntegrationSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.Estimator import org.apache.spark.ml.classification.{DecisionTreeClassifier, OptimizedDecisionTreeClassifier} import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.{DecisionTreeRegressor, OptimizedDecisionTreeRegressor} import org.apache.spark.mllib.tree.DecisionTreeSuite import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext} import org.apache.spark.sql.DataFrame private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val oldTree = setParams(new DecisionTreeRegressor(), testParams) val newTree = setParams(new OptimizedDecisionTreeRegressor(), testParams) val newModel = newTree.fit(train) val oldModel = oldTree.fit(train) OptimizedTreeTests.checkEqual(oldModel, newModel) } private def testClassifierEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val oldTree = setParams(new DecisionTreeClassifier(), testParams) val newTree = setParams(new OptimizedDecisionTreeClassifier(), testParams) val newModel = newTree.fit(train) val model = oldTree.fit(train) OptimizedTreeTests.checkEqual(model, newModel) } test("Local & distributed training produce the same tree on a toy dataset") { val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree with two feature values") { val data = sc.parallelize(Range(0, 8).map(x => { if (x > 3) { Instance(x, 1.0, Vectors.dense(0.0)) } else { Instance(x, 1.0, Vectors.dense(1.0)) }})) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a slightly larger toy dataset") { val data = sc.parallelize(Range(0, 10).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce the same tree on a larger toy dataset") { val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce same tree on a dataset of categorical features") { val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances()) // Create a map of categorical feature index to arity; each feature has arity nclasses val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3) // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its // categorical features val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a dataset of continuous features") { val sqlContext = spark.sqlContext import sqlContext.implicits._ // Use maxDepth = 5 and default params val params = medDepthTreeSettings val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext, nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2) .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray))) .toDF().cache() testEquivalence(data, params) } test("Local & distributed training produce the same tree on a dataset of constant features") { // Generate constant, continuous data val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } }
Example 45
Source File: VSoftmaxRegressionSuite.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.{SparseMatrix, Vector, Vectors} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import scala.language.existentials class VSoftmaxRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { import testImplicits._ private val seed = 42 @transient var multinomialDataset: Dataset[_] = _ private val eps: Double = 1e-5 override def beforeAll(): Unit = { super.beforeAll() multinomialDataset = { val nPoints = 50 val coefficients = Array( -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) val xMean = Array(5.843, 3.057, 3.758, 1.199) val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) val testData = LogisticRegressionSuite.generateMultinomialLogisticInput( coefficients, xMean, xVariance, addIntercept = true, nPoints, seed) val df = sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed)) df.cache() println("softmax test data:") df.show(10, false) df } } test("test on multinomialDataset") { def b2s(b: Boolean): String = { if (b) "w/" else "w/o" } for (standardization <- Seq(false, true)) { for ((reg, elasticNet) <- Seq((0.0, 0.0), (2.3, 0.0), (0.3, 0.05), (0.01, 1.0))) { println() println(s"# test ${b2s(standardization)} standardization, reg=${reg}, elasticNet=${elasticNet}") val trainer = new LogisticRegression() .setFamily("multinomial") .setStandardization(standardization) .setWeightCol("weight") .setRegParam(reg) .setFitIntercept(false) .setElasticNetParam(elasticNet) val model = trainer.fit(multinomialDataset) val vtrainer = new VSoftmaxRegression() .setColsPerBlock(2) .setRowsPerBlock(5) .setColPartitions(2) .setRowPartitions(3) .setWeightCol("weight") .setGeneratingFeatureMatrixBuffer(2) .setStandardization(standardization) .setRegParam(reg) .setElasticNetParam(elasticNet) val vmodel = vtrainer.fit(multinomialDataset) println(s"VSoftmaxRegression coefficientMatrix:\n" + s"${vmodel.coefficientMatrix.asInstanceOf[SparseMatrix].toDense},\n" + s"ml.SoftmaxRegression coefficientMatrix:\n" + s"${model.coefficientMatrix}\n") assert(vmodel.coefficientMatrix ~== model.coefficientMatrix relTol eps) } } } }
Example 46
Source File: DistributedVectorSuite.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg.distributed import breeze.linalg.{DenseVector => BDV, norm => Bnorm} import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.ml.util.VUtils import org.apache.spark.mllib.util.MLlibTestSparkContext class DistributedVectorSuite extends SparkFunSuite with MLlibTestSparkContext { var BV1: BDV[Double] = null var BV2: BDV[Double] = null var BV3: BDV[Double] = null var BV4: BDV[Double] = null var DV1: DistributedVector = null var DV2: DistributedVector = null var DV3: DistributedVector = null var DV4: DistributedVector = null override def beforeAll(): Unit = { super.beforeAll() val v1: Array[Double] = Seq(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0).toArray val v2: Array[Double] = Seq(-1.0, -2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0).toArray val v3: Array[Double] = Seq(-1.0, -2.0, -3.0, 5.0, -6.0, 7.0, 8.0, 9.0).toArray val v4: Array[Double] = Seq(0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 8.0, 0.0).toArray BV1 = new BDV(v1) BV2 = new BDV(v2) BV3 = new BDV(v3) BV4 = new BDV(v4) val sizePerPart = 3 val numPartitions = VUtils.getNumBlocks(sizePerPart, v1.length) DV1 = VUtils.splitArrIntoDV(sc, v1, sizePerPart, numPartitions).persist() DV2 = VUtils.splitArrIntoDV(sc, v2, sizePerPart, numPartitions).persist() DV3 = VUtils.splitArrIntoDV(sc, v3, sizePerPart, numPartitions).persist() DV4 = VUtils.splitArrIntoDV(sc, v4, sizePerPart, numPartitions).persist() } test("toLocal") { val localDV1 = DV1.toLocal assert(localDV1 ~== Vectors.fromBreeze(BV1) relTol 1e-8) val localDV4 = DV4.compressed.toLocal assert(localDV4 ~== Vectors.fromBreeze(BV4) relTol 1e-8) } test("add") { val local1 = DV1.add(2.0).persist().toLocal val local2 = DV1.add(DV2).persist().toLocal assert(local1 ~== Vectors.fromBreeze(BV1 + 2.0) relTol 1e-8) assert(local2 ~== Vectors.fromBreeze(BV1 + BV2) relTol 1e-8) } test("scale") { val local1 = DV1.scale(2.0).persist().toLocal assert(local1 ~== Vectors.fromBreeze(BV1 * 2.0) relTol 1e-8) } test("addScaledVector") { val res = DV1.addScaledVector(3.0, DV2).persist().toLocal assert(res ~== Vectors.fromBreeze(BV1 + (BV2 * 3.0)) relTol 1e-8) } test("dot") { val dotVal = DV1.dot(DV2) val bDotVal = BV1.dot(BV2) assert(dotVal ~== bDotVal relTol 1e-8) } test("norm") { assert(DV1.norm ~== Bnorm(BV1) relTol 1e-8) } test("combine") { val combined = DistributedVectors.combine( (10.0, DV1), (100.0, DV2), (18.0, DV3) ).persist().toLocal val bCombined = (BV1 * 10.0) + (BV2 * 100.0) + (BV3 * 18.0) assert(combined ~== Vectors.fromBreeze(bCombined) relTol 1e-8) } test("zeros") { var res1 = VUtils.zipRDDWithPartitionIDAndCollect( DistributedVectors.zeros(sc, 3, 2, 5).values) var res2 = Array((0, Vectors.dense(0.0, 0.0, 0.0)), (1, Vectors.dense(0.0, 0.0))) assert(res1 === res2) res1 = VUtils.zipRDDWithPartitionIDAndCollect( DistributedVectors.zeros(sc, 3, 2, 7, 1.5).values) res2 = Array((0, Vectors.dense(0.0, 0.0, 0.0)), (1, Vectors.dense(0.0, 0.0, 0.0, 1.5))) assert(res1 === res2) } }
Example 47
Source File: VLinearRegressionSuite.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import scala.language.existentials import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.DataFrame class VLinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { import testImplicits._ var datasetWithWeight: DataFrame = _ override def beforeAll(): Unit = { super.beforeAll() datasetWithWeight = sc.parallelize(Seq( Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)), Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)), Instance(29.0, 4.0, Vectors.dense(3.0, 13.0)) ), 2).toDF() } test("test on datasetWithWeight") { def b2s(b: Boolean): String = { if (b) "w/" else "w/o" } for (fitIntercept <- Seq(false, true)) { for (standardization <- Seq(false, true)) { for ((reg, elasticNet)<- Seq((0.0, 0.0), (2.3, 0.0), (2.3, 0.5))) { println() println(s"# test ${b2s(fitIntercept)} intercept, ${b2s(standardization)} standardization, reg=${reg}, elasticNet=${elasticNet}") val vtrainer = new VLinearRegression() .setColsPerBlock(1) .setRowsPerBlock(1) .setGeneratingFeatureMatrixBuffer(2) .setFitIntercept(fitIntercept) .setStandardization(standardization) .setRegParam(reg) .setWeightCol("weight") .setElasticNetParam(elasticNet) val vmodel = vtrainer.fit(datasetWithWeight) // Note that in ml.LinearRegression, when datasets numInstanse is small // solver l-bfgs and solver normal will generate slightly different result when reg not zero // because there std calculation result have multiple difference numInstance/(numInstance - 1) // here test keep consistent with l-bfgs solver val trainer = new LinearRegression() .setSolver("l-bfgs") // by default it may use noraml solver so here force set it. .setFitIntercept(fitIntercept) .setStandardization(standardization) .setRegParam(reg) .setWeightCol("weight") .setElasticNetParam(elasticNet) val model = trainer.fit(datasetWithWeight) logInfo(s"LinearRegression total iterations: ${model.summary.totalIterations}") println(s"VLinearRegression coefficients: ${vmodel.coefficients.toDense}, intercept: ${vmodel.intercept}\n" + s"LinearRegression coefficients: ${model.coefficients.toDense}, intercept: ${model.intercept}") def filterSmallValue(v: Vector) = { Vectors.dense(v.toArray.map(x => if (math.abs(x) < 1e-6) 0.0 else x)) } assert(filterSmallValue(vmodel.coefficients) ~== filterSmallValue(model.coefficients) relTol 1e-3) assert(vmodel.intercept ~== model.intercept relTol 1e-3) } } } } }
Example 48
Source File: ReebDiagram.scala From spark-tda with Apache License 2.0 | 5 votes |
import java.io.{File, PrintWriter} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.feature.{ReebDiagram, VectorAssembler} import org.apache.spark.sql.functions._ def computeReebDiagram( pathToTextFile: String, quantity: Int, linkThresholdRatio: Double, coreThresholdRatio: Double, topTreeRatio: Double) { def save(f: File)(func: PrintWriter => Unit) { val p = new PrintWriter(f) try { func(p) } finally { p.close() } } val filename = pathToTextFile.split("\\.")(0) val outputFilename = s"$filename-REEB-k${quantity}-l${linkThresholdRatio}-c${coreThresholdRatio}-i${topTreeRatio}.tsv" val points = sc.textFile(pathToTextFile) .map { line => line.trim.split("\\s+") } .zipWithIndex .map { case (row, i) => (i, row(0).toDouble, row(1).toDouble, 0) } .toDF("id", "x", "y", "cover_id") val cardinality = points.count val assembler = new VectorAssembler() .setInputCols(Array("x", "y")) .setOutputCol("feature") val features = assembler .transform(points) val reeb = new ReebDiagram() .setK(quantity) .setLinkThresholdRatio(linkThresholdRatio) .setCoreThresholdRatio(coreThresholdRatio) .setTopTreeSize((topTreeRatio * cardinality).toInt) .setTopTreeLeafSize(quantity) .setIdCol("id") .setCoverCol("cover_id") .setFeaturesCol("feature") .setOutputCol("cluster_id") val transformed = reeb .fit(features) .transform(features) val clusters = Map( transformed .select("cluster_id") .rdd .map(row => row.getLong(0)) .distinct .zipWithIndex .collect(): _*) val result = transformed .select("x", "y", "cluster_id") .rdd .map(row => (row.getDouble(0), row.getDouble(1), row.getLong(2))) .map { case (x, y, clusterId) => (x, y, clusters(clusterId) + 1)} .collect() save(new File(outputFilename)) { println(s"OUTPUT TO: ${outputFilename}") f => result.foreach{ case (x, y, ccid) => f.println(s"${x}\t${y}\t${ccid}") } } }
Example 49
Source File: FeaturePropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vector, Vectors, DenseVector} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.{SparkSession, DataFrame} import org.apache.spark.sql.types.{ StructField, IntegerType, DoubleType, BooleanType, StructType, StringType, ArrayType } import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Arbitrary.arbitrary import org.scalatest.PropSpec import com.holdenkarau.spark.testing.{ SharedSparkContext, DataframeGenerator, Column } abstract class FeaturePropSpec extends PropSpec with SharedSparkContext with DefaultReadWriteTest { implicit def arbitraryDenseVector: Arbitrary[DenseVector] = Arbitrary { for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr) } implicit def arbitraryVector: Arbitrary[Vector] = Arbitrary( Gen.frequency( 1 -> arbitrary[DenseVector] )) lazy val spark = SparkSession.builder().getOrCreate() def schema = StructType( List( StructField("integer", IntegerType), StructField("double", DoubleType), StructField("boolean", BooleanType), StructField("string", StringType) )) def integerGen = new Column("integer", Gen.choose(-100, 100)) def doubleGen = new Column("double", Gen.choose(-100.0, 100.0)) def stringGen = new Column("string", Gen.oneOf("A", "BC", "DEF", "GHIJ", "KLMNO")) def dataframeGen = DataframeGenerator.arbitraryDataFrameWithCustomFields( spark.sqlContext, schema)(integerGen, doubleGen, stringGen) def hasDistinctValues(df: DataFrame, columns: String*): Boolean = { columns.foldLeft(true) { (acc, col) => acc && df.select(col).distinct.count() > 1 } } }
Example 50
Source File: ReebDiagramTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vectors, EuclideanDistance, Vector} import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class ReebDiagramTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") val cover = new Cover() .setExploding(true) .setInputCols("double", "integer") .setOutputCol("cover_id") property("argument topTreeSize must be positive") { intercept[IllegalArgumentException] { val reeb = new ReebDiagram() // .setIdCol("id") // .setCoverCol("cover_id") // .setFeaturesCol("vector") // .setOutputCol("cluster_id") .setTopTreeSize(0) } } property("placeholder") { val reeb = new ReebDiagram() .setK(15) .setIdCol("id") .setCoverCol("cover_id") .setFeaturesCol("vector") .setOutputCol("cluster_id") forAll(dataframeGen.arbitrary) { df => val assembled = assembler.transform(df) whenever( assembled.count() > 0 && hasDistinctValues(assembled, "double", "integer")) { val transformed = cover .fit(assembled) .transform(assembled) val result = reeb .setTopTreeSize(1) .fit(transformed) .transform(transformed) // result.show() } } } }
Example 51
Source File: CoverTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class CoverTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") property("argument numSplits must be positive") { intercept[IllegalArgumentException] { val cover = new Cover() .setInputCols("double") .setOutputCol("cover_ids") .setNumSplits(0) } } property("argument overlapRatio must be positive") { intercept[IllegalArgumentException] { val cover = new Cover() .setInputCols("double") .setOutputCol("cover_ids") .setOverlapRatio(0.0) } } property("cover estimator changes nothing with the original dataframe") { val cover = new Cover() .setInputCols("double", "integer", "vector") .setOutputCol("cover_ids") forAll(dataframeGen.arbitrary) { df => val transformed = assembler.transform(df) whenever( transformed.count() > 0 && hasDistinctValues(transformed, "double", "integer", "vector")) { val covered = cover .fit(transformed) .transform(transformed) .drop("cover_ids") .except(transformed) .count() should be(0) } } } property("generated cover covers all range of specified columns") { val cover = new Cover() .setInputCols("double", "integer", "vector") .setOutputCol("cover_ids") val uncovered = udf { xs: Seq[Long] => xs.length == 0 } forAll(dataframeGen.arbitrary) { df => val transformed = assembler.transform(df) whenever( transformed.count() > 0 && hasDistinctValues(transformed, "double", "integer", "vector")) { cover .fit(transformed) .transform(transformed) .where(uncovered(col("cover_ids"))) .count() should be(0) } } } property("Cover is readable/writable") { val cover = new Cover() .setInputCols("double", "integer") .setOutputCol("cover_ids") testDefaultReadWrite(cover) } property("CoverModel is readable/writable") { val model = new CoverModel("myCoverModel", Vectors.dense(-1.0, 0.0), Vectors.dense(1.0, 10.0)) .setInputCols("double", "integer") .setOutputCol("cover_ids") val newModel = testDefaultReadWrite(model) assert(newModel.min === model.min) assert(newModel.max === model.max) } }
Example 52
Source File: PartitionersTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import org.scalacheck.Prop.forAllNoShrink import org.scalatest.Matchers import org.scalatest.prop.GeneratorDrivenPropertyChecks import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance} class PartitionersTest extends KNNPropSpec with GeneratorDrivenPropertyChecks with Matchers { property("TopTreesPartitioner can be constructed with empty data") { forAll { (v: Vector, coverId: Int) => val partitioner = new TopTreesPartitioner(TopTrees(IndexedSeq.empty[(Int, Tree)])) val vector = VectorEntry(0L, v) intercept[NoSuchElementException] { partitioner.getPartition((coverId, vector)) } } } property( "TopTrees can be constructed with non empty data and maintain its consistency") { forAll(treeGen) { case (trees) => val indexedTrees = trees.zipWithIndex.map { case (t, i) => (i, t) } val partitioner = new TopTreesPartitioner(TopTrees(indexedTrees)) val indices = indexedTrees .flatMap { case (index, tree) => tree.iterator.map(d => (index, d)) } .map { case (index, entry) => partitioner.getPartition((index, entry)) } .toSet indices should contain theSameElementsAs (0 until partitioner.numPartitions) .toSet (0 until partitioner.numPartitions).toSet should contain theSameElementsAs indices intercept[IllegalArgumentException] { partitioner.getPartition(0) } } } }
Example 53
Source File: KNNPropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import scala.reflect.ClassTag import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Gen.{choose, oneOf} import org.scalatest.PropSpec import org.apache.spark.ml.linalg.{ CosineDistance, EuclideanDistance, ManhattanDistance, JaccardDistance, HammingDistance } import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector, Vectors} import com.holdenkarau.spark.testing.SharedSparkContext abstract class KNNPropSpec extends PropSpec with SharedSparkContext { implicit def arbitraryDenseVector: Arbitrary[DenseVector] = Arbitrary { for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr) } implicit def arbitrarySparseVector: Arbitrary[SparseVector] = Arbitrary { for (vec <- arbitrary[DenseVector]) yield vec.toSparse } implicit def arbitraryVector: Arbitrary[Vector] = Arbitrary( Gen.frequency( 1 -> arbitrary[DenseVector], 1 -> arbitrary[SparseVector] )) private def arraysOfNM[T: ClassTag](numRows: Int, numCols: Int, gen: Gen[T]): Gen[Array[Array[T]]] = Gen.listOfN(numRows * numCols, gen).map { square => square.toArray.grouped(numCols).toArray } private def vectorsOfNM(numRows: Int, numCols: Int, gen: Gen[Double]): Gen[Array[DenseVector]] = for { arrays <- arraysOfNM(numRows, numCols, gen) } yield arrays.map(arr => new DenseVector(arr)) val treeGen = for { measure <- oneOf(CosineDistance, EuclideanDistance, ManhattanDistance, HammingDistance, JaccardDistance) numVectors <- choose(1, 100) vectors <- vectorsOfNM(numVectors, 2, choose(-10.0, 10.0)) } yield vectors .scanLeft(Seq[Vector]())(_ :+ _) .tail .map( vs => VPTree(vs.map(v => VectorEntry(0L, v)).toIndexedSeq, measure, 10, 10, 10)) }
Example 54
Source File: IndicesTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import org.scalacheck.Prop.forAllNoShrink import org.scalatest.Matchers import org.scalatest.prop.GeneratorDrivenPropertyChecks import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance} class IndicesTest extends KNNPropSpec with GeneratorDrivenPropertyChecks with Matchers { property("TopTrees can be constructed with empty data") { forAll { (v: Vector, coverId: Int) => val topTrees = TopTrees(IndexedSeq.empty[(Int, Tree)]) val vector = VectorEntry(0L, v) topTrees.get((coverId, vector)) shouldBe None topTrees.isDefinedAt((coverId, vector)) shouldBe false intercept[NoSuchElementException] { topTrees((coverId, vector)) } } } property( "TopTrees can be constructed with non empty data and maintain its consistency") { forAll(treeGen) { case (trees) => val indexedTrees = trees.zipWithIndex.map { case (t, i) => (i, t) } val topTrees = TopTrees(indexedTrees) val indices = indexedTrees .flatMap { case (index, tree) => tree.iterator.map(d => (index, d)) } .map { case (index, entry) => topTrees((index, entry)) } .toSet indices should contain theSameElementsAs (0 until topTrees.numIndices) .toSet (0 until topTrees.numIndices).toSet should contain theSameElementsAs indices } } }
Example 55
Source File: TreesTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import org.scalacheck.Prop.forAllNoShrink import org.scalatest.Matchers import org.scalatest.prop.GeneratorDrivenPropertyChecks import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance} class TreesTest extends KNNPropSpec with GeneratorDrivenPropertyChecks with Matchers { property("VPTree can be constructed with empty data") { forAll { (v: Vector) => val tree = VPTree(IndexedSeq.empty[VectorWithId], EuclideanDistance, 0, 0) val vector = VectorEntry(0L, v) tree.iterator shouldBe empty tree.query(vector) shouldBe empty tree.numLeaves shouldBe 0 } } property("VPTree can be constructed with data not having any duplication") { val origin = VectorEntry(0L, Vectors.dense(0, 0)) val data = (-5 to 5).flatMap { i => (-5 to 5).map { j => VectorEntry(0L, Vectors.dense(i, j)) } } List(1, data.size / 2, data.size, data.size * 2).foreach { leafSize => val tree = VPTree(data, EuclideanDistance, 1, 1, leafSize) tree.size shouldBe data.size tree.iterator.toIterable should contain theSameElementsAs data data.foreach(v => tree.query(v, 1).head._1 shouldBe v) tree .query(origin, 5) .map(_._1.vector) should contain theSameElementsAs Set( Vectors.dense(-1, 0), Vectors.dense(1, 0), Vectors.dense(0, -1), Vectors.dense(0, 1), Vectors.dense(0, 0) ) tree .query(origin, 9) .map(_._1.vector) should contain theSameElementsAs Set( Vectors.dense(-1, -1), Vectors.dense(-1, 0), Vectors.dense(-1, 1), Vectors.dense(0, -1), Vectors.dense(0, 0), Vectors.dense(0, 1), Vectors.dense(1, -1), Vectors.dense(1, 0), Vectors.dense(1, 1) ) tree.numLeaves shouldBe (tree.cardinality / leafSize.toDouble).ceil } } property("VPTree can be constructed with data having duplication") { val origin = VectorEntry(0L, Vectors.dense(0, 0)) val data = (Vectors.dense(2.0, 0.0) +: Array.fill(5)(Vectors.dense(0.0, 1.0))) .map(VectorEntry(0L, _)) val tree = VPTree(data, EuclideanDistance, 6, 6) val knn = tree.query(origin, 5) tree.numLeaves shouldBe 2 knn.size shouldBe 5 knn.map(_._1.vector).toSet should contain theSameElementsAs Array( Vectors.dense(0.0, 1.0)) } }
Example 56
Source File: MleapNodeWrapper.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.tree.clustering import ml.bundle.ctree.Node import ml.combust.bundle.tree.cluster.NodeWrapper import ml.combust.mleap.core.clustering.ClusteringTreeNode import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.linalg.mleap.VectorWithNorm object MleapNodeWrapper extends NodeWrapper[ClusteringTreeNode] { override def node(n: ClusteringTreeNode): Node = { Node(index = n.index, norm = n.centerWithNorm.norm, values = n.centerWithNorm.vector.toArray.toSeq, numChildren = n.children.length) } override def children(n: ClusteringTreeNode): Array[ClusteringTreeNode] = n.children override def create(node: Node, children: Seq[ClusteringTreeNode]): ClusteringTreeNode = { ClusteringTreeNode(index = node.index, centerWithNorm = VectorWithNorm(Vectors.dense(node.values.toArray), node.norm), children = children.toArray) } }
Example 57
Source File: ElementwiseProductOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.feature.ElementwiseProductModel import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.feature.ElementwiseProduct import org.apache.spark.ml.linalg.Vectors class ElementwiseProductOp extends MleapOp[ElementwiseProduct, ElementwiseProductModel] { override val Model: OpModel[MleapContext, ElementwiseProductModel] = new OpModel[MleapContext, ElementwiseProductModel] { override val klazz: Class[ElementwiseProductModel] = classOf[ElementwiseProductModel] override def opName: String = Bundle.BuiltinOps.feature.elementwise_product override def store(model: Model, obj: ElementwiseProductModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("scaling_vec", Value.vector(obj.scalingVec.toArray)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): ElementwiseProductModel = { ElementwiseProductModel(scalingVec = Vectors.dense(model.value("scaling_vec").getTensor[Double].toArray)) } } override def model(node: ElementwiseProduct): ElementwiseProductModel = node.model }
Example 58
Source File: BucketedRandomProjectionLSHOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.feature.BucketedRandomProjectionLSHModel import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.feature.BucketedRandomProjectionLSH import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.linalg.Vectors class BucketedRandomProjectionLSHOp extends MleapOp[BucketedRandomProjectionLSH, BucketedRandomProjectionLSHModel] { override val Model: OpModel[MleapContext, BucketedRandomProjectionLSHModel] = new OpModel[MleapContext, BucketedRandomProjectionLSHModel] { override val klazz: Class[BucketedRandomProjectionLSHModel] = classOf[BucketedRandomProjectionLSHModel] override def opName: String = Bundle.BuiltinOps.feature.bucketed_random_projection_lsh override def store(model: Model, obj: BucketedRandomProjectionLSHModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("random_unit_vectors", Value.tensorList[Double](obj.randomUnitVectors.map(v => Tensor.denseVector(v.toArray)))). withValue("bucket_length", Value.double(obj.bucketLength)). withValue("input_size", Value.int(obj.inputSize)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): BucketedRandomProjectionLSHModel = { val ruv = model.value("random_unit_vectors").getTensorList[Double].map(_.toArray).map(Vectors.dense) BucketedRandomProjectionLSHModel(randomUnitVectors = ruv, bucketLength = model.value("bucket_length").getDouble, inputSize = model.value("input_size").getInt) } } override def model(node: BucketedRandomProjectionLSH): BucketedRandomProjectionLSHModel = node.model }
Example 59
Source File: MaxAbsScalerOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.feature.MaxAbsScalerModel import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.feature.MaxAbsScaler import org.apache.spark.ml.linalg.Vectors class MaxAbsScalerOp extends MleapOp[MaxAbsScaler, MaxAbsScalerModel]{ override val Model: OpModel[MleapContext, MaxAbsScalerModel] = new OpModel[MleapContext, MaxAbsScalerModel] { override val klazz: Class[MaxAbsScalerModel] = classOf[MaxAbsScalerModel] override def opName: String = Bundle.BuiltinOps.feature.max_abs_scaler override def store(model: Model, obj: MaxAbsScalerModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("maxAbs", Value.vector(obj.maxAbs.toArray)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): MaxAbsScalerModel = { MaxAbsScalerModel(maxAbs = Vectors.dense(model.value("maxAbs").getTensor[Double].toArray)) } } override def model(node: MaxAbsScaler): MaxAbsScalerModel = node.model }
Example 60
Source File: IDFOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.feature.IDFModel import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.feature.IDF import org.apache.spark.ml.linalg.Vectors class IDFOp extends MleapOp[IDF, IDFModel] { override val Model: OpModel[MleapContext, IDFModel] = new OpModel[MleapContext, IDFModel] { override val klazz: Class[IDFModel] = classOf[IDFModel] override def opName: String = Bundle.BuiltinOps.feature.idf override def store(model: Model, obj: IDFModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("idf", Value.vector(obj.idf.toArray)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): IDFModel = { IDFModel(idf = Vectors.dense(model.value("idf").getTensor[Double].toArray)) } } override def model(node: IDF): IDFModel = node.model }
Example 61
Source File: StandardScalerOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.mleap.core.feature.StandardScalerModel import ml.combust.mleap.runtime.transformer.feature.StandardScaler import ml.combust.bundle.op.OpModel import ml.combust.bundle.dsl._ import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.runtime.MleapContext import org.apache.spark.ml.linalg.Vectors class StandardScalerOp extends MleapOp[StandardScaler, StandardScalerModel] { override val Model: OpModel[MleapContext, StandardScalerModel] = new OpModel[MleapContext, StandardScalerModel] { override val klazz: Class[StandardScalerModel] = classOf[StandardScalerModel] override def opName: String = Bundle.BuiltinOps.feature.standard_scaler override def store(model: Model, obj: StandardScalerModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("mean", obj.mean.map(_.toArray).map(Value.vector[Double])). withValue("std", obj.std.map(_.toArray).map(Value.vector[Double])) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): StandardScalerModel = { val mean = model.getValue("mean").map(_.getTensor[Double].toArray).map(Vectors.dense) val std = model.getValue("std").map(_.getTensor[Double].toArray).map(Vectors.dense) StandardScalerModel(mean = mean, std = std) } } override def model(node: StandardScaler): StandardScalerModel = node.model }
Example 62
Source File: MinMaxScalerOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.feature.MinMaxScalerModel import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors class MinMaxScalerOp extends MleapOp[MinMaxScaler, MinMaxScalerModel]{ override val Model: OpModel[MleapContext, MinMaxScalerModel] = new OpModel[MleapContext, MinMaxScalerModel] { override val klazz: Class[MinMaxScalerModel] = classOf[MinMaxScalerModel] override def opName: String = Bundle.BuiltinOps.feature.min_max_scaler override def store(model: Model, obj: MinMaxScalerModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("min", Value.vector(obj.originalMin.toArray)). withValue("max", Value.vector(obj.originalMax.toArray)) .withValue("minValue", Value.double(obj.minValue)) .withValue("maxValue", Value.double(obj.maxValue)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): MinMaxScalerModel = { val minValue = model.getValue("minValue").map(_.getDouble).getOrElse(0.0) val maxValue = model.getValue("maxValue").map(_.getDouble).getOrElse(1.0) MinMaxScalerModel(originalMin = Vectors.dense(model.value("min").getTensor[Double].toArray), originalMax = Vectors.dense(model.value("max").getTensor[Double].toArray), minValue = minValue, maxValue = maxValue ) } } override def model(node: MinMaxScaler): MinMaxScalerModel = node.model }
Example 63
Source File: GaussianMixtureOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.clustering import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.clustering.GaussianMixtureModel import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.clustering.GaussianMixture import ml.combust.mleap.tensor.{DenseTensor, Tensor} import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.stat.distribution.MultivariateGaussian class GaussianMixtureOp extends MleapOp[GaussianMixture, GaussianMixtureModel] { override val Model: OpModel[MleapContext, GaussianMixtureModel] = new OpModel[MleapContext, GaussianMixtureModel] { override val klazz: Class[GaussianMixtureModel] = classOf[GaussianMixtureModel] override def opName: String = Bundle.BuiltinOps.clustering.gaussian_mixture override def store(model: Model, obj: GaussianMixtureModel) (implicit context: BundleContext[MleapContext]): Model = { val (means, covs) = obj.gaussians.map(g => (g.mean, g.cov)).unzip model.withValue("means", Value.tensorList(means.map(m => Tensor.denseVector(m.toArray)))). withValue("covs", Value.tensorList(covs.map(c => DenseTensor(c.toArray, Seq(c.numRows, c.numCols))))). withValue("weights", Value.doubleList(obj.weights.toSeq)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): GaussianMixtureModel = { val means = model.value("means").getTensorList[Double].map(values => Vectors.dense(values.toArray)) val covs = model.value("covs").getTensorList[Double].map { values => Matrices.dense(values.dimensions.head, values.dimensions(1), values.toArray) } val gaussians = means.zip(covs).map { case (mean, cov) => new MultivariateGaussian(mean, cov) }.toArray val weights = model.value("weights").getDoubleList.toArray GaussianMixtureModel(gaussians, weights) } } override def model(node: GaussianMixture): GaussianMixtureModel = node.model }
Example 64
Source File: KMeansOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.clustering import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.clustering.KMeansModel import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.clustering.KMeans import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.linalg.Vectors class KMeansOp extends MleapOp[KMeans, KMeansModel] { override val Model: OpModel[MleapContext, KMeansModel] = new OpModel[MleapContext, KMeansModel] { override val klazz: Class[KMeansModel] = classOf[KMeansModel] override def opName: String = Bundle.BuiltinOps.clustering.k_means override def store(model: Model, obj: KMeansModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("cluster_centers", Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.vector.toArray)))) .withValue("num_features", Value.long(obj.numFeatures)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): KMeansModel = { val numFeatures = model.value("num_features").getLong.toInt KMeansModel(model.value("cluster_centers").getTensorList[Double].map(t => Vectors.dense(t.toArray)), numFeatures) } } override def model(node: KMeans): KMeansModel = node.model }
Example 65
Source File: MultiLayerPerceptronClassifierOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.classification.MultiLayerPerceptronClassifierModel import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.classification.MultiLayerPerceptronClassifier import org.apache.spark.ml.linalg.Vectors class MultiLayerPerceptronClassifierOp extends MleapOp[MultiLayerPerceptronClassifier, MultiLayerPerceptronClassifierModel] { override val Model: OpModel[MleapContext, MultiLayerPerceptronClassifierModel] = new OpModel[MleapContext, MultiLayerPerceptronClassifierModel] { override def opName: String = Bundle.BuiltinOps.classification.multi_layer_perceptron_classifier override val klazz: Class[MultiLayerPerceptronClassifierModel] = classOf[MultiLayerPerceptronClassifierModel] override def store(model: Model, obj: MultiLayerPerceptronClassifierModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("layers", Value.longList(obj.layers.map(_.toLong))). withValue("weights", Value.vector(obj.weights.toArray)). withValue("thresholds", obj.thresholds.map(Value.doubleList(_))) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): MultiLayerPerceptronClassifierModel = { MultiLayerPerceptronClassifierModel(layers = model.value("layers").getLongList.map(_.toInt), weights = Vectors.dense(model.value("weights").getTensor[Double].toArray), thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray)) } } override def model(node: MultiLayerPerceptronClassifier): MultiLayerPerceptronClassifierModel = node.model }
Example 66
Source File: LogisticRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, LogisticRegressionModel, ProbabilisticLogisticsRegressionModel} import ml.combust.mleap.runtime.transformer.classification.LogisticRegression import ml.combust.bundle.op.OpModel import ml.combust.bundle.dsl._ import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.linalg.{Matrices, Vectors} class LogisticRegressionOp extends MleapOp[LogisticRegression, LogisticRegressionModel] { private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5 override val Model: OpModel[MleapContext, LogisticRegressionModel] = new OpModel[MleapContext, LogisticRegressionModel] { override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel] override def opName: String = Bundle.BuiltinOps.classification.logistic_regression override def store(model: Model, obj: LogisticRegressionModel) (implicit context: BundleContext[MleapContext]): Model = { val m = model.withValue("num_classes", Value.long(obj.numClasses)) if(obj.isMultinomial) { val mm = obj.multinomialModel val cm = mm.coefficientMatrix m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))). withValue("intercept_vector", Value.vector(mm.interceptVector.toArray)). withValue("thresholds", mm.thresholds.map(_.toSeq).map(Value.doubleList)) } else { m.withValue("coefficients", Value.vector(obj.binaryModel.coefficients.toArray)). withValue("intercept", Value.double(obj.binaryModel.intercept)). withValue("threshold", Value.double(obj.binaryModel.threshold)) } } override def load(model: Model) (implicit context: BundleContext[MleapContext]): LogisticRegressionModel = { val numClasses = model.value("num_classes").getLong val lm = if(numClasses > 2) { val tensor = model.value("coefficient_matrix").getTensor[Double] val cm = Matrices.dense(numRows = tensor.dimensions.head, numCols = tensor.dimensions(1), tensor.toArray) ProbabilisticLogisticsRegressionModel(coefficientMatrix = cm, interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray), thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray)) } else { // default threshold is 0.5 for both Spark and Scikit-learn val threshold = model.getValue("threshold") .map(value => value.getDouble) .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD) BinaryLogisticRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble, threshold = threshold) } LogisticRegressionModel(lm) } } override def model(node: LogisticRegression): LogisticRegressionModel = node.model }
Example 67
Source File: SupportVectorMachineOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.mleap.core.classification.SupportVectorMachineModel import ml.combust.mleap.runtime.transformer.classification.SupportVectorMachine import ml.combust.bundle.op.OpModel import ml.combust.bundle.dsl._ import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.runtime.MleapContext import org.apache.spark.ml.linalg.Vectors class SupportVectorMachineOp extends MleapOp[SupportVectorMachine, SupportVectorMachineModel] { override val Model: OpModel[MleapContext, SupportVectorMachineModel] = new OpModel[MleapContext, SupportVectorMachineModel] { override val klazz: Class[SupportVectorMachineModel] = classOf[SupportVectorMachineModel] override def opName: String = Bundle.BuiltinOps.classification.support_vector_machine override def store(model: Model, obj: SupportVectorMachineModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)). withValue("num_classes", Value.long(2)). withValue("thresholds", obj.thresholds.map(_.toSeq).map(Value.doubleList)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): SupportVectorMachineModel = { if(model.value("num_classes").getLong != 2) { throw new IllegalArgumentException("MLeap only supports binary SVM") } SupportVectorMachineModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble, thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray)) } } override def model(node: SupportVectorMachine): SupportVectorMachineModel = node.model }
Example 68
Source File: NaiveBayesClassifierOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl.Model import ml.combust.bundle.op.OpModel import ml.combust.mleap.runtime.transformer.classification.NaiveBayesClassifier import ml.combust.mleap.core.classification.NaiveBayesModel import ml.combust.bundle.dsl._ import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.linalg.{Matrices, Vectors} class NaiveBayesClassifierOp extends MleapOp[NaiveBayesClassifier, NaiveBayesModel]{ override val Model: OpModel[MleapContext, NaiveBayesModel] = new OpModel[MleapContext, NaiveBayesModel]{ override val klazz: Class[NaiveBayesModel] = classOf[NaiveBayesModel] override def opName: String = Bundle.BuiltinOps.classification.naive_bayes override def store(model: Model, obj: NaiveBayesModel)(implicit context: BundleContext[MleapContext]): Model = { model.withValue("num_features", Value.long(obj.numFeatures)). withValue("num_classes", Value.long(obj.numClasses)). withValue("pi", Value.vector(obj.pi.toArray)). withValue("theta", Value.tensor(DenseTensor(obj.theta.toArray, Seq(obj.theta.numRows, obj.theta.numCols)))). withValue("model_type", Value.string(obj.modelType.toString)). withValue("thresholds", obj.thresholds.map(Value.doubleList(_))) } override def load(model: Model)(implicit context: BundleContext[MleapContext]): NaiveBayesModel = { val theta = model.value("theta").getTensor[Double] val modelType = NaiveBayesModel.forName(model.value("model_type").getString) val numClasses = model.value("num_classes").getLong.toInt val thresholds = model.getValue("thresholds").map(_.getDoubleList.toArray) require(thresholds.isEmpty || thresholds.get.length == numClasses, "NaiveBayesModel loaded with non-matching numClasses and thresholds.length. " + s" numClasses=$numClasses, but thresholds has length ${thresholds.get.length}") new NaiveBayesModel(numFeatures = model.value("num_features").getLong.toInt, numClasses = numClasses, pi = Vectors.dense(model.value("pi").getTensor[Double].toArray), theta = Matrices.dense(theta.dimensions.head, theta.dimensions(1), theta.toArray), modelType = modelType, thresholds = thresholds) } } override def model(node: NaiveBayesClassifier): NaiveBayesModel = node.model }
Example 69
Source File: GeneralizedLinearRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.regression.GeneralizedLinearRegressionModel import ml.combust.mleap.core.regression.GeneralizedLinearRegressionModel.{Family, FamilyAndLink, Link} import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.regression.GeneralizedLinearRegression import org.apache.spark.ml.linalg.Vectors class GeneralizedLinearRegressionOp extends MleapOp[GeneralizedLinearRegression, GeneralizedLinearRegressionModel] { override val Model: OpModel[MleapContext, GeneralizedLinearRegressionModel] = new OpModel[MleapContext, GeneralizedLinearRegressionModel] { override val klazz: Class[GeneralizedLinearRegressionModel] = classOf[GeneralizedLinearRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.generalized_linear_regression override def store(model: Model, obj: GeneralizedLinearRegressionModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)). withValue("family", Value.string(obj.fal.family.name)). withValue("link", Value.string(obj.fal.link.name)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): GeneralizedLinearRegressionModel = { val family = Family.fromName(model.value("family").getString) val link = model.getValue("link").map(v => Link.fromName(v.getString)).getOrElse(family.defaultLink) GeneralizedLinearRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble, fal = new FamilyAndLink(family, link) ) } } override def model(node: GeneralizedLinearRegression): GeneralizedLinearRegressionModel = node.model }
Example 70
Source File: AFTSurvivalRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.core.regression.AFTSurvivalRegressionModel import ml.combust.mleap.runtime.MleapContext import ml.combust.mleap.runtime.transformer.regression.AFTSurvivalRegression import org.apache.spark.ml.linalg.Vectors class AFTSurvivalRegressionOp extends MleapOp[AFTSurvivalRegression, AFTSurvivalRegressionModel] { override val Model: OpModel[MleapContext, AFTSurvivalRegressionModel] = new OpModel[MleapContext, AFTSurvivalRegressionModel] { override val klazz: Class[AFTSurvivalRegressionModel] = classOf[AFTSurvivalRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.aft_survival_regression override def store(model: Model, obj: AFTSurvivalRegressionModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)). withValue("quantile_probabilities", Value.doubleList(obj.quantileProbabilities)). withValue("scale", Value.double(obj.scale)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): AFTSurvivalRegressionModel = { AFTSurvivalRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble, quantileProbabilities = model.value("quantile_probabilities").getDoubleList.toArray, scale = model.value("scale").getDouble) } } override def model(node: AFTSurvivalRegression): AFTSurvivalRegressionModel = node.model }
Example 71
Source File: LinearRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.mleap.core.regression.LinearRegressionModel import ml.combust.mleap.runtime.transformer.regression.LinearRegression import ml.combust.bundle.op.OpModel import ml.combust.bundle.dsl._ import ml.combust.mleap.bundle.ops.MleapOp import ml.combust.mleap.runtime.MleapContext import org.apache.spark.ml.linalg.Vectors class LinearRegressionOp extends MleapOp[LinearRegression, LinearRegressionModel] { override val Model: OpModel[MleapContext, LinearRegressionModel] = new OpModel[MleapContext, LinearRegressionModel] { override val klazz: Class[LinearRegressionModel] = classOf[LinearRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.linear_regression override def store(model: Model, obj: LinearRegressionModel) (implicit context: BundleContext[MleapContext]): Model = { model.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)) } override def load(model: Model) (implicit context: BundleContext[MleapContext]): LinearRegressionModel = { LinearRegressionModel(coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble) } } override def model(node: LinearRegression): LinearRegressionModel = node.model }
Example 72
Source File: IDFSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.feature import ml.combust.mleap.core.feature.IDFModel import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class IDFSpec extends FunSpec { describe("input/output schema") { it("has the correct inputs and outputs") { val transformer = IDF(shape = NodeShape.feature(), model = IDFModel(Vectors.dense(Array(1.0, 2.0, 3.0)))) assert(transformer.schema.fields == Seq(StructField("input", TensorType.Double()), StructField("output", TensorType.Double()))) } } }
Example 73
Source File: StandardScalerSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.feature import java.io.File import java.net.URI import ml.combust.bundle.BundleFile import ml.combust.bundle.serializer.SerializationFormat import ml.combust.mleap.core.feature.StandardScalerModel import ml.combust.mleap.core.types._ import ml.combust.mleap.runtime.test.TestUtil import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec import resource.managed import ml.combust.mleap.runtime.MleapSupport._ class StandardScalerSpec extends FunSpec { val means = Some(Vectors.dense(Array(50.0, 20.0, 30.0))) val std = Some(Vectors.dense(Array(5.0, 1.0, 3.0))) val transformer = StandardScaler(shape = NodeShape.feature(), model = StandardScalerModel(std, means)) describe("input/output schema") { it("has the correct inputs and outputs") { assert(transformer.schema.fields == Seq(StructField("input", TensorType.Double(3)), StructField("output", TensorType.Double(3)))) } } describe("serialization") { it("serializes std as well as mean correctly") { val uri = new URI(s"jar:file:${TestUtil.baseDir}/standard-scaler.json.zip") for (file <- managed(BundleFile(uri))) { transformer.writeBundle.name("bundle") .format(SerializationFormat.Json) .save(file) } val file = new File(s"${TestUtil.baseDir}/standard-scaler.json.zip") val scaler = (for (bf <- managed(BundleFile(file))) yield { bf.loadMleapBundle().get.root }).tried.get.asInstanceOf[StandardScaler] assert(transformer.model.std sameElements scaler.model.std) assert(transformer.model.mean sameElements scaler.model.mean) } } }
Example 74
Source File: MaxAbsScalerSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.feature import ml.combust.mleap.core.feature.MaxAbsScalerModel import ml.combust.mleap.core.types._ import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row} import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class MaxAbsScalerSpec extends FunSpec{ val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get val dataset = Seq(Row(Tensor.denseVector(Array(0.0, 20.0, 20.0)))) val frame = DefaultLeapFrame(schema, dataset) val maxAbsScaler = MaxAbsScaler( shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_normalized"), model = MaxAbsScalerModel(Vectors.dense(Array(10.0, 20.0, 40.0)))) describe("#transform") { it("scales the input data by maximum value vector") { val frame2 = maxAbsScaler.transform(frame).get val data = frame2.dataset.toArray val norm = data(0).getTensor[Double](1) assert(norm(0) == 0.0) assert(norm(1) == 1.0) assert(norm(2) == 0.5) } describe("with invalid input column") { val maxAbsScaler2 = maxAbsScaler.copy(shape = NodeShape.feature(inputCol = "bad_input")) it("returns a Failure") { assert(maxAbsScaler2.transform(frame).isFailure) } } } describe("input/output schema") { it("has the correct inputs and outputs") { assert(maxAbsScaler.schema.fields == Seq(StructField("test_vec", TensorType.Double(3)), StructField("test_normalized", TensorType.Double(3)))) } } }
Example 75
Source File: MinMaxScalerSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.feature import java.io.File import ml.combust.bundle.BundleFile import ml.combust.mleap.core.feature.MinMaxScalerModel import ml.combust.mleap.core.types._ import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row} import ml.combust.mleap.runtime.transformer.Pipeline import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec import resource.managed import ml.combust.mleap.runtime.MleapSupport._ class MinMaxScalerSpec extends FunSpec{ val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get val dataset = Seq(Row(Tensor.denseVector(Array(0.0, 20.0, 20.0)))) val frame = DefaultLeapFrame(schema, dataset) val minMaxScaler = MinMaxScaler( shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_normalized"), model = MinMaxScalerModel(Vectors.dense(Array(0.0, 0.0, 0.0)), Vectors.dense(Array(10.0, 20.0, 40.0)))) describe("#transform") { it("scales the input data between min / max value vectors") { val frame2 = minMaxScaler.transform(frame).get val data = frame2.dataset.toArray val norm = data(0).getTensor[Double](1) assert(norm(0) == 0.0) assert(norm(1) == 1.0) assert(norm(2) == 0.5) } describe("with invalid input column") { val minMaxScaler2 = minMaxScaler.copy(shape = NodeShape.feature(inputCol = "bad_feature")) it("returns a Failure") { assert(minMaxScaler2.transform(frame).isFailure) } } } describe("input/output schema") { it("has the correct inputs and outputs") { assert(minMaxScaler.schema.fields == Seq(StructField("test_vec", TensorType.Double(3)), StructField("test_normalized", TensorType.Double(3)))) } } describe("min max scaler with defaults for min/max still works") { it ("loads correctly in mleap") { val file = new File(getClass.getResource("/min_max_scaler_tf.zip").toURI) val pipeline = (for (bf <- managed(BundleFile(file))) yield { bf.loadMleapBundle().get.root }).tried.get.asInstanceOf[Pipeline] assert(pipeline.model.transformers.size == 2) } } }
Example 76
Source File: ElementWiseProductSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.feature import ml.combust.mleap.core.feature.ElementwiseProductModel import ml.combust.mleap.core.types._ import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row} import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class ElementWiseProductSpec extends FunSpec { val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get val dataset = Seq(Row(Tensor.denseVector(Array(0.0, 20.0, 20.0)))) val frame = DefaultLeapFrame(schema, dataset) val ewp = ElementwiseProduct(shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_norm"), model = ElementwiseProductModel(Vectors.dense(Array(0.5, 1.0, 0.5)))) describe("#transform") { it("multiplies each input vector by a provided weight vector") { val frame2 = ewp.transform(frame).get val data = frame2.dataset(0).getTensor[Double](1) assert(data.toArray sameElements Array(0.0, 20.0, 10.0)) } describe("with invalid input column") { val ewp2 = ewp.copy(shape = NodeShape.feature(inputCol = "bad_input")) it("returns a Failure") { assert(ewp2.transform(frame).isFailure) } } } describe("input/output schema") { it("has the correct inputs and outputs") { assert(ewp.schema.fields == Seq(StructField("test_vec", TensorType.Double(3)), StructField("test_norm", TensorType.Double(3)))) } } }
Example 77
Source File: PcaSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.feature import ml.combust.mleap.core.feature.PcaModel import ml.combust.mleap.core.types._ import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row} import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.scalatest.FunSpec class PcaSpec extends FunSpec { val schema = StructType(Seq(StructField("test_vec", TensorType(BasicType.Double)))).get val dataset = Seq(Row(Tensor.denseVector(Array(2.0, 1.0, 0.0)))) val frame = DefaultLeapFrame(schema, dataset) val pc = new DenseMatrix(3, 2, Array(1d, -1, 2, 0, -3, 1)) val input = Vectors.dense(Array(2d, 1, 0)) val pca = Pca( shape = NodeShape.feature(inputCol = "test_vec", outputCol = "test_pca"), model = PcaModel(pc)) describe("#transform") { it("extracts the principal components from the input column") { val frame2 = pca.transform(frame).get val data = frame2.dataset(0).getTensor[Double](1).toArray assert(data sameElements Array[Double](1, -3)) } describe("with invalid input column") { val pca2 = pca.copy(shape = NodeShape.feature(inputCol = "bad_input")) it("returns a Failure") { assert(pca2.transform(frame).isFailure) } } } describe("input/output schema") { it("has the correct inputs and outputs") { assert(pca.schema.fields == Seq(StructField("test_vec", TensorType.Double()), StructField("test_pca", TensorType.Double()))) } } }
Example 78
Source File: KMeansSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.clustering import ml.combust.mleap.core.clustering.KMeansModel import ml.combust.mleap.core.types._ import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row} import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class KMeansSpec extends FunSpec { val v1 = Vectors.dense(Array(1.0, 2.0, 55.0)) val v2 = Vectors.dense(Array(11.0, 200.0, 55.0)) val v3 = Vectors.dense(Array(100.0, 22.0, 55.0)) val schema = StructType(Seq(StructField("features", TensorType(BasicType.Double)))).get val dataset = Seq(Row(DenseTensor(Array(2.0, 5.0, 34.0), Seq(3))), Row(DenseTensor(Array(20.0, 230.0, 34.0), Seq(3))), Row(DenseTensor(Array(111.0, 20.0, 56.0), Seq(3)))) val frame = DefaultLeapFrame(schema, dataset) val km = KMeans(shape = NodeShape.basicCluster(), model = KMeansModel(Seq(v1, v2, v3), 3)) describe("#transform") { it("uses the k-means to find closest cluster") { val frame2 = km.transform(frame).get val data = frame2.dataset.toArray assert(data(0).getInt(1) == 0) assert(data(1).getInt(1) == 1) assert(data(2).getInt(1) == 2) } describe("with invalid features column") { val km2 = km.copy(shape = NodeShape.basicCluster(featuresCol = "bad_features")) it("returns a Failure") { assert(km2.transform(frame).isFailure) } } } describe("input/output schema") { it("has the correct inputs and outputs") { assert(km.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Int.nonNullable))) } } }
Example 79
Source File: BisectingKMeansSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.clustering import ml.combust.mleap.core.clustering.{BisectingKMeansModel, ClusteringTreeNode} import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.linalg.mleap.VectorWithNorm import org.scalatest.FunSpec class BisectingKMeansSpec extends FunSpec { describe("input/output schema") { it("has the correct inputs and outputs") { val transformer = BisectingKMeans(shape = NodeShape.basicCluster(), model = new BisectingKMeansModel(ClusteringTreeNode(23, VectorWithNorm(Vectors.dense(1, 2, 3)) , Array()))) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Int.nonNullable))) } } }
Example 80
Source File: PipelineSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer import ml.combust.mleap.core.feature.VectorAssemblerModel import ml.combust.mleap.core.regression.LinearRegressionModel import ml.combust.mleap.core.types._ import ml.combust.mleap.runtime.transformer.feature.VectorAssembler import ml.combust.mleap.runtime.transformer.regression.LinearRegression import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class PipelineSpec extends FunSpec { describe("input/output schema") { it("has inputs or outputs of its transformers") { val vectorAssembler = VectorAssembler( shape = NodeShape().withInput("input0", "feature1"). withInput("input1", "feature2"). withInput("input2", "feature3"). withStandardOutput("features"), model = VectorAssemblerModel(Seq(ScalarShape(), ScalarShape(), ScalarShape()))) val regression = LinearRegression(shape = NodeShape().withInput("features", "features"). withOutput("prediction", "prediction"), model = LinearRegressionModel(Vectors.dense(1.0, 2.0, 3.0), 4.0)) val pipeline = Pipeline(uid = "root_pipeline", shape = NodeShape(), PipelineModel(Seq( Pipeline(uid = "child_pipeline_1", shape = NodeShape(), PipelineModel(Seq(vectorAssembler))), Pipeline(uid = "child_pipeline_2", shape = NodeShape(), PipelineModel(Seq(regression)))))) assert(pipeline.schema.fields == Seq( StructField("feature1", ScalarType.Double), StructField("feature2", ScalarType.Double), StructField("feature3", ScalarType.Double), StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable))) assert(pipeline.inputSchema.fields == Seq( StructField("feature1", ScalarType.Double), StructField("feature2", ScalarType.Double), StructField("feature3", ScalarType.Double))) assert(pipeline.outputSchema.fields == Seq( StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable))) assert(pipeline.strictOutputSchema.fields == Seq( StructField("prediction", ScalarType.Double.nonNullable))) assert(pipeline.intermediateSchema.fields == Seq( StructField("features", TensorType.Double(3)))) } } }
Example 81
Source File: LogisticRegressionSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.classification import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, LogisticRegressionModel} import ml.combust.mleap.core.types._ import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row} import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class LogisticRegressionSpec extends FunSpec { val schema = StructType(Seq(StructField("features", TensorType(BasicType.Double)))).get val dataset = Seq(Row(Tensor.denseVector(Array(0.5, -0.5, 1.0)))) val frame = DefaultLeapFrame(schema, dataset) val logisticRegression = LogisticRegression(shape = NodeShape.probabilisticClassifier(), model = LogisticRegressionModel(BinaryLogisticRegressionModel(coefficients = Vectors.dense(Array(1.0, 1.0, 2.0)), intercept = -0.2, threshold = 0.75))) describe("LogisticRegression") { describe("#transform") { it("executes the logistic regression model and outputs the prediction") { val frame2 = logisticRegression.transform(frame).get val prediction = frame2.dataset(0).getDouble(1) assert(prediction == 1.0) } describe("with probability column") { val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(probabilityCol = Some("probability"))) it("executes the logistic regression model and outputs the prediction/probability") { val frame2 = logisticRegression2.transform(frame).get val data = frame2.dataset.toArray val probability = data(0).getTensor[Double](1)(1) val prediction = data(0).getDouble(2) assert(prediction == 1.0) assert(probability > 0.84) assert(probability < 0.86) } } describe("with invalid features column") { val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(featuresCol = "bad_features")) it("returns a Failure") { assert(logisticRegression2.transform(frame).isFailure) } } } describe("input/output schema") { it("has the correct inputs and outputs") { assert(logisticRegression.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable))) } it("has the correct inputs and outputs with probability column") { val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(probabilityCol = Some("probability"))) assert(logisticRegression2.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("probability", TensorType.Double(2)), StructField("prediction", ScalarType.Double.nonNullable))) } it("has the correct inputs and outputs with rawPrediction column") { val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier(rawPredictionCol = Some("rp"))) assert(logisticRegression2.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("rp", TensorType.Double(2)), StructField("prediction", ScalarType.Double.nonNullable))) } it("has the correct inputs and outputs with both probability and rawPrediction column") { val logisticRegression2 = logisticRegression.copy(shape = NodeShape.probabilisticClassifier( rawPredictionCol = Some("rp"), probabilityCol = Some("p"))) assert(logisticRegression2.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("rp", TensorType.Double(2)), StructField("p", TensorType.Double(2)), StructField("prediction", ScalarType.Double.nonNullable))) } } } }
Example 82
Source File: SupportVectorMachineSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.classification import ml.combust.mleap.core.classification.SupportVectorMachineModel import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class SupportVectorMachineSpec extends FunSpec { describe("input/output schema") { it("has the correct inputs and outputs") { val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(), model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable))) } it("has the correct inputs and outputs with probability column") { val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(probabilityCol = Some("probability")), model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("probability", TensorType.Double(2)), StructField("prediction", ScalarType.Double.nonNullable))) } it("has the correct inputs and outputs with rawPrediction column") { val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier(rawPredictionCol = Some("rp")), model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("rp", TensorType(BasicType.Double, Seq(2))), StructField("prediction", ScalarType.Double.nonNullable))) } it("has the correct inputs and outputs with both probability and rawPrediction columns") { val transformer = SupportVectorMachine(shape = NodeShape.probabilisticClassifier( rawPredictionCol = Some("rp"), probabilityCol = Some("probability")), model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("rp", TensorType.Double(2)), StructField("probability", TensorType.Double(2)), StructField("prediction", ScalarType.Double.nonNullable))) } } }
Example 83
Source File: OneVsRestSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.classification import ml.combust.mleap.core.classification.{BinaryLogisticRegressionModel, OneVsRestModel} import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class OneVsRestSpec extends FunSpec { describe("input/output schema") { it("has the correct inputs and outputs without probability column") { val transformer = OneVsRest(shape = NodeShape.basicClassifier(), model = new OneVsRestModel(Array( BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(2)), StructField("prediction", ScalarType.Double))) } it("has the correct inputs and outputs with probability column") { val transformer = OneVsRest(shape = NodeShape().withInput("features", "features"). withOutput("probability", "prob"). withOutput("prediction", "prediction"), model = new OneVsRestModel(Array( BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(2)), StructField("prob", ScalarType.Double), StructField("prediction", ScalarType.Double))) } it("has the correct inputs and outputs with raw prediction column") { val transformer = OneVsRest(shape = NodeShape().withInput("features", "features"). withOutput("probability", "prob"). withOutput("raw_prediction", "raw"). withOutput("prediction", "prediction"), model = new OneVsRestModel(Array( BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(2)), StructField("prob", ScalarType.Double), StructField("raw", TensorType.Double(1)), StructField("prediction", ScalarType.Double))) } } }
Example 84
Source File: MultiLayerPerceptronClassifierSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.classification import ml.combust.mleap.core.classification.MultiLayerPerceptronClassifierModel import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class MultiLayerPerceptronClassifierSpec extends FunSpec { describe("input/output schema") { it("has the correct inputs and outputs") { val transformer = MultiLayerPerceptronClassifier(shape = NodeShape.basicClassifier(), model = new MultiLayerPerceptronClassifierModel(Seq(3, 1), Vectors.dense(Array(1.9, 2.2, 4, 1)))) assert(transformer.schema.fields == Seq(StructField("features", TensorType(BasicType.Double, Seq(3))), StructField("prediction", ScalarType.Double.nonNullable))) } } }
Example 85
Source File: LinearSVCSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.classification import org.scalatest.FunSpec import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import ml.combust.mleap.core.classification.LinearSVCModel class LinearSVCSpec extends FunSpec { describe("input/output schema") { it("has the correct inputs and outputs") { val transformer = LinearSVC(shape = NodeShape.basicClassifier(), model = new LinearSVCModel(Vectors.dense(1, 2, 3), 2)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable))) } it("has the correct inputs and outputs with prediction column") { val transformer = LinearSVC(shape = NodeShape.probabilisticClassifier(rawPredictionCol = Some("rp"),predictionCol = "pred"), model = new LinearSVCModel(Vectors.dense(1, 2, 3), 2)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("rp", TensorType.Double(2)), StructField("pred", ScalarType.Double.nonNullable))) } } }
Example 86
Source File: GeneralizedLinearRegressionSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.regression import ml.combust.mleap.core.regression.GeneralizedLinearRegressionModel import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class GeneralizedLinearRegressionSpec extends FunSpec { describe("input/output schema") { it("has the correct inputs and outputs with prediction column only") { val transformer = GeneralizedLinearRegression(shape = NodeShape.regression(), model = new GeneralizedLinearRegressionModel(Vectors.dense(1, 2, 3), 23, null)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable))) } it("has the correct inputs and outputs with prediction column as well as linkPrediction column") { val transformer = GeneralizedLinearRegression(shape = NodeShape.regression(). withOutput("link_prediction", "lp"), model = new GeneralizedLinearRegressionModel(Vectors.dense(1, 2, 3), 23, null)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable), StructField("lp", ScalarType.Double.nonNullable))) } } }
Example 87
Source File: LinearRegressionSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.regression import ml.combust.mleap.core.regression.LinearRegressionModel import ml.combust.mleap.core.types._ import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row} import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class LinearRegressionSpec extends FunSpec { val schema = StructType(Seq(StructField("features", TensorType(BasicType.Double)))).get val dataset = Seq(Row(Tensor.denseVector(Array(20.0, 10.0, 5.0)))) val frame = DefaultLeapFrame(schema, dataset) val linearRegression = LinearRegression(shape = NodeShape.regression(), model = LinearRegressionModel(coefficients = Vectors.dense(Array(1.0, 0.5, 5.0)), intercept = 73.0)) describe("LinearRegression") { describe("#transform") { it("executes the linear regression model and outputs a prediction") { val frame2 = linearRegression.transform(frame).get val prediction = frame2.dataset(0).getDouble(1) assert(prediction == 123.0) } describe("with invalid features input") { it("returns a Failure") { val frame2 = linearRegression.copy(shape = NodeShape.regression(featuresCol = "bad_features")).transform(frame) assert(frame2.isFailure) } } } } describe("input/output schema") { it("has the correct inputs and outputs") { assert(linearRegression.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable))) } } }
Example 88
Source File: AFTSurvivalRegressionSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.transformer.regression import ml.combust.mleap.core.regression.AFTSurvivalRegressionModel import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class AFTSurvivalRegressionSpec extends FunSpec { describe("input/output schema") { it("has the correct inputs and outputs") { val transformer = AFTSurvivalRegression(shape = NodeShape.regression() .withOutput("quantiles", "quantiles"), model = new AFTSurvivalRegressionModel(Vectors.dense(1, 3, 4), 23, Array(1, 2, 3, 4, 5), 5)) assert(transformer.schema.fields == Seq(StructField("features", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable), StructField("quantiles", TensorType.Double(5)))) } } }
Example 89
Source File: VectorSlicerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.VectorUtil._ @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala") case class VectorSlicerModel(indices: Array[Int], namedIndices: Array[(String, Int)] = Array(), inputSize: Int) extends Model { val allIndices: Array[Int] = indices.union(namedIndices.map(_._2)) def apply(features: Vector): Vector = features match { case features: DenseVector => Vectors.dense(allIndices.map(features.apply)) case features: SparseVector => features.slice(allIndices) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(allIndices.length)).get }
Example 90
Source File: ElementwiseProductModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructField, StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala") case class ElementwiseProductModel(scalingVec: Vector) extends Model { def apply(vector: Vector): Vector = { vector match { case DenseVector(values) => val vs = values.clone() val size = vs.length var i = 0 while (i < size) { vs(i) *= scalingVec(i) i += 1 } Vectors.dense(vs) case SparseVector(size, indices, values) => val vs = values.clone() val nnz = vs.length var i = 0 while (i < nnz) { vs(i) *= scalingVec(indices(i)) i += 1 } Vectors.sparse(size, indices, vs) } } override def inputSchema: StructType = StructType(StructField("input" -> TensorType.Double(scalingVec.size))).get override def outputSchema: StructType = StructType(StructField("output" -> TensorType.Double(scalingVec.size))).get }
Example 91
Source File: MaxAbsScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.math.{max, min} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala") case class MaxAbsScalerModel(maxAbs: Vector) extends Model { def apply(vector: Vector): Vector = { val maxAbsUnzero = Vectors.dense(maxAbs.toArray.map(x => if (x == 0) 1 else x)) vector match { case DenseVector(values) => val vs = values.clone() val size = vs.length var i = 0 while (i < size) { if (!values(i).isNaN) { val rescale = max(-1.0, min(1.0, values(i) / maxAbsUnzero(i))) vs(i) = rescale } i += 1 } Vectors.dense(vs) case SparseVector(size, indices, values) => val vs = values.clone() val nnz = vs.length var i = 0 while (i < nnz) { val raw = max(-1.0, min(1.0, values(i) / maxAbsUnzero(indices(i)))) vs(i) = raw i += 1 } Vectors.sparse(size, indices, vs) } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(maxAbs.size)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(maxAbs.size)).get }
Example 92
Source File: ChiSqSelectorModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.collection.mutable @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala") case class ChiSqSelectorModel(filterIndices: Seq[Int], inputSize: Int) extends Model { def apply(features: Vector): Vector = { features match { case SparseVector(size, indices, values) => val newSize = filterIndices.length val newValues = mutable.ArrayBuilder.make[Double] val newIndices = mutable.ArrayBuilder.make[Int] var i = 0 var j = 0 var indicesIdx = 0 var filterIndicesIdx = 0 while (i < indices.length && j < filterIndices.length) { indicesIdx = indices(i) filterIndicesIdx = filterIndices(j) if (indicesIdx == filterIndicesIdx) { newIndices += j newValues += values(i) j += 1 i += 1 } else { if (indicesIdx > filterIndicesIdx) { j += 1 } else { i += 1 } } } // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size) Vectors.sparse(newSize, newIndices.result(), newValues.result()) case DenseVector(values) => val values = features.toArray Vectors.dense(filterIndices.map(i => values(i)).toArray) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}.") } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(filterIndices.length)).get }
Example 93
Source File: FeatureHasherModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import ml.combust.mleap.core.util.Platform import ml.combust.mleap.core.util.Murmur3_x86_32.{hashInt, hashLong, hashUnsafeBytes2} import org.apache.spark.ml.linalg.{Vector, Vectors} import scala.collection.mutable object FeatureHasherModel { val seed = HashingTermFrequencyModel.seed def murmur3(term: Any): Int = { term match { case null => seed case b: Boolean => hashInt(if (b) 1 else 0, seed) case b: Byte => hashInt(b, seed) case s: Short => hashInt(s, seed) case i: Int => hashInt(i, seed) case l: Long => hashLong(l, seed) case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed) case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed) case s: String => val utf8 = s.getBytes("UTF-8") hashUnsafeBytes2(utf8, Platform.BYTE_ARRAY_OFFSET, utf8.length, seed) case _ => throw new IllegalStateException("FeatureHasher with murmur3 algorithm does not " + s"support type ${term.getClass.getCanonicalName} of input data.") } } } @SparkCode(uri = "https://github.com/apache/spark/blob/v2.3.0/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala") case class FeatureHasherModel(numFeatures: Int = 1 << 18, categoricalCols: Seq[String], inputNames: Seq[String], inputTypes: Seq[DataType] ) extends Model { assert(inputTypes.forall(dt ⇒ dt.shape.isScalar), "must provide scalar shapes as inputs") val schema = inputNames.zip(inputTypes) val realFields = schema.filter(t ⇒ t._2.base match { case BasicType.Short if !categoricalCols.contains(t._1) ⇒ true case BasicType.Double if !categoricalCols.contains(t._1) ⇒ true case BasicType.Float if !categoricalCols.contains(t._1) ⇒ true case BasicType.Int if !categoricalCols.contains(t._1) ⇒ true case BasicType.Long if !categoricalCols.contains(t._1) ⇒ true case _ ⇒ false }).toMap.keys.toSeq def getDouble(x: Any): Double = { x match { case n: java.lang.Number ⇒ n.doubleValue() // will throw ClassCastException if it cannot be cast, as would row.getDouble case other ⇒ other.asInstanceOf[Double] } } def nonNegativeMod(x: Int, mod: Int): Int = { val rawMod = x % mod rawMod + (if (rawMod < 0) mod else 0) } def apply(things: Seq[Any]): Vector = { val map = new mutable.OpenHashMap[Int, Double]() schema.zip(things).foreach { case (sc, item) ⇒ if (item != null) { val (rawIdx, value) = if (realFields.contains(sc._1)) { // numeric values are kept as is, with vector index based on hash of "column_name" val value = getDouble(item) val hash = FeatureHasherModel.murmur3(sc._1) (hash, value) } else { // string, boolean and numeric values that are in catCols are treated as categorical, // with an indicator value of 1.0 and vector index based on hash of "column_name=value" val value = item.toString val fieldName = s"${sc._1}=$value" val hash = FeatureHasherModel.murmur3(fieldName) (hash, 1.0) } val idx = nonNegativeMod(rawIdx, numFeatures) map.+=((idx, map.getOrElse(idx, 0.0) + value)) } } Vectors.sparse(numFeatures, map.toSeq) } override def inputSchema: StructType = { val inputFields = inputTypes.zipWithIndex.map { case (dtype, i) => StructField(s"input$i", dtype) } StructType(inputFields).get } override def outputSchema: StructType = { StructType(StructField("output" -> TensorType.Double(numFeatures))).get } }
Example 94
Source File: MinMaxScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.mleap.VectorUtil._ import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.math.{max, min} def apply(vector: Vector): Vector = { val scale = maxValue - minValue // 0 in sparse vector will probably be rescaled to non-zero val values = vector.copy.toArray val size = values.length var i = 0 while (i < size) { if (!values(i).isNaN) { val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5 values(i) = raw * scale + minValue } i += 1 } Vectors.dense(values) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(originalRange.length)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(originalRange.length)).get }
Example 95
Source File: WordToVectorModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType} import org.apache.spark.ml.linalg.mleap.BLAS import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} sealed trait WordToVectorKernel { def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector def name: String } object WordToVectorKernel { private val lookup: Map[String, WordToVectorKernel] = Seq(Default, Sqrt).map { k => (k.name, k) }.toMap def forName(name: String): WordToVectorKernel = lookup(name) case object Default extends WordToVectorKernel { override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = { val sum = Vectors.zeros(size) for (v <- vectors) { BLAS.axpy(1.0, v, sum) } BLAS.scal(1.0 / sentenceSize, sum) sum } override def name: String = "default" } case object Sqrt extends WordToVectorKernel { override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = { val sum = Vectors.zeros(size) for (v <- vectors) { BLAS.axpy(1.0, v, sum) } val values = sum match { case sum: DenseVector => sum.values case sum: SparseVector => sum.values } var i = 0 val s = values.length val sqrt = Math.sqrt(BLAS.dot(sum, sum)) while (i < s) { values(i) /= sqrt i += 1 } sum } override def name: String = "sqrt" } } case class WordToVectorModel(wordIndex: Map[String, Int], wordVectors: Array[Double], kernel: WordToVectorKernel = WordToVectorKernel.Default) extends Model { val numWords: Int = wordIndex.size val vectorSize: Int = wordVectors.length / numWords val vectors: Map[String, Vector] = { wordIndex.map { case (word, ind) => (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize)) } }.mapValues(Vectors.dense).map(identity) def apply(sentence: Seq[String]): Vector = { if (sentence.isEmpty) { Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double]) } else { val vs = sentence.iterator.map(vectors.get). filter(_.isDefined). map(_.get) kernel(vectorSize, sentence.size, vs) } } override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(vectorSize)).get }
Example 96
Source File: NormalizerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def apply(features: Vector): Vector = { val norm = Vectors.norm(features, pNorm) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. features match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. features } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get }
Example 97
Source File: HashingTermFrequencyModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.{Vector, Vectors} import ml.combust.mleap.core.util.Murmur3_x86_32._ import ml.combust.mleap.core.util.Platform import scala.collection.mutable object HashingTermFrequencyModel { val seed = 42 def murmur3(term: Any): Int = { term match { case null => seed case b: Boolean => hashInt(if (b) 1 else 0, seed) case b: Byte => hashInt(b, seed) case s: Short => hashInt(s, seed) case i: Int => hashInt(i, seed) case l: Long => hashLong(l, seed) case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed) case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed) case s: String => val utf8 = s.getBytes("UTF-8") hashUnsafeBytes(utf8, Platform.BYTE_ARRAY_OFFSET, utf8.length, seed) case _ => throw new IllegalStateException("HashingTF with murmur3 algorithm does not " + s"support type ${term.getClass.getCanonicalName} of input data.") } } } @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/core/src/main/scala/org/apache/spark/util/Utils.scala") def nonNegativeMod(x: Int, mod: Int): Int = { val rawMod = x % mod rawMod + (if (rawMod < 0) mod else 0) } override def inputSchema: StructType = { StructType(StructField("input" -> ListType(BasicType.String))).get } override def outputSchema: StructType = { StructType(StructField("output" -> TensorType.Double(numFeatures))).get } }
Example 98
Source File: StandardScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def apply(vector: Vector): Vector = { if (mean.nonEmpty) { val shift = mean.get.toArray val values = vector match { // specially handle DenseVector because its toArray does not clone already case d: DenseVector => d.values.clone() case v: SparseVector => v.toArray } val size = values.length if (std.nonEmpty) { val stdDev = std.get var i = 0 while (i < size) { values(i) = if (stdDev(i) != 0.0) (values(i) - shift(i)) * (1.0 / stdDev(i)) else 0.0 i += 1 } } else { var i = 0 while (i < size) { values(i) -= shift(i) i += 1 } } Vectors.dense(values) } else if (std.nonEmpty) { val stdDev = std.get vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while(i < size) { values(i) *= (if (stdDev(i) != 0.0) 1.0 / stdDev(i) else 0.0) i += 1 } Vectors.dense(values) case SparseVector(size, indices, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) *= (if (stdDev(indices(i)) != 0.0) 1.0 / stdDev(indices(i)) else 0.0) i += 1 } Vectors.sparse(size, indices, values) } } else { throw new IllegalStateException("need to scale with mean and/or with stdev") } } override def inputSchema: StructType = { StructType("input" -> TensorType.Double(size)).get } override def outputSchema: StructType = StructType("output" -> TensorType.Double(size)).get }
Example 99
Source File: CountVectorizerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType} import org.apache.spark.ml.linalg.{Vector, Vectors} import scala.collection.mutable @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala") case class CountVectorizerModel(vocabulary: Array[String], binary: Boolean, minTf: Double) extends Model { val dict: Map[String, Int] = vocabulary.zipWithIndex.toMap def apply(document: Seq[String]): Vector = { val termCounts = mutable.Map[Int, Double]() var tokenCount = 0L document.foreach { term => dict.get(term) match { case Some(index) => termCounts += (index -> termCounts.get(index).map(_ + 1).getOrElse(1)) case None => // ignore terms not found in dictionary } tokenCount += 1 } val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf val effectiveCounts = if(binary) { termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq } else { termCounts.filter(_._2 >= effectiveMinTF).toSeq } Vectors.sparse(dict.size, effectiveCounts) } override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(dict.size)).get }
Example 100
Source File: OneHotEncoderModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.{Vector, Vectors} def apply(labels: Array[Double]): Array[Vector] = { if (labels.length != categorySizes.length) { throw new IllegalArgumentException(s"invalid input size: ${labels.length}, must be ${categorySizes.length}") } labels.zipWithIndex.map { case (label: Double, colIdx: Int) ⇒ encoder(label, colIdx) } } private def encoder(label: Double, colIdx: Int): Vector = { val labelInt = label.toInt if(label != labelInt) { throw new IllegalArgumentException(s"invalid label: $label, must be integer") } val origCategorySize = categorySizes(colIdx) val idx = if (label >= 0 && label < origCategorySize) { label } else { if (keepInvalid) { origCategorySize } else { if (label < 0) { throw new IllegalArgumentException(s"Negative value: $label. Input can't be negative. To handle invalid values, set Param handleInvalid to ${HandleInvalid.Keep}") } else { throw new IllegalArgumentException(s"Unseen value: $label. To handle unseen values, set Param handleInvalid to ${HandleInvalid.Keep}") } } } val size = configedCategorySizes(colIdx) if (idx < size) { Vectors.sparse(size, Array(idx.toInt), oneValue) } else { Vectors.sparse(size, emptyIndices, emptyValues) } } override def inputSchema: StructType = { val f = categorySizes.zipWithIndex.map { case (_, i) => StructField(s"input$i", ScalarType.Double.setNullable(false)) } StructType(f).get } override def outputSchema: StructType = { val f = categorySizes.zipWithIndex.map { case (size, i) => StructField(s"output$i", TensorType.Double(size)) } StructType(f).get } }
Example 101
Source File: IDFModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala") case class IDFModel(idf: Vector) extends Model { def apply(v: Vector): Vector = { val n = v.size v match { case SparseVector(size, indices, values) => val nnz = indices.length val newValues = new Array[Double](nnz) var k = 0 while (k < nnz) { newValues(k) = values(k) * idf(indices(k)) k += 1 } Vectors.sparse(n, indices, newValues) case DenseVector(values) => val newValues = new Array[Double](n) var j = 0 while (j < n) { newValues(j) = values(j) * idf(j) j += 1 } Vectors.dense(newValues) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}.") } } override def inputSchema: StructType = StructType("input" -> TensorType.Double()).get override def outputSchema: StructType = StructType("output" -> TensorType.Double()).get }
Example 102
Source File: MinHashLSHModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructType, TensorType} import ml.combust.mleap.tensor.{DenseTensor, Tensor} import org.apache.spark.ml.linalg.{Vector, Vectors} object MinHashLSHModel { val HASH_PRIME = 2038074743 } case class MinHashLSHModel(randomCoefficients: Seq[(Int, Int)], inputSize: Int) extends LSHModel{ def apply(features: Vector): Tensor[Double] = predict(features) def predict(features: Vector): Tensor[Double] = { require(features.numNonzeros > 0, "Must have at least 1 non zero entry.") val elemsList = features.toSparse.indices.toList val hashValues = randomCoefficients.map { case (a, b) => elemsList.map { elem: Int => ((1 + elem) * a + b) % MinHashLSHModel.HASH_PRIME }.min.toDouble } // TODO: Output vectors of dimension numHashFunctions in SPARK-18450 DenseTensor(hashValues.toArray, Seq(hashValues.length, 1)) } override def keyDistance(x: Vector, y: Vector): Double = { val xSet = x.toSparse.indices.toSet val ySet = y.toSparse.indices.toSet val intersectionSize = xSet.intersect(ySet).size.toDouble val unionSize = xSet.size + ySet.size - intersectionSize assert(unionSize > 0, "The union of two input sets must have at least 1 elements") 1 - intersectionSize / unionSize } override def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = { // Since it's generated by hashing, it will be a pair of dense vectors. // TODO: This hashDistance function requires more discussion in SPARK-18454 x.zip(y).map(vectorPair => vectorPair._1.toArray.zip(vectorPair._2.toArray).count(pair => pair._1 != pair._2) ).min } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize, 1)).get }
Example 103
Source File: VectorAssemblerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import ml.combust.mleap.tensor.{DenseTensor, SparseTensor} import org.apache.spark.ml.linalg.{Vector, Vectors} import scala.collection.mutable def apply(vv: Seq[Any]): Vector = { val indices = mutable.ArrayBuilder.make[Int] val values = mutable.ArrayBuilder.make[Double] var cur = 0 vv.foreach { case v: Double => if (v != 0.0) { indices += cur values += v } cur += 1 case tensor: DenseTensor[_] if tensor.dimensions.size == 1 => val dTensor = tensor.asInstanceOf[DenseTensor[Double]] dTensor.values.indices.foreach { i => val v = dTensor.values(i) if(v != 0.0) { indices += cur + i values += v } } cur += dTensor.values.length case tensor: SparseTensor[_] if tensor.dimensions.size == 1 => val dTensor = tensor.asInstanceOf[SparseTensor[Double]] var idx = 0 dTensor.indices.map(_.head).foreach { i => val v = dTensor.values(idx) if(v != 0.0) { indices += cur + i values += v } idx += 1 } cur += dTensor.dimensions.head case vec: Vector => vec.foreachActive { case (i, v) => if (v != 0.0) { indices += cur + i values += v } } cur += vec.size case v: java.math.BigDecimal => val d = v.doubleValue() if (d != 0.0) { indices += cur values += d } cur += 1 case Some(v: Double) => if(v != 0.0) { indices += cur values += v } cur += 1 } Vectors.sparse(cur, indices.result(), values.result()).compressed } override def inputSchema: StructType = { val inputFields = inputShapes.zipWithIndex.map { case (shape, i) => StructField(s"input$i", DataType(BasicType.Double, shape)) } StructType(inputFields).get } override def outputSchema: StructType = StructType("output" -> TensorType.Double(outputSize)).get }
Example 104
Source File: BinarizerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.{Vector, Vectors} import scala.collection.mutable @SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala") case class BinarizerModel(threshold: Double, inputShape: DataShape) extends Model { assert(inputShape.isScalar || inputShape.isTensor, "Must provide a tensor or scalar shape") def apply(value: Double): Double = { if (value > threshold) 1.0 else 0.0 } def apply(value: Vector): Vector = { val indices = mutable.ArrayBuilder.make[Int] val values = mutable.ArrayBuilder.make[Double] value.foreachActive { (index, value) => if (value > threshold) { indices += index values += 1.0 } } Vectors.sparse(value.size, indices.result(), values.result()).compressed } override def inputSchema: StructType = { StructType("input" -> DataType(BasicType.Double, inputShape).setNullable(!inputShape.isScalar)).get } override def outputSchema: StructType = { StructType("output" -> DataType(BasicType.Double, inputShape).setNullable(!inputShape.isScalar)).get } }
Example 105
Source File: InteractionModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import ml.combust.mleap.tensor.Tensor import ml.combust.mleap.core.util.VectorConverters._ import org.apache.spark.ml.linalg.{Vector, Vectors} import scala.collection.mutable def foreachNonzeroOutput(v: Any, f: (Int, Double) => Unit): Unit = { val value = v match { case tensor: Tensor[_] => tensor.asInstanceOf[Tensor[Double]]: Vector case _ => v } value match { case d: Double => assert(numFeatures.length == 1, "DoubleType columns should only contain one feature.") val numOutputCols = numFeatures.head if (numOutputCols > 1) { assert( d >= 0.0 && d == d.toInt && d < numOutputCols, s"Values from column must be indices, but got $d.") f(d.toInt, 1.0) } else { f(0, d) } case vec: Vector => assert(numFeatures.length == vec.size, s"Vector column size was ${vec.size}, expected ${numFeatures.length}") vec.foreachActive { (i, v) => val numOutputCols = numFeatures(i) if (numOutputCols > 1) { assert( v >= 0.0 && v == v.toInt && v < numOutputCols, s"Values from column must be indices, but got $v.") f(outputOffsets(i) + v.toInt, 1.0) } else { f(outputOffsets(i), v) } } case null => throw new IllegalArgumentException("Values to interact cannot be null.") case o => throw new IllegalArgumentException(s"$o of type ${o.getClass.getName} is not supported.") } } }
Example 106
Source File: DCTModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import ml.combust.mleap.core.Model import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.{Vector, Vectors} case class DCTModel(inverse: Boolean, inputSize: Int) extends Model { def apply(features: Vector): Vector = { val result = features.toArray.clone() val jTransformer = new DoubleDCT_1D(result.length) if (inverse) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get }
Example 107
Source File: BucketedRandomProjectionLSHModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructType, TensorType} import ml.combust.mleap.tensor.{DenseTensor, Tensor} import org.apache.spark.ml.linalg.mleap.BLAS import org.apache.spark.ml.linalg.{Vector, Vectors} case class BucketedRandomProjectionLSHModel(randomUnitVectors: Seq[Vector], bucketLength: Double, inputSize: Int) extends LSHModel { def apply(features: Vector): Tensor[Double] = predict(features) def predict(features: Vector): Tensor[Double] = { val hashValues: Seq[Double] = randomUnitVectors.map({ randUnitVector => Math.floor(BLAS.dot(features, randUnitVector) / bucketLength) }) // TODO: Output vectors of dimension numHashFunctions in SPARK-18450 DenseTensor(hashValues.toArray, Seq(hashValues.length, 1)) } override def keyDistance(x: Vector, y: Vector): Double = { Math.sqrt(Vectors.sqdist(x, y)) } override def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = { // Since it's generated by hashing, it will be a pair of dense vectors. x.zip(y).map(vectorPair => Vectors.sqdist(vectorPair._1, vectorPair._2)).min } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize, 1)).get }
Example 108
Source File: PolynomialFeaturesModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.sklearn import ml.combust.mleap.core.Model import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{Vector, Vectors} case class PolynomialFeaturesModel(combinations: String) extends Model { private val pattern = "x(\\d+)(?:[\\^](\\d+))?".r private val polynomials = extractPolynomials(combinations) private val indices = polynomials.flatMap(poly => poly.terms).map(term => term.index).toSet private def extractPolynomials(combinations: String): List[Polynomial] = { combinations.split(",") .map(combination => extractPolynomial(combination)) .toList } private def extractPolynomial(polynomial: String): Polynomial = { Polynomial(pattern.findAllIn(polynomial).matchData .map(matcher => {Term(matcher.group(1).toInt, Option(matcher.group(2)).getOrElse("1").toInt)}) .toList ) } def getPolyValue(poly: Polynomial, features: Vector): Double = { poly.terms.map(term => scala.math.pow(features(term.index), term.power)).product } def apply(features: Vector): Vector = { Vectors.dense(polynomials.map(poly => getPolyValue(poly, features)).toArray) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(indices.size)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(polynomials.size)).get } case class Term(index: Int, power: Int) case class Polynomial(terms: List[Term])
Example 109
Source File: GaussianMixtureModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.clustering import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.mleap.Utils._ import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.ml.stat.distribution.MultivariateGaussian object GaussianMixtureModel { @SparkCode(uri = "https://github.com/apache/spark/blob/branch-2.0/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala") def computeProbabilities(features: DenseVector, dists: Array[MultivariateGaussian], weights: Array[Double]): Array[Double] = { val p = weights.zip(dists).map { case (weight, dist) => EPSILON + weight * dist.pdf(features) } val pSum = p.sum var i = 0 while (i < weights.length) { p(i) /= pSum i += 1 } p } } case class GaussianMixtureModel(gaussians: Array[MultivariateGaussian], weights: Array[Double]) extends Model { val numClusters = gaussians.length val numFeatures: Int = weights.length def apply(features: Vector): Int = predict(features) def predict(features: Vector): Int = { predictionFromProbability(predictProbability(features)) } def predictWithProbability(features: Vector): (Int, Double) = { val probability = predictProbability(features) val index = probability.argmax (index, probability(index)) } def predictionFromProbability(probabilities: Vector): Int = { probabilities.argmax } def predictProbability(features: Vector): Vector = { val probs: Array[Double] = GaussianMixtureModel.computeProbabilities(features.toDense, gaussians, weights) Vectors.dense(probs) } override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("prediction" -> ScalarType.Int.nonNullable, "probability" -> TensorType.Double(numClusters)).get }
Example 110
Source File: Node.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.tree import ml.combust.mleap.core.annotation.SparkCode import org.apache.spark.ml.linalg.{Vector, Vectors} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala") final case class InternalNode(left: Node, right: Node, split: Split) extends Node { override def predictImpl(features: Vector): LeafNode = { if(split.shouldGoLeft(features)) { left.predictImpl(features) } else { right.predictImpl(features) } } }
Example 111
Source File: OneVsRestModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.Model import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.{Vector, Vectors} def predictAll(features: Vector): (Double, Vector, Double) = { val predArray = Array.fill[Double](classifiers.length)(0.0) val (prediction, probability) = classifiers.zipWithIndex.map { case (c:ProbabilisticClassificationModel, i) => val raw = c.predictRaw(features) predArray(i) = raw(1) val probability = c.rawToProbabilityInPlace(raw)(1) (i.toDouble, probability) case (c,i) => val raw = c.predict(features) predArray(i) = raw (i.toDouble,raw) }.maxBy(_._2) (probability, Vectors.dense(predArray), prediction) } override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("probability" -> ScalarType.Double, "raw_prediction" -> TensorType.Double(classifiers.length), "prediction" -> ScalarType.Double).get }
Example 112
Source File: ClassificationModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} val numClasses: Int val numFeatures: Int def thresholds: Option[Array[Double]] = None def predict(features: Vector): Double = probabilityToPrediction(predictProbabilities(features)) def predictWithProbability(features: Vector): (Double, Double) = { val probabilities = predictProbabilities(features) val index = probabilityToPredictionIndex(probabilities) (index.toDouble, probabilities(index)) } def predictProbabilities(features: Vector): Vector = { val raw = predictRaw(features) rawToProbabilityInPlace(raw) raw } def rawToProbability(raw: Vector): Vector = { val probabilities = raw.copy rawToProbabilityInPlace(probabilities) } def rawToPrediction(raw: Vector): Double = { thresholds match { case Some(t) => probabilityToPrediction(rawToProbability(raw)) case None => raw.argmax } } def probabilityToPrediction(probability: Vector): Double = { probabilityToPredictionIndex(probability).toDouble } def probabilityToPredictionIndex(probability: Vector): Int = { thresholds match { case Some(ts) => val scaledProbability: Array[Double] = probability.toArray.zip(ts).map { case (p, t) => if (t == 0.0) Double.PositiveInfinity else p / t } Vectors.dense(scaledProbability).argmax case None => probability.argmax } } def rawToProbabilityInPlace(raw: Vector): Vector override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses), "probability" -> TensorType.Double(numClasses), "prediction" -> ScalarType.Double.nonNullable).get }
Example 113
Source File: GBTClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.regression.DecisionTreeRegressionModel import ml.combust.mleap.core.tree.TreeEnsemble import ml.combust.mleap.core.tree.loss.LogLoss import org.apache.spark.ml.linalg.mleap.BLAS import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def margin(features: Vector): Double = { val treePredictions = Vectors.dense(trees.map(_.predict(features)).toArray) BLAS.dot(treePredictions, treeWeightsVector) } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => dv.values(0) = loss.computeProbability(dv.values(0)) dv.values(1) = 1.0 - dv.values(0) dv case sv: SparseVector => throw new RuntimeException("GBTClassificationModel encountered SparseVector") } } override def predictRaw(features: Vector): Vector = { val prediction: Double = margin(features) Vectors.dense(Array(-prediction, prediction)) } }
Example 114
Source File: MultiLayerPerceptronClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.ann.FeedForwardTopology import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.feature.LabeledPoint import org.apache.spark.ml.linalg.{Vector, Vectors} def decodeLabel(output: Vector): Double = { output.argmax.toDouble } } } @SparkCode(uri = "https://github.com/apache/spark/blob/v2.3.0/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala") case class MultiLayerPerceptronClassifierModel(layers: Seq[Int], weights: Vector, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel { val numFeatures: Int = layers.head private val mlpModel = FeedForwardTopology .multiLayerPerceptron(layers.toArray) .model(weights) override def predictRaw(features: Vector): Vector = { mlpModel.predictRaw(features) } override def rawToProbabilityInPlace(raw: Vector): Vector = { mlpModel.raw2ProbabilityInPlace(raw) } override val numClasses: Int = layers.last }
Example 115
Source File: SupportVectorMachineModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.BLAS case class SupportVectorMachineModel(coefficients: Vector, intercept: Double, override val thresholds: Option[Array[Double]] = Some(SupportVectorMachineModel.defaultThresholds)) extends ProbabilisticClassificationModel with Serializable { private def margin(features: Vector): Double = BLAS.dot(coefficients, features) + intercept override val numClasses: Int = 2 override val numFeatures: Int = coefficients.size override def predictRaw(features: Vector): Vector = { val m = margin(features) Vectors.dense(Array(-m, m)) } override def rawToProbabilityInPlace(raw: Vector): Vector = raw }
Example 116
Source File: RandomForestClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.tree.TreeEnsemble import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} case class RandomForestClassifierModel(override val trees: Seq[DecisionTreeClassifierModel], override val treeWeights: Seq[Double], numFeatures: Int, override val numClasses: Int, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with TreeEnsemble with Serializable { override def predictRaw(raw: Vector): Vector = { val votes = Array.fill[Double](numClasses)(0.0) trees.view.foreach { tree => val classCounts: Array[Double] = tree.rootNode.predictImpl(raw).impurities.toArray val total = classCounts.sum if (total != 0) { var i = 0 while (i < numClasses) { votes(i) += classCounts(i) / total i += 1 } } } Vectors.dense(votes) } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in RandomForestClassificationModel:" + " raw2probabilityInPlace encountered SparseVector") } } }
Example 117
Source File: LinearSVCModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.linalg.mleap.BLAS object LinearSVCModel { val defaultThreshold = 0.0 } @SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala") case class LinearSVCModel(coefficients: Vector, intercept: Double, threshold: Double = LinearSVCModel.defaultThreshold ) extends ClassificationModel { val numClasses: Int = 2 val numFeatures: Int = coefficients.size private val margin: Vector => Double = features => { BLAS.dot(features, coefficients) + intercept } override def predict(features: Vector): Double = { if (margin(features) > threshold) 1.0 else 0.0 } override def predictRaw(features: Vector): Vector = { val m = margin(features) Vectors.dense(-m, m) } def rawToPrediction(rawPrediction: Vector): Double = { if (rawPrediction(1) > threshold) 1.0 else 0.0 } override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses), "prediction" -> ScalarType.Double.nonNullable).get }
Example 118
Source File: VectorConverters.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.util import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor} import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors} import scala.language.implicitConversions trait VectorConverters { implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match { case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size)) case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)), values = vector.values, dimensions = Seq(vector.size)) } implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match { case tensor: DenseTensor[_] => Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => Vectors.sparse(tensor.dimensions.product, tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match { case matrix: DenseMatrix => DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols)) case matrix: SparseMatrix => val indices = matrix.rowIndices.zip(matrix.colPtrs).map { case (r, c) => Seq(r, c) }.toSeq SparseTensor(indices = indices, values = matrix.values, dimensions = Seq(matrix.numRows, matrix.numCols)) } implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match { case tensor: DenseTensor[_] => Matrices.dense(tensor.dimensions.head, tensor.dimensions(1), tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip Matrices.sparse(tensor.dimensions.head, tensor.dimensions(1), cols.toArray, rows.toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match { case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size)) case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size)) } implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match { case tensor: DenseTensor[_] => new BDV(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => new BSV(tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]], tensor.dimensions.product) } } object VectorConverters extends VectorConverters
Example 119
Source File: LinalgUtils.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.linalg import ml.combust.mleap.core.annotation.SparkCode import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.{BLAS, VectorWithNorm} val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * normDiff + EPSILON) if (precisionBound1 < precision) { sqDist = sumSquaredNorm - 2.0 * BLAS.dot(v1, v2) } else if (v1.isInstanceOf[SparseVector] || v2.isInstanceOf[SparseVector]) { val dotValue = BLAS.dot(v1, v2) sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0) val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) / (sqDist + EPSILON) if (precisionBound2 > precision) { sqDist = Vectors.sqdist(v1, v2) } } else { sqDist = Vectors.sqdist(v1, v2) } sqDist } def log1pExp(x: Double): Double = { if (x > 0) { x + math.log1p(math.exp(-x)) } else { math.log1p(math.exp(x)) } } }
Example 120
Source File: AFTSurvivalRegressionModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.regression import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.linalg.mleap.BLAS @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala") case class AFTSurvivalRegressionModel(coefficients: Vector, intercept: Double, quantileProbabilities: Array[Double], scale: Double) extends Model { def apply(features: Vector): Double = predict(features) def predictWithQuantiles(features: Vector): (Double, Vector) = { val quantiles = predictQuantiles(features) (predict(features), quantiles) } def predictQuantiles(features: Vector): Vector = { // scale parameter for the Weibull distribution of lifetime val lambda = math.exp(BLAS.dot(coefficients, features) + intercept) // shape parameter for the Weibull distribution of lifetime val k = 1 / scale val quantiles = quantileProbabilities.map { q => lambda * math.exp(math.log(-math.log(1 - q)) / k) } Vectors.dense(quantiles) } def predict(features: Vector): Double = { math.exp(BLAS.dot(coefficients, features) + intercept) } override def inputSchema: StructType = StructType("features" -> TensorType.Double(coefficients.size)).get override def outputSchema: StructType = { StructType("prediction" -> ScalarType.Double.nonNullable, "quantiles" -> TensorType.Double(quantileProbabilities.length)).get } }
Example 121
Source File: MinMaxScalerModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec import org.apache.spark.ml.util.TestingUtils._ class MinMaxScalerModelSpec extends FunSpec{ describe("min max scaler model") { val scaler = MinMaxScalerModel(Vectors.dense(Array(1.0, 0.0, 5.0, 10.0)), Vectors.dense(Array(15.0, 10.0, 15.0, 20.0))) it("scales vector based on min/max range"){ val inputVector = Vectors.dense(15.0, 5.0, 5.0, 19.0) val expectedVector = Vectors.dense(1.0, 0.5, 0.0, 0.9) assert(scaler(inputVector) ~= expectedVector relTol 1E-9) } it("has the right input schema") { assert(scaler.inputSchema.fields == Seq(StructField("input", TensorType.Double(4)))) } it("has the right output schema") { assert(scaler.outputSchema.fields == Seq(StructField("output", TensorType.Double(4)))) } } }
Example 122
Source File: ElementwiseProductModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class ElementwiseProductModelSpec extends FunSpec{ describe("elementwise product model") { val scaler = ElementwiseProductModel(Vectors.dense(Array(0.5, 1.0, 1.0))) it("multiplies each input vector by a provided weight vector"){ val inputArray = Array(15.0, 10.0, 10.0) val expectedVector = Array(7.5, 10.0, 10.0) assert(scaler(Vectors.dense(inputArray)).toArray.sameElements(expectedVector)) } it("has the right input schema") { assert(scaler.inputSchema.fields == Seq(StructField("input", TensorType.Double(3)))) } it("has the right output schema") { assert(scaler.outputSchema.fields == Seq(StructField("output", TensorType.Double(3)))) } } }
Example 123
Source File: PcaModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.{DenseMatrix, Matrices, Vectors} import org.scalatest.FunSpec class PcaModelSpec extends FunSpec { describe("pca model") { val pc = new DenseMatrix(3, 2, Array[Double](1, -1, 2, 0, -3, 1)) val pca = PcaModel(pc) it("uses the principal components matrix to transform a vector to a lower-dimensional vector") { val input = Vectors.dense(Array[Double](2, 1, 0)) assert(pca(input).toArray sameElements Array[Double](1, -3)) } it("has the right input schema") { assert(pca.inputSchema.fields == Seq(StructField("input", TensorType.Double()))) } it("has the right output schema") { assert(pca.outputSchema.fields == Seq(StructField("output", TensorType.Double()))) } } }
Example 124
Source File: MaxAbsScalerModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec import org.apache.spark.ml.util.TestingUtils._ class MaxAbsScalerModelSpec extends FunSpec { describe("Max Abs Scaler Model") { val scaler = MaxAbsScalerModel(Vectors.dense(Array(20.0, 10.0, 10.0, 20.0))) it("Scales the vector based on absolute max value"){ val inputVector = Vectors.dense(15.0, -5.0, 5.0, 19.0) val expectedVector = Vectors.dense(0.75, -0.5, 0.5, 0.95) assert(scaler(inputVector) ~= expectedVector relTol 1E-9) } it("Has the right input schema") { assert(scaler.inputSchema.fields == Seq(StructField("input", TensorType.Double(4)))) } it("Has the right output schema") { assert(scaler.outputSchema.fields == Seq(StructField("output", TensorType.Double(4)))) } } }
Example 125
Source File: BinarizerModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class BinarizerModelSpec extends FunSpec { describe("binarizer with several inputs"){ val binarizer = BinarizerModel(0.3, TensorShape(3)) it("Makes a value 0 or 1 based on the threshold") { val features = Vectors.dense(Array(0.1, 0.4, 0.3)) val binFeatures = binarizer(features).toArray assert(binFeatures(0) == 0.0) assert(binFeatures(1) == 1.0) assert(binFeatures(2) == 0.0) } it("Has the right input schema") { assert(binarizer.inputSchema.fields == Seq(StructField("input", TensorType.Double(3)))) } it("Has the right output schema") { assert(binarizer.outputSchema.fields == Seq(StructField("output", TensorType.Double(3)))) } } describe("binarizer with one input") { val binarizer = BinarizerModel(0.3, ScalarShape()) it("Has the right input schema") { assert(binarizer.inputSchema.fields == Seq(StructField("input", ScalarType.Double.nonNullable))) } it("Has the right output schema") { assert(binarizer.outputSchema.fields == Seq(StructField("output", ScalarType.Double.nonNullable))) } } }
Example 126
Source File: DCTModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class DCTModelSpec extends FunSpec { describe("dct model") { val model = DCTModel(false, 3) describe("issue167") { it("should not modify input features") { val expected = Array(123.4, 23.4, 56.7) val features = Vectors.dense(Array(123.4, 23.4, 56.7)) val dctFeatures = model(features) assert(features.toArray.sameElements(expected)) assert(!features.toArray.sameElements(dctFeatures.toArray)) assert(features.toArray != dctFeatures.toArray) } } describe("input/output schema") { it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double(3)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(3)))) } } } }
Example 127
Source File: WordToVectorModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{BasicType, ListType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalactic.TolerantNumerics import org.scalatest.FunSpec class WordToVectorModelSpec extends FunSpec { implicit val doubleEquality = TolerantNumerics.tolerantDoubleEquality(0.000001) describe("word to vector model") { val model = WordToVectorModel(Map("test" -> 1), Array(12)) it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("input", ListType(BasicType.String)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(1)))) } } describe("WordToVectorKernel") { describe("for name") { it("returns the kernel for string") { assert(WordToVectorKernel.forName("default") == WordToVectorKernel.Default) assert(WordToVectorKernel.forName("sqrt") == WordToVectorKernel.Sqrt) } } } describe("Sqrt kernel") { it("produces results using the sqrt kernel (division by sqrt(dot(vec, vec)))") { val hello = Vectors.dense(-0.02743354, 0.13925314, -0.41874424, 0.05635237, -1.01364303, 0.13555442, -0.36437142, 0.10494551, 1.25634718, 0.74919909, -0.75405639, 0.34798685, -0.33082211, -1.83296537, 1.8524611 , 0.16053002, 0.05308712, -0.61047131, -2.04251647, -0.6457383 , -0.06899478, -1.06984603, 1.81890905, -1.57762015, -1.14214861, -0.37704349, -1.13758969, -1.11241293, -0.01736556, 0.55350637, 1.29117298, 0.6780861 , 0.72507775, 0.38882053, -1.13152575) val there = Vectors.dense(0.05639598, -0.0189869 , 0.01236993, 0.00477022, -0.10707449, 0.02502576, 0.0702049 , 0.07715208, 0.03785434, 0.06749821, 0.0028507 , 0.03143736, -0.07800865, -0.066576 , 0.05038944, 0.04129622, 0.05770208, -0.09861612, -0.02329824, -0.03803944, -0.01226865, -0.03243028, 0.05924392, -0.07248155, -0.03818463, 0.03131858, -0.03253553, 0.04506788, -0.02503723, -0.03580079, 0.05802456, -0.00171577, -0.07222789, 0.01021192, 0.01579604) val `{make}` = Vectors.dense(1.69664776, -0.9033435 , -1.13164949, 1.94182444, -0.53111398, 2.28728724, 1.39580894, 1.38314795, -1.03503716, 1.0247947 , -2.175174 , 1.62514234, -0.64084077, -0.20218629, -0.0694286 , 0.37854579, -2.70390058, -2.27423668, -2.79813218, -0.46218753, 0.77630186, -0.82613772, 1.18320072, -2.93088889, 0.6440177 , -0.02956525, -1.51469374, -2.94850779, -0.89843947, -0.16953184, -1.4054004 , -1.22051024, 0.41841957, 0.26196802, 3.39272285) val wordVectors = Array(hello, there, `{make}`).flatMap(_.toArray) val model = WordToVectorModel(Map("hello" -> 0, "there" -> 1, "{make}" -> 2), wordVectors, kernel = WordToVectorKernel.Sqrt) val resultHello = model(Seq("hello")) val expectedHello = Vectors.dense(-0.00489383, 0.02484115, -0.07469912, 0.01005261, -0.18082216, 0.02418134, -0.06499964, 0.01872106, 0.22411777, 0.13364843, -0.13451492, 0.06207682, -0.05901483, -0.32697977, 0.33045758, 0.02863669, 0.00947013, -0.108901 , -0.36436126, -0.11519223, -0.01230787, -0.19084813, 0.32447228, -0.28142914, -0.20374607, -0.06726019, -0.20293281, -0.19844157, -0.00309781, 0.09873912, 0.23033029, 0.1209627 , 0.12934546, 0.06936107, -0.20185107) val resultSentence = model(Seq("hello", "there", "{make}", "qqq")) val expectedSentence = Vectors.dense(0.13878191, -0.06297886, -0.1236953 , 0.16108668, -0.13284827, 0.19686932, 0.0885994 , 0.12588461, 0.02084325, 0.14810168, -0.23535359, 0.16121693, -0.08441966, -0.16903109, 0.14745265, 0.04667632, -0.20855054, -0.23993334, -0.39118211, -0.09216406, 0.05589835, -0.15509237, 0.24620885, -0.36842539, -0.04313309, -0.03018265, -0.21592611, -0.32297428, -0.07566708, 0.02800181, -0.00452011, -0.04376236, 0.08615666, 0.05316085, 0.18312679) for ((a, b) <- resultHello.toArray.zip(expectedHello.toArray)) { assert(a === b) } for ((a, b) <- resultSentence.toArray.zip(expectedSentence.toArray)) { assert(a === b) } } } }
Example 128
Source File: VectorAssemblerModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import java.math.BigDecimal import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class VectorAssemblerModelSpec extends FunSpec { val assembler = VectorAssemblerModel(Seq( ScalarShape(), ScalarShape(), TensorShape(2), TensorShape(5))) describe("#apply") { it("assembles doubles and vectors into a new vector") { val expectedArray = Array(45.0, 76.8, 23.0, 45.6, 0.0, 22.3, 45.6, 0.0, 99.3) assert(assembler(Array(45.0, new BigDecimal(76.8), Vectors.dense(Array(23.0, 45.6)), Vectors.sparse(5, Array(1, 2, 4), Array(22.3, 45.6, 99.3)))).toArray.sameElements(expectedArray)) } } describe("input/output schema") { it("has the right input schema") { assert(assembler.inputSchema.fields == Seq( StructField("input0", ScalarType.Double), StructField("input1", ScalarType.Double), StructField("input2", TensorType.Double(2)), StructField("input3", TensorType.Double(5)))) } it("has the right output schema") { assert(assembler.outputSchema.fields == Seq(StructField("output", TensorType.Double(9)))) } } }
Example 129
Source File: InteractionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class InteractionModelSpec extends FunSpec { describe("with all numeric inputs") { val encoderSpec: Array[Array[Int]] = Array(Array(1), Array(1, 1)) val model = InteractionModel(encoderSpec, Seq(ScalarShape(), TensorShape(2))) it("produces the expected interaction vector") { val features = Seq(2.toDouble, Vectors.dense(3, 4)) assert(model(features).toArray.toSeq == Seq(6, 8)) } it("has the right inputs") { assert(model.inputSchema.fields == Seq(StructField("input0", ScalarType.Double), StructField("input1", TensorType.Double(2)))) } it("has the right outputs") { assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(2)))) } } describe("with one nominal input") { val encoderSpec: Array[Array[Int]] = Array(Array(4), Array(1, 1)) val model = InteractionModel(encoderSpec, Seq(ScalarShape(), TensorShape(2))) it("produce the expected interaction vector") { val features = Seq(2.toDouble, Vectors.dense(3, 4)) assert(model(features).toArray.toSeq == Seq(0, 0, 0, 0, 3, 4, 0, 0)) } it("has the right inputs") { assert(model.inputSchema.fields == Seq(StructField("input0", ScalarType.Double), StructField("input1", TensorType.Double(2)))) } it("has the right outputs") { assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(8)))) } } }
Example 130
Source File: StandardScalerModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class StandardScalerModelSpec extends FunSpec { describe("standard scaler with dense data") { describe("with mean") { val scaler = StandardScalerModel(None, Some(Vectors.dense(Array(50.0, 20.0, 30.0)))) it("scales based off of the mean") { val expectedVector = Array(5.0, 5.0, 3.0) assert(scaler(Vectors.dense(Array(55.0, 25.0, 33.0))).toArray.sameElements(expectedVector)) } it should behave like aModelWithSchema(scaler, 3) } describe("with stdev") { val scaler = StandardScalerModel(Some(Vectors.dense(Array(2.5, 8.0, 10.0))), None) it("scales based off the standard deviation") { val expectedVector = Array(1.6, .4375, 1.0) assert(scaler(Vectors.dense(Array(4.0, 3.5, 10.0))).toArray.sameElements(expectedVector)) } it should behave like aModelWithSchema(scaler, 3) } describe("with mean and stdev") { val scaler = StandardScalerModel(Some(Vectors.dense(Array(2.5, 8.0, 10.0))), Some(Vectors.dense(Array(50.0, 20.0, 30.0)))) it("scales based off the mean and standard deviation") { val expectedVector = Array(1.6, .4375, 1.0) assert(scaler(Vectors.dense(Array(54.0, 23.5, 40.0))).toArray.sameElements(expectedVector)) } it should behave like aModelWithSchema(scaler, 3) } } describe("standard scaler with sparse data") { describe("with mean") { val scaler = StandardScalerModel(None, Some(Vectors.sparse(5, Array(1, 2, 4), Array(20, 45, 100)))) it("scales based off of the mean") { val expectedVector = Array(0.0, 5.0, 5.0, 0.0, 3.0) assert(scaler(Vectors.sparse(5, Array(1, 2, 4), Array(25, 50, 103))).toArray.sameElements(expectedVector)) } it should behave like aModelWithSchema(scaler, 5) } describe("with stdev") { val scaler = StandardScalerModel(Some(Vectors.sparse(5, Array(1, 2, 4), Array(20, 45, 100))), None) it("scales based off the standard deviation") { val expectedVector = Array(0.0, 1.25, 2.2, 0.0, 1.02) assert(scaler(Vectors.sparse(5, Array(1, 2, 4), Array(25, 99, 102))).toArray.sameElements(expectedVector)) } it should behave like aModelWithSchema(scaler, 5) } describe("with mean and stdev") { val scaler = StandardScalerModel(Some(Vectors.sparse(5, Array(1, 2, 4), Array(2.5, 8.0, 10.0))), Some(Vectors.sparse(5, Array(1, 2, 4), Array(50.0, 20.0, 30.0)))) it("scales based off the mean and standard deviation") { val expectedVector = Array(0.0, 1.6, .4375, 0.0, 1.0) val actual = scaler(Vectors.sparse(5, Array(1, 2, 4), Array(54.0, 23.5, 40.0))) assert(actual.toArray.sameElements(expectedVector)) } it should behave like aModelWithSchema(scaler, 5) } } def aModelWithSchema(model: StandardScalerModel, tensorSize: Integer) = { it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double(tensorSize)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(tensorSize)))) } } }
Example 131
Source File: IDFModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class IDFModelSpec extends FunSpec { describe("idf model") { val model = IDFModel(Vectors.dense(Array(1.0, 2.0))) it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double()))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double()))) } } }
Example 132
Source File: NormalizerModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class NormalizerModelSpec extends FunSpec { describe("normalizer model") { val normalizer = NormalizerModel(20.0, 3) it("normalizes the feature vector using the p normalization value") { val features = Vectors.dense(Array(0.0, 20.0, 40.0)) val norm = normalizer(features).toArray assert(norm(0) < 0.0001 && norm(0) > -0.0001) assert(norm(1) < 0.5001 && norm(1) > 0.49999) assert(norm(2) < 1.0001 && norm(2) > 0.99999) } it("has the right input schema") { assert(normalizer.inputSchema.fields == Seq(StructField("input", TensorType.Double(3)))) } it("has the right output schema") { assert(normalizer.outputSchema.fields == Seq(StructField("output", TensorType.Double(3)))) } } }
Example 133
Source File: PolynomialExpansionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class PolynomialExpansionModelSpec extends FunSpec { describe("polynomial expansion model") { val model = PolynomialExpansionModel(2, 2) it("performs polynomial expansion on an input vector") { val inputArray = Array(2.0,3.0) val expectedVector = Array(2.0, 4.0, 3.0, 6.0, 9.0) assert(model(Vectors.dense(inputArray)).toArray.sameElements(expectedVector)) } it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double(2)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(5)))) } } }
Example 134
Source File: PolynomialFeaturesModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.sklearn import ml.combust.mleap.core.types.{StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class PolynomialFeaturesModelSpec extends FunSpec { val model = new PolynomialFeaturesModel("[x0,x1,x0^2,x0 x1,x1^2,x0^3,x0^2 x1,x0 x1^2,x1^3]") describe("sklearn polynomial features") { it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("input", TensorType.Double(2)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("output", TensorType.Double(9)))) } it("calculates the polynomial features based off given combinations") { val result = model(Vectors.dense(3, 4)) assert(result == Vectors.dense(3.0, 4.0, 9.0, 12.0, 16.0, 27.0, 36.0, 48.0, 64.0)) } } }
Example 135
Source File: KMeansModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.clustering import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class KMeansModelSpec extends FunSpec { val v1 = Vectors.dense(Array(1.0, 2.0, 55.0)) val v2 = Vectors.dense(Array(11.0, 200.0, 55.0)) val v3 = Vectors.dense(Array(100.0, 22.0, 55.0)) val km = KMeansModel(Array(v1, v2, v3), 3) describe("#apply") { it("finds the closest cluster") { assert(km(Vectors.dense(Array(2.0, 5.0, 34.0))) == 0) assert(km(Vectors.dense(Array(20.0, 230.0, 34.0))) == 1) assert(km(Vectors.dense(Array(111.0, 20.0, 56.0))) == 2) } } describe("input/output schema") { it("has the right input schema") { assert(km.inputSchema.fields == Seq(StructField("features", TensorType.Double(3)))) } it("has the right output schema") { assert(km.outputSchema.fields == Seq(StructField("prediction", ScalarType.Int.nonNullable))) } } }
Example 136
Source File: BisectingKMeansModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.clustering import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.linalg.mleap.VectorWithNorm import org.scalatest.FunSpec class BisectingKMeansModelSpec extends FunSpec { describe("bisecting kmeans model") { val model = new BisectingKMeansModel(ClusteringTreeNode(23, VectorWithNorm(Vectors.dense(1, 2, 3)) , Array())) it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("features", TensorType.Double(3)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("prediction", ScalarType.Int.nonNullable))) } } }
Example 137
Source File: NodeSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.tree import org.scalatest.FunSpec import org.apache.spark.ml.linalg.Vectors class InternalNodeSpec extends FunSpec { describe("#typeName") { it("is InternalNode") { } } describe("#predictImpl") { val leftNode = LeafNode(0.45) val rightNode = LeafNode(0.33) val features = Vectors.dense(Array(0.3)) describe("when split goes left") { it("returns the left node") { val node = InternalNode(leftNode, rightNode, ContinuousSplit(0, 0.4)) assert(node.predictImpl(features) == leftNode) } } describe("when split goes right") { it("returns the right node") { val node = InternalNode(leftNode, rightNode, ContinuousSplit(0, 0.2)) assert(node.predictImpl(features) == rightNode) } } } } class LeafNodeSpec extends FunSpec { describe("#predictImpl") { it("returns itself") { val node = LeafNode(0.45) assert(node.predictImpl(Vectors.dense(Array(0.67))) == node) } } }
Example 138
Source File: MultiLayerPerceptronClassifierModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class MultiLayerPerceptronClassifierModelSpec extends FunSpec { describe("multi layer perceptron classifier model") { val model = new MultiLayerPerceptronClassifierModel(Seq(3, 1), Vectors.dense(Array(1.9, 2.2, 4, 1))) it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("features", TensorType.Double(3)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("raw_prediction", TensorType.Double(1)), StructField("probability", TensorType.Double(1)), StructField("prediction", ScalarType.Double.nonNullable) )) } } }
Example 139
Source File: GBTClassifierModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.test.TestUtil import ml.combust.mleap.core.types.{BasicType, ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class GBTClassifierModelSpec extends FunSpec { val tree1 = TestUtil.buildDecisionTreeRegression(0.5, 0, goLeft = true) val tree2 = TestUtil.buildDecisionTreeRegression(0.75, 1, goLeft = false) val tree3 = TestUtil.buildDecisionTreeRegression(-0.1, 2, goLeft = true) val classifier = GBTClassifierModel(trees = Seq(tree1, tree2, tree3), treeWeights = Seq(0.5, 2.0, 1.0), numFeatures = 3) describe("#apply") { val features = Vectors.dense(Array(0.2, 0.8, 0.4)) it("predicts the class based on the features") { assert(classifier(features) == 1.0) } } describe("input/output schema") { it("has the right input schema") { assert(classifier.inputSchema.fields == Seq(StructField("features", TensorType(BasicType.Double, Seq(3))))) } it("has the right output schema") { assert(classifier.outputSchema.fields == Seq(StructField("raw_prediction", TensorType.Double(2)), StructField("probability", TensorType.Double(2)), StructField("prediction", ScalarType.Double.nonNullable) )) } } }
Example 140
Source File: OneVsRestModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class OneVsRestModelSpec extends FunSpec { describe("one vs rest model") { val model = new OneVsRestModel(Array( BinaryLogisticRegressionModel(Vectors.dense(1.0, 2.0), 0.7, 0.4)), 2) it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("features", TensorType.Double(2)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq( StructField("probability", ScalarType.Double), StructField("raw_prediction", TensorType.Double(1)), StructField("prediction", ScalarType.Double) )) } } }
Example 141
Source File: LogisticRegressionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.scalatest.FunSpec class LogisticRegressionModelSpec extends FunSpec { describe("BinaryLogisticRegression") { val weights = Vectors.dense(1.0, 2.0, 4.0) val intercept = 0.7 describe("issue210: Logistic function not being applied") { val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4) it("applies the logistic function for prediction") { val features = Vectors.dense(-1.0, 1.0, -0.5) assert(lr.predict(features) == 1.0) } } describe("issue386:Wrong Binary LogisticRegression predictions") { val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4) it("compare binary logisticRegression prediction with the transform api predictions") { val features = Vectors.dense(-1.0, 1.0, -0.5) assert(lr.predict(features) == lr.probabilityToPrediction(lr.rawToProbability(lr.predictRaw(features)))) assert(lr.predict(features) == 1.0) } it("compare binary logisticRegression prediction with rawToPrediction() results") { val features = Vectors.dense(-1.0, 1.0, -0.5) assert(lr.predict(features) == lr.rawToPrediction(lr.predictRaw(features))) assert(lr.predict(features) == 1.0) } } describe("issue386:Binary LogisticRegression predictions with 1.0 threshold"){ val lr = BinaryLogisticRegressionModel(weights, intercept, 1.0) it("binary logisticRegression prediction equals zero for 1.0 threshold") { val features = Vectors.dense(-1.0, 1.0, -0.5) assert(lr.predict(features) == lr.probabilityToPrediction(lr.rawToProbability(lr.predictRaw(features)))) assert(lr.predict(features) == 0.0) } } describe("issue386:Binary LogisticRegression predictions with 0.0 threshold"){ val lr = BinaryLogisticRegressionModel(weights, intercept, 0.0) it("binary logisticRegression prediction equals 1 for zero threshold") { val features = Vectors.dense(-1.0, 1.0, -0.5) assert(lr.predict(features) == lr.rawToPrediction(lr.predictRaw(features))) assert(lr.predict(features) == 1.0) } } describe("input/output schema"){ val lr = BinaryLogisticRegressionModel(weights, intercept, 0.4) it("has the right input schema") { assert(lr.inputSchema.fields == Seq(StructField("features", TensorType.Double(3)))) } it("has the right output schema") { assert(lr.outputSchema.fields == Seq( StructField("raw_prediction", TensorType.Double(2)), StructField("probability", TensorType.Double(2)), StructField("prediction", ScalarType.Double.nonNullable) )) } } } describe("ProbabilisticLogisticsRegressionModel") { val weights = Matrices.dense(3, 3, Array(1, 2, 3, 1, 2, 3, 1, 2, 3)) val intercept = Vectors.dense(1, 2, 3) val lr = ProbabilisticLogisticsRegressionModel(weights, intercept, None) describe("input/output schema"){ it("has the right input schema") { assert(lr.inputSchema.fields == Seq(StructField("features", TensorType.Double(3)))) } it("has the right output schema") { assert(lr.outputSchema.fields == Seq( StructField("raw_prediction", TensorType.Double(3)), StructField("probability", TensorType.Double(3)), StructField("prediction", ScalarType.Double.nonNullable) )) } } } }
Example 142
Source File: SupportVectorMachineModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class SupportVectorMachineModelSpec extends FunSpec { describe("svm model") { val model = new SupportVectorMachineModel(Vectors.dense(1, 2, 3), 2) it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("features", TensorType.Double(3)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("raw_prediction", TensorType.Double(2)), StructField("probability", TensorType.Double(2)), StructField("prediction", ScalarType.Double.nonNullable))) } } }
Example 143
Source File: IsotonicRegressionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.regression import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class IsotonicRegressionModelSpec extends FunSpec { val regression = IsotonicRegressionModel(boundaries = Array(0.0, 4.0, 5.0, 7.0, 8.0), predictions = Seq(100.0, 200.0, 300.0, 400.0, 500.0), isotonic = true, featureIndex = Some(2)) describe("#apply") { it("applies the linear regression to a feature vector") { assert(regression(4.0) == 200.0) assert(regression(4.5) == 250.0) assert(regression(Vectors.dense(Array(1.0, 2.3, 7.2))) == 420.0) } } describe("input/output schema") { it("has the right input schema") { assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double()))) } it("has the right output schema") { assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable))) } } }
Example 144
Source File: AFTSurvivalRegressionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.regression import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class AFTSurvivalRegressionModelSpec extends FunSpec { describe("AFT survival regression model") { val model = new AFTSurvivalRegressionModel(Vectors.dense(1, 2, 3), 2, Array(4, 5, 6, 7), 3) it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("features",TensorType.Double(3)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable), StructField("quantiles", TensorType.Double(4)))) } } }
Example 145
Source File: LinearRegressionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.regression import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.scalatest.FunSpec import org.apache.spark.ml.linalg.Vectors class LinearRegressionModelSpec extends FunSpec { val linearRegression = LinearRegressionModel(Vectors.dense(Array(0.5, 0.75, 0.25)), .33) describe("#apply") { it("applies the linear regression to a feature vector") { assert(linearRegression(Vectors.dense(Array(1.0, 0.5, 1.0))) == 1.455) } } describe("input/output schema") { it("has the right input schema") { assert(linearRegression.inputSchema.fields == Seq(StructField("features", TensorType.Double(3)))) } it("has the right output schema") { assert(linearRegression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable))) } } }
Example 146
Source File: RandomForestRegressionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.regression import ml.combust.mleap.core.test.TestUtil import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class RandomForestRegressionModelSpec extends FunSpec { val tree1 = TestUtil.buildDecisionTreeRegression(0.5, 0, goLeft = true) val tree2 = TestUtil.buildDecisionTreeRegression(0.75, 1, goLeft = false) val tree3 = TestUtil.buildDecisionTreeRegression(0.1, 2, goLeft = true) val regression = RandomForestRegressionModel(Seq(tree1, tree2, tree3), 5) describe("#predict") { it("uses the forest to make a prediction") { val features = Vectors.dense(Array(0.2, 0.8, 0.4)) assert(tree1.predict(features) == 0.5) assert(tree2.predict(features) == 0.75) assert(tree3.predict(features) == 0.1) assert(regression.predict(features) == (0.5 + 0.75 + 0.1) / 3) } } describe("input/output schema") { it("has the right input schema") { assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double(5)))) } it("has the right output schema") { assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable))) } } }
Example 147
Source File: DecisionTreeRegressionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.regression import ml.combust.mleap.core.tree.{ContinuousSplit, InternalNode, LeafNode} import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class DecisionTreeRegressionModelSpec extends FunSpec { val node = InternalNode(LeafNode(Seq(0.78)), LeafNode(Seq(0.34)), ContinuousSplit(0, 0.5)) val regression = DecisionTreeRegressionModel(node, 5) describe("#predict") { it("returns the prediction for the decision tree") { val features = Vectors.dense(Array(0.3, 1.0, 43.23, -21.2, 66.7)) assert(regression.predict(features) == 0.78) } } describe("input/output schema") { it("has the right input schema") { assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double(5)))) } it("has the right output schema") { assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable))) } } }
Example 148
Source File: GBTRegressionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.regression import ml.combust.mleap.core.test.TestUtil import ml.combust.mleap.core.types.{ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class GBTRegressionModelSpec extends FunSpec { val tree1 = TestUtil.buildDecisionTreeRegression(0.5, 0, goLeft = true) val tree2 = TestUtil.buildDecisionTreeRegression(0.75, 1, goLeft = false) val tree3 = TestUtil.buildDecisionTreeRegression(0.1, 2, goLeft = true) val regression = GBTRegressionModel(Seq(tree1, tree2, tree3), Seq(0.5, 2.0, 1.0), 5) describe("#apply") { val features = Vectors.dense(Array(0.2, 0.8, 0.4)) it("predicts the value based on the features") { assert(tree1.predict(features) == 0.5) assert(tree2.predict(features) == 0.75) assert(tree3.predict(features) == 0.1) assert(regression.predict(features) == (0.5 * 0.5 + 0.75 * 2.0 + 0.1 * 1.0)) } } describe("input/output schema") { it("has the right input schema") { assert(regression.inputSchema.fields == Seq(StructField("features", TensorType.Double(5)))) } it("has the right output schema") { assert(regression.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable))) } } }
Example 149
Source File: GeneralizedLinearRegressionModelSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.regression import ml.combust.mleap.core.types.{ScalarShape, ScalarType, StructField, TensorType} import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSpec class GeneralizedLinearRegressionModelSpec extends FunSpec { describe("generalized linear regression model") { val model = new GeneralizedLinearRegressionModel(Vectors.dense(1, 2, 3), 23, null) it("has the right input schema") { assert(model.inputSchema.fields == Seq(StructField("features",TensorType.Double(3)))) } it("has the right output schema") { assert(model.outputSchema.fields == Seq(StructField("prediction", ScalarType.Double.nonNullable), StructField("link_prediction", ScalarType.Double.nonNullable))) } } }
Example 150
Source File: LinearSVCOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.bundle.ops import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl.{Bundle, Model, NodeShape, Value} import ml.combust.bundle.op.OpModel import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.classification.LinearSVCModel import org.apache.spark.ml.linalg.Vectors override val Model: OpModel[SparkBundleContext, LinearSVCModel] = new OpModel[SparkBundleContext, LinearSVCModel] { override val klazz: Class[LinearSVCModel] = classOf[LinearSVCModel] override def opName: String = Bundle.BuiltinOps.classification.linear_svc override def store(model: Model, obj: LinearSVCModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val m = model.withValue("num_classes", Value.long(obj.numClasses)) // Set the rest of the parameters m.withValue("coefficients", Value.vector(obj.coefficients.toArray)) .withValue("intercept", Value.double(obj.intercept)) .withValue("threshold", Value.double(obj.getThreshold)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): LinearSVCModel = { new LinearSVCModel(uid = "", coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble ).setThreshold(model.value("threshold").getDouble) } } override def sparkInputs(obj: LinearSVCModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: LinearSVCModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "prediction" -> obj.predictionCol) } override def sparkLoad(uid: String, shape: NodeShape, model: LinearSVCModel): LinearSVCModel = { new LinearSVCModel(uid = uid, coefficients = model.coefficients, intercept = model.intercept).setThreshold(model.getThreshold) } }
Example 151
Source File: ElementwiseProductOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.ElementwiseProduct import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.Param class ElementwiseProductOp extends SimpleSparkOp[ElementwiseProduct] { override val Model: OpModel[SparkBundleContext, ElementwiseProduct] = new OpModel[SparkBundleContext, ElementwiseProduct] { override val klazz: Class[ElementwiseProduct] = classOf[ElementwiseProduct] override def opName: String = Bundle.BuiltinOps.feature.elementwise_product override def store(model: Model, obj: ElementwiseProduct) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("scaling_vec", Value.vector(obj.getScalingVec.toArray)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): ElementwiseProduct = { new ElementwiseProduct(uid = "").setScalingVec(Vectors.dense(model.value("scaling_vec").getTensor[Double].toArray)) } } override def sparkLoad(uid: String, shape: NodeShape, model: ElementwiseProduct): ElementwiseProduct = { new ElementwiseProduct(uid = uid).setScalingVec(model.getScalingVec) } override def sparkInputs(obj: ElementwiseProduct): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: ElementwiseProduct): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol ) } }
Example 152
Source File: BucketedRandomProjectionLSHOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import ml.combust.mleap.core.types.TensorShape import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.bundle._ import org.apache.spark.ml.feature.BucketedRandomProjectionLSHModel import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.mleap.TypeConverters.sparkToMleapDataShape class BucketedRandomProjectionLSHOp extends SimpleSparkOp[BucketedRandomProjectionLSHModel] { override val Model: OpModel[SparkBundleContext, BucketedRandomProjectionLSHModel] = new OpModel[SparkBundleContext, BucketedRandomProjectionLSHModel] { override val klazz: Class[BucketedRandomProjectionLSHModel] = classOf[BucketedRandomProjectionLSHModel] override def opName: String = Bundle.BuiltinOps.feature.bucketed_random_projection_lsh override def store(model: Model, obj: BucketedRandomProjectionLSHModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val dataset = context.context.dataset.get val inputShape = sparkToMleapDataShape(dataset.schema(obj.getInputCol), dataset).asInstanceOf[TensorShape] model.withValue("random_unit_vectors", Value.tensorList[Double](obj.randUnitVectors.map(_.toArray).map(Tensor.denseVector))). withValue("bucket_length", Value.double(obj.getBucketLength)) .withValue("input_size", Value.int(inputShape.dimensions.get(0))) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): BucketedRandomProjectionLSHModel = { val ruv = model.value("random_unit_vectors").getTensorList[Double].map(_.toArray).map(Vectors.dense) val m = new BucketedRandomProjectionLSHModel(uid = "", randUnitVectors = ruv.toArray) m.set(m.bucketLength, model.value("bucket_length").getDouble) } } override def sparkLoad(uid: String, shape: NodeShape, model: BucketedRandomProjectionLSHModel): BucketedRandomProjectionLSHModel = { val m = new BucketedRandomProjectionLSHModel(uid = uid, randUnitVectors = model.randUnitVectors) m.set(m.bucketLength, model.getBucketLength) } override def sparkInputs(obj: BucketedRandomProjectionLSHModel): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: BucketedRandomProjectionLSHModel): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 153
Source File: MaxAbsScalerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.MaxAbsScalerModel import org.apache.spark.ml.linalg.Vectors class MaxAbsScalerOp extends SimpleSparkOp[MaxAbsScalerModel]{ override val Model: OpModel[SparkBundleContext, MaxAbsScalerModel] = new OpModel[SparkBundleContext, MaxAbsScalerModel] { override val klazz: Class[MaxAbsScalerModel] = classOf[MaxAbsScalerModel] override def opName: String = Bundle.BuiltinOps.feature.max_abs_scaler override def store(model: Model, obj: MaxAbsScalerModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("maxAbs", Value.vector(obj.maxAbs.toArray)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): MaxAbsScalerModel = { new MaxAbsScalerModel(uid = "", maxAbs = Vectors.dense(model.value("maxAbs").getTensor[Double].toArray)) } } override def sparkLoad(uid: String, shape: NodeShape, model: MaxAbsScalerModel): MaxAbsScalerModel = { new MaxAbsScalerModel(uid = uid, maxAbs = model.maxAbs) } override def sparkInputs(obj: MaxAbsScalerModel): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: MaxAbsScalerModel): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 154
Source File: StandardScalerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.bundle.dsl._ import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.StandardScalerModel import org.apache.spark.ml.linalg.Vectors class StandardScalerOp extends SimpleSparkOp[StandardScalerModel] { override val Model: OpModel[SparkBundleContext, StandardScalerModel] = new OpModel[SparkBundleContext, StandardScalerModel] { override val klazz: Class[StandardScalerModel] = classOf[StandardScalerModel] override def opName: String = Bundle.BuiltinOps.feature.standard_scaler override def store(model: Model, obj: StandardScalerModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val mean = if(obj.getWithMean) Some(obj.mean.toArray) else None val std = if(obj.getWithStd) Some(obj.std.toArray) else None model.withValue("mean", mean.map(Value.vector[Double])). withValue("std", std.map(Value.vector[Double])) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): StandardScalerModel = { val std = model.getValue("std").map(_.getTensor[Double].toArray).map(Vectors.dense) val mean = model.getValue("mean").map(_.getTensor[Double].toArray).map(Vectors.dense) val size = std.map(_.size).orElse(mean.map(_.size)).get val m = new StandardScalerModel(uid = "", std = std.getOrElse(Vectors.sparse(size, Array(), Array())), mean = mean.getOrElse(Vectors.sparse(size, Array(), Array()))) if (std.isEmpty) { m.set(m.withStd, false)} else {m.set(m.withStd, true)} if (mean.isEmpty) { m.set(m.withMean, false)} else {m.set(m.withMean, true)} m } } override def sparkLoad(uid: String, shape: NodeShape, model: StandardScalerModel): StandardScalerModel = { val m = new StandardScalerModel(uid = uid, std = model.std, mean = model.mean) if (model.isDefined(model.withMean)) { m.set(m.withMean, model.getWithMean) } if (model.isDefined(model.withStd)) { m.set(m.withStd, model.getWithStd) } m } override def sparkInputs(obj: StandardScalerModel): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: StandardScalerModel): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 155
Source File: MinMaxScalerOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.MinMaxScalerModel import org.apache.spark.ml.linalg.Vectors class MinMaxScalerOp extends SimpleSparkOp[MinMaxScalerModel] { override val Model: OpModel[SparkBundleContext, MinMaxScalerModel] = new OpModel[SparkBundleContext, MinMaxScalerModel] { override val klazz: Class[MinMaxScalerModel] = classOf[MinMaxScalerModel] override def opName: String = Bundle.BuiltinOps.feature.min_max_scaler override def store(model: Model, obj: MinMaxScalerModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("min", Value.vector(obj.originalMin.toArray)). withValue("max", Value.vector(obj.originalMax.toArray)) .withValue("minValue", Value.double(obj.getMin)) .withValue("maxValue", Value.double(obj.getMax)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): MinMaxScalerModel = { new MinMaxScalerModel(uid = "", originalMin = Vectors.dense(model.value("min").getTensor[Double].toArray), originalMax = Vectors.dense(model.value("max").getTensor[Double].toArray)) .setMin(model.getValue("minValue").map(_.getDouble).getOrElse(0.0)) .setMax(model.getValue("maxValue").map(_.getDouble).getOrElse(1.0)) } } override def sparkLoad(uid: String, shape: NodeShape, model: MinMaxScalerModel): MinMaxScalerModel = { val m = new MinMaxScalerModel(uid = uid, originalMin = model.originalMin, originalMax = model.originalMax) if (model.isDefined(model.max)) { m.setMax(model.getMax)} if (model.isDefined(model.min)) { m.setMin(model.getMin)} m } override def sparkInputs(obj: MinMaxScalerModel): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: MinMaxScalerModel): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 156
Source File: GaussianMixtureOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.clustering import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.tensor.{DenseTensor, Tensor} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.clustering.GaussianMixtureModel import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.stat.distribution.MultivariateGaussian class GaussianMixtureOp extends SimpleSparkOp[GaussianMixtureModel] { override val Model: OpModel[SparkBundleContext, GaussianMixtureModel] = new OpModel[SparkBundleContext, GaussianMixtureModel] { override val klazz: Class[GaussianMixtureModel] = classOf[GaussianMixtureModel] override def opName: String = Bundle.BuiltinOps.clustering.gaussian_mixture override def store(model: Model, obj: GaussianMixtureModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val (rows, cols) = obj.gaussians.headOption. map(g => (g.cov.numRows, g.cov.numCols)). getOrElse((-1, -1)) val (means, covs) = obj.gaussians.map(g => (g.mean, g.cov)).unzip model.withValue("means", Value.tensorList(means.map(_.toArray).map(Tensor.denseVector))). withValue("covs", Value.tensorList(covs.map(m => DenseTensor(m.toArray, Seq(m.numRows, m.numCols))))). withValue("weights", Value.doubleList(obj.weights.toSeq)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): GaussianMixtureModel = { val means = model.value("means").getTensorList[Double].map(values => Vectors.dense(values.toArray)) val covs = model.value("covs").getTensorList[Double].map(values => Matrices.dense(values.dimensions.head, values.dimensions(1), values.toArray)) val gaussians = means.zip(covs).map { case (mean, cov) => new MultivariateGaussian(mean, cov) }.toArray val weights = model.value("weights").getDoubleList.toArray new GaussianMixtureModel(uid = "", gaussians = gaussians, weights = weights) } } override def sparkLoad(uid: String, shape: NodeShape, model: GaussianMixtureModel): GaussianMixtureModel = { new GaussianMixtureModel(uid = uid, weights = model.weights, gaussians = model.gaussians) } override def sparkInputs(obj: GaussianMixtureModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: GaussianMixtureModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol, "probability" -> obj.probabilityCol) } }
Example 157
Source File: MultiLayerPerceptronClassifierOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel import org.apache.spark.ml.linalg.Vectors class MultiLayerPerceptronClassifierOp extends SimpleSparkOp[MultilayerPerceptronClassificationModel] { override val Model: OpModel[SparkBundleContext, MultilayerPerceptronClassificationModel] = new OpModel[SparkBundleContext, MultilayerPerceptronClassificationModel] { override def opName: String = Bundle.BuiltinOps.classification.multi_layer_perceptron_classifier override val klazz: Class[MultilayerPerceptronClassificationModel] = classOf[MultilayerPerceptronClassificationModel] override def store(model: Model, obj: MultilayerPerceptronClassificationModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val thresholds = if(obj.isSet(obj.thresholds)) { Some(obj.getThresholds) } else None model.withValue("layers", Value.longList(obj.layers.map(_.toLong))). withValue("weights", Value.vector(obj.weights.toArray)). withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): MultilayerPerceptronClassificationModel = { val m = new MultilayerPerceptronClassificationModel(uid = "", layers = model.value("layers").getLongList.map(_.toInt).toArray, weights = Vectors.dense(model.value("weights").getTensor[Double].toArray)) model.getValue("thresholds"). map(t => m.setThresholds(t.getDoubleList.toArray)). getOrElse(m) } } override def sparkLoad(uid: String, shape: NodeShape, model: MultilayerPerceptronClassificationModel): MultilayerPerceptronClassificationModel = { val m = new MultilayerPerceptronClassificationModel(uid = uid,layers = model.layers, weights = model.weights) if (model.isSet(model.thresholds)) m.setThresholds(model.getThresholds) m } override def sparkInputs(obj: MultilayerPerceptronClassificationModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: MultilayerPerceptronClassificationModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "probability" -> obj.probabilityCol, "prediction" -> obj.predictionCol) } }
Example 158
Source File: LogisticRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.op.OpModel import ml.combust.bundle.dsl._ import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.{Matrices, Vectors} class LogisticRegressionOp extends SimpleSparkOp[LogisticRegressionModel] { private final val LOGISTIC_REGRESSION_DEFAULT_THRESHOLD = 0.5 override val Model: OpModel[SparkBundleContext, LogisticRegressionModel] = new OpModel[SparkBundleContext, LogisticRegressionModel] { override val klazz: Class[LogisticRegressionModel] = classOf[LogisticRegressionModel] override def opName: String = Bundle.BuiltinOps.classification.logistic_regression override def store(model: Model, obj: LogisticRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val m = model.withValue("num_classes", Value.long(obj.numClasses)) if(obj.numClasses > 2) { val cm = obj.coefficientMatrix val thresholds = if(obj.isSet(obj.thresholds)) { Some(obj.getThresholds) } else None m.withValue("coefficient_matrix", Value.tensor[Double](DenseTensor(cm.toArray, Seq(cm.numRows, cm.numCols)))). withValue("intercept_vector", Value.vector(obj.interceptVector.toArray)). withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList)) } else { m.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)). withValue("threshold", Value.double(obj.getThreshold)) } } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): LogisticRegressionModel = { val numClasses = model.value("num_classes").getLong val r = if(numClasses > 2) { val cmTensor = model.value("coefficient_matrix").getTensor[Double] val coefficientMatrix = Matrices.dense(cmTensor.dimensions.head, cmTensor.dimensions(1), cmTensor.toArray) val lr = new LogisticRegressionModel(uid = "", coefficientMatrix = coefficientMatrix, interceptVector = Vectors.dense(model.value("intercept_vector").getTensor[Double].toArray), numClasses = numClasses.toInt, isMultinomial = true) model.getValue("thresholds"). map(t => lr.setThresholds(t.getDoubleList.toArray)). getOrElse(lr) } else { val lr = new LogisticRegressionModel(uid = "", coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble) // default threshold is 0.5 for both Spark and Scikit-learn val threshold = model.getValue("threshold") .map(value => value.getDouble) .getOrElse(LOGISTIC_REGRESSION_DEFAULT_THRESHOLD) lr.setThreshold(threshold) } r } } override def sparkLoad(uid: String, shape: NodeShape, model: LogisticRegressionModel): LogisticRegressionModel = { val numClasses = model.numClasses val r = if (numClasses > 2) { val lr = new LogisticRegressionModel(uid = uid, coefficientMatrix = model.coefficientMatrix, interceptVector = model.interceptVector, numClasses = numClasses, isMultinomial = true) if(model.isDefined(model.thresholds)) { lr.setThresholds(model.getThresholds) } lr } else { val lr = new LogisticRegressionModel(uid = uid, coefficientMatrix = model.coefficientMatrix, interceptVector = model.interceptVector, numClasses = numClasses, isMultinomial = false) if(model.isDefined(model.threshold)) { lr.setThreshold(model.getThreshold) } lr } r } override def sparkInputs(obj: LogisticRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: LogisticRegressionModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "probability" -> obj.probabilityCol, "prediction" -> obj.predictionCol) } }
Example 159
Source File: NaiveBayesClassifierOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.classification import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.classification.NaiveBayesModel import org.apache.spark.ml.linalg.{Matrices, Vectors} class NaiveBayesClassifierOp extends SimpleSparkOp[NaiveBayesModel] { override val Model: OpModel[SparkBundleContext, NaiveBayesModel] = new OpModel[SparkBundleContext, NaiveBayesModel] { override val klazz: Class[NaiveBayesModel] = classOf[NaiveBayesModel] override def opName: String = Bundle.BuiltinOps.classification.naive_bayes override def store(model: Model, obj: NaiveBayesModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val thresholds = if(obj.isSet(obj.thresholds)) { Some(obj.getThresholds) } else None model.withValue("num_features", Value.long(obj.numFeatures)). withValue("num_classes", Value.long(obj.numClasses)). withValue("pi", Value.vector(obj.pi.toArray)). withValue("theta", Value.tensor(DenseTensor(obj.theta.toArray, Seq(obj.theta.numRows, obj.theta.numCols)))). withValue("model_type", Value.string(obj.getModelType)). withValue("thresholds", thresholds.map(Value.doubleList(_))) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): NaiveBayesModel = { val theta = model.value("theta").getTensor[Double] val nb = new NaiveBayesModel(uid = "", pi = Vectors.dense(model.value("pi").getTensor[Double].toArray), theta = Matrices.dense(theta.dimensions.head, theta.dimensions(1), theta.toArray)) val modelType = model.value("model_type").getString model.getValue("thresholds").map(t => nb.setThresholds(t.getDoubleList.toArray)) nb.set(nb.modelType, modelType) } } override def sparkLoad(uid: String, shape: NodeShape, model: NaiveBayesModel): NaiveBayesModel = { val r = new NaiveBayesModel(uid = uid, pi = model.pi, theta = model.theta) if (model.isDefined(model.thresholds)) { r.setThresholds(model.getThresholds) } if (model.isDefined(model.modelType)) { r.set(r.modelType, model.getModelType)} r } override def sparkInputs(obj: NaiveBayesModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: NaiveBayesModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "probability" -> obj.probabilityCol, "prediction" -> obj.predictionCol) } }
Example 160
Source File: GeneralizedLinearRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.OpModel import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GeneralizedLinearRegressionModel class GeneralizedLinearRegressionOp extends SimpleSparkOp[GeneralizedLinearRegressionModel] { override val Model: OpModel[SparkBundleContext, GeneralizedLinearRegressionModel] = new OpModel[SparkBundleContext, GeneralizedLinearRegressionModel] { override val klazz: Class[GeneralizedLinearRegressionModel] = classOf[GeneralizedLinearRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.generalized_linear_regression override def store(model: Model, obj: GeneralizedLinearRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { val modelWithoutLink = model.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)). withValue("family", Value.string(obj.getFamily)) if (obj.isDefined(obj.link)) { modelWithoutLink.withValue("link", Value.string(obj.getLink)) } else { modelWithoutLink } } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): GeneralizedLinearRegressionModel = { val m = new GeneralizedLinearRegressionModel(uid = "", coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble) m.set(m.family, model.value("family").getString) for (link <- model.getValue("link")) { m.set(m.link, link.getString) } m } } override def sparkLoad(uid: String, shape: NodeShape, model: GeneralizedLinearRegressionModel): GeneralizedLinearRegressionModel = { val m = new GeneralizedLinearRegressionModel(uid = uid, coefficients = model.coefficients, intercept = model.intercept) m.set(m.family, model.getFamily) if (model.isSet(model.link)) m.set(m.link, model.getLink) m } override def sparkInputs(obj: GeneralizedLinearRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: GeneralizedLinearRegressionModel): Seq[SimpleParamSpec] = { Seq("link_prediction" -> obj.linkPredictionCol, "prediction" -> obj.predictionCol) } }
Example 161
Source File: AFTSurvivalRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.Param import org.apache.spark.ml.regression.AFTSurvivalRegressionModel class AFTSurvivalRegressionOp extends SimpleSparkOp[AFTSurvivalRegressionModel] { override val Model: OpModel[SparkBundleContext, AFTSurvivalRegressionModel] = new OpModel[SparkBundleContext, AFTSurvivalRegressionModel] { override val klazz: Class[AFTSurvivalRegressionModel] = classOf[AFTSurvivalRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.aft_survival_regression override def store(model: Model, obj: AFTSurvivalRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)). withValue("quantile_probabilities", Value.doubleList(obj.getQuantileProbabilities)). withValue("scale", Value.double(obj.scale)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): AFTSurvivalRegressionModel = { new AFTSurvivalRegressionModel(uid = "", coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble, scale = model.value("scale").getDouble). setQuantileProbabilities(model.value("quantile_probabilities").getDoubleList.toArray) } } override def sparkLoad(uid: String, shape: NodeShape, model: AFTSurvivalRegressionModel): AFTSurvivalRegressionModel = { new AFTSurvivalRegressionModel(uid = uid, coefficients = model.coefficients, intercept = model.intercept, scale = model.scale).setQuantileProbabilities(model.getQuantileProbabilities) } override def sparkInputs(obj: AFTSurvivalRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: AFTSurvivalRegressionModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol, "quantiles" -> obj.quantilesCol) } }
Example 162
Source File: LinearRegressionOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.regression import ml.combust.bundle.BundleContext import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.bundle.dsl._ import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.param.Param import org.apache.spark.ml.regression.LinearRegressionModel class LinearRegressionOp extends SimpleSparkOp[LinearRegressionModel] { override val Model: OpModel[SparkBundleContext, LinearRegressionModel] = new OpModel[SparkBundleContext, LinearRegressionModel] { override val klazz: Class[LinearRegressionModel] = classOf[LinearRegressionModel] override def opName: String = Bundle.BuiltinOps.regression.linear_regression override def store(model: Model, obj: LinearRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("coefficients", Value.vector(obj.coefficients.toArray)). withValue("intercept", Value.double(obj.intercept)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): LinearRegressionModel = { new LinearRegressionModel(uid = "", coefficients = Vectors.dense(model.value("coefficients").getTensor[Double].toArray), intercept = model.value("intercept").getDouble) } } override def sparkLoad(uid: String, shape: NodeShape, model: LinearRegressionModel): LinearRegressionModel = { new LinearRegressionModel(uid = uid, coefficients = model.coefficients, intercept = model.intercept) } override def sparkInputs(obj: LinearRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: LinearRegressionModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 163
Source File: LinearSVCParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.parity import org.apache.spark.ml.classification.LinearSVCModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame class LinearSVCParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline() .setStages(Array( new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LinearSVCModel("linear_svc", Vectors.dense(0.44, 0.77), 0.66).setThreshold(0.5).setFeaturesCol("features"))) .fit(dataset) // The string order type is ignored, because once the transformer is built based on some order type, we need to serialize only the string to index map // but not the order in which it has to index. This value we can ignore while we check the transformer values. override val unserializedParams: Set[String] = Set("stringOrderType") }
Example 164
Source File: LogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.{StringIndexer, VectorAssembler} import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.DataFrame import org.apache.spark.ml.linalg.Vectors class LogisticRegressionParitySpec extends SparkParityBase { override val dataset: DataFrame = baseDataset.select("fico_score_group_fnl", "dti") override val sparkTransformer: Transformer = new Pipeline().setStages(Array(new StringIndexer(). setInputCol("fico_score_group_fnl"). setOutputCol("fico_index"), new VectorAssembler(). setInputCols(Array("fico_index", "dti")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficients = Vectors.dense(0.44, 0.77), intercept = 0.66).setThreshold(0.7).setFeaturesCol("features"))).fit(dataset) override val unserializedParams = Set("stringOrderType") }
Example 165
Source File: MultinomialLogisticRegressionParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.classification import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{DoubleType, IntegerType, StructType} class MultinomialLogisticRegressionParitySpec extends SparkParityBase { val labels = Seq(0.0, 1.0, 2.0, 0.0, 1.0, 2.0) val ages = Seq(15, 30, 40, 50, 15, 80) val heights = Seq(175, 190, 155, 160, 170, 180) val weights = Seq(67, 100, 57, 56, 56, 88) val rows = spark.sparkContext.parallelize(Seq.tabulate(6) { i => Row(labels(i), ages(i), heights(i), weights(i)) }) val schema = new StructType().add("label", DoubleType, nullable = false) .add("age", IntegerType, nullable = false) .add("height", IntegerType, nullable = false) .add("weight", IntegerType, nullable = false) override val dataset: DataFrame = spark.sqlContext.createDataFrame(rows, schema) override val sparkTransformer: Transformer = new Pipeline().setStages(Array( new VectorAssembler(). setInputCols(Array("age", "height", "weight")). setOutputCol("features"), new LogisticRegressionModel(uid = "logr", coefficientMatrix = Matrices.dense(3, 3, Array(-1.3920551604166562, -0.13119545493644366, 1.5232506153530998, 0.3129112131192873, -0.21959056436528473, -0.09332064875400257, -0.24696506013528507, 0.6122879917796569, -0.36532293164437174)), interceptVector = Vectors.dense(0.4965574044951358, -2.1486146169780063, 1.6520572124828703), numClasses = 3, isMultinomial = true))).fit(dataset) }
Example 166
Source File: ExecuteTransformSpec.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.executor import ml.combust.mleap.core.feature.VectorAssemblerModel import ml.combust.mleap.core.regression.LinearRegressionModel import ml.combust.mleap.runtime.frame.{DefaultLeapFrame, Row} import ml.combust.mleap.runtime.transformer.{Pipeline, PipelineModel} import ml.combust.mleap.runtime.transformer.feature.VectorAssembler import ml.combust.mleap.runtime.transformer.regression.LinearRegression import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.linalg.Vectors import org.scalatest.{FunSpec, Matchers} import ml.combust.mleap.core.types._ import org.scalatest.concurrent.ScalaFutures import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.Future import scala.util.{Success, Try} class ExecuteTransformSpec extends FunSpec with ScalaFutures with Matchers { describe("execute transform") { val pipeline = Pipeline("pipeline", NodeShape(), PipelineModel(Seq( VectorAssembler(shape = NodeShape().withInput("input0", "first_double"). withInput("input1", "second_double"). withStandardOutput("features"), model = VectorAssemblerModel(Seq(ScalarShape(), ScalarShape()))), LinearRegression(shape = NodeShape.regression(), model = LinearRegressionModel(Vectors.dense(2.0, 2.0), 5.0))))) val input = DefaultLeapFrame(StructType(Seq(StructField("first_double", ScalarType.Double), StructField("second_double" -> ScalarType.Double))).get, Seq(Row(20.0, 10.0))) it("transforms successfully a leap frame in strict mode") { val result = ExecuteTransform(pipeline, input, TransformOptions(Some(Seq("features", "prediction")), SelectMode.Strict)). flatMap(Future.fromTry) whenReady(result) { frame => { val data = frame.collect().head assert(frame.schema.fields.length == 2) assert(frame.schema.indexOf("features").get == 0) assert(data.getTensor(0) == Tensor.denseVector(Array(20.0, 10.0))) assert(data.getDouble(1) == 65.0) } } } it("transforms successfully a leap frame with default options") { val result = ExecuteTransform(pipeline, input, TransformOptions.default).flatMap(Future.fromTry) whenReady(result) { frame => assert(frame.schema.hasField("prediction")) } } it("throws exception when transforming and selecting a missing field in strict mode") { val result = ExecuteTransform(pipeline, input, TransformOptions(Some(Seq("features", "prediction", "does-not-exist")), SelectMode.Strict)). flatMap(Future.fromTry) whenReady(result.failed) { ex => ex shouldBe a [IllegalArgumentException] } } it("transforms successfully a leap frame in relaxed mode, ignoring unknown fields") { val result = ExecuteTransform(pipeline, input, TransformOptions(Some(Seq("features", "prediction", "does-not-exist")), SelectMode.Relaxed)). flatMap(Future.fromTry) whenReady(result) { frame => { val data = frame.collect().head assert(frame.schema.fields.length == 2) assert(frame.schema.indexOf("features").get == 0) assert(data.getTensor(0) == Tensor.denseVector(Array(20.0, 10.0))) assert(data.getDouble(1) == 65.0) } } } it("throws exception when transforming throws exception") { val invalidPipeline = Pipeline("pipeline", NodeShape(), PipelineModel(Seq( VectorAssembler(shape = NodeShape().withInput("input0", "first_double"). withInput("input1", "second_double"). withStandardOutput("features"), model = VectorAssemblerModel(Seq(ScalarShape(), ScalarShape()))), LinearRegression(shape = NodeShape.regression(), // missing coefficient for LR model = LinearRegressionModel(Vectors.dense(2.0), 5.0))))) val result = ExecuteTransform(invalidPipeline, input, TransformOptions.default).flatMap(Future.fromTry) whenReady(result.failed) { ex => ex shouldBe a [IllegalArgumentException] } } } }
Example 167
Source File: SpillTreeSpec.scala From spark-knn with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.knn import org.apache.spark.ml.knn.KNN.RowWithVector import org.apache.spark.ml.linalg.Vectors import org.scalatest.funspec.AnyFunSpec import org.scalatest.matchers.should.Matchers class SpillTreeSpec extends AnyFunSpec with Matchers { describe("SpillTree") { val origin = Vectors.dense(0, 0) describe("can be constructed with empty data") { val tree = SpillTree.build(IndexedSeq.empty[RowWithVector], tau = 0.0) it("iterator should be empty") { tree.iterator shouldBe empty } it("should return empty when queried") { tree.query(origin) shouldBe empty } it("should have zero leaf") { tree.leafCount shouldBe 0 } } describe("with equidistant points on a circle") { val n = 12 val points = (1 to n).map { i => new RowWithVector(Vectors.dense(math.sin(2 * math.Pi * i / n), math.cos(2 * math.Pi * i / n)), null) } val leafSize = n / 4 describe("built with tau = 0.0") { val tree = SpillTree.build(points, leafSize = leafSize, tau = 0.0) it("should have correct size") { tree.size shouldBe points.size } it("should return an iterator that goes through all data points") { tree.iterator.toIterable should contain theSameElementsAs points } it("can return more than min leaf size") { val k = leafSize + 5 points.foreach(v => tree.query(v.vector, k).size shouldBe k) } } describe("built with tau = 0.5") { val tree = SpillTree.build(points, leafSize = leafSize, tau = 0.5) it("should have correct size") { tree.size shouldBe points.size } it("should return an iterator that goes through all data points") { tree.iterator.toIterable should contain theSameElementsAs points } it("works for every point to identify itself") { points.foreach(v => tree.query(v.vector, 1).head._1 shouldBe v) } it("has consistent size and iterator") { def check(tree: Tree): Unit = { tree match { case t: SpillTree => t.iterator.size shouldBe t.size check(t.leftChild) check(t.rightChild) case _ => } } check(tree) } } } } describe("HybridTree") { val origin = Vectors.dense(0, 0) describe("can be constructed with empty data") { val tree = HybridTree.build(IndexedSeq.empty[RowWithVector], tau = 0.0) it("iterator should be empty") { tree.iterator shouldBe empty } it("should return empty when queried") { tree.query(origin) shouldBe empty } it("should have zero leaf") { tree.leafCount shouldBe 0 } } describe("with equidistant points on a circle") { val n = 12 val points = (1 to n).map { i => new RowWithVector(Vectors.dense(math.sin(2 * math.Pi * i / n), math.cos(2 * math.Pi * i / n)), null) } val leafSize = n / 4 val tree = HybridTree.build(points, leafSize = leafSize, tau = 0.5) it("should have correct size") { tree.size shouldBe points.size } it("should return an iterator that goes through all data points") { tree.iterator.toIterable should contain theSameElementsAs points } } } }
Example 168
Source File: MetricTreeSpec.scala From spark-knn with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.knn import org.apache.spark.ml.knn.KNN.{RowWithVector, VectorWithNorm} import org.apache.spark.ml.linalg.Vectors import org.scalatest.funspec.AnyFunSpec import org.scalatest.matchers.should.Matchers class MetricTreeSpec extends AnyFunSpec with Matchers { describe("MetricTree") { val origin = Vectors.dense(0, 0) describe("can be constructed with empty data") { val tree = MetricTree.build(IndexedSeq.empty[RowWithVector]) it("iterator should be empty") { tree.iterator shouldBe empty } it("should return empty when queried") { tree.query(origin) shouldBe empty } it("should have zero leaf") { tree.leafCount shouldBe 0 } } describe("without duplicates") { val data = (-5 to 5).flatMap(i => (-5 to 5).map(j => new RowWithVector(Vectors.dense(i, j), null))) List(1, data.size / 2, data.size, data.size * 2).foreach { leafSize => describe(s"with leafSize of $leafSize") { val tree = MetricTree.build(data, leafSize) it("should have correct size") { tree.size shouldBe data.size } it("should return an iterator that goes through all data points") { tree.iterator.toIterable should contain theSameElementsAs data } it("should return vector itself for those in input set") { data.foreach(v => tree.query(v.vector, 1).head._1 shouldBe v) } it("should return nearest neighbors correctly") { tree.query(origin, 5).map(_._1.vector.vector) should contain theSameElementsAs Set( Vectors.dense(-1, 0), Vectors.dense(1, 0), Vectors.dense(0, -1), Vectors.dense(0, 1), Vectors.dense(0, 0) ) tree.query(origin, 9).map(_._1.vector.vector) should contain theSameElementsAs (-1 to 1).flatMap(i => (-1 to 1).map(j => Vectors.dense(i, j))) } it("should have correct number of leaves") { tree.leafCount shouldBe (tree.size / leafSize.toDouble).ceil } it("all points should fall with radius of pivot") { def check(tree: Tree): Unit = { tree.iterator.foreach(_.vector.fastDistance(tree.pivot) <= tree.radius) tree match { case t: MetricTree => check(t.leftChild) check(t.rightChild) case _ => } } check(tree) } } } } describe("with duplicates") { val data = (Vectors.dense(2.0, 0.0) +: Array.fill(5)(Vectors.dense(0.0, 1.0))).map(new RowWithVector(_, null)) val tree = MetricTree.build(data) it("should have 2 leaves") { tree.leafCount shouldBe 2 } it("should return all available duplicated candidates") { val res = tree.query(origin, 5).map(_._1.vector.vector) res.size shouldBe 5 res.toSet should contain theSameElementsAs Array(Vectors.dense(0.0, 1.0)) } } describe("for other corner cases") { it("queryCost should work on Empty") { Empty.distance(new KNNCandidates(new VectorWithNorm(origin), 1)) shouldBe 0 Empty.distance(new VectorWithNorm(origin)) shouldBe 0 } } } }
Example 169
Source File: MLPipelineTrackerIT.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.ml import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.types.{IntegerType, StringType, StructType} import org.scalatest.Matchers import com.hortonworks.spark.atlas._ import com.hortonworks.spark.atlas.types._ import com.hortonworks.spark.atlas.TestUtils._ class MLPipelineTrackerIT extends BaseResourceIT with Matchers with WithHiveSupport { private val atlasClient = new RestAtlasClient(atlasClientConf) def clusterName: String = atlasClientConf.get(AtlasClientConf.CLUSTER_NAME) def getTableEntity(tableName: String): SACAtlasEntityWithDependencies = { val dbDefinition = createDB("db1", "hdfs:///test/db/db1") val sd = createStorageFormat() val schema = new StructType() .add("user", StringType, false) .add("age", IntegerType, true) val tableDefinition = createTable("db1", s"$tableName", schema, sd) internal.sparkTableToEntity(tableDefinition, clusterName, Some(dbDefinition)) } // Enable it to run integrated test. it("pipeline and pipeline model") { val uri = "hdfs://" val pipelineDir = "tmp/pipeline" val modelDir = "tmp/model" val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir) val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir) atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, modelDirEntity)) val df = sparkSession.createDataFrame(Seq( (1, Vectors.dense(0.0, 1.0, 4.0), 1.0), (2, Vectors.dense(1.0, 0.0, 4.0), 2.0), (3, Vectors.dense(1.0, 0.0, 5.0), 3.0), (4, Vectors.dense(0.0, 0.0, 5.0), 4.0) )).toDF("id", "features", "label") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("features_scaled") .setMin(0.0) .setMax(3.0) val pipeline = new Pipeline().setStages(Array(scaler)) val model = pipeline.fit(df) pipeline.write.overwrite().save(pipelineDir) val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity) atlasClient.createEntitiesWithDependencies(Seq(pipelineDirEntity, pipelineEntity)) val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity) atlasClient.createEntitiesWithDependencies(Seq(modelDirEntity, modelEntity)) val tableEntities1 = getTableEntity("chris1") val tableEntities2 = getTableEntity("chris2") atlasClient.createEntitiesWithDependencies(tableEntities1) atlasClient.createEntitiesWithDependencies(tableEntities2) } }
Example 170
Source File: MLAtlasEntityUtilsSuite.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas.types import java.io.File import org.apache.atlas.{AtlasClient, AtlasConstants} import org.apache.atlas.model.instance.AtlasEntity import org.apache.commons.io.FileUtils import org.apache.spark.ml.Pipeline import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.types.{IntegerType, StringType, StructType} import org.scalatest.{FunSuite, Matchers} import com.hortonworks.spark.atlas.TestUtils._ import com.hortonworks.spark.atlas.{AtlasUtils, WithHiveSupport} class MLAtlasEntityUtilsSuite extends FunSuite with Matchers with WithHiveSupport { def getTableEntity(tableName: String): AtlasEntity = { val dbDefinition = createDB("db1", "hdfs:///test/db/db1") val sd = createStorageFormat() val schema = new StructType() .add("user", StringType, false) .add("age", IntegerType, true) val tableDefinition = createTable("db1", s"$tableName", schema, sd) val tableEntities = internal.sparkTableToEntity( tableDefinition, AtlasConstants.DEFAULT_CLUSTER_NAME, Some(dbDefinition)) val tableEntity = tableEntities.entity tableEntity } test("pipeline, pipeline model, fit and transform") { val uri = "/" val pipelineDir = "tmp/pipeline" val modelDir = "tmp/model" val pipelineDirEntity = internal.mlDirectoryToEntity(uri, pipelineDir) pipelineDirEntity.entity.getAttribute("uri") should be (uri) pipelineDirEntity.entity.getAttribute("directory") should be (pipelineDir) pipelineDirEntity.dependencies.length should be (0) val modelDirEntity = internal.mlDirectoryToEntity(uri, modelDir) modelDirEntity.entity.getAttribute("uri") should be (uri) modelDirEntity.entity.getAttribute("directory") should be (modelDir) modelDirEntity.dependencies.length should be (0) val df = sparkSession.createDataFrame(Seq( (1, Vectors.dense(0.0, 1.0, 4.0), 1.0), (2, Vectors.dense(1.0, 0.0, 4.0), 2.0), (3, Vectors.dense(1.0, 0.0, 5.0), 3.0), (4, Vectors.dense(0.0, 0.0, 5.0), 4.0) )).toDF("id", "features", "label") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("features_scaled") .setMin(0.0) .setMax(3.0) val pipeline = new Pipeline().setStages(Array(scaler)) val model = pipeline.fit(df) pipeline.write.overwrite().save(pipelineDir) val pipelineEntity = internal.mlPipelineToEntity(pipeline.uid, pipelineDirEntity) pipelineEntity.entity.getTypeName should be (metadata.ML_PIPELINE_TYPE_STRING) pipelineEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be ( pipeline.uid) pipelineEntity.entity.getAttribute("name") should be (pipeline.uid) pipelineEntity.entity.getRelationshipAttribute("directory") should be ( AtlasUtils.entityToReference(pipelineDirEntity.entity, useGuid = false)) pipelineEntity.dependencies should be (Seq(pipelineDirEntity)) val modelEntity = internal.mlModelToEntity(model.uid, modelDirEntity) val modelUid = model.uid.replaceAll("pipeline", "model") modelEntity.entity.getTypeName should be (metadata.ML_MODEL_TYPE_STRING) modelEntity.entity.getAttribute(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME) should be (modelUid) modelEntity.entity.getAttribute("name") should be (modelUid) modelEntity.entity.getRelationshipAttribute("directory") should be ( AtlasUtils.entityToReference(modelDirEntity.entity, useGuid = false)) modelEntity.dependencies should be (Seq(modelDirEntity)) FileUtils.deleteDirectory(new File("tmp")) } }
Example 171
Source File: RBFKernel.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons.kernel import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import breeze.numerics.{exp, inf} import org.apache.spark.ml.linalg.{Vector, Vectors} class RBFKernel(private var sigma: Double, private val lower: Double = 1e-6, private val upper: Double = inf) extends TrainDatasetBearingKernel with NoiselessKernel with SameOnDiagonalKernel { def this() = this(1) override def setHyperparameters(value: BDV[Double]): RBFKernel.this.type = { sigma = value(0) this } override def getHyperparameters: BDV[Double] = BDV[Double](sigma) override def numberOfHyperparameters: Int = 1 private def getSigma() = sigma private var squaredDistances: Option[BDM[Double]] = None override def hyperparameterBoundaries: (BDV[Double], BDV[Double]) = { (BDV[Double](lower), BDV[Double](upper)) } override def setTrainingVectors(vectors: Array[Vector]): this.type = { super.setTrainingVectors(vectors) val sqd = BDM.zeros[Double](vectors.length, vectors.length) for (i <- vectors.indices; j <- 0 to i) { val dist = Vectors.sqdist(vectors(i), vectors(j)) sqd(i, j) = dist sqd(j, i) = dist } squaredDistances = Some(sqd) this } override def trainingKernel(): BDM[Double] = { val result = squaredDistances.getOrElse(throw new TrainingVectorsNotInitializedException) / (-2d * sqr(getSigma())) exp.inPlace(result) result } override def trainingKernelAndDerivative(): (BDM[Double], Array[BDM[Double]]) = { val sqd = squaredDistances.getOrElse(throw new TrainingVectorsNotInitializedException) val kernel = trainingKernel() val derivative = sqd *:* kernel derivative /= cube(getSigma()) (kernel, Array(derivative)) } override def crossKernel(test: Array[Vector]): BDM[Double] = { val train = getTrainingVectors val result = BDM.zeros[Double](test.length, train.length) for (i <- test.indices; j <- train.indices) result(i, j) = Vectors.sqdist(test(i), train(j)) / (-2d * sqr(getSigma())) exp.inPlace(result) result } override def selfKernel(test: Vector): Double = 1d private def sqr(x: Double) = x * x private def cube(x: Double) = x * x * x override def toString = f"RBFKernel(sigma=$sigma%1.1e)" }
Example 172
Source File: Scaling.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons.util import breeze.linalg.DenseVector import breeze.numerics.sqrt import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD private[ml] trait Scaling { def scale(data: RDD[LabeledPoint]) = { val x = data.map(x => DenseVector(x.features.toArray)).cache() val y = data.map(_.label) val n = x.count().toDouble val mean = x.reduce(_ + _) / n val centered = x.map(_ - mean).cache() val variance = centered.map(xx => xx *:* xx).reduce(_ + _) / n x.unpersist() val varianceNoZeroes = variance.map(v => if (v > 0d) v else 1d) val scaled = centered.map(_ /:/ sqrt(varianceNoZeroes)).map(_.toArray).map(Vectors.dense).zip(y).map { case(f, y) => LabeledPoint(y, f) }.cache() if (scaled.count() > 0) // ensure scaled is materialized centered.unpersist() scaled } }
Example 173
Source File: MNIST.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.examples import org.apache.spark.ml.classification.GaussianProcessClassifier import org.apache.spark.ml.commons.kernel.RBFKernel import org.apache.spark.ml.commons.util.Scaling import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object MNIST extends App with Scaling { val name = "MNIST" val spark = SparkSession.builder().appName(name).master(s"local[${args(0)}]").getOrCreate() val path = args(1) val parallelism = args(0).toInt * 4 val forExpert = args(2).toInt val activeSet = args(3).toInt import spark.sqlContext.implicits._ val dataset = (scale _ andThen labels201 _) (spark.read.format("csv").load(path).rdd.map(row => { val features = Vectors.dense((1 until row.length).map("_c" + _).map(row.getAs[String]).map(_.toDouble).toArray) val label = row.getAs[String]("_c0").toDouble LabeledPoint(label, features) }).cache()).toDF.repartition(parallelism).cache() val gp = new GaussianProcessClassifier() .setDatasetSizeForExpert(forExpert) .setActiveSetSize(activeSet) .setKernel(() => new RBFKernel(10)) .setTol(1e-3) val cv = new TrainValidationSplit() .setEstimator(gp) .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("accuracy")) .setEstimatorParamMaps(new ParamGridBuilder().build()) .setTrainRatio(0.8) println("Accuracy: " + cv.fit(dataset).validationMetrics.toList) def labels201(data: RDD[LabeledPoint]) : RDD[LabeledPoint] = { val old2new = data.map(_.label).distinct().collect().zipWithIndex.toMap data.map(lp => LabeledPoint(old2new(lp.label), lp.features)) } }
Example 174
Source File: Iris.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.examples import org.apache.spark.ml.classification.{GaussianProcessClassifier, OneVsRest} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.SparkSession object Iris extends App { val name = "Iris" val spark = SparkSession.builder().appName(name).master("local[4]").getOrCreate() import spark.sqlContext.implicits._ val name2indx = Map("Iris-versicolor" -> 0, "Iris-setosa" -> 1, "Iris-virginica" -> 2) val dataset = spark.read.format("csv").load("data/iris.csv").rdd.map(row => { val features = Vectors.dense(Array("_c0", "_c1", "_c2", "_c3") .map(col => row.getAs[String](col).toDouble)) val label = name2indx(row.getAs[String]("_c4")) LabeledPoint(label, features) }).toDF val gp = new GaussianProcessClassifier().setDatasetSizeForExpert(20).setActiveSetSize(30) val ovr = new OneVsRest().setClassifier(gp) val cv = new CrossValidator() .setEstimator(ovr) .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("accuracy")) .setEstimatorParamMaps(new ParamGridBuilder().build()) .setNumFolds(10) println("Accuracy: " + cv.fit(dataset).avgMetrics.toList) }
Example 175
Source File: PerformanceBenchmark.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression.benchmark import breeze.linalg.{sum, DenseMatrix => BDM, DenseVector => BDV, _} import breeze.numerics.sin import org.apache.spark.ml.commons.kernel.RBFKernel import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GaussianProcessRegression import org.apache.spark.sql.SparkSession import scala.util.Random object PerformanceBenchmark extends App { val spark = SparkSession.builder() .appName("bench") .master(s"local[${args(0)}]").getOrCreate() import spark.sqlContext.implicits._ val sampleSize = args(2).toInt val nFeatures = 3 val parallelism = args(0).toInt * 4 val expertSampleSize = args(1).toInt val instancesRDD = spark.sparkContext.parallelize(0 until parallelism).flatMap(index => { val random = new Random(13 * index) val X = BDM.create(sampleSize/parallelism, nFeatures, Array.fill(sampleSize * nFeatures/parallelism)(random.nextDouble())) val Y = sin(sum(X(*, ::)) / 1000d).toArray (0 until X.rows).map{ i=> val x = X(i, ::) val y = Y(i) LabeledPoint(y, Vectors.dense(x.t.toArray)) } }) val instances = instancesRDD.toDF.cache() instances.count() val gp = new GaussianProcessRegression() .setKernel(() => new RBFKernel(0.1)) .setDatasetSizeForExpert(expertSampleSize) .setActiveSetSize(expertSampleSize) .setSeed(13) .setSigma2(1e-3) time(gp.fit(instances)) def time[T](f: => T): T = { val start = System.currentTimeMillis() val result = f println("TIME: " + (System.currentTimeMillis() - start)) result } }
Example 176
Source File: Synthetics.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression.examples import breeze.linalg._ import breeze.numerics._ import org.apache.spark.ml.commons.KMeansActiveSetProvider import org.apache.spark.ml.commons.kernel.{RBFKernel, WhiteNoiseKernel, _} import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GaussianProcessRegression object Synthetics extends App with GPExample { import spark.sqlContext.implicits._ override def name = "Synthetics" val noiseVar = 0.01 val g = breeze.stats.distributions.Gaussian(0, math.sqrt(noiseVar)) val X = linspace(0, 1, length = 2000).toDenseMatrix val Y = sin(X).toArray.map(y => y + g.sample()) val instances = spark.sparkContext.parallelize(X.toArray.zip(Y).map { case(v, y) => LabeledPoint(y, Vectors.dense(Array(v)))}).toDF val gp = new GaussianProcessRegression() .setKernel(() => 1*new RBFKernel(0.1, 1e-6, 10) + WhiteNoiseKernel(0.5, 0, 1)) .setDatasetSizeForExpert(100) .setActiveSetProvider(new KMeansActiveSetProvider()) .setActiveSetSize(100) .setSeed(13) .setSigma2(1e-3) cv(gp, instances, 0.11) }
Example 177
Source File: Airfoil.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression.examples import org.apache.spark.ml.commons.kernel.{ARDRBFKernel, _} import org.apache.spark.ml.commons.util.Scaling import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GaussianProcessRegression object Airfoil extends App with GPExample with Scaling { import spark.sqlContext.implicits._ override def name = "Airfoil" val airfoil = readSCV("data/airfoil.csv") val scaled = scale(airfoil).toDF val gp = new GaussianProcessRegression() .setActiveSetSize(1000) .setSigma2(1e-4) .setKernel(() => 1 * new ARDRBFKernel(5) + 1.const * new EyeKernel) cv(gp, scaled, 2.1) def readSCV(path : String) = { spark.read.format("csv").load(path).rdd.map(row => { val features = Vectors.dense(Array("_c0", "_c1", "_c2", "_c3", "_c4") .map(col => row.getAs[String](col).toDouble)) LabeledPoint(row.getAs[String]("_c5").toDouble, features) }) } }
Example 178
Source File: ARDRBFKernelTest.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons.kernel import breeze.linalg.{all, DenseMatrix => BDM, DenseVector => BDV} import breeze.numerics.abs import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSuite class ARDRBFKernelTest extends FunSuite { private val dataset = Array(Array(1d, 2d), Array(2d, 3d), Array(5d, 7d)).map(Vectors.dense) private def computationalDerivative(beta: BDV[Double], h: Double): BDM[Double] = { val left = new ARDRBFKernel(beta - h) val right = new ARDRBFKernel(beta + h) left.setTrainingVectors(dataset) right.setTrainingVectors(dataset) (right.trainingKernel() - left.trainingKernel()) / (2 * h) } test("being called after `setTrainingVector`," + " `derivative` should return the correct kernel matrix derivative") { val beta = BDV[Double](0.2, 0.3) val ard = new ARDRBFKernel(beta) ard.setTrainingVectors(dataset) val analytical = ard.trainingKernelAndDerivative()._2.reduce(_ + _) val computational = computationalDerivative(beta, 1e-3) assert(all(abs(analytical - computational) <:< 1e-3)) } }
Example 179
Source File: RBFKernelTest.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons.kernel import breeze.linalg.{DenseMatrix, DenseVector, all} import breeze.numerics.abs import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSuite class RBFKernelTest extends FunSuite { test("Calling `trainingKernel` before `setTrainingVectors` " + "yields `TrainingVectorsNotInitializedException") { val rbf = new RBFKernel() assertThrows[TrainingVectorsNotInitializedException] { rbf.trainingKernel() } } test("Calling `derivative` before `setTrainingVectors` " + "yields `TrainingVectorsNotInitializedException") { val rbf = new RBFKernel() assertThrows[TrainingVectorsNotInitializedException] { rbf.trainingKernelAndDerivative() } } private val dataset = Array(Array(1d, 2d), Array(2d, 3d), Array(5d, 7d)).map(Vectors.dense) test("being called after `setTrainingVector`," + " `trainingKernel` should return the correct kernel matrix") { val rbf = new RBFKernel(math.sqrt(0.2)) rbf.setTrainingVectors(dataset) val correctKernelMatrix = DenseMatrix((1.000000e+00, 6.737947e-03, 3.053624e-45), (6.737947e-03, 1.000000e+00, 7.187782e-28), (3.053624e-45, 7.187782e-28, 1.000000e+00)) assert(all(abs(rbf.trainingKernel() - correctKernelMatrix) <:< 1e-4)) } private def computationalDerivative(sigma: Double, h: Double) = { val rbfLeft = new RBFKernel(sigma - h) val rbfRight = new RBFKernel(sigma + h) rbfLeft.setTrainingVectors(dataset) rbfRight.setTrainingVectors(dataset) (rbfRight.trainingKernel() - rbfLeft.trainingKernel()) / (2 * h) } test("being called after `setTrainingVector`," + " `derivative` should return the correct kernel matrix derivative") { val rbf = new RBFKernel(0.2) rbf.setTrainingVectors(dataset) val analytical = rbf.trainingKernelAndDerivative()._2(0) val computational = computationalDerivative(0.2, 1e-3) assert(all(abs(analytical - computational) <:< 1e-3)) } test("crossKernel returns correct kernel") { val rbf = new RBFKernel(math.sqrt(0.2)) rbf.setTrainingVectors(dataset.drop(1)) val crossKernel = rbf.crossKernel(dataset.take(1)) val correctCrossKernel = DenseMatrix((6.737947e-03, 3.053624e-45)) assert(all(abs(crossKernel - correctCrossKernel) <:< 1e-4)) } test("crossKernel returns correct kernel if called on a single vector") { val rbf = new RBFKernel(math.sqrt(0.2)) rbf.setTrainingVectors(dataset.drop(1)) val crossKernel = rbf.crossKernel(dataset(0)) val correctCrossKernel = DenseVector(6.737947e-03, 3.053624e-45).t assert(all(abs(crossKernel - correctCrossKernel) <:< 1e-4)) } }
Example 180
Source File: SparkVector.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.vector import org.apache.spark.ml.linalg.{Vector, Vectors} object SparkVector { def main(args: Array[String]): Unit = { // Create a dense vector (1.0, 0.0, 3.0). val dVectorOne: Vector = Vectors.dense(1.0, 0.0, 2.0) println("dVectorOne:" + dVectorOne) // Sparse vector (1.0, 0.0, 2.0, 3.0) // corresponding to nonzero entries. val sVectorOne: Vector = Vectors.sparse(4, Array(0, 2,3), Array(1.0, 2.0, 3.0)) // Create a sparse vector (1.0, 0.0, 2.0, 2.0) by specifying its // nonzero entries. val sVectorTwo: Vector = Vectors.sparse(4, Seq((0, 1.0), (2, 2.0), (3, 3.0))) println("sVectorOne:" + sVectorOne) println("sVectorTwo:" + sVectorTwo) val sVectorOneMax = sVectorOne.argmax val sVectorOneNumNonZeros = sVectorOne.numNonzeros val sVectorOneSize = sVectorOne.size val sVectorOneArray = sVectorOne.toArray println("sVectorOneMax:" + sVectorOneMax) println("sVectorOneNumNonZeros:" + sVectorOneNumNonZeros) println("sVectorOneSize:" + sVectorOneSize) println("sVectorOneArray:" + sVectorOneArray) val dVectorOneToSparse = dVectorOne.toSparse println("dVectorOneToSparse:" + dVectorOneToSparse) } }
Example 181
Source File: MLUserDefinedType.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.sql.types.DataType import org.apache.spark.ml.linalg.SQLDataTypes.{MatrixType, VectorType} import org.apache.spark.ml.linalg.{DenseMatrix, Vectors} import org.scalacheck.{Arbitrary, Gen} object MLUserDefinedType { def unapply(dataType: DataType): Option[Gen[Any]] = dataType match { case MatrixType => { val dense = for { rows <- Gen.choose(0, 20) cols <- Gen.choose(0, 20) values <- Gen.containerOfN[Array, Double](rows * cols, Arbitrary.arbitrary[Double]) } yield new DenseMatrix(rows, cols, values) val sparse = dense.map(_.toSparse) Some(Gen.oneOf(dense, sparse)) } case VectorType => { val dense = Arbitrary.arbitrary[Array[Double]].map(Vectors.dense) val sparse = for { indices <- Gen.nonEmptyContainerOf[Set, Int](Gen.choose(0, Int.MaxValue - 1)) values <- Gen.listOfN(indices.size, Arbitrary.arbitrary[Double]) } yield Vectors.sparse(indices.max + 1, indices.toSeq.zip(values)) Some(Gen.oneOf(dense, sparse)) } case _ => None } }
Example 182
Source File: LogisticRegressionWorkload.scala From spark-bench with Apache License 2.0 | 5 votes |
package com.ibm.sparktc.sparkbench.workload.ml import com.ibm.sparktc.sparkbench.utils.GeneralFunctions._ import com.ibm.sparktc.sparkbench.utils.SaveModes import com.ibm.sparktc.sparkbench.workload.{Workload, WorkloadDefaults} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator => BCE} import org.apache.spark.sql.{DataFrame, Row, SparkSession} // ¯\_(ツ)_/¯ // the logic for this workload came from: // https://github.com/szilard/benchm-ml/blob/master/1-linear/5-spark.txt // ¯\_(ツ)_/¯ case class LogisticRegressionResult( name: String, appid: String, start_time: Long, input: String, train_count: Long, train_time: Long, test_file: String, test_count: Long, test_time: Long, load_time: Long, count_time: Long, total_runtime: Long, area_under_roc: Double ) object LogisticRegressionWorkload extends WorkloadDefaults { val name = "lr-bml" def apply(m: Map[String, Any]) = new LogisticRegressionWorkload( input = Some(getOrThrow(m, "input").asInstanceOf[String]), output = getOrDefault[Option[String]](m, "workloadresultsoutputdir", None), saveMode = getOrDefault[String](m, "save-mode", SaveModes.error), testFile = getOrThrow(m, "testfile").asInstanceOf[String], numPartitions = getOrDefault[Int](m, "numpartitions", 32), cacheEnabled = getOrDefault[Boolean](m, "cacheenabled", true) ) } case class LogisticRegressionWorkload( input: Option[String], output: Option[String], saveMode: String, testFile: String, numPartitions: Int, cacheEnabled: Boolean ) extends Workload { private[ml] def load(filename: String)(implicit spark: SparkSession): DataFrame = { import spark.implicits._ spark.sparkContext.textFile(filename) .map { line => val vv = line.split(',').map(_.toDouble) val label = vv(0) val features = Vectors.dense(vv.slice(1, vv.length)).toSparse (label, features) }.toDF("label", "features") } private[ml] def ld(fn: String)(implicit spark: SparkSession) = time { val ds = load(fn)(spark).repartition(numPartitions) if (cacheEnabled) ds.cache ds } override def doWorkload(df: Option[DataFrame], spark: SparkSession): DataFrame = { val startTime = System.currentTimeMillis val (ltrainTime, d_train) = ld(s"${input.get}")(spark) val (ltestTime, d_test) = ld(s"$testFile")(spark) val (countTime, (trainCount, testCount)) = time { (d_train.count(), d_test.count()) } val (trainTime, model) = time(new LogisticRegression().setTol(1e-4).fit(d_train)) val (testTime, areaUnderROC) = time(new BCE().setMetricName("areaUnderROC").evaluate(model.transform(d_test))) val loadTime = ltrainTime + ltestTime //spark.createDataFrame(Seq(SleepResult("sleep", timestamp, t))) spark.createDataFrame(Seq(LogisticRegressionResult( name = "lr-bml", appid = spark.sparkContext.applicationId, startTime, input.get, train_count = trainCount, trainTime, testFile, test_count = testCount, testTime, loadTime, countTime, loadTime + trainTime + testTime, areaUnderROC ))) } }
Example 183
Source File: MultivariateGaussianSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import org.apache.spark.ml.SparkMLFunSuite import org.apache.spark.ml.linalg.{Matrices, Vectors} import org.apache.spark.ml.util.TestingUtils._ class MultivariateGaussianSuite extends SparkMLFunSuite { test("univariate") { val x1 = Vectors.dense(0.0) val x2 = Vectors.dense(1.5) val mu = Vectors.dense(0.0) val sigma1 = Matrices.dense(1, 1, Array(1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.39894 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.12952 absTol 1E-5) val sigma2 = Matrices.dense(1, 1, Array(4.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.19947 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.15057 absTol 1E-5) } test("multivariate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma1 = Matrices.dense(2, 2, Array(1.0, 0.0, 0.0, 1.0)) val dist1 = new MultivariateGaussian(mu, sigma1) assert(dist1.pdf(x1) ~== 0.15915 absTol 1E-5) assert(dist1.pdf(x2) ~== 0.05855 absTol 1E-5) val sigma2 = Matrices.dense(2, 2, Array(4.0, -1.0, -1.0, 2.0)) val dist2 = new MultivariateGaussian(mu, sigma2) assert(dist2.pdf(x1) ~== 0.060155 absTol 1E-5) assert(dist2.pdf(x2) ~== 0.033971 absTol 1E-5) } test("multivariate degenerate") { val x1 = Vectors.dense(0.0, 0.0) val x2 = Vectors.dense(1.0, 1.0) val mu = Vectors.dense(0.0, 0.0) val sigma = Matrices.dense(2, 2, Array(1.0, 1.0, 1.0, 1.0)) val dist = new MultivariateGaussian(mu, sigma) assert(dist.pdf(x1) ~== 0.11254 absTol 1E-5) assert(dist.pdf(x2) ~== 0.068259 absTol 1E-5) } test("SPARK-11302") { val x = Vectors.dense(629, 640, 1.7188, 618.19) val mu = Vectors.dense( 1055.3910505836575, 1070.489299610895, 1.39020554474708, 1040.5907503867697) val sigma = Matrices.dense(4, 4, Array( 166769.00466698944, 169336.6705268059, 12.820670788921873, 164243.93314092053, 169336.6705268059, 172041.5670061245, 21.62590020524533, 166678.01075856484, 12.820670788921873, 21.62590020524533, 0.872524191943962, 4.283255814732373, 164243.93314092053, 166678.01075856484, 4.283255814732373, 161848.9196719207)) val dist = new MultivariateGaussian(mu, sigma) // Agrees with R's dmvnorm: 7.154782e-05 assert(dist.pdf(x) ~== 7.154782224045512E-5 absTol 1E-9) } }
Example 184
Source File: AFTSurvivalRegressionExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.AFTSurvivalRegression // $example off$ import org.apache.spark.sql.SparkSession object AFTSurvivalRegressionExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("AFTSurvivalRegressionExample") .getOrCreate() // $example on$ val training = spark.createDataFrame(Seq( (1.218, 1.0, Vectors.dense(1.560, -0.605)), (2.949, 0.0, Vectors.dense(0.346, 2.158)), (3.627, 0.0, Vectors.dense(1.380, 0.231)), (0.273, 1.0, Vectors.dense(0.520, 1.151)), (4.199, 0.0, Vectors.dense(0.795, -0.226)) )).toDF("label", "censor", "features") val quantileProbabilities = Array(0.3, 0.6) val aft = new AFTSurvivalRegression() .setQuantileProbabilities(quantileProbabilities) .setQuantilesCol("quantiles") val model = aft.fit(training) // Print the coefficients, intercept and scale parameter for AFT survival regression println(s"Coefficients: ${model.coefficients}") println(s"Intercept: ${model.intercept}") println(s"Scale: ${model.scale}") model.transform(training).show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 185
Source File: NormalizerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object NormalizerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("NormalizerExample") .getOrCreate() // $example on$ val dataFrame = spark.createDataFrame(Seq( (0, Vectors.dense(1.0, 0.5, -1.0)), (1, Vectors.dense(2.0, 1.0, 1.0)), (2, Vectors.dense(4.0, 10.0, 2.0)) )).toDF("id", "features") // Normalize each Vector using $L^1$ norm. val normalizer = new Normalizer() .setInputCol("features") .setOutputCol("normFeatures") .setP(1.0) val l1NormData = normalizer.transform(dataFrame) println("Normalized using L^1 norm") l1NormData.show() // Normalize each Vector using $L^\infty$ norm. val lInfNormData = normalizer.transform(dataFrame, normalizer.p -> Double.PositiveInfinity) println("Normalized using L^inf norm") lInfNormData.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 186
Source File: VectorSlicerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import java.util.Arrays import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.feature.VectorSlicer import org.apache.spark.ml.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType // $example off$ import org.apache.spark.sql.SparkSession object VectorSlicerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorSlicerExample") .getOrCreate() // $example on$ val data = Arrays.asList( Row(Vectors.sparse(3, Seq((0, -2.0), (1, 2.3)))), Row(Vectors.dense(-2.0, 2.3, 0.0)) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f1", "f2", "f3").map(defaultAttr.withName) val attrGroup = new AttributeGroup("userFeatures", attrs.asInstanceOf[Array[Attribute]]) val dataset = spark.createDataFrame(data, StructType(Array(attrGroup.toStructField()))) val slicer = new VectorSlicer().setInputCol("userFeatures").setOutputCol("features") slicer.setIndices(Array(1)).setNames(Array("f3")) // or slicer.setIndices(Array(1, 2)), or slicer.setNames(Array("f2", "f3")) val output = slicer.transform(dataset) output.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 187
Source File: ChiSqSelectorExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.ChiSqSelector import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object ChiSqSelectorExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("ChiSqSelectorExample") .getOrCreate() import spark.implicits._ // $example on$ val data = Seq( (7, Vectors.dense(0.0, 0.0, 18.0, 1.0), 1.0), (8, Vectors.dense(0.0, 1.0, 12.0, 0.0), 0.0), (9, Vectors.dense(1.0, 0.0, 15.0, 0.1), 0.0) ) val df = spark.createDataset(data).toDF("id", "features", "clicked") val selector = new ChiSqSelector() .setNumTopFeatures(1) .setFeaturesCol("features") .setLabelCol("clicked") .setOutputCol("selectedFeatures") val result = selector.fit(df).transform(df) println(s"ChiSqSelector output with top ${selector.getNumTopFeatures} features selected") result.show() // $example off$ spark.stop() } } // scalastyle:on println
Example 188
Source File: DCTExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.DCT import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object DCTExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("DCTExample") .getOrCreate() // $example on$ val data = Seq( Vectors.dense(0.0, 1.0, -2.0, 3.0), Vectors.dense(-1.0, 2.0, 4.0, -7.0), Vectors.dense(14.0, -2.0, -5.0, 1.0)) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val dct = new DCT() .setInputCol("features") .setOutputCol("featuresDCT") .setInverse(false) val dctDf = dct.transform(df) dctDf.select("featuresDCT").show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 189
Source File: VectorAssemblerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object VectorAssemblerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("VectorAssemblerExample") .getOrCreate() // $example on$ val dataset = spark.createDataFrame( Seq((0, 18, 1.0, Vectors.dense(0.0, 10.0, 0.5), 1.0)) ).toDF("id", "hour", "mobile", "userFeatures", "clicked") val assembler = new VectorAssembler() .setInputCols(Array("hour", "mobile", "userFeatures")) .setOutputCol("features") val output = assembler.transform(dataset) println("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'") output.select("features", "clicked").show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 190
Source File: PCAExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object PCAExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("PCAExample") .getOrCreate() // $example on$ val data = Array( Vectors.sparse(5, Seq((1, 1.0), (3, 7.0))), Vectors.dense(2.0, 0.0, 3.0, 4.0, 5.0), Vectors.dense(4.0, 0.0, 0.0, 6.0, 7.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) .fit(df) val result = pca.transform(df).select("pcaFeatures") result.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 191
Source File: ElementwiseProductExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.ElementwiseProduct import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object ElementwiseProductExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("ElementwiseProductExample") .getOrCreate() // $example on$ // Create some vector data; also works for sparse vectors val dataFrame = spark.createDataFrame(Seq( ("a", Vectors.dense(1.0, 2.0, 3.0)), ("b", Vectors.dense(4.0, 5.0, 6.0)))).toDF("id", "vector") val transformingVector = Vectors.dense(0.0, 1.0, 2.0) val transformer = new ElementwiseProduct() .setScalingVec(transformingVector) .setInputCol("vector") .setOutputCol("transformedVector") // Batch transform the vectors to create new column: transformer.transform(dataFrame).show() // $example off$ spark.stop() } } // scalastyle:on println
Example 192
Source File: MinMaxScalerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object MinMaxScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("MinMaxScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.createDataFrame(Seq( (0, Vectors.dense(1.0, 0.1, -1.0)), (1, Vectors.dense(2.0, 1.1, 1.0)), (2, Vectors.dense(3.0, 10.1, 3.0)) )).toDF("id", "features") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaledFeatures") // Compute summary statistics and generate MinMaxScalerModel val scalerModel = scaler.fit(dataFrame) // rescale each feature to range [min, max]. val scaledData = scalerModel.transform(dataFrame) println(s"Features scaled to range: [${scaler.getMin}, ${scaler.getMax}]") scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() } } // scalastyle:on println
Example 193
Source File: PolynomialExpansionExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.PolynomialExpansion import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object PolynomialExpansionExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("PolynomialExpansionExample") .getOrCreate() // $example on$ val data = Array( Vectors.dense(2.0, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(3.0, -1.0) ) val df = spark.createDataFrame(data.map(Tuple1.apply)).toDF("features") val polyExpansion = new PolynomialExpansion() .setInputCol("features") .setOutputCol("polyFeatures") .setDegree(3) val polyDF = polyExpansion.transform(df) polyDF.show(false) // $example off$ spark.stop() } } // scalastyle:on println
Example 194
Source File: MaxAbsScalerExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.MaxAbsScaler import org.apache.spark.ml.linalg.Vectors // $example off$ import org.apache.spark.sql.SparkSession object MaxAbsScalerExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("MaxAbsScalerExample") .getOrCreate() // $example on$ val dataFrame = spark.createDataFrame(Seq( (0, Vectors.dense(1.0, 0.1, -8.0)), (1, Vectors.dense(2.0, 1.0, -4.0)), (2, Vectors.dense(4.0, 10.0, 8.0)) )).toDF("id", "features") val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaledFeatures") // Compute summary statistics and generate MaxAbsScalerModel val scalerModel = scaler.fit(dataFrame) // rescale each feature to range [-1, 1] val scaledData = scalerModel.transform(dataFrame) scaledData.select("features", "scaledFeatures").show() // $example off$ spark.stop() } }
Example 195
Source File: DCT.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 196
Source File: MultilayerPerceptronClassifierWrapper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.r import org.apache.hadoop.fs.Path import org.json4s._ import org.json4s.JsonDSL._ import org.json4s.jackson.JsonMethods._ import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier} import org.apache.spark.ml.feature.{IndexToString, RFormula} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.r.RWrapperUtils._ import org.apache.spark.ml.util.{MLReadable, MLReader, MLWritable, MLWriter} import org.apache.spark.sql.{DataFrame, Dataset} private[r] class MultilayerPerceptronClassifierWrapper private ( val pipeline: PipelineModel ) extends MLWritable { import MultilayerPerceptronClassifierWrapper._ val mlpModel: MultilayerPerceptronClassificationModel = pipeline.stages(1).asInstanceOf[MultilayerPerceptronClassificationModel] val weights: Array[Double] = mlpModel.weights.toArray val layers: Array[Int] = mlpModel.layers def transform(dataset: Dataset[_]): DataFrame = { pipeline.transform(dataset) .drop(mlpModel.getFeaturesCol) .drop(mlpModel.getLabelCol) .drop(PREDICTED_LABEL_INDEX_COL) } override def read: MLReader[MultilayerPerceptronClassifierWrapper] = new MultilayerPerceptronClassifierWrapperReader override def load(path: String): MultilayerPerceptronClassifierWrapper = super.load(path) class MultilayerPerceptronClassifierWrapperReader extends MLReader[MultilayerPerceptronClassifierWrapper]{ override def load(path: String): MultilayerPerceptronClassifierWrapper = { implicit val format = DefaultFormats val pipelinePath = new Path(path, "pipeline").toString val pipeline = PipelineModel.load(pipelinePath) new MultilayerPerceptronClassifierWrapper(pipeline) } } class MultilayerPerceptronClassifierWrapperWriter(instance: MultilayerPerceptronClassifierWrapper) extends MLWriter { override protected def saveImpl(path: String): Unit = { val rMetadataPath = new Path(path, "rMetadata").toString val pipelinePath = new Path(path, "pipeline").toString val rMetadata = "class" -> instance.getClass.getName val rMetadataJson: String = compact(render(rMetadata)) sc.parallelize(Seq(rMetadataJson), 1).saveAsTextFile(rMetadataPath) instance.pipeline.save(pipelinePath) } } }
Example 197
Source File: VectorSlicerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 198
Source File: MaxAbsScalerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("MaxAbsScaler fit basic case") { val data = Array( Vectors.dense(1, 0, 100), Vectors.dense(2, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-2, -100)), Vectors.sparse(3, Array(0), Array(-1.5))) val expected: Array[Vector] = Array( Vectors.dense(0.5, 0, 1), Vectors.dense(1, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-1, -1)), Vectors.sparse(3, Array(0), Array(-0.75))) val df = data.zip(expected).toSeq.toDF("features", "expected") val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaled") val model = scaler.fit(df) model.transform(df).select("expected", "scaled").collect() .foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1") } // copied model must have the same parent. MLTestingUtils.checkCopy(model) } test("MaxAbsScaler read/write") { val t = new MaxAbsScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } test("MaxAbsScalerModel read/write") { val instance = new MaxAbsScalerModel( "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.maxAbs === instance.maxAbs) } }
Example 199
Source File: ChiSqSelectorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @transient var dataset: Dataset[_] = _ override def beforeAll(): Unit = { super.beforeAll() // Toy dataset, including the top feature for a chi-squared test. // These data are chosen such that each feature's test has a distinct p-value. val allParamSettings: Map[String, Any] = Map( "selectorType" -> "percentile", "numTopFeatures" -> 1, "percentile" -> 0.12, "outputCol" -> "myOutput" ) }
Example 200
Source File: DCTSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }