org.apache.spark.ml.linalg.Vector Scala Examples
The following examples show how to use org.apache.spark.ml.linalg.Vector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DataFrameExample.scala From drizzle-spark with Apache License 2.0 | 7 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 2
Source File: DCT.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 3
Source File: SparkPFASuiteBase.scala From aardpfark with Apache License 2.0 | 6 votes |
package com.ibm.aardpfark.pfa import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.SparkConf import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.functions.udf import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.scalactic.Equality import org.scalatest.FunSuite abstract class SparkPFASuiteBase extends FunSuite with DataFrameSuiteBase with PFATestUtils { val sparkTransformer: Transformer val input: Array[String] val expectedOutput: Array[String] val sparkConf = new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID). set("spark.driver.host", "localhost") override lazy val spark = SparkSession.builder().config(sparkConf).getOrCreate() override val reuseContextIfPossible = true // Converts column containing a vector to an array def withColumnAsArray(df: DataFrame, colName: String) = { val vecToArray = udf { v: Vector => v.toArray } df.withColumn(colName, vecToArray(df(colName))) } def withColumnAsArray(df: DataFrame, first: String, others: String*) = { val vecToArray = udf { v: Vector => v.toArray } var result = df.withColumn(first, vecToArray(df(first))) others.foreach(c => result = result.withColumn(c, vecToArray(df(c)))) result } // Converts column containing a vector to a sparse vector represented as a map def getColumnAsSparseVectorMap(df: DataFrame, colName: String) = { val vecToMap = udf { v: Vector => v.toSparse.indices.map(i => (i.toString, v(i))).toMap } df.withColumn(colName, vecToMap(df(colName))) } } abstract class Result object ApproxEquality extends ApproxEquality trait ApproxEquality { import org.scalactic.Tolerance._ import org.scalactic.TripleEquals._ implicit val seqApproxEq: Equality[Seq[Double]] = new Equality[Seq[Double]] { override def areEqual(a: Seq[Double], b: Any): Boolean = { b match { case d: Seq[Double] => a.zip(d).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } implicit val vectorApproxEq: Equality[Vector] = new Equality[Vector] { override def areEqual(a: Vector, b: Any): Boolean = { b match { case v: Vector => a.toArray.zip(v.toArray).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } }
Example 4
Source File: Normalizer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 5
Source File: MinMaxScalerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} import org.apache.spark.sql.Row class MinMaxScalerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("MinMaxScaler fit basic case") { val data = Array( Vectors.dense(1, 0, Long.MinValue), Vectors.dense(2, 0, 0), Vectors.sparse(3, Array(0, 2), Array(3, Long.MaxValue)), Vectors.sparse(3, Array(0), Array(1.5))) val expected: Array[Vector] = Array( Vectors.dense(-5, 0, -5), Vectors.dense(0, 0, 0), Vectors.sparse(3, Array(0, 2), Array(5, 5)), Vectors.sparse(3, Array(0), Array(-2.5))) val df = data.zip(expected).toSeq.toDF("features", "expected") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaled") .setMin(-5) .setMax(5) val model = scaler.fit(df) testTransformer[(Vector, Vector)](df, model, "expected", "scaled") { case Row(vector1: Vector, vector2: Vector) => assert(vector1 === vector2, "Transformed vector is different with expected.") } MLTestingUtils.checkCopyAndUids(scaler, model) } test("MinMaxScaler arguments max must be larger than min") { withClue("arguments max must be larger than min") { val dummyDF = Seq((1, Vectors.dense(1.0, 2.0))).toDF("id", "features") intercept[IllegalArgumentException] { val scaler = new MinMaxScaler().setMin(10).setMax(0).setInputCol("features") scaler.transformSchema(dummyDF.schema) } intercept[IllegalArgumentException] { val scaler = new MinMaxScaler().setMin(0).setMax(0).setInputCol("features") scaler.transformSchema(dummyDF.schema) } } } test("MinMaxScaler read/write") { val t = new MinMaxScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMax(1.0) .setMin(-1.0) testDefaultReadWrite(t) } test("MinMaxScalerModel read/write") { val instance = new MinMaxScalerModel( "myMinMaxScalerModel", Vectors.dense(-1.0, 0.0), Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setMin(-1.0) .setMax(1.0) val newInstance = testDefaultReadWrite(instance) assert(newInstance.originalMin === instance.originalMin) assert(newInstance.originalMax === instance.originalMax) } test("MinMaxScaler should remain NaN value") { val data = Array( Vectors.dense(1, Double.NaN, 2.0, 2.0), Vectors.dense(2, 2.0, 0.0, 3.0), Vectors.dense(3, Double.NaN, 0.0, 1.0), Vectors.dense(6, 2.0, 2.0, Double.NaN)) val expected: Array[Vector] = Array( Vectors.dense(-5.0, Double.NaN, 5.0, 0.0), Vectors.dense(-3.0, 0.0, -5.0, 5.0), Vectors.dense(-1.0, Double.NaN, -5.0, -5.0), Vectors.dense(5.0, 0.0, 5.0, Double.NaN)) val df = data.zip(expected).toSeq.toDF("features", "expected") val scaler = new MinMaxScaler() .setInputCol("features") .setOutputCol("scaled") .setMin(-5) .setMax(5) val model = scaler.fit(df) model.transform(df).select("expected", "scaled").collect() .foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1 === vector2, "Transformed vector is different with expected.") } } }
Example 6
Source File: BinarizerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.sql.{DataFrame, Row} class BinarizerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[Double] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4) } test("params") { ParamsSuite.checkParams(new Binarizer) } test("Binarize continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") testTransformer[(Double, Double)](dataFrame, binarizer, "binarized_feature", "expected") { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize continuous features with setter") { val threshold: Double = 0.2 val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with setter") { val threshold: Double = 0.2 val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("read/write") { val t = new Binarizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setThreshold(0.1) testDefaultReadWrite(t) } }
Example 7
Source File: ElementwiseProductSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.sql.Row class ElementwiseProductSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("streaming transform") { val scalingVec = Vectors.dense(0.1, 10.0) val data = Seq( (Vectors.dense(0.1, 1.0), Vectors.dense(0.01, 10.0)), (Vectors.dense(0.0, -1.1), Vectors.dense(0.0, -11.0)) ) val df = spark.createDataFrame(data).toDF("features", "expected") val ep = new ElementwiseProduct() .setInputCol("features") .setOutputCol("actual") .setScalingVec(scalingVec) testTransformer[(Vector, Vector)](df, ep, "actual", "expected") { case Row(actual: Vector, expected: Vector) => assert(actual ~== expected relTol 1e-14) } } test("read/write") { val ep = new ElementwiseProduct() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setScalingVec(Vectors.dense(0.1, 0.2)) testDefaultReadWrite(ep) } }
Example 8
Source File: DCTSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) testTransformer[(Vector, Vector)](dataset, transformer, "resultVec", "wantedVec") { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 9
Source File: MaxAbsScalerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} import org.apache.spark.sql.Row class MaxAbsScalerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ test("MaxAbsScaler fit basic case") { val data = Array( Vectors.dense(1, 0, 100), Vectors.dense(2, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-2, -100)), Vectors.sparse(3, Array(0), Array(-1.5))) val expected: Array[Vector] = Array( Vectors.dense(0.5, 0, 1), Vectors.dense(1, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-1, -1)), Vectors.sparse(3, Array(0), Array(-0.75))) val df = data.zip(expected).toSeq.toDF("features", "expected") val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaled") val model = scaler.fit(df) testTransformer[(Vector, Vector)](df, model, "expected", "scaled") { case Row(expectedVec: Vector, actualVec: Vector) => assert(expectedVec === actualVec, s"MaxAbsScaler error: Expected $expectedVec but computed $actualVec") } MLTestingUtils.checkCopyAndUids(scaler, model) } test("MaxAbsScaler read/write") { val t = new MaxAbsScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } test("MaxAbsScalerModel read/write") { val instance = new MaxAbsScalerModel( "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.maxAbs === instance.maxAbs) } }
Example 10
Source File: RDDLossFunction.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim.loss import scala.reflect.ClassTag import breeze.linalg.{DenseVector => BDV} import breeze.optimize.DiffFunction import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregator import org.apache.spark.rdd.RDD private[ml] class RDDLossFunction[ T: ClassTag, Agg <: DifferentiableLossAggregator[T, Agg]: ClassTag]( instances: RDD[T], getAggregator: (Broadcast[Vector] => Agg), regularization: Option[DifferentiableRegularization[Vector]], aggregationDepth: Int = 2) extends DiffFunction[BDV[Double]] { override def calculate(coefficients: BDV[Double]): (Double, BDV[Double]) = { val bcCoefficients = instances.context.broadcast(Vectors.fromBreeze(coefficients)) val thisAgg = getAggregator(bcCoefficients) val seqOp = (agg: Agg, x: T) => agg.add(x) val combOp = (agg1: Agg, agg2: Agg) => agg1.merge(agg2) val newAgg = instances.treeAggregate(thisAgg)(seqOp, combOp, aggregationDepth) val gradient = newAgg.gradient val regLoss = regularization.map { regFun => val (regLoss, regGradient) = regFun.calculate(Vectors.fromBreeze(coefficients)) BLAS.axpy(1.0, regGradient, gradient) regLoss }.getOrElse(0.0) bcCoefficients.destroy(blocking = false) (newAgg.loss + regLoss, gradient.asBreeze.toDenseVector) } }
Example 11
Source File: ChiSquareTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.util.SchemaUtils import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.mllib.stat.{Statistics => OldStatistics} import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.col @Since("2.2.0") def test(dataset: DataFrame, featuresCol: String, labelCol: String): DataFrame = { val spark = dataset.sparkSession import spark.implicits._ SchemaUtils.checkColumnType(dataset.schema, featuresCol, new VectorUDT) SchemaUtils.checkNumericType(dataset.schema, labelCol) val rdd = dataset.select(col(labelCol).cast("double"), col(featuresCol)).as[(Double, Vector)] .rdd.map { case (label, features) => OldLabeledPoint(label, OldVectors.fromML(features)) } val testResults = OldStatistics.chiSqTest(rdd) val pValues: Vector = Vectors.dense(testResults.map(_.pValue)) val degreesOfFreedom: Array[Int] = testResults.map(_.degreesOfFreedom) val statistics: Vector = Vectors.dense(testResults.map(_.statistic)) spark.createDataFrame(Seq(ChiSquareResult(pValues, degreesOfFreedom, statistics))) } }
Example 12
Source File: BinaryClassificationEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 13
Source File: DCT.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 14
Source File: NormalizerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row} class NormalizerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[Vector] = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) } def assertTypeOfVector(lhs: Vector, rhs: Vector): Unit = { assert((lhs, rhs) match { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } def assertValues(lhs: Vector, rhs: Vector): Unit = { assert(lhs ~== rhs absTol 1E-5, "The vector value is not correct after normalization.") } test("Normalization with default parameter") { val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized") val dataFrame: DataFrame = data.zip(l2Normalized).seq.toDF("features", "expected") testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") { case Row(features: Vector, normalized: Vector, expected: Vector) => assertTypeOfVector(normalized, features) assertValues(normalized, expected) } } test("Normalization with setter") { val dataFrame: DataFrame = data.zip(l1Normalized).seq.toDF("features", "expected") val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized").setP(1) testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") { case Row(features: Vector, normalized: Vector, expected: Vector) => assertTypeOfVector(normalized, features) assertValues(normalized, expected) } } test("read/write") { val t = new Normalizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setP(3.0) testDefaultReadWrite(t) } }
Example 15
Source File: ElementwiseProduct.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 16
Source File: Word2VecExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession object Word2VecExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Word2Vec example") .getOrCreate() // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.collect().foreach { case Row(text: Seq[_], features: Vector) => println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() } } // scalastyle:on println
Example 17
Source File: DataFrameExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text("input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println("Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 18
Source File: ChiSquareTestExample.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.stat.ChiSquareTest // $example off$ import org.apache.spark.sql.SparkSession object ChiSquareTestExample { def main(args: Array[String]): Unit = { val spark = SparkSession .builder .appName("ChiSquareTestExample") .getOrCreate() import spark.implicits._ // $example on$ val data = Seq( (0.0, Vectors.dense(0.5, 10.0)), (0.0, Vectors.dense(1.5, 20.0)), (1.0, Vectors.dense(1.5, 30.0)), (0.0, Vectors.dense(3.5, 30.0)), (0.0, Vectors.dense(3.5, 40.0)), (1.0, Vectors.dense(3.5, 40.0)) ) val df = data.toDF("label", "features") val chi = ChiSquareTest.test(df, "features", "label").head println(s"pValues = ${chi.getAs[Vector](0)}") println(s"degreesOfFreedom ${chi.getSeq[Int](1).mkString("[", ",", "]")}") println(s"statistics ${chi.getAs[Vector](2)}") // $example off$ spark.stop() } } // scalastyle:on println
Example 19
Source File: VectorExplodeSpec.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.TestEnv import odkl.analysis.spark.util.SQLOperations import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql.{functions, Row} import org.apache.spark.sql.types.{StructType, StructField, DoubleType} import org.scalatest.FlatSpec class VectorExplodeSpec extends FlatSpec with TestEnv with org.scalatest.Matchers with SQLOperations with WithModels with HasMetricsBlock { case class Point(id: Int, vector: Vector, mean: Vector) lazy val data = sqlc.createDataFrame(Seq( Point(1, Vectors.dense(1.0, 3.0), Vectors.dense(10.0, 30.0)), Point(2, Vectors.dense(2.0, 4.0), Vectors.sparse(2, Array(1), Array(20.0))) )) lazy val withMetadata = data.withColumn( "vector", data("vector").as("vector", new AttributeGroup("vector", Array[Attribute]( NumericAttribute.defaultAttr.withName("fixed"), NumericAttribute.defaultAttr.withName("var") )).toMetadata())) .withColumn( "mean", data("mean").as("mean", new AttributeGroup("vector", Array[Attribute]( NumericAttribute.defaultAttr.withName("fixed"), NumericAttribute.defaultAttr.withName("var") )).toMetadata())) lazy val explode = new VectorExplode().transform(withMetadata) "Explode " should " add data" in { val result = explode.orderBy("id", "value").collect() result(0).getInt(0) should be(1) result(0).getString(1) should be("fixed") result(0).getDouble(2) should be(1.0) result(0).getDouble(3) should be(10.0) result(1).getInt(0) should be(1) result(1).getString(1) should be("var") result(1).getDouble(2) should be(3.0) result(1).getDouble(3) should be(30.0) result(2).getInt(0) should be(2) result(2).getString(1) should be("fixed") result(2).getDouble(2) should be(2.0) result(2).isNullAt(3) should be(true) result(3).getInt(0) should be(2) result(3).getString(1) should be("var") result(3).getDouble(2) should be(4.0) result(3).getDouble(3) should be(20.0) } "Explode " should " create schema" in { val fields = explode.schema.fields fields(0).name should be("id") fields(1).name should be("value") fields(2).name should be("vector") fields(3).name should be("mean") } }
Example 20
Source File: VectorExplode.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import odkl.analysis.spark.util.collection.OpenHashMap import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.{DefaultParamsWritable, Identifiable} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.odkl.SparkSqlUtils import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row, functions} class VectorExplode(override val uid: String) extends Transformer with DefaultParamsWritable { val valueCol = new Param[String](this, "valueCol", "Name of the column to store value name.") def setValueCol(value: String) : this.type = set(valueCol, value) setDefault(valueCol -> "value") def this() = this(Identifiable.randomUID("vectorExplode")) override def transform(dataset: Dataset[_]): DataFrame = { val vectors: Array[StructField] = dataset.schema.fields.filter(_.dataType.isInstanceOf[VectorUDT]) val resultSchema = StructType(Seq( StructField($(valueCol), StringType, nullable = false)) ++ vectors.map(f => StructField(f.name, DoubleType, nullable = true)) ) val arraySize = resultSchema.size - 1 val names: Array[Map[Int, String]] = vectors.map( f => { AttributeGroup.fromStructField(f).attributes .map(attributes => attributes.filter(_.name.isDefined).map(a => a.index.get -> a.name.get).toMap) .getOrElse(Map()) }) val maxCapacity = names.map(_.size).max val explodeVectors : (Row => Array[Row]) = (r: Row ) => { val accumulator = new OpenHashMap[String,Array[Double]](maxCapacity) for(i <- 0 until r.length) { val vector = r.getAs[Vector](i) vector.foreachActive((index, value) => { val name = names(i).getOrElse(index, s"${vectors(i).name}_$index") accumulator.changeValue( name, Array.tabulate(arraySize) {ind => if(i == ind) value else Double.NaN}, v => {v(i) = value; v}) }) } accumulator.map(x => new GenericRowWithSchema( (Seq(x._1) ++ x._2.toSeq.map(v => if (v.isNaN) null else v)).toArray, resultSchema)).toArray } val vectorsStruct = functions.struct(vectors.map(f => dataset(f.name)): _*) val explodeUDF = SparkSqlUtils.customUDF(explodeVectors, ArrayType(resultSchema), Some(Seq(vectorsStruct.expr.dataType))) val expression = functions.explode(explodeUDF(vectorsStruct)) dataset .withColumn(uid, expression) .select( dataset.schema.fields.filterNot(_.dataType.isInstanceOf[VectorUDT]).map(f => dataset(f.name)) ++ resultSchema.fields.map(f => functions.expr(s"$uid.${f.name}").as(f.name)) :_*) } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = StructType(schema.fields.map(x => x.dataType match { case vector: VectorUDT => StructField(x.name, typeFromVector(x)) case _ => x } )) def typeFromVector(field: StructField): StructType = { val attributes = AttributeGroup.fromStructField(field) StructType(attributes.attributes .map(_.map(a => a.name.getOrElse(s"_${a.index.get}"))) .getOrElse(Array.tabulate(attributes.size) { i => s"_$i" }) .map(name => StructField(name, DoubleType, nullable = false))) } }
Example 21
Source File: HasNetlibBlas.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS} import com.github.fommil.netlib.{F2jBLAS, BLAS => NetlibBLAS} import org.apache.spark.ml.linalg.{DenseVector, Matrices, Vector, Vectors} trait HasNetlibBlas { // For level-1 routines, we use Java implementation. def f2jBLAS: NetlibBLAS = HasNetlibBlas._f2jBLAS def blas: NetlibBLAS = HasNetlibBlas._nativeBLAS def dscal(a: Double, data: Array[Double]) : Unit = f2jBLAS.dscal(data.length, a, data, 1) def axpy(a: Double, x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.daxpy(x.length, a, x, 1, y, 1) def axpy(a: Double, x: Vector, y : Array[Double]) : Unit = x match { case dense: DenseVector => axpy(a, dense.values, y) case _ => x.foreachActive((i, v) => y(i) += a * v) } def copy( x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.dcopy(x.length, x, 1, y, 1) } object HasNetlibBlas extends Serializable { @transient private lazy val _f2jBLAS: NetlibBLAS = { initSparkBlas new F2jBLAS } private def initSparkBlas = synchronized { org.apache.spark.ml.linalg.BLAS.dot(Vectors.zeros(2), Vectors.zeros(2)) org.apache.spark.ml.linalg.BLAS.gemv(1.0, Matrices.zeros(2, 2), Vectors.zeros(2), 0.5, Vectors.zeros(2).toDense) } @transient private lazy val _nativeBLAS: NetlibBLAS = { initSparkBlas NativeBLAS } }
Example 22
Source File: RandomProjectionsHasher.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import java.util.Random import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol, HasSeed} import org.apache.spark.ml.param._ import org.apache.spark.ml.util.{Identifiable, SchemaUtils} import org.apache.spark.ml.linalg.{Matrices, SparseMatrix, Vector} import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.udf import org.apache.spark.sql.types.{LongType, StructType} def setDim(value: Long): this.type = set(dim, value) def this() = this(Identifiable.randomUID("randomProjectionsHasher")) override def transform(dataset: Dataset[_]): DataFrame = { val dimensity = { if (!isSet(dim)) {//If dimensions is not set - will search AttributeGroup in metadata as it comes from OdklCountVectorizer val vectorsIndex = dataset.schema.fieldIndex($(inputCol)) AttributeGroup.fromStructField(dataset.schema.fields(vectorsIndex)).size } else { $(dim).toInt } } val projectionMatrix = dataset.sqlContext.sparkContext.broadcast( Matrices.sprandn($(basisSize).toInt, dimensity, $(sparsity), new Random($(seed))).asInstanceOf[SparseMatrix]) //the matrix of random vectors to costruct hash val binHashSparseVectorColumn = udf((vector: Vector) => { projectionMatrix.value.multiply(vector).values .map(f => if (f>0) 1L else 0L) .view.zipWithIndex .foldLeft(0L) {case (acc,(v, i)) => acc | (v << i) } }) dataset.withColumn($(outputCol), binHashSparseVectorColumn(dataset.col($(inputCol)))) } override def copy(extra: ParamMap): Transformer = { defaultCopy(extra) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { SchemaUtils.appendColumn(schema, $(outputCol), LongType) } }
Example 23
Source File: PCAModelSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class PCAModelSuite extends SparkFeaturePFASuiteBase[PCAModelResult] { implicit val enc = ExpressionEncoder[Vector]() val inputPath = "data/sample_lda_libsvm_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) override val sparkTransformer = pca.fit(dataset) val result = sparkTransformer.transform(dataset) override val input = withColumnAsArray(result, pca.getInputCol).toJSON.collect() override val expectedOutput = withColumnAsArray(result, pca.getOutputCol).toJSON.collect() } case class PCAModelResult(pcaFeatures: Seq[Double]) extends Result
Example 24
Source File: MyNormalize.scala From Apache-Spark-2x-Machine-Learning-Cookbook with MIT License | 5 votes |
package spark.ml.cookbook.chapter4 import org.apache.spark.sql.SparkSession import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.feature.MinMaxScaler import org.apache.log4j.{Level, Logger} object MyNormalize { def parseWine(str: String): (Int, Vector) = { val columns = str.split(",") // don't use the entire row of data (columns(0).toInt, Vectors.dense(columns(1).toFloat, columns(2).toFloat, columns(3).toFloat)) } def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) Logger.getLogger("akka").setLevel(Level.ERROR) // setup SparkSession to use for interactions with Spark val spark = SparkSession .builder .master("local[*]") .appName("My Normalize") .config("spark.sql.warehouse.dir", ".") .getOrCreate() import spark.implicits._ //http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data val data = spark.read.text("../data/sparkml2/chapter4/wine.data").as[String].map(parseWine) val df = data.toDF("id", "feature") df.printSchema() df.show(false) val scale = new MinMaxScaler() .setInputCol("feature") .setOutputCol("scaled") .setMax(1) .setMin(-1) scale.fit(df).transform(df).select("scaled").show(false) spark.stop() } }
Example 25
Source File: DataLoader.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.tree.util import org.apache.spark.SparkContext import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.rdd.RDD object DataLoader { def parseLibsvm(line: String, dim: Int): (Double, Vector) = { val splits = line.split("\\s+|,").map(_.trim) val y = splits(0).toDouble val indices = new Array[Int](splits.length - 1) val values = new Array[Double](splits.length - 1) for (i <- 0 until splits.length - 1) { val kv = splits(i + 1).split(":") indices(i) = kv(0).toInt values(i) = kv(1).toDouble } (y, Vectors.sparse(dim, indices, values)) } def loadLibsvm(input: String, dim: Int) (implicit sc: SparkContext): RDD[(Double, Vector)] = { sc.textFile(input) .map(_.trim) .filter(_.nonEmpty) .filter(!_.startsWith("#")) .map(line => parseLibsvm(line, dim)) } }
Example 26
Source File: GBDTPredictor.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.tree.gbdt.predict import com.tencent.angel.sona.tree.gbdt.GBDTConf._ import com.tencent.angel.sona.tree.gbdt.GBDTModel import com.tencent.angel.sona.tree.util.DataLoader import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.linalg.Vector import org.apache.spark.rdd.RDD import org.apache.spark.util.SparkUtil object GBDTPredictor { def main(args: Array[String]): Unit = { @transient val conf = new SparkConf() conf.set("spark.rpc.message.maxSize", "2000") conf.set("spark.driver.maxResultSize", "2G") @transient implicit val sc = SparkContext.getOrCreate(conf) val params = SparkUtil.parse(args) val modelPath = params(ML_MODEL_PATH) val inputPath = params(ML_PREDICT_INPUT_PATH) val outputPath = params(ML_PREDICT_OUTPUT_PATH) val model = loadModel(modelPath) predict(model, inputPath, outputPath) } def loadModel(modelFolder: String)(implicit sc: SparkContext): GBDTModel = { val loadStart = System.currentTimeMillis() val modelPath = modelFolder + "/model" println(s"Loading model from $modelPath...") val model = sc.objectFile[GBDTModel](modelPath).first() println(s"Loading model with ${model.numTree} tree(s) done, " + s"cost ${System.currentTimeMillis() - loadStart} ms") model } def predict(model: GBDTModel, input: String, output: String) (implicit sc: SparkContext): Unit = { val predStart = System.currentTimeMillis() println("Start to do prediction...") println(s"Prediction input: $input") println(s"Prediction output: $output") val predictor = new GBDTPredictor(model) val preds = if (predictor.isRegression) { predictor.predictRegression(input) .map(x => s"${x._1} ${x._2}") } else { predictor.predictClassification(input) .map(x => s"${x._1} ${x._2} ${x._3.mkString(",")}") } preds.saveAsTextFile(output) println(s"Prediction done, cost ${System.currentTimeMillis() - predStart} ms") } private def predictRaw(model: GBDTModel, ins: Vector): Array[Float] = { model.predict(ins) } private def predToClass(predRaw: Array[Float]): Int = { predRaw.length match { case 1 => if (predRaw.head > 0.0f) 1 else 0 case _ => predRaw.zipWithIndex.maxBy(_._1)._2 } } } class GBDTPredictor(model: GBDTModel) { import GBDTPredictor._ def predictRegression(input: String) (implicit sc: SparkContext): RDD[(Long, Float)] = { require(model.param.isRegression, "Input model is obtained " + "from a classification task, cannot be used in regression") val maxDim = model.param.regTParam.numFeature val bcModel = sc.broadcast(model) DataLoader.loadLibsvm(input, maxDim) .map { case (id, ins) => val predRaw = predictRaw(bcModel.value, ins) (id.toLong, predRaw.head) } } def predictClassification(input: String) (implicit sc: SparkContext): RDD[(Long, Int, Array[Float])] = { require(!model.param.isRegression, "Input model is obtained " + "from a regression task, cannot be used in classification") val maxDim = model.param.regTParam.numFeature val bcModel = sc.broadcast(model) DataLoader.loadLibsvm(input, maxDim) .map { case (id, ins) => val predRaw = predictRaw(bcModel.value, ins) val predClass = predToClass(predRaw) (id.toLong, predClass, predRaw) } } def isRegression: Boolean = model.param.isRegression }
Example 27
Source File: GBDTModel.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.tree.gbdt import java.io.{FileInputStream, FileOutputStream, ObjectInputStream, ObjectOutputStream} import com.tencent.angel.sona.tree.gbdt.tree.{GBDTParam, GBTNode} import com.tencent.angel.sona.tree.regression.RegTree import org.apache.spark.ml.linalg.Vector import scala.collection.mutable.ArrayBuffer object GBDTModel { type GBTTree = RegTree[GBTNode] def save(model: GBDTModel, path: String): Unit = { val oos = new ObjectOutputStream(new FileOutputStream(path)) oos.writeObject(model) oos.close() } def load(path: String): GBDTModel = { val ois = new ObjectInputStream(new FileInputStream(path)) ois.readObject().asInstanceOf[GBDTModel] } } import GBDTModel._ class GBDTModel(val param: GBDTParam) extends Serializable { private var forest: ArrayBuffer[GBTTree] = ArrayBuffer[GBTTree]() private var weights: ArrayBuffer[Float] = ArrayBuffer[Float]() def predict(instance: Vector): Array[Float] = { if (param.isRegression || param.numClass == 2) { var pred = 0.0f for (i <- forest.indices) pred += weights(i) * forest(i).predictBinary(instance) Array(pred) } else if (param.multiTree) { val preds = Array.ofDim[Float](param.numClass) for (i <- forest.indices) preds(i % param.numClass) += weights(i) * forest(i).predictBinary(instance) preds } else { val preds = Array.ofDim[Float](param.numClass) for (i <- forest.indices) { val p = forest(i).predictMulti(instance) val w = weights(i) for (k <- 0 until param.numClass) preds(k) += w * p(k) } preds } } def predict(instances: Array[Vector]): Array[Array[Float]] = { instances.map(predict) } def get(treeId: Int): GBTTree = forest(treeId) def add(tree: GBTTree, weight: Float): Unit = { forest += tree weights += weight } def keepFirstTrees(num: Int): Unit = { forest = forest.slice(0, num) weights = weights.slice(0, num) } def numTree: Int = forest.size }
Example 28
Source File: LinearSVCSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.classification import com.ibm.aardpfark.pfa.ClassifierResult import org.apache.spark.ml.classification.LinearSVC import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class LinearSVCSuite extends SparkClassifierPFASuiteBase[ClassifierResult] { val inputPath = "data/sample_libsvm_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val clf = new LinearSVC() override val sparkTransformer = clf.fit(dataset) import spark.implicits._ implicit val mapEncoder = ExpressionEncoder[Map[String, Double]]() val result = sparkTransformer.transform(dataset) override val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() override val expectedOutput = result.select(clf.getPredictionCol, clf.getRawPredictionCol).map { case Row(p: Double, raw: Vector) => (p, raw.toArray) }.toDF(clf.getPredictionCol, clf.getRawPredictionCol).toJSON.collect() // Additional tests test("LinearSVC w/o fitIntercept") { val sparkTransformer = clf.setFitIntercept(false).fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = result.select(clf.getPredictionCol, clf.getRawPredictionCol).map { case Row(p: Double, raw: Vector) => (p, raw.toArray) }.toDF(clf.getPredictionCol, clf.getRawPredictionCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } }
Example 29
Source File: NaiveBayesSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.classification import com.ibm.aardpfark.pfa.ProbClassifierResult import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql.Row import org.apache.spark.sql.functions._ class NaiveBayesSuite extends SparkClassifierPFASuiteBase[ProbClassifierResult] { import spark.implicits._ val inputPath = "data/sample_multiclass_classification_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val multinomialData = dataset .as[(Double, Vector)] .map { case (label, vector) => val nonZeroVector = Vectors.dense(vector.toArray.map(math.max(0.0, _))) (label, nonZeroVector) }.toDF("label", "features") val multinomialDataBinary = multinomialData.select( when(col("label") >= 1, 1.0).otherwise(0.0).alias("label"), col("features") ) val bernoulliData = dataset .as[(Double, Vector)] .map { case (label, vector) => val binaryData = vector.toArray.map { case e if e > 0.0 => 1.0 case e if e <= 0.0 => 0.0 } (label, Vectors.dense(binaryData)) }.toDF("label", "features") val bernoulliDataBinary = bernoulliData.select( when(col("label") >= 1, 1.0).otherwise(0.0).alias("label"), col("features") ) val clf = new NaiveBayes() override val sparkTransformer = clf.fit(multinomialData) val result = sparkTransformer.transform(multinomialData) override val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() override val expectedOutput = result.select(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).map { case Row(p: Double, raw: Vector, pr: Vector) => (p, raw.toArray, pr.toArray) }.toDF(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).toJSON.collect() // Additional tests test("Multinomial model binary classification") { val sparkTransformer = clf.fit(multinomialDataBinary) val result = sparkTransformer.transform(multinomialDataBinary) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = result.select(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).map { case Row(p: Double, raw: Vector, pr: Vector) => (p, raw.toArray, pr.toArray) }.toDF(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("Bernoulli model") { val sparkTransformer = clf.setModelType("bernoulli").fit(bernoulliData) val result = sparkTransformer.transform(bernoulliData) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = result.select(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).map { case Row(p: Double, raw: Vector, pr: Vector) => (p, raw.toArray, pr.toArray) }.toDF(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("Bernoulli model binary classification") { val sparkTransformer = clf.setModelType("bernoulli").fit(bernoulliDataBinary) val result = sparkTransformer.transform(bernoulliDataBinary) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = result.select(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).map { case Row(p: Double, raw: Vector, pr: Vector) => (p, raw.toArray, pr.toArray) }.toDF(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } }
Example 30
Source File: LogisticRegressionSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.classification import com.ibm.aardpfark.pfa.ProbClassifierResult import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.{DataFrame, Row} class LogisticRegressionSuite extends SparkClassifierPFASuiteBase[ProbClassifierResult] { import spark.implicits._ def getOutput(df: DataFrame) = { df.select(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).map { case Row(p: Double, raw: Vector, pr: Vector) => (p, raw.toArray, pr.toArray) }.toDF(clf.getPredictionCol, clf.getRawPredictionCol, clf.getProbabilityCol).toJSON.collect() } val binaryData = spark.read.format("libsvm").load("data/sample_libsvm_data.txt") val multiData = spark.read.format("libsvm").load("data/sample_multiclass_classification_data.txt") val clf = new LogisticRegression() override val sparkTransformer = clf.fit(binaryData) val result = sparkTransformer.transform(binaryData) override val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() override val expectedOutput = getOutput(result) // Additional tests test("LogisticRegression w/o fitIntercept") { val sparkTransformer = clf.setFitIntercept(false).fit(binaryData) val result = sparkTransformer.transform(binaryData) val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) } test("LogisticRegression w/ non-default threshold") { val sparkTransformer = clf.setThreshold(0.0).fit(binaryData) val result = sparkTransformer.transform(binaryData) val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) val sparkTransformer2 = clf.setThreshold(1.0).fit(binaryData) val result2 = sparkTransformer2.transform(binaryData) val expectedOutput2 = getOutput(result2) parityTest(sparkTransformer2, input, expectedOutput2) } test("MLOR w/ intercept") { val sparkTransformer = clf.fit(multiData) val result = sparkTransformer.transform(multiData) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) } test("MLOR w/o intercept") { val sparkTransformer = clf.setFitIntercept(false).fit(multiData) val result = sparkTransformer.transform(multiData) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) } test("MLOR w/ thresholds") { val sparkTransformer = clf.setThresholds(Array(0.1, 0.6, 0.3)).fit(multiData) val result = sparkTransformer.transform(multiData) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) } test("MLOR w/ thresholds - one zero") { val sparkTransformer = clf.setThresholds(Array(0.0, 0.6, 0.3)).fit(multiData) val result = sparkTransformer.transform(multiData) val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() val expectedOutput = getOutput(result) parityTest(sparkTransformer, input, expectedOutput) } }
Example 31
Source File: NormalizerSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{ScalerResult, Result, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class NormalizerSuite extends SparkFeaturePFASuiteBase[ScalerResult] { implicit val enc = ExpressionEncoder[Vector]() val inputPath = "data/sample_lda_libsvm_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val scaler = new Normalizer() .setInputCol("features") .setOutputCol("scaled") override val sparkTransformer = scaler val result = scaler.transform(dataset) override val input = withColumnAsArray(result, scaler.getInputCol).toJSON.collect() override val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() test("Normalizer with P = 1") { val sparkTransformer = scaler.setP(1.0) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("Normalizer with P = positive infinity"){ val sparkTransformer = scaler.setP(Double.PositiveInfinity) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("Normalizer with P = 3") { val sparkTransformer = scaler.setP(3.0) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } }
Example 32
Source File: HashBasedDeduplicator.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl.texts import odkl.analysis.spark.util.Logging import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.linalg.Vectors.norm import org.apache.spark.ml.linalg.{BLAS, Vector} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuffer def setSimilarityTreshold(value: Double): this.type = set(similarityThreshold, value) setDefault(new ParamPair[String](inputColHash,"hash"), new ParamPair[Double](similarityThreshold,0.9)) def this() = this(Identifiable.randomUID("hashBasedDeduplication")) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sqlContext.createDataFrame( dataset.toDF .repartition(dataset.col($(inputColHash))) .sortWithinPartitions($(inputColHash)) .rdd .mapPartitions((f: Iterator[Row]) => { if (f.hasNext) { var curHash: Long = -1L val vectorsBuffer = new ArrayBuffer[Vector](0) // unique vectors buffer for this bucket for (it <- f) yield { val newHash = it.getAs[Long]($(inputColHash)) if (newHash == curHash) { val currentVector = it.getAs[Vector]($(inputColVector)) val isUnique = vectorsBuffer.forall(storedVector => { //are this vector is "different" with other in buffer? (BLAS.dot(storedVector, currentVector) / (norm(storedVector, 2) * norm(currentVector, 2))) < $(similarityThreshold) //is unsimilar? }) if (isUnique) { vectorsBuffer.append(currentVector) it } else { Row.empty //dummy Row } } else { vectorsBuffer.clear() vectorsBuffer.append(it.getAs[Vector]($(inputColVector))) curHash = newHash it } } } else { new Array[Row](0).toIterator //empty partition? } }).filter(!_.equals(Row.empty)), //filter dummy transformSchema(dataset.schema)) } @DeveloperApi override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Transformer = defaultCopy(extra) }
Example 33
Source File: StandardScalerSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{ScalerResult, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.StandardScaler import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class StandardScalerSuite extends SparkFeaturePFASuiteBase[ScalerResult] { implicit val enc = ExpressionEncoder[Vector]() val inputPath = "data/sample_lda_libsvm_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(true) .setWithStd(true) override val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) override val input = withColumnAsArray(result, scaler.getInputCol).toJSON.collect() override val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() test("StandardScaler w/o Mean and Std") { val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(false) .setWithStd(false) val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("StandardScaler w/o Mean") { val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(false) .setWithStd(true) val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("StandardScaler w/o Std") { val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(true) .setWithStd(false) val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } }
Example 34
Source File: SimpleVectorAssembler.scala From albedo with MIT License | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.{DataFrame, Dataset, Row} import scala.collection.mutable.ArrayBuilder def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: Dataset[_]): DataFrame = { transformSchema(dataset.schema, logging = true) val schema = dataset.schema val assembleFunc = udf { r: Row => SimpleVectorAssembler.assemble(r.toSeq: _*) } val args = $(inputCols).map { c => schema(c).dataType match { case DoubleType => dataset(c) case _: VectorUDT => dataset(c) case _: NumericType | BooleanType => dataset(c).cast(DoubleType).as(s"${c}_double_$uid") } } dataset.select(col("*"), assembleFunc(struct(args: _*)).as($(outputCol))) } override def transformSchema(schema: StructType): StructType = { val inputColNames = $(inputCols) val outputColName = $(outputCol) val inputDataTypes = inputColNames.map(name => schema(name).dataType) inputDataTypes.foreach { case _: NumericType | BooleanType => case t if t.isInstanceOf[VectorUDT] => case other => throw new IllegalArgumentException(s"Data type $other is not supported.") } if (schema.fieldNames.contains(outputColName)) { throw new IllegalArgumentException(s"Output column $outputColName already exists.") } StructType(schema.fields :+ new StructField(outputColName, new VectorUDT, true)) } override def copy(extra: ParamMap): SimpleVectorAssembler = defaultCopy(extra) } object SimpleVectorAssembler extends DefaultParamsReadable[SimpleVectorAssembler] { override def load(path: String): SimpleVectorAssembler = super.load(path) def assemble(vv: Any*): Vector = { val indices = ArrayBuilder.make[Int] val values = ArrayBuilder.make[Double] var cur = 0 vv.foreach { case v: Double => if (v != 0.0) { indices += cur values += v } cur += 1 case vec: Vector => vec.foreachActive { case (i, v) => if (v != 0.0) { indices += cur + i values += v } } cur += vec.size case null => // TODO: output Double.NaN? throw new SparkException("Values to assemble cannot be null.") case o => throw new SparkException(s"$o of type ${o.getClass.getName} is not supported.") } Vectors.sparse(cur, indices.result(), values.result()).compressed } }
Example 35
Source File: UDFs.scala From albedo with MIT License | 5 votes |
package ws.vinta.albedo.closures import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions._ import ws.vinta.albedo.closures.StringFunctions._ import scala.util.control.Breaks.{break, breakable} object UDFs extends Serializable { def containsAnyOfUDF(substrings: Array[String], shouldLower: Boolean = false): UserDefinedFunction = udf[Double, String]((text: String) => { var result = 0.0 breakable { for (substring <- substrings) { if (text.contains(substring)) { result = 1.0 break } } } result }) def toArrayUDF: UserDefinedFunction = udf[Array[Double], Vector]((vector: Vector) => { vector.toArray }) def numNonzerosOfVectorUDF: UserDefinedFunction = udf[Int, Vector]((vector: Vector) => { vector.numNonzeros }) def cleanCompanyUDF: UserDefinedFunction = udf[String, String]((company: String) => { val temp1 = company .toLowerCase() .replaceAll("""\b(.com|.net|.org|.io|.co.uk|.co|.eu|.fr|.de|.ru)\b""", "") .replaceAll("""\b(formerly|previously|ex\-)\b""", "") .replaceAll("""\W+""", " ") .replaceAll("""\s+""", " ") .replaceAll("""\b(http|https|www|co ltd|pvt ltd|ltd|inc|llc)\b""", "") .trim() val temp2 = extractWordsIncludeCJK(temp1).mkString(" ") if (temp2.isEmpty) "__empty" else temp2 }) def cleanEmailUDF: UserDefinedFunction = udf[String, String]((email: String) => { val temp1 = email.toLowerCase().trim() val temp2 = extractEmailDomain(temp1) if (temp2.isEmpty) "__empty" else temp2 }) def cleanLocationUDF: UserDefinedFunction = udf[String, String]((location: String) => { val temp1 = try { val pattern = s"([$wordPatternIncludeCJK]+),\\s*([$wordPatternIncludeCJK]+)".r val pattern(city, _) = location city } catch { case _: MatchError => { location } } val temp2 = temp1 .toLowerCase() .replaceAll("""[~!@#$^%&*\\(\\)_+={}\\[\\]|;:\"'<,>.?`/\\\\-]+""", " ") .replaceAll("""\s+""", " ") .replaceAll("""\b(city)\b""", "") .trim() val temp3 = extractWordsIncludeCJK(temp2).mkString(" ") if (temp3.isEmpty) "__empty" else temp3 }) def repoLanguageIndexInUserRecentRepoLanguagesUDF = udf((repo_language: String, user_recent_repo_languages: Seq[String]) => { val index = user_recent_repo_languages.indexOf(repo_language.toLowerCase()) if (index < 0) user_recent_repo_languages.size + 50 else index }) def repoLanguageCountInUserRecentRepoLanguagesUDF = udf((repo_language: String, user_recent_repo_languages: Seq[String]) => { user_recent_repo_languages.count(_ == repo_language.toLowerCase()) }) }
Example 36
Source File: NaiveBayes.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License | 5 votes |
package com.chapter12.NaiveBayes import org.apache.spark.ml.classification.NaiveBayes import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.sql.SparkSession import org.apache.spark.ml.Pipeline; import org.apache.spark.ml.PipelineStage; import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.evaluation.BinaryClassificationEvaluator import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} object NaiveBayesExample { def main(args: Array[String]): Unit = { // Create the Spark session val spark = SparkSession .builder .master("local[*]") .config("spark.sql.warehouse.dir", "E:/Exp/") .appName(s"OneVsRestExample") .getOrCreate() // Load the data stored in LIBSVM format as a DataFrame. val data = spark.read.format("libsvm").load("C:/Users/rezkar/Downloads/spark-2.1.0-bin-hadoop2.7/data/sample.data") // Split the data into training and test sets (30% held out for testing) val Array(trainingData, validationData) = data.randomSplit(Array(0.75, 0.25), seed = 12345L) // Train a NaiveBayes model. val nb = new NaiveBayes().setSmoothing(0.00001) val model = nb.fit(trainingData) // Select example rows to display. val predictions = model.transform(validationData) predictions.show() // Select (prediction, true label) and compute test error obtain evaluator and compute the classification performnce metrics like accuracy, precision, recall and f1 measure. val evaluator = new BinaryClassificationEvaluator().setLabelCol("label").setMetricName("areaUnderROC") val evaluator1 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("accuracy") val evaluator2 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("weightedPrecision") val evaluator3 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("weightedRecall") val evaluator4 = new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("f1") // compute the classification accuracy, precision, recall, f1 measure and error on test data. val areaUnderROC = evaluator.evaluate(predictions) val accuracy = evaluator1.evaluate(predictions) val precision = evaluator2.evaluate(predictions) val recall = evaluator3.evaluate(predictions) val f1 = evaluator4.evaluate(predictions) // Print the performance metrics println("areaUnderROC = " + areaUnderROC) println("Accuracy = " + accuracy) println("Precision = " + precision) println("Recall = " + recall) println("F1 = " + f1) println(s"Test Error = ${1 - accuracy}") data.show(20) spark.stop() } }
Example 37
Source File: LibSVMRequestRowSerializerUtils.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import scala.collection.mutable.StringBuilder import org.apache.spark.ml.linalg.Vector private[serializers]object LibSVMRequestRowSerializerUtils { def serializeLabeledFeatureVector(label : Double, features : Vector): Array[Byte] = { val sb = new StringBuilder(label.toString) features.foreachActive { case (index, value) => sb ++= s" ${index + 1}:$value" } sb ++= "\n" sb.toString().getBytes } }
Example 38
Source File: RDDLossFunctionSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim.loss import org.apache.spark.SparkFunSuite import org.apache.spark.broadcast.Broadcast import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors} import org.apache.spark.ml.optim.aggregator.DifferentiableLossAggregatorSuite.TestAggregator import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD class RDDLossFunctionSuite extends SparkFunSuite with MLlibTestSparkContext { @transient var instances: RDD[Instance] = _ override def beforeAll(): Unit = { super.beforeAll() instances = sc.parallelize(Seq( Instance(0.0, 0.1, Vectors.dense(1.0, 2.0)), Instance(1.0, 0.5, Vectors.dense(1.5, 1.0)), Instance(2.0, 0.3, Vectors.dense(4.0, 0.5)) )) } test("regularization") { val coefficients = Vectors.dense(0.5, -0.1) val regLossFun = new L2Regularization(0.1, (_: Int) => true, None) val getAgg = (bvec: Broadcast[Vector]) => new TestAggregator(2)(bvec.value) val lossNoReg = new RDDLossFunction(instances, getAgg, None) val lossWithReg = new RDDLossFunction(instances, getAgg, Some(regLossFun)) val (loss1, grad1) = lossNoReg.calculate(coefficients.asBreeze.toDenseVector) val (regLoss, regGrad) = regLossFun.calculate(coefficients) val (loss2, grad2) = lossWithReg.calculate(coefficients.asBreeze.toDenseVector) BLAS.axpy(1.0, Vectors.fromBreeze(grad1), regGrad) assert(regGrad ~== Vectors.fromBreeze(grad2) relTol 1e-5) assert(loss1 + regLoss === loss2) } test("empty RDD") { val rdd = sc.parallelize(Seq.empty[Instance]) val coefficients = Vectors.dense(0.5, -0.1) val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value) val lossFun = new RDDLossFunction(rdd, getAgg, None) withClue("cannot calculate cost for empty dataset") { intercept[IllegalArgumentException]{ lossFun.calculate(coefficients.asBreeze.toDenseVector) } } } test("versus aggregating on an iterable") { val coefficients = Vectors.dense(0.5, -0.1) val getAgg = (bv: Broadcast[Vector]) => new TestAggregator(2)(bv.value) val lossFun = new RDDLossFunction(instances, getAgg, None) val (loss, grad) = lossFun.calculate(coefficients.asBreeze.toDenseVector) // just map the aggregator over the instances array val agg = new TestAggregator(2)(coefficients) instances.collect().foreach(agg.add) assert(loss === agg.loss) assert(Vectors.fromBreeze(grad) === agg.gradient) } }
Example 39
Source File: ChiSquareTestSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import java.util.Random import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.stat.test.ChiSqTest import org.apache.spark.mllib.util.MLlibTestSparkContext class ChiSquareTestSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("test DataFrame of labeled points") { // labels: 1.0 (2 / 6), 0.0 (4 / 6) // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6) // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6) val data = Seq( LabeledPoint(0.0, Vectors.dense(0.5, 10.0)), LabeledPoint(0.0, Vectors.dense(1.5, 20.0)), LabeledPoint(1.0, Vectors.dense(1.5, 30.0)), LabeledPoint(0.0, Vectors.dense(3.5, 30.0)), LabeledPoint(0.0, Vectors.dense(3.5, 40.0)), LabeledPoint(1.0, Vectors.dense(3.5, 40.0))) for (numParts <- List(2, 4, 6, 8)) { val df = spark.createDataFrame(sc.parallelize(data, numParts)) val chi = ChiSquareTest.test(df, "features", "label") val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = chi.select("pValues", "degreesOfFreedom", "statistics") .as[(Vector, Array[Int], Vector)].head() assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4) assert(degreesOfFreedom === Array(2, 3)) assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4) } } test("large number of features (SPARK-3087)") { // Test that the right number of results is returned val numCols = 1001 val sparseData = Array( LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))), LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0))))) val df = spark.createDataFrame(sparseData) val chi = ChiSquareTest.test(df, "features", "label") val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = chi.select("pValues", "degreesOfFreedom", "statistics") .as[(Vector, Array[Int], Vector)].head() assert(pValues.size === numCols) assert(degreesOfFreedom.length === numCols) assert(statistics.size === numCols) assert(pValues(1000) !== null) // SPARK-3087 } test("fail on continuous features or labels") { val tooManyCategories: Int = 100000 assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " + "tooManyCategories be large enough to cause ChiSqTest to throw an exception.") val random = new Random(11L) val continuousLabel = Seq.fill(tooManyCategories)( LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2)))) withClue("ChiSquare should throw an exception when given a continuous-valued label") { intercept[SparkException] { val df = spark.createDataFrame(continuousLabel) ChiSquareTest.test(df, "features", "label") } } val continuousFeature = Seq.fill(tooManyCategories)( LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble()))) withClue("ChiSquare should throw an exception when given continuous-valued features") { intercept[SparkException] { val df = spark.createDataFrame(continuousFeature) ChiSquareTest.test(df, "features", "label") } } } }
Example 40
Source File: HashingTFSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.feature.{HashingTF => MLlibHashingTF} import org.apache.spark.sql.Row import org.apache.spark.util.Utils class HashingTFSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ import HashingTFSuite.murmur3FeatureIdx test("params") { ParamsSuite.checkParams(new HashingTF) } test("hashingTF") { val numFeatures = 100 // Assume perfect hash when computing expected features. def idx: Any => Int = murmur3FeatureIdx(numFeatures) val data = Seq( ("a a b b c d".split(" ").toSeq, Vectors.sparse(numFeatures, Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0)))) ) val df = data.toDF("words", "expected") val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(numFeatures) val output = hashingTF.transform(df) val attrGroup = AttributeGroup.fromStructField(output.schema("features")) require(attrGroup.numAttributes === Some(numFeatures)) testTransformer[(Seq[String], Vector)](df, hashingTF, "features", "expected") { case Row(features: Vector, expected: Vector) => assert(features ~== expected absTol 1e-14) } } test("applying binary term freqs") { val df = Seq((0, "a a b c c c".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) .setBinary(true) val output = hashingTF.transform(df) val features = output.select("features").first().getAs[Vector](0) def idx: Any => Int = murmur3FeatureIdx(n) // Assume perfect hash on input features val expected = Vectors.sparse(n, Seq((idx("a"), 1.0), (idx("b"), 1.0), (idx("c"), 1.0))) assert(features ~== expected absTol 1e-14) } test("read/write") { val t = new HashingTF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setNumFeatures(10) testDefaultReadWrite(t) } } object HashingTFSuite { private[feature] def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = { Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures) } }
Example 41
Source File: ElementwiseProduct.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 42
Source File: HashingTFSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.feature.{HashingTF => MLlibHashingTF} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new HashingTF) } test("hashingTF") { val df = Seq((0, "a a b b c d".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) val output = hashingTF.transform(df) val attrGroup = AttributeGroup.fromStructField(output.schema("features")) require(attrGroup.numAttributes === Some(n)) val features = output.select("features").first().getAs[Vector](0) // Assume perfect hash on "a", "b", "c", and "d". def idx: Any => Int = murmur3FeatureIdx(n) val expected = Vectors.sparse(n, Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) assert(features ~== expected absTol 1e-14) } test("applying binary term freqs") { val df = Seq((0, "a a b c c c".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) .setBinary(true) val output = hashingTF.transform(df) val features = output.select("features").first().getAs[Vector](0) def idx: Any => Int = murmur3FeatureIdx(n) // Assume perfect hash on input features val expected = Vectors.sparse(n, Seq((idx("a"), 1.0), (idx("b"), 1.0), (idx("c"), 1.0))) assert(features ~== expected absTol 1e-14) } test("read/write") { val t = new HashingTF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setNumFeatures(10) testDefaultReadWrite(t) } private def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = { Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures) } }
Example 43
Source File: BinarizerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[Double] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4) } test("params") { ParamsSuite.checkParams(new Binarizer) } test("Binarize continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize continuous features with setter") { val threshold: Double = 0.2 val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with setter") { val threshold: Double = 0.2 val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("read/write") { val t = new Binarizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setThreshold(0.1) testDefaultReadWrite(t) } }
Example 44
Source File: DCTSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 45
Source File: ChiSqSelectorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @transient var dataset: Dataset[_] = _ override def beforeAll(): Unit = { super.beforeAll() // Toy dataset, including the top feature for a chi-squared test. // These data are chosen such that each feature's test has a distinct p-value. val allParamSettings: Map[String, Any] = Map( "selectorType" -> "percentile", "numTopFeatures" -> 1, "percentile" -> 0.12, "outputCol" -> "myOutput" ) }
Example 46
Source File: MaxAbsScalerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("MaxAbsScaler fit basic case") { val data = Array( Vectors.dense(1, 0, 100), Vectors.dense(2, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-2, -100)), Vectors.sparse(3, Array(0), Array(-1.5))) val expected: Array[Vector] = Array( Vectors.dense(0.5, 0, 1), Vectors.dense(1, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-1, -1)), Vectors.sparse(3, Array(0), Array(-0.75))) val df = data.zip(expected).toSeq.toDF("features", "expected") val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaled") val model = scaler.fit(df) model.transform(df).select("expected", "scaled").collect() .foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1") } // copied model must have the same parent. MLTestingUtils.checkCopy(model) } test("MaxAbsScaler read/write") { val t = new MaxAbsScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } test("MaxAbsScalerModel read/write") { val instance = new MaxAbsScalerModel( "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.maxAbs === instance.maxAbs) } }
Example 47
Source File: VectorSlicerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 48
Source File: BinaryClassificationEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 49
Source File: DCT.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 50
Source File: Normalizer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 51
Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 52
Source File: Word2VecExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession object Word2VecExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Word2Vec example") .getOrCreate() // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.collect().foreach { case Row(text: Seq[_], features: Vector) => println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() } } // scalastyle:on println
Example 53
Source File: DataFrameExample.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 54
Source File: VerifyVowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.ml.util.MLReadable class VerifyVowpalWabbitInteractions extends TestBase with TransformerFuzzing[VowpalWabbitInteractions] { case class Data(val v1: Vector, val v2: Vector, val v3: Vector) lazy val df = session.createDataFrame(Seq(Data( Vectors.dense(Array(1.0, 2.0, 3.0)), Vectors.sparse(8, Array(5), Array(4.0)), Vectors.sparse(11, Array(8, 9), Array(7.0, 8.0)) ))) private def featurizeUsing(interactions: VowpalWabbitInteractions) = interactions.transform(df).head().getAs[SparseVector]("features") private def verifyValues(actual: SparseVector, expected: Array[Double]) = { assert(actual.numNonzeros == expected.length) (actual.values.sorted zip expected.sorted).forall{ case (x,y) => x == y } } test("Verify VowpalWabbit Interactions 3-dense x 1-sparse") { val interactions = new VowpalWabbitInteractions() .setInputCols(Array("v1", "v2")) .setOutputCol("features") val v = featurizeUsing(interactions) verifyValues(v, Array(4.0, 8, 12.0)) } test("Verify VowpalWabbit Interactions 1-sparse x 2-sparse") { val interactions = new VowpalWabbitInteractions() .setInputCols(Array("v2", "v3")) .setOutputCol("features") val v = featurizeUsing(interactions) verifyValues(v, Array(28.0, 32.0)) } test("Verify VowpalWabbit Interactions 3-dense x 1-sparse x 2-sparse") { val interactions = new VowpalWabbitInteractions() .setInputCols(Array("v1", "v2", "v3")) .setOutputCol("features") val v = featurizeUsing(interactions) verifyValues(v, Array( 1.0 * 5 * 7, 1 * 5 * 8.0, 2.0 * 5 * 7, 2 * 5 * 8.0, 3.0 * 5 * 7, 3 * 5 * 8.0 )) } def testObjects(): Seq[TestObject[VowpalWabbitInteractions]] = List(new TestObject( new VowpalWabbitInteractions().setInputCols(Array("v1")).setOutputCol("out"), df)) override def reader: MLReadable[_] = VowpalWabbitInteractions }
Example 55
Source File: VerifyIsolationForest.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.isolationforest import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.metrics.MetricConstants import com.microsoft.ml.spark.core.test.benchmarks.Benchmarks import org.apache.spark.ml.util.MLReadable import org.apache.spark.sql.{DataFrame, Dataset, Encoders, Row} import com.microsoft.ml.spark.core.test.fuzzing.{EstimatorFuzzing, TestObject} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.scalactic.Tolerance._ import com.microsoft.ml.spark.train.ComputeModelStatistics case class MammographyRecord(feature0: Double, feature1: Double, feature2: Double, feature3: Double, feature4: Double, feature5: Double, label: Double) case class ScoringResult(features: Vector, label: Double, predictedLabel: Double, outlierScore: Double) class VerifyIsolationForest extends Benchmarks with EstimatorFuzzing[IsolationForest] { test ("Verify isolationForestMammographyDataTest") { import session.implicits._ val data = loadMammographyData // Train a new isolation forest model val contamination = 0.02 val isolationForest = new IsolationForest() .setNumEstimators(100) .setBootstrap(false) .setMaxSamples(256) .setMaxFeatures(1.0) .setFeaturesCol("features") .setPredictionCol("predictedLabel") .setScoreCol("outlierScore") .setContamination(0.02) .setContaminationError(contamination * 0.01) .setRandomSeed(1) // Score all training data instances using the new model val isolationForestModel = isolationForest.fit(data) // Calculate area under ROC curve and assert val scores = isolationForestModel.transform(data).as[ScoringResult] val metrics = new ComputeModelStatistics() .setEvaluationMetric(MetricConstants.AucSparkMetric) .setLabelCol("label") .setScoredLabelsCol("predictedLabel") .setScoresCol("outlierScore") .transform(scores) // Expectation from results in the 2008 "Isolation Forest" paper by F. T. Liu, et al. val aurocExpectation = 0.86 val uncert = 0.02 val auroc = metrics.first().getDouble(1) assert(auroc === aurocExpectation +- uncert, "expected area under ROC =" + s" $aurocExpectation +/- $uncert, but observed $auroc") } def loadMammographyData(): DataFrame = { import session.implicits._ val mammographyRecordSchema = Encoders.product[MammographyRecord].schema val fileLocation = FileUtilities.join(BuildInfo.datasetDir,"IsolationForest", "mammography.csv").toString // Open source dataset from http://odds.cs.stonybrook.edu/mammography-dataset/ val rawData = session.read .format("csv") .option("comment", "#") .option("header", "false") .schema(mammographyRecordSchema) .load(fileLocation) val assembler = new VectorAssembler() .setInputCols(Array("feature0", "feature1", "feature2", "feature3", "feature4", "feature5")) .setOutputCol("features") val data = assembler .transform(rawData) .select("features", "label") data } override def reader: MLReadable[_] = IsolationForest override def modelReader: MLReadable[_] = IsolationForestModel override def testObjects(): Seq[TestObject[IsolationForest]] = { val dataset = loadMammographyData.toDF Seq(new TestObject( new IsolationForest(), dataset)) } }
Example 56
Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable} import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions] class VowpalWabbitInteractions(override val uid: String) extends Transformer with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("VowpalWabbitInteractions")) override def transform(dataset: Dataset[_]): DataFrame = { val fieldSubset = dataset.schema.fields .filter(f => getInputCols.contains(f.name)) val mask = getMask val mode = udf((r: Row) => { // compute the final number of features val numElems = (0 until r.length) .map(r.getAs[Vector](_).numNonzeros).product val newIndices = new Array[Int](numElems) val newValues = new Array[Double](numElems) // build interaction features using FNV-1 val fnvPrime = 16777619 var i = 0 def interact(idx: Int, value: Double, ns: Int): Unit = { if (ns == r.size) { newIndices(i) += mask & idx newValues(i) += value i += 1 } else { val idx1 = idx * fnvPrime r.getAs[Vector](ns).foreachActive { case (idx2, value2) => interact(idx1 ^ idx2, value * value2, ns + 1) } } } // start the recursion interact(0, 1, 0) val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions) Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted) }) dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*))) } override def transformSchema(schema: StructType): StructType = { val fieldNames = schema.fields.map(_.name) for (f <- getInputCols) if (!fieldNames.contains(f)) throw new IllegalArgumentException("missing input column " + f) else { val fieldType = schema.fields(schema.fieldIndex(f)).dataType if (fieldType != VectorType) throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName) } schema.add(StructField(getOutputCol, VectorType, true)) } override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra) }
Example 57
Source File: VectorFeaturizer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw.featurizer import org.apache.spark.sql.Row import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import scala.collection.mutable override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = { row.getAs[Vector](fieldIdx) match { case v: DenseVector => // check if we need to hash if (v.size < mask + 1) indices ++= 0 until v.size else indices ++= (0 until v.size).map { mask & _ } values ++= v.values case v: SparseVector => // check if we need to hash if (v.size < mask + 1) indices ++= v.indices else indices ++= v.indices.map { mask & _ } values ++= v.values } () } }
Example 58
Source File: VowpalWabbitClassifier.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.env.InternalWrapper import com.microsoft.ml.spark.core.schema.DatasetExtensions import org.apache.spark.ml.ComplexParamsReadable import org.apache.spark.ml.param._ import org.apache.spark.ml.util._ import org.apache.spark.ml.classification.{ProbabilisticClassificationModel, ProbabilisticClassifier} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql._ import org.apache.spark.sql.functions.{col, udf} import org.vowpalwabbit.spark.VowpalWabbitExample import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import scala.math.exp object VowpalWabbitClassifier extends DefaultParamsReadable[VowpalWabbitClassifier] @InternalWrapper class VowpalWabbitClassifier(override val uid: String) extends ProbabilisticClassifier[Row, VowpalWabbitClassifier, VowpalWabbitClassificationModel] with VowpalWabbitBase { def this() = this(Identifiable.randomUID("VowpalWabbitClassifier")) // to support Grid search we need to replicate the parameters here... val labelConversion = new BooleanParam(this, "labelConversion", "Convert 0/1 Spark ML style labels to -1/1 VW style labels. Defaults to true.") setDefault(labelConversion -> true) def getLabelConversion: Boolean = $(labelConversion) def setLabelConversion(value: Boolean): this.type = set(labelConversion, value) override protected def train(dataset: Dataset[_]): VowpalWabbitClassificationModel = { val model = new VowpalWabbitClassificationModel(uid) .setFeaturesCol(getFeaturesCol) .setAdditionalFeatures(getAdditionalFeatures) .setPredictionCol(getPredictionCol) .setProbabilityCol(getProbabilityCol) .setRawPredictionCol(getRawPredictionCol) val finalDataset = if (!getLabelConversion) dataset else { val inputLabelCol = dataset.withDerivativeCol("label") dataset .withColumnRenamed(getLabelCol, inputLabelCol) .withColumn(getLabelCol, col(inputLabelCol) * 2 - 1) } trainInternal(finalDataset, model) } override def copy(extra: ParamMap): VowpalWabbitClassifier = defaultCopy(extra) } // Preparation for multi-class learning, though it no fun as numClasses is spread around multiple reductions @InternalWrapper class VowpalWabbitClassificationModel(override val uid: String) extends ProbabilisticClassificationModel[Row, VowpalWabbitClassificationModel] with VowpalWabbitBaseModel { def numClasses: Int = 2 override def transform(dataset: Dataset[_]): DataFrame = { val df = transformImplInternal(dataset) // which mode one wants to use depends a bit on how this should be deployed // 1. if you stay in spark w/o link=logistic is probably more convenient as it also returns the raw prediction // 2. if you want to export the model *and* get probabilities at scoring term w/ link=logistic is preferable // convert raw prediction to probability (if needed) val probabilityUdf = if (vwArgs.getArgs.contains("--link logistic")) udf { (pred: Double) => Vectors.dense(Array(1 - pred, pred)) } else udf { (pred: Double) => { val prob = 1.0 / (1.0 + exp(-pred)) Vectors.dense(Array(1 - prob, prob)) } } val df2 = df.withColumn($(probabilityCol), probabilityUdf(col($(rawPredictionCol)))) // convert probability to prediction val probability2predictionUdf = udf(probability2prediction _) df2.withColumn($(predictionCol), probability2predictionUdf(col($(probabilityCol)))) } override def copy(extra: ParamMap): this.type = defaultCopy(extra) protected override def predictRaw(features: Row): Vector = { throw new NotImplementedError("Not implemented") } protected override def raw2probabilityInPlace(rawPrediction: Vector): Vector= { throw new NotImplementedError("Not implemented") } } object VowpalWabbitClassificationModel extends ComplexParamsReadable[VowpalWabbitClassificationModel]
Example 59
Source File: PredictGolfing.scala From spark_training with Apache License 2.0 | 5 votes |
package com.malaska.spark.training.machinelearning.golf import com.malaska.spark.training.machinelearning.common.ClassifiersImpl import com.malaska.spark.training.machinelearning.titanic.TrainPassenger import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql.SparkSession object PredictGolfing { def main(args:Array[String]): Unit = { val testFile = args(0) val trainFile = args(1) val testPercentage = args(2).toDouble val isLocal = true val sparkSession = if (isLocal) { SparkSession.builder .master("local") .appName("my-spark-app") .config("spark.some.config.option", "config-value") .config("spark.driver.host","127.0.0.1") .config("spark.sql.parquet.compression.codec", "gzip") .enableHiveSupport() .getOrCreate() } else { SparkSession.builder .appName("my-spark-app") .config("spark.some.config.option", "config-value") .enableHiveSupport() .getOrCreate() } import sparkSession.implicits._ //Load Data val trainDs = sparkSession.read.option("header", "true") .option("charset", "UTF8") .option("delimiter","\t") .csv(trainFile) .as[GolfDay] val labeledPointRdd = trainDs.rdd.map(golfDay => { val outlookSunny = if (golfDay.outlook.equals("sunny")) 1d else 0d val outlookRainy = if (golfDay.outlook.equals("rainy")) 1d else 0d val outlookOvercast = if (golfDay.outlook.equals("overcast")) 1d else 0d val temp = golfDay.temp.toDouble val tempHot = if (golfDay.outlook.equals("hot")) 1d else 0d val tempMild = if (golfDay.outlook.equals("mild")) 1d else 0d val tempCool = if (golfDay.outlook.equals("cool")) 1d else 0d val humidityHigh = if (golfDay.outlook.equals("high")) 1d else 0d val humidityNormal = if (golfDay.outlook.equals("normal")) 1d else 0d val windy = if (golfDay.outlook.equals("true")) 1d else 0d val play = if (golfDay.outlook.equals("yes")) 1d else 0d val vector: Vector = Vectors.dense(Array(outlookSunny, outlookRainy, outlookOvercast, temp, tempHot, tempMild, tempCool, humidityHigh, humidityNormal, windy)) (play, vector) }) val labeledPointDf = labeledPointRdd.toDF("passenderId", "features") ClassifiersImpl.naiveBayerTest(labeledPointDf, testPercentage) ClassifiersImpl.decisionTree(labeledPointDf, "gini", 7, 32, testPercentage) ClassifiersImpl.decisionTree(labeledPointDf, "entropy", 7, 32, testPercentage) ClassifiersImpl.randomForestRegressor(labeledPointDf, "variance", 5, 32, testPercentage) ClassifiersImpl.gbtClassifer(labeledPointDf, testPercentage) ClassifiersImpl.logisticRegression(labeledPointDf, testPercentage) } } case class GolfDay(outlook:String, temp:Int, tempEnum:String, humidity:Int, humidityEnum:String, windyFlag:Boolean, playFlag:Boolean)
Example 60
Source File: SparkModelConverter.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.base.binary.OpTransformer2 import com.salesforce.op.stages.impl.classification._ import com.salesforce.op.stages.impl.regression._ import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassificationModel, XGBoostRegressionModel} import org.apache.spark.ml.classification._ import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.regression._ import org.apache.spark.ml.{Model, PredictionModel} // TODO remove when loco and model selector are updated def toOPUnchecked( model: Model[_], uid: String ): OpTransformer2[RealNN, OPVector, Prediction] = { model match { case m: LogisticRegressionModel => new OpLogisticRegressionModel(m, uid = uid) case m: RandomForestClassificationModel => new OpRandomForestClassificationModel(m, uid = uid) case m: NaiveBayesModel => new OpNaiveBayesModel(m, uid) case m: DecisionTreeClassificationModel => new OpDecisionTreeClassificationModel(m, uid = uid) case m: GBTClassificationModel => new OpGBTClassificationModel(m, uid = uid) case m: LinearSVCModel => new OpLinearSVCModel(m, uid = uid) case m: MultilayerPerceptronClassificationModel => new OpMultilayerPerceptronClassificationModel(m, uid = uid) case m: LinearRegressionModel => new OpLinearRegressionModel(m, uid = uid) case m: RandomForestRegressionModel => new OpRandomForestRegressionModel(m, uid = uid) case m: GBTRegressionModel => new OpGBTRegressionModel(m, uid = uid) case m: DecisionTreeRegressionModel => new OpDecisionTreeRegressionModel(m, uid = uid) case m: GeneralizedLinearRegressionModel => new OpGeneralizedLinearRegressionModel(m, uid = uid) case m: XGBoostClassificationModel => new OpXGBoostClassificationModel(m, uid = uid) case m: XGBoostRegressionModel => new OpXGBoostRegressionModel(m, uid = uid) case m => throw new RuntimeException(s"model conversion not implemented for model $m") } } }
Example 61
Source File: XGBoostUtils.scala From pravda-ml with Apache License 2.0 | 5 votes |
package ml.dmlc.xgboost4j.scala.spark import ml.dmlc.xgboost4j.scala.Booster import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{BooleanParam, Params} import org.apache.spark.ml.param.shared.{HasFeaturesCol, HasPredictionCol} import org.apache.spark.sql.{Dataset, functions} object XGBoostUtils { def getBooster(x: XGBoostClassificationModel): Booster = x._booster def getBooster(x: XGBoostRegressionModel): Booster = x._booster } trait OkXGBoostParams extends HasFeaturesCol with HasPredictionCol { this: Params => val densifyInput = new BooleanParam(this, "densifyInput", "In order to fix the difference between spark abd xgboost sparsity treatment") val predictAsDouble = new BooleanParam(this, "predictAsDouble", "Whenver to cast XGBoost prediction to double matching common behavior for other predictors.") val addRawTrees = new BooleanParam(this, "addRawTrees", "Whenever to add raw trees block to model summary.") val addSignificance = new BooleanParam(this, "addSignificance", "Whenever to add feature significance block to model summary.") def setAddSignificance(value: Boolean): this.type = set(addSignificance, value) def setAddRawTrees(value: Boolean): this.type = set(addRawTrees, value) def setDensifyInput(value: Boolean): this.type = set(densifyInput, value) def setPredictAsDouble(value: Boolean): this.type = set(predictAsDouble, value) protected def densifyIfNeeded(dataset: Dataset[_]) : Dataset[_] = { if ($(densifyInput)) { val densify = functions.udf((x: Vector) => x.toDense) val col = getFeaturesCol val metadata = dataset.schema(col).metadata dataset.withColumn( col, densify(dataset(col)).as(col, metadata)) } else { dataset } } } trait OkXGBoostClassifierParams extends XGBoostClassifierParams with OkXGBoostParams trait OkXGBoostRegressorParams extends XGBoostRegressorParams with OkXGBoostParams
Example 62
Source File: IDFTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.feature.IDF import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.{Estimator, Transformer} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class IDFTest extends FlatSpec with TestSparkContext { val data = Seq( Vectors.sparse(4, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(4, Array(1), Array(1.0)) ) lazy val (ds, f1) = TestFeatureBuilder(data.map(_.toOPVector)) Spec[IDF] should "compute inverted document frequency" in { val idf = f1.idf() val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((data.length + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } it should "compute inverted document frequency when minDocFreq is 1" in { val idf = f1.idf(minDocFreq = 1) val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((data.length + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } private def scaleDataWithIDF(dataSet: Seq[Vector], model: Vector): Seq[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } }
Example 63
Source File: DropIndicesByTransformerTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.TransientFeature import com.salesforce.op.features.types._ import com.salesforce.op.test.{OpTransformerSpec, TestFeatureBuilder} import com.salesforce.op.testkit.RandomText import com.salesforce.op.utils.spark.{OpVectorColumnMetadata, OpVectorMetadata} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.linalg.{Vector, Vectors} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.apache.spark.sql.functions._ @RunWith(classOf[JUnitRunner]) class DropIndicesByTransformerTest extends OpTransformerSpec[OPVector, DropIndicesByTransformer] with AttributeAsserts { val (inputData, transformer) = { val vecData = Seq( Vectors.dense(1.0, 1.0, 0.0), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.0) ).map(_.toOPVector) val (data, v) = TestFeatureBuilder(vecData) val meta = OpVectorMetadata(v.name, Array(TransientFeature(v).toColumnMetaData()), Map.empty).toMetadata val inputData = data.withColumn(v.name, col(v.name).as(v.name, meta)) val stage = new DropIndicesByTransformer(new DropIndicesByTransformerTest.MatchFn) .setInput(v).setInputSchema(inputData.schema) inputData -> stage } val expectedResult = Seq( Vectors.dense(1.0), Vectors.dense(0.0), Vectors.dense(0.0) ).map(_.toOPVector) val picklistData = RandomText.pickLists(domain = List("Red", "Blue", "Green")).withProbabilityOfEmpty(0.0).limit(100) val (df, picklistFeature) = TestFeatureBuilder("color", picklistData) it should "filter vector using a predicate" in { val vectorizedPicklist = picklistFeature.vectorize(topK = 10, minSupport = 3, cleanText = false) val prunedVector = new DropIndicesByTransformer(_.indicatorValue.contains("Red")) .setInput(vectorizedPicklist) .getOutput() val materializedFeatures = new OpWorkflow().setResultFeatures(vectorizedPicklist, prunedVector).transform(df) val field = materializedFeatures.schema(prunedVector.name) val collectedFeatures = materializedFeatures.collect(prunedVector) assertNominal(field, Array.fill(collectedFeatures.head.value.size)(true), collectedFeatures) collectedFeatures.foreach(_.value.size shouldBe 4) materializedFeatures.collect().foreach { r => if (r.getString(0) == "Red") r.getAs[Vector](2).toArray.forall(_ == 0) shouldBe true else r.getAs[Vector](2).toArray.max shouldBe 1 } val rawMeta = OpVectorMetadata(vectorizedPicklist.name, vectorizedPicklist.originStage.getMetadata()) val trimmedMeta = OpVectorMetadata(materializedFeatures.schema(prunedVector.name)) rawMeta.columns.length - 1 shouldBe trimmedMeta.columns.length trimmedMeta.columns.foreach(_.indicatorValue.contains("Red") shouldBe false) } it should "work with its shortcut" in { val vectorizedPicklist = picklistFeature.vectorize(topK = 10, minSupport = 3, cleanText = false) val prunedVector = vectorizedPicklist.dropIndicesBy(_.isNullIndicator) val materializedFeatures = new OpWorkflow().setResultFeatures(vectorizedPicklist, prunedVector).transform(df) val field = materializedFeatures.schema(prunedVector.name) val collectedFeatures = materializedFeatures.collect(prunedVector) assertNominal(field, Array.fill(collectedFeatures.head.value.size)(true), collectedFeatures) collectedFeatures.foreach(_.value.size shouldBe 4) materializedFeatures.collect().foreach(_.getAs[Vector](2).toArray.max shouldBe 1) val rawMeta = OpVectorMetadata(vectorizedPicklist.name, vectorizedPicklist.originStage.getMetadata()) val trimmedMeta = OpVectorMetadata(materializedFeatures.schema(prunedVector.name)) rawMeta.columns.length - 1 shouldBe trimmedMeta.columns.length trimmedMeta.columns.foreach(_.isNullIndicator shouldBe false) } it should "validate that the match function is serializable" in { class NonSerializable(val in: Int) val nonSer = new NonSerializable(5) val vectorizedPicklist = picklistFeature.vectorize(topK = 10, minSupport = 3, cleanText = false) intercept[IllegalArgumentException]( vectorizedPicklist.dropIndicesBy(_.indicatorValue.get == nonSer.in.toString) ).getMessage shouldBe "Provided function is not serializable" } } object DropIndicesByTransformerTest { class MatchFn extends Function1[OpVectorColumnMetadata, Boolean] with Serializable { def apply(m: OpVectorColumnMetadata): Boolean = m.isNullIndicator } }
Example 64
Source File: OpLDATest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.clustering.LDA import org.apache.spark.ml.linalg.{Vector, Vectors} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class OpLDATest extends FlatSpec with TestSparkContext { val inputData = Seq( (0.0, Vectors.sparse(11, Array(0, 1, 2, 4, 5, 6, 7, 10), Array(1.0, 2.0, 6.0, 2.0, 3.0, 1.0, 1.0, 3.0))), (1.0, Vectors.sparse(11, Array(0, 1, 3, 4, 7, 10), Array(1.0, 3.0, 1.0, 3.0, 2.0, 1.0))), (2.0, Vectors.sparse(11, Array(0, 1, 2, 5, 6, 8, 9), Array(1.0, 4.0, 1.0, 4.0, 9.0, 1.0, 2.0))), (3.0, Vectors.sparse(11, Array(0, 1, 3, 6, 8, 9, 10), Array(2.0, 1.0, 3.0, 5.0, 2.0, 3.0, 9.0))), (4.0, Vectors.sparse(11, Array(0, 1, 2, 3, 4, 6, 9, 10), Array(3.0, 1.0, 1.0, 9.0, 3.0, 2.0, 1.0, 3.0))), (5.0, Vectors.sparse(11, Array(0, 1, 3, 4, 5, 6, 7, 8, 9), Array(4.0, 2.0, 3.0, 4.0, 5.0, 1.0, 1.0, 1.0, 4.0))), (6.0, Vectors.sparse(11, Array(0, 1, 3, 6, 8, 9, 10), Array(2.0, 1.0, 3.0, 5.0, 2.0, 2.0, 9.0))), (7.0, Vectors.sparse(11, Array(0, 1, 2, 3, 4, 5, 6, 9, 10), Array(1.0, 1.0, 1.0, 9.0, 2.0, 1.0, 2.0, 1.0, 3.0))), (8.0, Vectors.sparse(11, Array(0, 1, 3, 4, 5, 6, 7), Array(4.0, 4.0, 3.0, 4.0, 2.0, 1.0, 3.0))), (9.0, Vectors.sparse(11, Array(0, 1, 2, 4, 6, 8, 9, 10), Array(2.0, 8.0, 2.0, 3.0, 2.0, 2.0, 7.0, 2.0))), (10.0, Vectors.sparse(11, Array(0, 1, 2, 3, 5, 6, 9, 10), Array(1.0, 1.0, 1.0, 9.0, 2.0, 2.0, 3.0, 3.0))), (11.0, Vectors.sparse(11, Array(0, 1, 4, 5, 6, 7, 9), Array(4.0, 1.0, 4.0, 5.0, 1.0, 3.0, 1.0))) ).map(v => v._1.toReal -> v._2.toOPVector) lazy val (ds, f1, f2) = TestFeatureBuilder(inputData) lazy val inputDS = ds.persist() val seed = 1234567890L val k = 3 val maxIter = 100 lazy val expected = new LDA() .setFeaturesCol(f2.name) .setK(k) .setSeed(seed) .fit(inputDS) .transform(inputDS) .select("topicDistribution") .collect() .toSeq .map(_.getAs[Vector](0)) Spec[OpLDA] should "convert document term vectors into topic vectors" in { val f2Vec = new OpLDA().setInput(f2).setK(k).setSeed(seed).setMaxIter(maxIter) val testTransformedData = f2Vec.fit(inputDS).transform(inputDS) val output = f2Vec.getOutput() val estimate = testTransformedData.collect(output) val mse = computeMeanSqError(estimate, expected) val expectedMse = 0.5 withClue(s"Computed mse $mse (expected $expectedMse)") { mse should be < expectedMse } } it should "convert document term vectors into topic vectors (shortcut version)" in { val output = f2.lda(k = k, seed = seed, maxIter = maxIter) val f2Vec = output.originStage.asInstanceOf[OpLDA] val testTransformedData = f2Vec.fit(inputDS).transform(inputDS) val estimate = testTransformedData.collect(output) val mse = computeMeanSqError(estimate, expected) val expectedMse = 0.5 withClue(s"Computed mse $mse (expected $expectedMse)") { mse should be < expectedMse } } private def computeMeanSqError(estimate: Seq[OPVector], expected: Seq[Vector]): Double = { val n = estimate.length.toDouble estimate.zip(expected).map { case (est, exp) => Vectors.sqdist(est.value, exp) }.sum / n } }
Example 65
Source File: OpTransformerWrapperTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.feature.{Normalizer, StopWordsRemover} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class OpTransformerWrapperTest extends FlatSpec with TestSparkContext { val (testData, featureVector) = TestFeatureBuilder( Seq[MultiPickList]( Set("I", "saw", "the", "red", "balloon").toMultiPickList, Set("Mary", "had", "a", "little", "lamb").toMultiPickList ) ) val (testDataNorm, _, _) = TestFeatureBuilder("label", "features", Seq[(Real, OPVector)]( 0.0.toReal -> Vectors.dense(1.0, 0.5, -1.0).toOPVector, 1.0.toReal -> Vectors.dense(2.0, 1.0, 1.0).toOPVector, 2.0.toReal -> Vectors.dense(4.0, 10.0, 2.0).toOPVector ) ) val (targetDataNorm, targetLabelNorm, featureVectorNorm) = TestFeatureBuilder("label", "features", Seq[(Real, OPVector)]( 0.0.toReal -> Vectors.dense(0.4, 0.2, -0.4).toOPVector, 1.0.toReal -> Vectors.dense(0.5, 0.25, 0.25).toOPVector, 2.0.toReal -> Vectors.dense(0.25, 0.625, 0.125).toOPVector ) ) Spec[OpTransformerWrapper[_, _, _]] should "remove stop words with caseSensitivity=true" in { val remover = new StopWordsRemover().setCaseSensitive(true) val swFilter = new OpTransformerWrapper[MultiPickList, MultiPickList, StopWordsRemover](remover).setInput(featureVector) val output = swFilter.transform(testData) output.collect(swFilter.getOutput()) shouldBe Array( Seq("I", "saw", "red", "balloon").toMultiPickList, Seq("Mary", "little", "lamb").toMultiPickList ) } it should "should properly normalize each feature vector instance with non-default norm of 1" in { val baseNormalizer = new Normalizer().setP(1.0) val normalizer = new OpTransformerWrapper[OPVector, OPVector, Normalizer](baseNormalizer).setInput(featureVectorNorm) val output = normalizer.transform(testDataNorm) val sumSqDist = validateDataframeDoubleColumn(output, normalizer.getOutput().name, targetDataNorm, "features") assert(sumSqDist <= 1E-6, "==> the sum of squared distances between actual and expected should be below tolerance.") } def validateDataframeDoubleColumn( normalizedFeatureDF: DataFrame, normalizedFeatureName: String, targetFeatureDF: DataFrame, targetColumnName: String ): Double = { val sqDistUdf = udf { (leftColVec: Vector, rightColVec: Vector) => Vectors.sqdist(leftColVec, rightColVec) } val targetColRename = "targetFeatures" val renamedTargedDF = targetFeatureDF.withColumnRenamed(targetColumnName, targetColRename) val joinedDF = normalizedFeatureDF.join(renamedTargedDF, Seq("label")) // compute sum of squared distances between expected and actual val finalDF = joinedDF.withColumn("sqDist", sqDistUdf(joinedDF(normalizedFeatureName), joinedDF(targetColRename))) val sumSqDist: Double = finalDF.agg(sum(finalDF("sqDist"))).first().getDouble(0) sumSqDist } }
Example 66
Source File: OpMultilayerPerceptronClassifier.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.classification import com.salesforce.op.UID import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.impl.CheckIsResponseValues import com.salesforce.op.stages.sparkwrappers.specific.{OpPredictorWrapper, OpProbabilisticClassifierModel} import com.salesforce.op.utils.reflection.ReflectionUtils.reflectMethod import org.apache.spark.ml.classification.{MultilayerPerceptronClassificationModel, MultilayerPerceptronClassifier, OpMultilayerPerceptronClassifierParams} import org.apache.spark.ml.linalg.Vector import scala.reflect.runtime.universe.TypeTag class OpMultilayerPerceptronClassificationModel ( sparkModel: MultilayerPerceptronClassificationModel, uid: String = UID[OpMultilayerPerceptronClassificationModel], operationName: String = classOf[MultilayerPerceptronClassifier].getSimpleName )( implicit tti1: TypeTag[RealNN], tti2: TypeTag[OPVector], tto: TypeTag[Prediction], ttov: TypeTag[Prediction#Value] ) extends OpProbabilisticClassifierModel[MultilayerPerceptronClassificationModel]( sparkModel = sparkModel, uid = uid, operationName = operationName ) { @transient lazy val predictRawMirror = reflectMethod(getSparkMlStage().get, "predictRaw") @transient lazy val raw2probabilityMirror = reflectMethod(getSparkMlStage().get, "raw2probability") @transient lazy val probability2predictionMirror = reflectMethod(getSparkMlStage().get, "probability2prediction") }
Example 67
Source File: VectorsCombiner.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op.UID import com.salesforce.op.features.TransientFeature import com.salesforce.op.features.types._ import com.salesforce.op.stages.base.sequence.{SequenceEstimator, SequenceModel} import com.salesforce.op.utils.spark.OpVectorMetadata import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.sql.Dataset import scala.collection.mutable.ArrayBuffer import scala.util.Try private def updateMetadata(data: Dataset[Seq[OPVector#Value]]): Unit = { val schema = getInputSchema() lazy val firstRow = data.first() def vectorSize(f: TransientFeature, index: Int): Int = Try { AttributeGroup.fromStructField(schema(f.name)).numAttributes.get // see it there is an attribute group size } getOrElse firstRow(index).size // get the size from the data val attributes = inN.zipWithIndex.map { case (f, i) => Try(OpVectorMetadata(schema(f.name))).getOrElse(f.toVectorMetaData(vectorSize(f, i))) } val outMeta = OpVectorMetadata.flatten(getOutputFeatureName, attributes) setMetadata(outMeta.toMetadata) } } final class VectorsCombinerModel private[op] (operationName: String, uid: String) extends SequenceModel[OPVector, OPVector](operationName = operationName, uid = uid) { def transformFn: Seq[OPVector] => OPVector = s => s.toList match { case v1 :: v2 :: tail => v1.combine(v2, tail: _*) case v :: Nil => v case Nil => OPVector.empty } }
Example 68
Source File: OPLogLoss.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.evaluator import com.salesforce.op.evaluators.{Evaluators, OpBinaryClassificationEvaluatorBase, OpMultiClassificationEvaluatorBase, SingleMetric} import com.twitter.algebird.AveragedValue import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Dataset import com.salesforce.op.utils.spark.RichDataset.RichDataset object LogLoss { private def logLossFun(ds: Dataset[(Double, Vector, Vector, Double)]): Double = { import ds.sparkSession.implicits._ require(!ds.isEmpty, "Dataset is empty, log loss cannot be calculated") val avg = ds.map { case (lbl, _, prob, _) => new AveragedValue(count = 1L, value = -math.log(prob.toArray(lbl.toInt))) }.reduce(_ + _) avg.value } def binaryLogLoss: OpBinaryClassificationEvaluatorBase[SingleMetric] = Evaluators.BinaryClassification.custom( metricName = "BinarylogLoss", largerBetter = false, evaluateFn = logLossFun ) def multiLogLoss: OpMultiClassificationEvaluatorBase[SingleMetric] = Evaluators.MultiClassification.custom( metricName = "MultiClasslogLoss", largerBetter = false, evaluateFn = logLossFun ) }
Example 69
Source File: OpPredictorWrapper.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.sparkwrappers.specific import com.salesforce.op.UID import com.salesforce.op.features.types.{OPVector, Prediction, RealNN} import com.salesforce.op.stages.OpPipelineStage2 import com.salesforce.op.stages.base.binary.OpTransformer2 import com.salesforce.op.stages.sparkwrappers.generic.SparkWrapperParams import org.apache.spark.ml._ import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Dataset import scala.reflect.runtime.universe.TypeTag override def fit(dataset: Dataset[_]): OpPredictorWrapperModel[M] = { setInputSchema(dataset.schema).transformSchema(dataset.schema) copyValues(predictor) // when params are shared with wrapping class this will pass them into the model val p1 = predictor.getParam(inputParam1Name) val p2 = predictor.getParam(inputParam2Name) val po = predictor.getParam(outputParamName) val model: M = predictor .set(p1, in1.name) .set(p2, in2.name) .set(po, getOutputFeatureName) .fit(dataset) SparkModelConverter.toOP(model, uid) .setParent(this) .setInput(in1.asFeatureLike[RealNN], in2.asFeatureLike[OPVector]) .setMetadata(getMetadata()) .setOutputFeatureName(getOutputFeatureName) } } abstract class OpPredictorWrapperModel[M <: PredictionModel[Vector, M]] ( val operationName: String, val uid: String, val sparkModel: M )( implicit val tti1: TypeTag[RealNN], val tti2: TypeTag[OPVector], val tto: TypeTag[Prediction], val ttov: TypeTag[Prediction#Value] ) extends Model[OpPredictorWrapperModel[M]] with SparkWrapperParams[M] with OpTransformer2[RealNN, OPVector, Prediction] { setDefault(sparkMlStage, Option(sparkModel)) }
Example 70
Source File: LDA.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.clustering import scala.collection.mutable.{HashMap => MHashMap} import org.apache.commons.math3.random.Well19937c import org.apache.spark.ml.{Estimator, PipelineStage} import org.apache.spark.ml import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.ml.linalg.{Vector, Vectors} import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} import com.databricks.spark.sql.perf.mllib.OptionImplicits._ object LDA extends BenchmarkAlgorithm with TestFromTraining { // The LDA model is package private, no need to expose it. override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ val rdd = ctx.sqlContext.sparkContext.parallelize( 0L until numExamples, numPartitions ) val seed: Int = randomSeed val docLen = docLength.get val numVocab = vocabSize.get val data: RDD[(Long, Vector)] = rdd.mapPartitionsWithIndex { (idx, partition) => val rng = new Well19937c(seed ^ idx) partition.map { docIndex => var currentSize = 0 val entries = MHashMap[Int, Int]() while (currentSize < docLen) { val index = rng.nextInt(numVocab) entries(index) = entries.getOrElse(index, 0) + 1 currentSize += 1 } val iter = entries.toSeq.map(v => (v._1, v._2.toDouble)) (docIndex, Vectors.sparse(numVocab, iter)) } } ctx.sqlContext.createDataFrame(data).toDF("docIndex", "features") } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.clustering.LDA() .setK(k) .setSeed(randomSeed.toLong) .setMaxIter(maxIter) .setOptimizer(optimizer) } // TODO(?) add a scoring method here. }
Example 71
Source File: RichVector.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.spark import breeze.linalg.{DenseVector => BreezeDenseVector, SparseVector => BreezeSparseVector, Vector => BreezeVector} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.collection.mutable.ArrayBuffer def combine(vectors: Seq[Vector]): Vector = { val indices = ArrayBuffer.empty[Int] val values = ArrayBuffer.empty[Double] val size = vectors.foldLeft(0)((size, vector) => { vector.foreachActive { case (i, v) => if (v != 0.0) { indices += size + i values += v } } size + vector.size }) Vectors.sparse(size, indices.toArray, values.toArray).compressed } implicit class RichSparseVector(val v: SparseVector) extends AnyVal { def updated(index: Int, indexVal: Int, value: Double): SparseVector = { require(v.indices(index) == indexVal, s"Invalid index: indices($index)==${v.indices(index)}, expected: $indexVal") v.values(index) = value v } } }
Example 72
Source File: MLeapModelConverter.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.local import ml.combust.mleap.core.feature._ import org.apache.spark.ml.linalg.Vector def modelToFunction(model: Any): Array[Any] => Any = model match { case m: BinarizerModel => x => m.apply(x(0).asInstanceOf[Number].doubleValue()) case m: BucketedRandomProjectionLSHModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: BucketizerModel => x => m.apply(x(0).asInstanceOf[Number].doubleValue()) case m: ChiSqSelectorModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: CoalesceModel => x => m.apply(x: _*) case m: CountVectorizerModel => x => m.apply(x(0).asInstanceOf[Seq[String]]) case m: DCTModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: ElementwiseProductModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: FeatureHasherModel => x => m.apply(x(0).asInstanceOf[Seq[Any]]) case m: HashingTermFrequencyModel => x => m.apply(x(0).asInstanceOf[Seq[Any]]) case m: IDFModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: ImputerModel => x => m.apply(x(0).asInstanceOf[Number].doubleValue()) case m: InteractionModel => x => m.apply(x(0).asInstanceOf[Seq[Any]]) case m: MathBinaryModel => x => m.apply( x.headOption.map(_.asInstanceOf[Number].doubleValue()), x.lastOption.map(_.asInstanceOf[Number].doubleValue()) ) case m: MathUnaryModel => x => m.apply(x(0).asInstanceOf[Number].doubleValue()) case m: MaxAbsScalerModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: MinHashLSHModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: MinMaxScalerModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: NGramModel => x => m.apply(x(0).asInstanceOf[Seq[String]]) case m: NormalizerModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: OneHotEncoderModel => x => m.apply(x(0).asInstanceOf[Vector].toArray) case m: PcaModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: PolynomialExpansionModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: RegexIndexerModel => x => m.apply(x(0).toString) case m: RegexTokenizerModel => x => m.apply(x(0).toString) case m: ReverseStringIndexerModel => x => m.apply(x(0).asInstanceOf[Number].intValue()) case m: StandardScalerModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: StopWordsRemoverModel => x => m.apply(x(0).asInstanceOf[Seq[String]]) case m: StringIndexerModel => x => m.apply(x(0)) case m: StringMapModel => x => m.apply(x(0).toString) case m: TokenizerModel => x => m.apply(x(0).toString) case m: VectorAssemblerModel => x => m.apply(x(0).asInstanceOf[Seq[Any]]) case m: VectorIndexerModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: VectorSlicerModel => x => m.apply(x(0).asInstanceOf[Vector]) case m: WordLengthFilterModel => x => m.apply(x(0).asInstanceOf[Seq[String]]) case m: WordToVectorModel => x => m.apply(x(0).asInstanceOf[Seq[String]]) case m => throw new RuntimeException(s"Unsupported MLeap model: ${m.getClass.getName}") } }
Example 73
Source File: ClassificationIntegrationTests.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml package classification import frameless.ml.feature.{TypedIndexToString, TypedStringIndexer, TypedVectorAssembler} import org.apache.spark.ml.linalg.Vector import org.scalatest.matchers.must.Matchers class ClassificationIntegrationTests extends FramelessMlSuite with Matchers { test("predict field3 from field1 and field2 using a RandomForestClassifier") { case class Data(field1: Double, field2: Int, field3: String) // Training val trainingDataDs = TypedDataset.create(Seq.fill(10)(Data(0D, 10, "foo"))) case class Features(field1: Double, field2: Int) val vectorAssembler = TypedVectorAssembler[Features] case class DataWithFeatures(field1: Double, field2: Int, field3: String, features: Vector) val dataWithFeatures = vectorAssembler.transform(trainingDataDs).as[DataWithFeatures] case class StringIndexerInput(field3: String) val indexer = TypedStringIndexer[StringIndexerInput] val indexerModel = indexer.fit(dataWithFeatures).run() case class IndexedDataWithFeatures(field1: Double, field2: Int, field3: String, features: Vector, indexedField3: Double) val indexedData = indexerModel.transform(dataWithFeatures).as[IndexedDataWithFeatures] case class RFInputs(indexedField3: Double, features: Vector) val rf = TypedRandomForestClassifier[RFInputs] val model = rf.fit(indexedData).run() // Prediction val testData = TypedDataset.create(Seq( Data(0D, 10, "foo") )) val testDataWithFeatures = vectorAssembler.transform(testData).as[DataWithFeatures] val indexedTestData = indexerModel.transform(testDataWithFeatures).as[IndexedDataWithFeatures] case class PredictionInputs(features: Vector, indexedField3: Double) val testInput = indexedTestData.project[PredictionInputs] case class PredictionResultIndexed( features: Vector, indexedField3: Double, rawPrediction: Vector, probability: Vector, predictedField3Indexed: Double ) val predictionDs = model.transform(testInput).as[PredictionResultIndexed] case class IndexToStringInput(predictedField3Indexed: Double) val indexToString = TypedIndexToString[IndexToStringInput](indexerModel.transformer.labels) case class PredictionResult( features: Vector, indexedField3: Double, rawPrediction: Vector, probability: Vector, predictedField3Indexed: Double, predictedField3: String ) val stringPredictionDs = indexToString.transform(predictionDs).as[PredictionResult] val prediction = stringPredictionDs.select(stringPredictionDs.col('predictedField3)).collect.run().toList prediction mustEqual List("foo") } }
Example 74
Source File: Generators.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml import frameless.ml.params.linears.{LossStrategy, Solver} import frameless.ml.params.trees.FeatureSubsetStrategy import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} import org.scalacheck.{Arbitrary, Gen} object Generators { implicit val arbVector: Arbitrary[Vector] = Arbitrary { val genDenseVector = Gen.listOf(arbDouble.arbitrary).map(doubles => Vectors.dense(doubles.toArray)) val genSparseVector = genDenseVector.map(_.toSparse) Gen.oneOf(genDenseVector, genSparseVector) } implicit val arbMatrix: Arbitrary[Matrix] = Arbitrary { Gen.sized { size => for { nbRows <- Gen.choose(0, size) nbCols <- Gen.choose(1, size) matrix <- { Gen.listOfN(nbRows * nbCols, arbDouble.arbitrary) .map(values => Matrices.dense(nbRows, nbCols, values.toArray)) } } yield matrix } } implicit val arbTreesFeaturesSubsetStrategy: Arbitrary[FeatureSubsetStrategy] = Arbitrary { val genRatio = Gen.choose(0D, 1D).suchThat(_ > 0D).map(FeatureSubsetStrategy.Ratio) val genNumberOfFeatures = Gen.choose(1, Int.MaxValue).map(FeatureSubsetStrategy.NumberOfFeatures) Gen.oneOf(Gen.const(FeatureSubsetStrategy.All), Gen.const(FeatureSubsetStrategy.All), Gen.const(FeatureSubsetStrategy.Log2), Gen.const(FeatureSubsetStrategy.OneThird), Gen.const(FeatureSubsetStrategy.Sqrt), genRatio, genNumberOfFeatures ) } implicit val arbLossStrategy: Arbitrary[LossStrategy] = Arbitrary { Gen.const(LossStrategy.SquaredError) } implicit val arbSolver: Arbitrary[Solver] = Arbitrary { Gen.oneOf( Gen.const(Solver.LBFGS), Gen.const(Solver.Auto), Gen.const(Solver.Normal) ) } }
Example 75
Source File: TypedRandomForestClassifier.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml package classification import frameless.ml.internals.TreesInputsChecker import frameless.ml.params.trees.FeatureSubsetStrategy import org.apache.spark.ml.classification.{RandomForestClassificationModel, RandomForestClassifier} import org.apache.spark.ml.linalg.Vector final class TypedRandomForestClassifier[Inputs] private[ml]( rf: RandomForestClassifier, labelCol: String, featuresCol: String ) extends TypedEstimator[Inputs, TypedRandomForestClassifier.Outputs, RandomForestClassificationModel] { val estimator: RandomForestClassifier = rf .setLabelCol(labelCol) .setFeaturesCol(featuresCol) .setPredictionCol(AppendTransformer.tempColumnName) .setRawPredictionCol(AppendTransformer.tempColumnName2) .setProbabilityCol(AppendTransformer.tempColumnName3) def setNumTrees(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setNumTrees(value)) def setMaxDepth(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMaxDepth(value)) def setMinInfoGain(value: Double): TypedRandomForestClassifier[Inputs] = copy(rf.setMinInfoGain(value)) def setMinInstancesPerNode(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMinInstancesPerNode(value)) def setMaxMemoryInMB(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMaxMemoryInMB(value)) def setSubsamplingRate(value: Double): TypedRandomForestClassifier[Inputs] = copy(rf.setSubsamplingRate(value)) def setFeatureSubsetStrategy(value: FeatureSubsetStrategy): TypedRandomForestClassifier[Inputs] = copy(rf.setFeatureSubsetStrategy(value.sparkValue)) def setMaxBins(value: Int): TypedRandomForestClassifier[Inputs] = copy(rf.setMaxBins(value)) private def copy(newRf: RandomForestClassifier): TypedRandomForestClassifier[Inputs] = new TypedRandomForestClassifier[Inputs](newRf, labelCol, featuresCol) } object TypedRandomForestClassifier { case class Outputs(rawPrediction: Vector, probability: Vector, prediction: Double) def apply[Inputs](implicit inputsChecker: TreesInputsChecker[Inputs]): TypedRandomForestClassifier[Inputs] = { new TypedRandomForestClassifier(new RandomForestClassifier(), inputsChecker.labelCol, inputsChecker.featuresCol) } }
Example 76
Source File: TypedVectorAssembler.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml package feature import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.Vector import shapeless.{HList, HNil, LabelledGeneric} import shapeless.ops.hlist.ToTraversable import shapeless.ops.record.{Keys, Values} import shapeless._ import scala.annotation.implicitNotFound final class TypedVectorAssembler[Inputs] private[ml](vectorAssembler: VectorAssembler, inputCols: Array[String]) extends AppendTransformer[Inputs, TypedVectorAssembler.Output, VectorAssembler] { val transformer: VectorAssembler = vectorAssembler .setInputCols(inputCols) .setOutputCol(AppendTransformer.tempColumnName) } object TypedVectorAssembler { case class Output(vector: Vector) def apply[Inputs](implicit inputsChecker: TypedVectorAssemblerInputsChecker[Inputs]): TypedVectorAssembler[Inputs] = { new TypedVectorAssembler(new VectorAssembler(), inputsChecker.inputCols.toArray) } } @implicitNotFound( msg = "Cannot prove that ${Inputs} is a valid input type. Input type must only contain fields of numeric or boolean types." ) private[ml] trait TypedVectorAssemblerInputsChecker[Inputs] { val inputCols: Seq[String] } private[ml] object TypedVectorAssemblerInputsChecker { implicit def checkInputs[Inputs, InputsRec <: HList, InputsKeys <: HList, InputsVals <: HList]( implicit inputsGen: LabelledGeneric.Aux[Inputs, InputsRec], inputsKeys: Keys.Aux[InputsRec, InputsKeys], inputsKeysTraverse: ToTraversable.Aux[InputsKeys, Seq, Symbol], inputsValues: Values.Aux[InputsRec, InputsVals], inputsTypeCheck: TypedVectorAssemblerInputsValueChecker[InputsVals] ): TypedVectorAssemblerInputsChecker[Inputs] = new TypedVectorAssemblerInputsChecker[Inputs] { val inputCols: Seq[String] = inputsKeys.apply.to[Seq].map(_.name) } } private[ml] trait TypedVectorAssemblerInputsValueChecker[InputsVals] private[ml] object TypedVectorAssemblerInputsValueChecker { implicit def hnilCheckInputsValue: TypedVectorAssemblerInputsValueChecker[HNil] = new TypedVectorAssemblerInputsValueChecker[HNil] {} implicit def hlistCheckInputsValueNumeric[H, T <: HList]( implicit ch: CatalystNumeric[H], tt: TypedVectorAssemblerInputsValueChecker[T] ): TypedVectorAssemblerInputsValueChecker[H :: T] = new TypedVectorAssemblerInputsValueChecker[H :: T] {} implicit def hlistCheckInputsValueBoolean[T <: HList]( implicit tt: TypedVectorAssemblerInputsValueChecker[T] ): TypedVectorAssemblerInputsValueChecker[Boolean :: T] = new TypedVectorAssemblerInputsValueChecker[Boolean :: T] {} }
Example 77
Source File: VectorInputsChecker.scala From frameless with Apache License 2.0 | 5 votes |
package frameless package ml package internals import shapeless.ops.hlist.Length import shapeless.{HList, LabelledGeneric, Nat, Witness} import scala.annotation.implicitNotFound import org.apache.spark.ml.linalg.Vector @implicitNotFound( msg = "Cannot prove that ${Inputs} is a valid input type. " + "Input type must only contain a field of type org.apache.spark.ml.linalg.Vector (the features)." ) trait VectorInputsChecker[Inputs] { val featuresCol: String } object VectorInputsChecker { implicit def checkVectorInput[Inputs, InputsRec <: HList, FeaturesK <: Symbol]( implicit i0: LabelledGeneric.Aux[Inputs, InputsRec], i1: Length.Aux[InputsRec, Nat._1], i2: SelectorByValue.Aux[InputsRec, Vector, FeaturesK], i3: Witness.Aux[FeaturesK] ): VectorInputsChecker[Inputs] = { new VectorInputsChecker[Inputs] { val featuresCol: String = i3.value.name } } }
Example 78
Source File: ProbabilisticClassifierSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} final class TestProbabilisticClassificationModel( override val uid: String, override val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] { override def copy(extra: org.apache.spark.ml.param.ParamMap): this.type = defaultCopy(extra) override protected def predictRaw(input: Vector): Vector = { input } override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { rawPrediction } def friendlyPredict(values: Double*): Double = { predict(Vectors.dense(values.toArray)) } } class ProbabilisticClassifierSuite extends SparkFunSuite { test("test thresholding") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.5, 0.2)) assert(testModel.friendlyPredict(1.0, 1.0) === 1.0) assert(testModel.friendlyPredict(1.0, 0.2) === 0.0) } test("test thresholding not required") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) assert(testModel.friendlyPredict(1.0, 2.0) === 1.0) } test("test tiebreak") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.4, 0.4)) assert(testModel.friendlyPredict(0.6, 0.6) === 0.0) } test("test one zero threshold") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.0, 0.1)) assert(testModel.friendlyPredict(1.0, 10.0) === 0.0) assert(testModel.friendlyPredict(0.0, 10.0) === 1.0) } test("bad thresholds") { intercept[IllegalArgumentException] { new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(0.0, 0.0)) } intercept[IllegalArgumentException] { new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(-0.1, 0.1)) } } } object ProbabilisticClassifierSuite { val allParamSettings: Map[String, Any] = ClassifierSuite.allParamSettings ++ Map( "probabilityCol" -> "myProbability", "thresholds" -> Array(0.4, 0.6) ) }
Example 79
Source File: MultivariateGaussian.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat.distribution import breeze.linalg.{diag, eigSym, max, DenseMatrix => BDM, DenseVector => BDV, Vector => BV} import org.apache.spark.annotation.{DeveloperApi, Since} import org.apache.spark.ml.impl.Utils import org.apache.spark.ml.linalg.{Matrices, Matrix, Vector, Vectors} private def calculateCovarianceConstants: (BDM[Double], Double) = { val eigSym.EigSym(d, u) = eigSym(cov.asBreeze.toDenseMatrix) // sigma = u * diag(d) * u.t // For numerical stability, values are considered to be non-zero only if they exceed tol. // This prevents any inverted value from exceeding (eps * n * max(d))^-1 val tol = Utils.EPSILON * max(d) * d.length try { // log(pseudo-determinant) is sum of the logs of all non-zero singular values val logPseudoDetSigma = d.activeValuesIterator.filter(_ > tol).map(math.log).sum // calculate the root-pseudo-inverse of the diagonal matrix of singular values // by inverting the square root of all non-zero values val pinvS = diag(new BDV(d.map(v => if (v > tol) math.sqrt(1.0 / v) else 0.0).toArray)) (pinvS * u.t, -0.5 * (mean.size * math.log(2.0 * math.Pi) + logPseudoDetSigma)) } catch { case uex: UnsupportedOperationException => throw new IllegalArgumentException("Covariance matrix has no non-zero singular values") } } }
Example 80
Source File: Word2VecExample.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession object Word2VecExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Word2Vec example") .getOrCreate() // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.collect().foreach { case Row(text: Seq[_], features: Vector) => println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() } } // scalastyle:on println
Example 81
Source File: ElementwiseProduct.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 82
Source File: Normalizer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 83
Source File: IDF.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.hadoop.fs.Path import org.apache.spark.annotation.Since import org.apache.spark.ml._ import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.StructType @Since("2.0.0") def idf: Vector = idfModel.idf.asML @Since("1.6.0") override def write: MLWriter = new IDFModelWriter(this) } @Since("1.6.0") object IDFModel extends MLReadable[IDFModel] { private[IDFModel] class IDFModelWriter(instance: IDFModel) extends MLWriter { private case class Data(idf: Vector) override protected def saveImpl(path: String): Unit = { DefaultParamsWriter.saveMetadata(instance, path, sc) val data = Data(instance.idf) val dataPath = new Path(path, "data").toString sparkSession.createDataFrame(Seq(data)).repartition(1).write.parquet(dataPath) } } private class IDFModelReader extends MLReader[IDFModel] { private val className = classOf[IDFModel].getName override def load(path: String): IDFModel = { val metadata = DefaultParamsReader.loadMetadata(path, sc, className) val dataPath = new Path(path, "data").toString val data = sparkSession.read.parquet(dataPath) val Row(idf: Vector) = MLUtils.convertVectorColumnsToML(data, "idf") .select("idf") .head() val model = new IDFModel(metadata.uid, new feature.IDFModel(OldVectors.fromML(idf))) DefaultParamsReader.getAndSetParams(model, metadata) model } } @Since("1.6.0") override def read: MLReader[IDFModel] = new IDFModelReader @Since("1.6.0") override def load(path: String): IDFModel = super.load(path) }
Example 84
Source File: BinaryClassificationEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 85
Source File: VectorSlicerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 86
Source File: MaxAbsScalerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("MaxAbsScaler fit basic case") { val data = Array( Vectors.dense(1, 0, 100), Vectors.dense(2, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-2, -100)), Vectors.sparse(3, Array(0), Array(-1.5))) val expected: Array[Vector] = Array( Vectors.dense(0.5, 0, 1), Vectors.dense(1, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-1, -1)), Vectors.sparse(3, Array(0), Array(-0.75))) val df = data.zip(expected).toSeq.toDF("features", "expected") val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaled") val model = scaler.fit(df) model.transform(df).select("expected", "scaled").collect() .foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1") } // copied model must have the same parent. MLTestingUtils.checkCopy(model) } test("MaxAbsScaler read/write") { val t = new MaxAbsScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } test("MaxAbsScalerModel read/write") { val instance = new MaxAbsScalerModel( "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.maxAbs === instance.maxAbs) } }
Example 87
Source File: ChiSqSelectorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("Test Chi-Square selector") { import testImplicits._ val data = Seq( LabeledPoint(0.0, Vectors.sparse(3, Array((0, 8.0), (1, 7.0)))), LabeledPoint(1.0, Vectors.sparse(3, Array((1, 9.0), (2, 6.0)))), LabeledPoint(1.0, Vectors.dense(Array(0.0, 9.0, 8.0))), LabeledPoint(2.0, Vectors.dense(Array(8.0, 9.0, 5.0))) ) val preFilteredData = Seq( Vectors.dense(8.0), Vectors.dense(0.0), Vectors.dense(0.0), Vectors.dense(8.0) ) val df = sc.parallelize(data.zip(preFilteredData)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") val selector = new ChiSqSelector() .setSelectorType("kbest") .setNumTopFeatures(1) .setFeaturesCol("data") .setLabelCol("label") .setOutputCol("filtered") selector.fit(df).transform(df).select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } selector.setSelectorType("percentile").setPercentile(0.34).fit(df).transform(df) .select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } val preFilteredData2 = Seq( Vectors.dense(8.0, 7.0), Vectors.dense(0.0, 9.0), Vectors.dense(0.0, 9.0), Vectors.dense(8.0, 9.0) ) val df2 = sc.parallelize(data.zip(preFilteredData2)) .map(x => (x._1.label, x._1.features, x._2)) .toDF("label", "data", "preFilteredData") selector.setSelectorType("fpr").setAlpha(0.2).fit(df2).transform(df2) .select("filtered", "preFilteredData").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 ~== vec2 absTol 1e-1) } } test("ChiSqSelector read/write") { val t = new ChiSqSelector() .setFeaturesCol("myFeaturesCol") .setLabelCol("myLabelCol") .setOutputCol("myOutputCol") .setNumTopFeatures(2) testDefaultReadWrite(t) } test("ChiSqSelectorModel read/write") { val oldModel = new feature.ChiSqSelectorModel(Array(1, 3)) val instance = new ChiSqSelectorModel("myChiSqSelectorModel", oldModel) val newInstance = testDefaultReadWrite(instance) assert(newInstance.selectedFeatures === instance.selectedFeatures) } test("should support all NumericType labels and not support other types") { val css = new ChiSqSelector() MLTestingUtils.checkNumericTypes[ChiSqSelectorModel, ChiSqSelector]( css, spark) { (expected, actual) => assert(expected.selectedFeatures === actual.selectedFeatures) } } }
Example 88
Source File: DCTSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 89
Source File: BinarizerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[Double] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4) } test("params") { ParamsSuite.checkParams(new Binarizer) } test("Binarize continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize continuous features with setter") { val threshold: Double = 0.2 val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with setter") { val threshold: Double = 0.2 val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("read/write") { val t = new Binarizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setThreshold(0.1) testDefaultReadWrite(t) } }
Example 90
Source File: HashingTFSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.feature.{HashingTF => MLlibHashingTF} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new HashingTF) } test("hashingTF") { val df = Seq((0, "a a b b c d".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) val output = hashingTF.transform(df) val attrGroup = AttributeGroup.fromStructField(output.schema("features")) require(attrGroup.numAttributes === Some(n)) val features = output.select("features").first().getAs[Vector](0) // Assume perfect hash on "a", "b", "c", and "d". def idx: Any => Int = murmur3FeatureIdx(n) val expected = Vectors.sparse(n, Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) assert(features ~== expected absTol 1e-14) } test("applying binary term freqs") { val df = Seq((0, "a a b c c c".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) .setBinary(true) val output = hashingTF.transform(df) val features = output.select("features").first().getAs[Vector](0) def idx: Any => Int = murmur3FeatureIdx(n) // Assume perfect hash on input features val expected = Vectors.sparse(n, Seq((idx("a"), 1.0), (idx("b"), 1.0), (idx("c"), 1.0))) assert(features ~== expected absTol 1e-14) } test("read/write") { val t = new HashingTF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setNumFeatures(10) testDefaultReadWrite(t) } private def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = { Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures) } }
Example 91
Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 92
Source File: ProbabilisticClassifierSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} final class TestProbabilisticClassificationModel( override val uid: String, override val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] { override def copy(extra: org.apache.spark.ml.param.ParamMap): this.type = defaultCopy(extra) override protected def predictRaw(input: Vector): Vector = { input } override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { rawPrediction } def friendlyPredict(values: Double*): Double = { predict(Vectors.dense(values.toArray)) } } class ProbabilisticClassifierSuite extends SparkFunSuite { test("test thresholding") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.5, 0.2)) assert(testModel.friendlyPredict(1.0, 1.0) === 1.0) assert(testModel.friendlyPredict(1.0, 0.2) === 0.0) } test("test thresholding not required") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) assert(testModel.friendlyPredict(1.0, 2.0) === 1.0) } test("test tiebreak") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.4, 0.4)) assert(testModel.friendlyPredict(0.6, 0.6) === 0.0) } test("test one zero threshold") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.0, 0.1)) assert(testModel.friendlyPredict(1.0, 10.0) === 0.0) assert(testModel.friendlyPredict(0.0, 10.0) === 1.0) } test("bad thresholds") { intercept[IllegalArgumentException] { new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(0.0, 0.0)) } intercept[IllegalArgumentException] { new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(-0.1, 0.1)) } } } object ProbabilisticClassifierSuite { val allParamSettings: Map[String, Any] = ClassifierSuite.allParamSettings ++ Map( "probabilityCol" -> "myProbability", "thresholds" -> Array(0.4, 0.6) ) }
Example 93
Source File: LocalLogisticRegressionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.Vector class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel) extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {} object LocalLogisticRegressionModel extends SimpleModelLoader[LogisticRegressionModel] with TypedTransformerConverter[LogisticRegressionModel] { override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = { val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor( classOf[String], classOf[Vector], classOf[Double] ) constructor.setAccessible(true) val coefficientsParams = data.column("coefficients").get.data.head.asInstanceOf[Map[String, Any]] val coefficients = DataUtils.constructVector(coefficientsParams) constructor .newInstance( metadata.uid, coefficients, data.column("intercept").get.data.head.asInstanceOf[java.lang.Double] ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double]) } override implicit def toLocal( sparkTransformer: LogisticRegressionModel ): LocalLogisticRegressionModel = { new LocalLogisticRegressionModel(sparkTransformer) } }
Example 94
Source File: LocalLinearSVCModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.classification.LocalClassificationModel import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.LinearSVCModel import org.apache.spark.ml.linalg.Vector class LocalLinearSVCModel(override val sparkTransformer: LinearSVCModel) extends LocalClassificationModel[LinearSVCModel] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => var result = localData sparkTransformer.get(sparkTransformer.rawPredictionCol).foreach { name => val res = LocalDataColumn( name, column.data.map(_.asInstanceOf[List[Double]]).map(predictRaw) ) result = result.withColumn(res) } sparkTransformer.get(sparkTransformer.predictionCol).foreach { name => val res = LocalDataColumn(name, column.data.map(_.asInstanceOf[List[Double]]).map(predict)) result = result.withColumn(res) } result case None => localData } } } object LocalLinearSVCModel extends SimpleModelLoader[LinearSVCModel] with TypedTransformerConverter[LinearSVCModel] { override def build(metadata: Metadata, data: LocalData): LinearSVCModel = { val coefficients = DataUtils.constructVector( data.column("coefficients").get.data.head.asInstanceOf[Map[String, Any]] ) val cls = classOf[LinearSVCModel].getConstructor( classOf[String], classOf[Vector], classOf[Double] ) val inst = cls.newInstance( metadata.uid, coefficients, data.column("intercept").get.data.head.asInstanceOf[java.lang.Double] ) inst .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) .set(inst.labelCol, metadata.paramMap("labelCol").toString) .set(inst.aggregationDepth, metadata.paramMap("aggregationDepth").toString.toInt) .set(inst.fitIntercept, metadata.paramMap("fitIntercept").toString.toBoolean) .set(inst.maxIter, metadata.paramMap("maxIter").toString.toInt) .set(inst.regParam, metadata.paramMap("regParam").toString.toDouble) .set(inst.standardization, metadata.paramMap("standardization").toString.toBoolean) .set(inst.threshold, metadata.paramMap("threshold").toString.toDouble) .set(inst.tol, metadata.paramMap("tol").toString.toDouble) } override implicit def toLocal(transformer: LinearSVCModel): LocalTransformer[LinearSVCModel] = new LocalLinearSVCModel(transformer) }
Example 95
Source File: LocalLogisticRegressionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.{Matrix, Vector} class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel) extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {} object LocalLogisticRegressionModel extends SimpleModelLoader[LogisticRegressionModel] with TypedTransformerConverter[LogisticRegressionModel] { override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = { val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor( classOf[String], classOf[Matrix], classOf[Vector], classOf[Int], java.lang.Boolean.TYPE ) constructor.setAccessible(true) val coefficientMatrixParams = data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]] val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams) val interceptVectorParams = data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]] val interceptVector = DataUtils.constructVector(interceptVectorParams) constructor .newInstance( metadata.uid, coefficientMatrix, interceptVector, data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer], data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean] ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double]) } override implicit def toLocal( transformer: LogisticRegressionModel ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer) }
Example 96
Source File: LocalLogisticRegressionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import java.lang.Boolean import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.LogisticRegressionModel import org.apache.spark.ml.linalg.{Matrix, SparseMatrix, Vector, Vectors} class LocalLogisticRegressionModel(override val sparkTransformer: LogisticRegressionModel) extends LocalProbabilisticClassificationModel[LogisticRegressionModel] {} object LocalLogisticRegressionModel extends SimpleModelLoader[LogisticRegressionModel] with TypedTransformerConverter[LogisticRegressionModel] { override def build(metadata: Metadata, data: LocalData): LogisticRegressionModel = { val constructor = classOf[LogisticRegressionModel].getDeclaredConstructor( classOf[String], classOf[Matrix], classOf[Vector], classOf[Int], java.lang.Boolean.TYPE ) constructor.setAccessible(true) val coefficientMatrixParams = data.column("coefficientMatrix").get.data.head.asInstanceOf[Map[String, Any]] val coefficientMatrix = DataUtils.constructMatrix(coefficientMatrixParams) val interceptVectorParams = data.column("interceptVector").get.data.head.asInstanceOf[Map[String, Any]] val interceptVector = DataUtils.constructVector(interceptVectorParams) constructor .newInstance( metadata.uid, coefficientMatrix, interceptVector, data.column("numFeatures").get.data.head.asInstanceOf[java.lang.Integer], data.column("isMultinomial").get.data.head.asInstanceOf[java.lang.Boolean] ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) .setThreshold(metadata.paramMap("threshold").asInstanceOf[Double]) } override implicit def toLocal( transformer: LogisticRegressionModel ): LocalTransformer[LogisticRegressionModel] = new LocalLogisticRegressionModel(transformer) }
Example 97
Source File: LocalGaussianMixtureModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.clustering import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import org.apache.spark.ml.clustering.GaussianMixtureModel import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.stat.distribution.MultivariateGaussian class LocalGaussianMixtureModel(override val sparkTransformer: GaussianMixtureModel) extends LocalTransformer[GaussianMixtureModel] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => val predictMethod = classOf[GaussianMixtureModel].getMethod("predict", classOf[Vector]) predictMethod.setAccessible(true) val newColumn = LocalDataColumn(sparkTransformer.getPredictionCol, column.data.mapToMlVectors map { predictMethod.invoke(sparkTransformer, _).asInstanceOf[Int] }) localData.withColumn(newColumn) case None => localData } } } object LocalGaussianMixtureModel extends SimpleModelLoader[GaussianMixtureModel] with TypedTransformerConverter[GaussianMixtureModel] { override def build(metadata: Metadata, data: LocalData): GaussianMixtureModel = { val weights = data.column("weights").get.data.head.asInstanceOf[Seq[Double]].toArray val mus = data.column("mus").get.data.head.asInstanceOf[Seq[Map[String, Any]]] val sigmas = data.column("sigmas").get.data.head.asInstanceOf[Seq[Map[String, Any]]] val sigMatrices = sigmas.map(DataUtils.constructMatrix) val musVecs = mus.map(DataUtils.constructVector) val gaussians = musVecs zip sigMatrices map { case (mu, sigma) => new MultivariateGaussian(mu, sigma) } val constructor = classOf[GaussianMixtureModel].getDeclaredConstructor( classOf[String], classOf[Array[Double]], classOf[Array[MultivariateGaussian]] ) constructor.setAccessible(true) var inst = constructor.newInstance(metadata.uid, weights, gaussians.toArray) inst = inst.set(inst.probabilityCol, metadata.paramMap("probabilityCol").asInstanceOf[String]) inst = inst.set(inst.featuresCol, metadata.paramMap("featuresCol").asInstanceOf[String]) inst = inst.set(inst.predictionCol, metadata.paramMap("predictionCol").asInstanceOf[String]) inst } override implicit def toLocal( transformer: GaussianMixtureModel ) = new LocalGaussianMixtureModel(transformer) }
Example 98
Source File: LocalDCT.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.DCT import org.apache.spark.ml.linalg.Vector class LocalDCT(override val sparkTransformer: DCT) extends LocalTransformer[DCT] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val method = classOf[DCT].getMethod("createTransformFunc") val newData = column.data.mapToMlVectors.map { r => method.invoke(sparkTransformer).asInstanceOf[Vector => Vector](r).toList } localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalDCT extends SimpleModelLoader[DCT] with TypedTransformerConverter[DCT] { override def build(metadata: Metadata, data: LocalData): DCT = { new DCT(metadata.uid) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) .setInverse(metadata.paramMap("inverse").asInstanceOf[Boolean]) } override implicit def toLocal(transformer: DCT) = new LocalDCT(transformer) }
Example 99
Source File: LocalNormalizer.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.linalg.Vector class LocalNormalizer(override val sparkTransformer: Normalizer) extends LocalTransformer[Normalizer] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val method = classOf[Normalizer].getMethod("createTransformFunc") val newData = column.data.mapToMlVectors.map { vector => method.invoke(sparkTransformer).asInstanceOf[Vector => Vector](vector).toList } localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalNormalizer extends SimpleModelLoader[Normalizer] with TypedTransformerConverter[Normalizer] { override def build(metadata: Metadata, data: LocalData): Normalizer = { new Normalizer(metadata.uid) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) .setP(metadata.paramMap("p").toString.toDouble) } override implicit def toLocal(transformer: Normalizer) = new LocalNormalizer(transformer) }
Example 100
Source File: LocalPolynomialExpansion.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import org.apache.spark.ml.feature.PolynomialExpansion import org.apache.spark.ml.linalg.{Vector, Vectors} class LocalPolynomialExpansion(override val sparkTransformer: PolynomialExpansion) extends LocalTransformer[PolynomialExpansion] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val method = classOf[PolynomialExpansion].getMethod("createTransformFunc") val newData = column.data.map(r => { val row = r.asInstanceOf[List[Any]].map(_.toString.toDouble).toArray val vector: Vector = Vectors.dense(row) method.invoke(sparkTransformer).asInstanceOf[Vector => Vector](vector).toList }) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalPolynomialExpansion extends SimpleModelLoader[PolynomialExpansion] with TypedTransformerConverter[PolynomialExpansion] { override def build(metadata: Metadata, data: LocalData): PolynomialExpansion = { new PolynomialExpansion(metadata.uid) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) .setDegree(metadata.paramMap("degree").asInstanceOf[Number].intValue()) } override implicit def toLocal( transformer: PolynomialExpansion ) = new LocalPolynomialExpansion(transformer) }
Example 101
Source File: LocalMaxAbsScalerModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.feature.MaxAbsScalerModel import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} class LocalMaxAbsScalerModel(override val sparkTransformer: MaxAbsScalerModel) extends LocalTransformer[MaxAbsScalerModel] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val maxAbsUnzero = Vectors.dense(sparkTransformer.maxAbs.toArray.map(x => if (x == 0) 1 else x)) val newData = column.data.map(r => { val vec = r match { case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue()) case d => throw new IllegalArgumentException(s"Unknown data type for LocalMaxAbsScaler: $d") } val brz = DataUtils.asBreeze(vec.toArray) / DataUtils.asBreeze(maxAbsUnzero.toArray) DataUtils.fromBreeze(brz).toList }) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalMaxAbsScalerModel extends SimpleModelLoader[MaxAbsScalerModel] with TypedTransformerConverter[MaxAbsScalerModel] { override def build(metadata: Metadata, data: LocalData): MaxAbsScalerModel = { val maxAbsParams = data.column("maxAbs").get.data.head.asInstanceOf[Map[String, Any]] val maxAbs = DataUtils.constructVector(maxAbsParams) val constructor = classOf[MaxAbsScalerModel].getDeclaredConstructor(classOf[String], classOf[Vector]) constructor.setAccessible(true) constructor .newInstance(metadata.uid, maxAbs) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) } override implicit def toLocal( transformer: MaxAbsScalerModel ): LocalMaxAbsScalerModel = new LocalMaxAbsScalerModel(transformer) }
Example 102
Source File: LocalStandardScalerModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.feature.StandardScalerModel import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.feature.{StandardScalerModel => OldStandardScalerModel} import org.apache.spark.mllib.linalg.{Vectors => OldVectors} class LocalStandardScalerModel(override val sparkTransformer: StandardScalerModel) extends LocalTransformer[StandardScalerModel] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val scaler = new OldStandardScalerModel( OldVectors.fromML(sparkTransformer.std), OldVectors.fromML(sparkTransformer.mean), sparkTransformer.getWithStd, sparkTransformer.getWithMean ) val newData = column.data.mapToMlLibVectors.map(scaler.transform(_).toList) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalStandardScalerModel extends SimpleModelLoader[StandardScalerModel] with TypedTransformerConverter[StandardScalerModel] { override def build(metadata: Metadata, data: LocalData): StandardScalerModel = { val constructor = classOf[StandardScalerModel].getDeclaredConstructor( classOf[String], classOf[Vector], classOf[Vector] ) constructor.setAccessible(true) val stdParams = data.column("std").get.data.head.asInstanceOf[Map[String, Any]] val std = DataUtils.constructVector(stdParams) val meanParams = data.column("mean").get.data.head.asInstanceOf[Map[String, Any]] val mean = DataUtils.constructVector(meanParams) constructor .newInstance(metadata.uid, std, mean) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) } override implicit def toLocal( transformer: StandardScalerModel ) = new LocalStandardScalerModel(transformer) }
Example 103
Source File: LocalMinMaxScalerModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.feature.MinMaxScalerModel import org.apache.spark.ml.linalg.{DenseVector, Vector} class LocalMinMaxScalerModel(override val sparkTransformer: MinMaxScalerModel) extends LocalTransformer[MinMaxScalerModel] { override def transform(localData: LocalData): LocalData = { val originalRange = (DataUtils.asBreeze(sparkTransformer.originalMax.toArray) - DataUtils.asBreeze( sparkTransformer.originalMin.toArray )).toArray val minArray = sparkTransformer.originalMin.toArray val min = sparkTransformer.getMin val max = sparkTransformer.getMax localData.column(sparkTransformer.getInputCol) match { case Some(column) => val newData = column.data.map(r => { val scale = max - min val vec = r match { case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue()) case d => throw new IllegalArgumentException(s"Unknown data type for LocalMinMaxScaler: $d") } val values = vec.toArray val size = values.length var i = 0 while (i < size) { if (!values(i).isNaN) { val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5 values.update(i, raw * scale + min) } i += 1 } values.toList }) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalMinMaxScalerModel extends SimpleModelLoader[MinMaxScalerModel] with TypedTransformerConverter[MinMaxScalerModel] { override def build(metadata: Metadata, data: LocalData): MinMaxScalerModel = { val originalMinList = data .column("originalMin") .get .data .head .asInstanceOf[Map[String, Any]] val originalMin = DataUtils.constructVector(originalMinList) val originalMaxList = data .column("originalMax") .get .data .head .asInstanceOf[Map[String, Any]] val originalMax = DataUtils.constructVector(originalMaxList) val constructor = classOf[MinMaxScalerModel].getDeclaredConstructor( classOf[String], classOf[Vector], classOf[Vector] ) constructor.setAccessible(true) constructor .newInstance(metadata.uid, originalMin, originalMax) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) .setMin(metadata.paramMap("min").toString.toDouble) .setMax(metadata.paramMap("max").toString.toDouble) } override implicit def toLocal( transformer: MinMaxScalerModel ) = new LocalMinMaxScalerModel(transformer) }
Example 104
Source File: LocalMultilayerPerceptronClassificationModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel import org.apache.spark.ml.linalg.{Vector, Vectors} class LocalMultilayerPerceptronClassificationModel( override val sparkTransformer: MultilayerPerceptronClassificationModel ) extends LocalPredictionModel[MultilayerPerceptronClassificationModel] {} object LocalMultilayerPerceptronClassificationModel extends SimpleModelLoader[MultilayerPerceptronClassificationModel] with TypedTransformerConverter[MultilayerPerceptronClassificationModel] { override def build( metadata: Metadata, data: LocalData ): MultilayerPerceptronClassificationModel = { val layers = data.column("layers").get.data.head.asInstanceOf[Seq[Int]].toArray val weightsParam = data.column("weights").get.data.head.asInstanceOf[Map[String, Any]] val weights = DataUtils.constructVector(weightsParam) val constructor = classOf[MultilayerPerceptronClassificationModel].getDeclaredConstructor( classOf[String], classOf[Array[Int]], classOf[Vector] ) constructor.setAccessible(true) constructor .newInstance( metadata.uid, layers, weights ) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) } override implicit def toLocal( sparkTransformer: MultilayerPerceptronClassificationModel ): LocalMultilayerPerceptronClassificationModel = { new LocalMultilayerPerceptronClassificationModel(sparkTransformer) } }
Example 105
Source File: LocalNaiveBayes.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.classification import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.classification.LocalProbabilisticClassificationModel import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.classification.NaiveBayesModel import org.apache.spark.ml.linalg.{Matrix, Vector, Vectors} class LocalNaiveBayes(override val sparkTransformer: NaiveBayesModel) extends LocalProbabilisticClassificationModel[NaiveBayesModel] {} object LocalNaiveBayes extends SimpleModelLoader[NaiveBayesModel] with TypedTransformerConverter[NaiveBayesModel] { override def build(metadata: Metadata, data: LocalData): NaiveBayesModel = { val constructor = classOf[NaiveBayesModel].getDeclaredConstructor( classOf[String], classOf[Vector], classOf[Matrix] ) constructor.setAccessible(true) val matrixMetadata = data.column("theta").get.data.head.asInstanceOf[Map[String, Any]] val matrix = DataUtils.constructMatrix(matrixMetadata) val piParams = data.column("pi").get.data.head.asInstanceOf[Map[String, Any]] val piVec = DataUtils.constructVector(piParams) val nb = constructor .newInstance(metadata.uid, piVec, matrix) .setFeaturesCol(metadata.paramMap("featuresCol").asInstanceOf[String]) .setPredictionCol(metadata.paramMap("predictionCol").asInstanceOf[String]) .setProbabilityCol(metadata.paramMap("probabilityCol").asInstanceOf[String]) .setRawPredictionCol(metadata.paramMap("rawPredictionCol").asInstanceOf[String]) nb.set(nb.smoothing, metadata.paramMap("smoothing").asInstanceOf[Number].doubleValue()) nb.set(nb.modelType, metadata.paramMap("modelType").asInstanceOf[String]) nb.set(nb.labelCol, metadata.paramMap("labelCol").asInstanceOf[String]) nb } override implicit def toLocal(sparkTransformer: NaiveBayesModel): LocalNaiveBayes = { new LocalNaiveBayes(sparkTransformer) } }
Example 106
Source File: LocalPredictionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common import org.apache.spark.ml.PredictionModel import org.apache.spark.ml.linalg.Vector import scala.reflect.ClassTag abstract class LocalPredictionModel[T <: PredictionModel[Vector, T]] extends LocalTransformer[T] { def predict(v: List[Double]): Double = invoke[Double]('predict, v) override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => val predictionCol = LocalDataColumn( sparkTransformer.getPredictionCol, column.data.map(_.asInstanceOf[List[Double]]).map(predict) ) localData.withColumn(predictionCol) case None => localData } } }
Example 107
Source File: LocalProbabilisticClassificationModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common.classification import io.hydrosphere.spark_ml_serving.common.{LocalData, LocalDataColumn} import org.apache.spark.ml.classification.ProbabilisticClassificationModel import org.apache.spark.ml.linalg.Vector import scala.reflect.ClassTag abstract class LocalProbabilisticClassificationModel[T <: ProbabilisticClassificationModel[ Vector, T ]] extends LocalClassificationModel[T] { def raw2probabilityInPlace(vector: List[Double]): List[Double] = invokeVec('raw2probabilityInPlace, vector) def raw2probability(vector: List[Double]): List[Double] = raw2probabilityInPlace(vector) def raw2prediction(vector: List[Double]): Double = invoke[Double]('raw2prediction, vector) def probability2prediction(vector: List[Double]): Double = invoke[Double]('probability2prediction, vector) override def transform(localData: LocalData) = { localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => sparkTransformer .get(sparkTransformer.thresholds) .foreach(t => require(t.length == sparkTransformer.numClasses)) var result = localData val rawCol = sparkTransformer.get(sparkTransformer.rawPredictionCol).map { name => val res = LocalDataColumn( name, column.data.map(_.asInstanceOf[List[Double]]).map(predictRaw) ) result = result.withColumn(res) res } val probCol = sparkTransformer.get(sparkTransformer.probabilityCol).map { name => val data = rawCol match { case Some(raw) => raw.data.map(_.asInstanceOf[List[Double]]).map(raw2probability) case None => column.data.map(_.asInstanceOf[List[Double]]).map(predictRaw) } val res = LocalDataColumn(name, data) result = result.withColumn(res) res } sparkTransformer.get(sparkTransformer.predictionCol).map { name => val data = rawCol match { case Some(raw) => raw.data.map(raw2prediction) case None => probCol match { case Some(prob) => prob.data.map(probability2prediction) case None => column.data.map(_.asInstanceOf[List[Double]]).map(predict) } } val res = LocalDataColumn(name, data) result = result.withColumn(res) res } result case None => localData } } }
Example 108
Source File: LocalClassificationModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.common.classification import io.hydrosphere.spark_ml_serving.common.{LocalData, LocalDataColumn, LocalPredictionModel} import org.apache.spark.ml.classification.ClassificationModel import org.apache.spark.ml.linalg.Vector abstract class LocalClassificationModel[T <: ClassificationModel[Vector, T]] extends LocalPredictionModel[T] { def predictRaw(v: List[Double]): List[Double] = invokeVec('predictRaw, v) override def transform(localData: LocalData) = { localData.column(sparkTransformer.getFeaturesCol) match { case Some(column) => var result = localData sparkTransformer.get(sparkTransformer.rawPredictionCol).foreach { name => val res = LocalDataColumn( name, column.data.map(_.asInstanceOf[List[Double]]).map(predictRaw) ) result = result.withColumn(res) } sparkTransformer.get(sparkTransformer.predictionCol).foreach { name => val res = LocalDataColumn(name, column.data.map(_.asInstanceOf[List[Double]]).map(predict)) result = result.withColumn(res) } result case None => localData } } }
Example 109
Source File: LocalLinearRegressionModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.regression import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.regression.LinearRegressionModel class LocalLinearRegressionModel(override val sparkTransformer: LinearRegressionModel) extends LocalPredictionModel[LinearRegressionModel] {} object LocalLinearRegressionModel extends SimpleModelLoader[LinearRegressionModel] with TypedTransformerConverter[LinearRegressionModel] { override def build(metadata: Metadata, data: LocalData): LinearRegressionModel = { val intercept = data.column("intercept").get.data.head.asInstanceOf[java.lang.Double] val coeffitientsMap = data.column("coefficients").get.data.head.asInstanceOf[Map[String, Any]] val coeffitients = DataUtils.constructVector(coeffitientsMap) val ctor = classOf[LinearRegressionModel].getConstructor( classOf[String], classOf[Vector], classOf[Double] ) val inst = ctor.newInstance(metadata.uid, coeffitients, intercept) inst .set(inst.featuresCol, metadata.paramMap("featuresCol").asInstanceOf[String]) .set(inst.predictionCol, metadata.paramMap("predictionCol").asInstanceOf[String]) .set(inst.labelCol, metadata.paramMap("labelCol").asInstanceOf[String]) .set(inst.elasticNetParam, metadata.paramMap("elasticNetParam").toString.toDouble) .set(inst.maxIter, metadata.paramMap("maxIter").asInstanceOf[Number].intValue()) .set(inst.regParam, metadata.paramMap("regParam").toString.toDouble) .set(inst.solver, metadata.paramMap("solver").asInstanceOf[String]) .set(inst.tol, metadata.paramMap("tol").toString.toDouble) .set(inst.standardization, metadata.paramMap("standardization").asInstanceOf[Boolean]) .set(inst.fitIntercept, metadata.paramMap("fitIntercept").asInstanceOf[Boolean]) } override implicit def toLocal( transformer: LinearRegressionModel ) = new LocalLinearRegressionModel(transformer) }
Example 110
Source File: GenericTestSpec.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving import io.hydrosphere.spark_ml_serving.common.LocalData import org.apache.spark.SparkConf import org.apache.spark.ml.linalg.{Matrix, Vector} import org.apache.spark.mllib.linalg.{Matrix => OldMatrix, Vector => OldVector} import org.apache.spark.ml.{Pipeline, PipelineStage} import org.apache.spark.sql.{DataFrame, SparkSession} import org.scalatest.{BeforeAndAfterAll, FunSpec} trait GenericTestSpec extends FunSpec with BeforeAndAfterAll { val conf = new SparkConf() .setMaster("local[2]") .setAppName("test") .set("spark.ui.enabled", "false") val session: SparkSession = SparkSession.builder().config(conf).getOrCreate() def modelPath(modelName: String): String = s"./target/test_models/${session.version}/$modelName" def test( name: String, data: => DataFrame, steps: => Seq[PipelineStage], columns: => Seq[String], accuracy: Double = 0.01 ) = { val path = modelPath(name.toLowerCase()) var validation = LocalData.empty var localPipelineModel = Option.empty[LocalPipelineModel] it("should train") { val pipeline = new Pipeline().setStages(steps.toArray) val pipelineModel = pipeline.fit(data) validation = LocalData.fromDataFrame(pipelineModel.transform(data)) pipelineModel.write.overwrite().save(path) } it("should load local version") { localPipelineModel = Some(LocalPipelineModel.load(path)) assert(localPipelineModel.isDefined) } it("should transform LocalData") { val localData = LocalData.fromDataFrame(data) val model = localPipelineModel.get val result = model.transform(localData) columns.foreach { col => val resCol = result .column(col) .getOrElse(throw new IllegalArgumentException("Result column is absent")) val valCol = validation .column(col) .getOrElse(throw new IllegalArgumentException("Validation column is absent")) resCol.data.zip(valCol.data).foreach { case (r: Seq[Number @unchecked], v: Seq[Number @unchecked]) if r.head.isInstanceOf[Number] && r.head.isInstanceOf[Number] => r.zip(v).foreach { case (ri, vi) => assert(ri.doubleValue() - vi.doubleValue() <= accuracy, s"$ri - $vi > $accuracy") } case (r: Number, v: Number) => assert(r.doubleValue() - v.doubleValue() <= accuracy, s"$r - $v > $accuracy") case (r, n) => assert(r === n) } result.column(col).foreach { resData => resData.data.foreach { resRow => if (resRow.isInstanceOf[Seq[_]]) { assert(resRow.isInstanceOf[List[_]], resRow) } else if (resRow.isInstanceOf[Vector] || resRow.isInstanceOf[OldVector] || resRow .isInstanceOf[Matrix] || resRow.isInstanceOf[OldMatrix]) { assert(false, s"SparkML type detected. Column: $col, value: $resRow") } } } } } } def modelTest( data: => DataFrame, steps: => Seq[PipelineStage], columns: => Seq[String], accuracy: Double = 0.01 ): Unit = { lazy val name = steps.map(_.getClass.getSimpleName).foldLeft("") { case ("", b) => b case (a, b) => a + "-" + b } describe(name) { test(name, data, steps, columns, accuracy) } } }
Example 111
Source File: FeatureCrossOp.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.cross import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import scala.collection.mutable.ArrayBuffer object FeatureCrossOp { def flatCartesian(vector: Vector): Vector = { val curDim = vector.size vector match { case sv: SparseVector => val indices = new ArrayBuffer[Int]() val values = new ArrayBuffer[Double]() sv.indices.foreach { idx1 => sv.indices.foreach { idx2 => indices += curDim * idx1 + idx2 values += sv(idx1) * sv(idx2) } } val sorted = indices.zip(values).sortBy(_._1) val sortedIndices = sorted.map(_._1) val sortedValues = sorted.map(_._2) new SparseVector(sv.size * sv.size, sortedIndices.toArray, sortedValues.toArray) case dv: DenseVector => val values: Array[Double] = new Array(dv.size * dv.size) (0 until dv.size).foreach { idx1 => (0 until dv.size).foreach { idx2 => values(dv.size * idx1 + idx2) = dv(idx1) * dv(idx2) } } new DenseVector(values) } } def main(args: Array[String]): Unit = { val v = new DenseVector(Array(1, 2, 3)) val cv = flatCartesian(v) println(cv.toDense.values.mkString(",")) } }
Example 112
Source File: Sampler.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.preprocess import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.{Param, ParamMap} import org.apache.spark.ml.util.Identifiable import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import scala.util.Random class Sampler(fraction: Double, override val uid: String, seed: Int = Random.nextInt) extends Transformer { def this(fraction: Double) = this(fraction, Identifiable.randomUID("sampler")) final def getOutputCol: String = $(inputCol) override def transform(dataset: Dataset[_]): DataFrame = { dataset.sample(false, fraction, seed).toDF } override def transformSchema(schema: StructType): StructType = { schema } override def copy(extra: ParamMap): Sampler = defaultCopy(extra) } object Sampler { def main(args: Array[String]): Unit = { val ss = SparkSession .builder .master("local") .appName("preprocess") .getOrCreate() val training = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") println(training.count) val sampler = new Sampler(0.5) .setInputCol("features") val pipeline = new Pipeline() .setStages(Array(sampler)) val model = pipeline.fit(training) val test = ss.read.format("libsvm") .load("/Users/jiangjiawei/dev-tools/spark-2.2.0/data/mllib/sample_libsvm_data.txt") model.transform(test).select("*") .collect() .foreach { case Row(label: Double, vector: Vector) => println(s"($label, " + s"${vector.toSparse.indices.mkString("[", ",", "]")}, " + s"${vector.toSparse.values.mkString("[", ",", "]")}") } ss.stop() } }
Example 113
Source File: FeatureUtils.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.sql.{Dataset, Row} import scala.language.postfixOps object FeatureUtils { def maxDim(dataset: Dataset[Row], col: String = "features"): Int = { dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] => val dim = rows.map { case Row(v: Vector) => v match { case sv: SparseVector => sv.indices.last case dv: DenseVector => dv.size } }.max Iterator(dim) }.max + 1 } def countNonZero(dataset: Dataset[Row], col: String = "features"): Array[Int] = { dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] => val mergeIndices = rows.map { case Row(v: Vector) => v match { case sv: SparseVector => sv.indices.toList } }.reduce(_ union _ distinct) Iterator(mergeIndices) }.reduce((a, b) => (a union b).distinct).toArray } }
Example 114
Source File: DataUtils.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.utils import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} object DataUtils { def parse(ss: SparkSession, schema: StructType, X: Array[Vector], Y: Array[Double]): DataFrame = { require(X.size == Y.size, "The size of configurations should be equal to the size of rewards.") ss.createDataFrame( Y.zip(X)).toDF("label", "features") } def parse(ss: SparkSession, schema: StructType, X: Vector): DataFrame = { parse(ss, schema, Array(X), Array(0)) } def toBreeze(values: Array[Double]): BDV[Double] = { new BDV[Double](values) } def toBreeze(vector: Vector): BDV[Double] = vector match { case sv: SparseVector => new BDV[Double](vector.toDense.values) case dv: DenseVector => new BDV[Double](dv.values) } def toBreeze(X: Array[Vector]): BDM[Double] = { val mat = BDM.zeros[Double](X.size, X(0).size) for (i <- 0 until X.size) { for (j <- 0 until X(0).size) { mat(i, j) = X(i)(j) } } mat } }
Example 115
Source File: Surrogate.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.tuner.surrogate import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace import org.apache.commons.logging.{Log, LogFactory} import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.types.{DataTypes, StructField, StructType} import scala.collection.mutable.ArrayBuffer def predict(X: Vector): (Double, Double) def stop(): Unit def curBest: (Vector, Double) = { if (minimize) curMin else curMax } def curMin: (Vector, Double) = { if (preY.isEmpty) (null, Double.MaxValue) else { val maxIdx: Int = preY.zipWithIndex.max._2 (preX(maxIdx), -preY(maxIdx)) } } def curMax: (Vector, Double) = { if (preY.isEmpty) (null, Double.MinValue) else { val maxIdx: Int = preY.zipWithIndex.max._2 (preX(maxIdx), preY(maxIdx)) } } }
Example 116
package com.tencent.angel.spark.automl.tuner.acquisition import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate import org.apache.commons.logging.{Log, LogFactory} import org.apache.spark.ml.linalg.{Vector, Vectors} class UCB( override val surrogate: Surrogate, val beta: Double = 100) extends Acquisition(surrogate) { val LOG: Log = LogFactory.getLog(classOf[Surrogate]) override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = { val pred = surrogate.predict(X) // (mean, variance) val m: Double = pred._1 val s: Double = Math.sqrt(pred._2) if (s == 0) { // if std is zero, we have observed x on all instances // using a RF, std should be never exactly 0.0 (0.0, Vectors.dense(new Array[Double](X.size))) } else { val ucb = m + beta * s (ucb, Vectors.dense(new Array[Double](X.size))) } } }
Example 117
package com.tencent.angel.spark.automl.tuner.acquisition import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate import org.apache.commons.logging.{Log, LogFactory} import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.ml.linalg.{Vector, Vectors} class EI( override val surrogate: Surrogate, val par: Double) extends Acquisition(surrogate) { val LOG: Log = LogFactory.getLog(classOf[Surrogate]) override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = { val pred = surrogate.predict(X) // (mean, variance) // Use the best seen observation as incumbent val eta: Double = surrogate.curBest._2 //println(s"best seen result: $eta") val m: Double = pred._1 val s: Double = Math.sqrt(pred._2) //println(s"${X.toArray.mkString("(", ",", ")")}: mean[$m], variance[$s]") if (s == 0) { // if std is zero, we have observed x on all instances // using a RF, std should be never exactly 0.0 (0.0, Vectors.dense(new Array[Double](X.size))) } else { val z = (pred._1 - eta - par) / s val norm: NormalDistribution = new NormalDistribution val cdf: Double = norm.cumulativeProbability(z) val pdf: Double = norm.density(z) val ei = s * (z * cdf + pdf) //println(s"EI of ${X.toArray.mkString("(", ",", ")")}: $ei, cur best: $eta, z: $z, cdf: $cdf, pdf: $pdf") (ei, Vectors.dense(new Array[Double](X.size))) } } }
Example 118
Source File: SolverWithTrail.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.tuner.solver import com.tencent.angel.spark.automl.tuner.config.Configuration import com.tencent.angel.spark.automl.tuner.trail.Trail import org.apache.spark.ml.linalg.Vector class SolverWithTrail(val solver: Solver, val trail: Trail) { def run(numIter: Int, X: Array[Configuration] = null, Y: Array[Double] = null): (Vector, Double) = { if (X != null && Y != null && X.size == Y.size) solver.feed(X, Y) (0 until numIter).foreach { iter => println(s"------iteration $iter starts------") val configs: Array[Configuration] = solver.suggest() val results: Array[Double] = trail.evaluate(configs) solver.feed(configs, results) } solver.surrogate.curBest } }
Example 119
Source File: TunerTest.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl import com.tencent.angel.spark.automl.tuner.config.Configuration import com.tencent.angel.spark.automl.tuner.parameter.ParamSpace import com.tencent.angel.spark.automl.tuner.solver.Solver import com.tencent.angel.spark.automl.tuner.trail.{TestTrail, Trail} import org.apache.spark.ml.linalg.Vector import org.scalatest.FunSuite class TunerTest extends FunSuite { test("test_random") { val param1 = ParamSpace.fromConfigString("param1", "{2.0,3.0,4.0,5.0,6.0}") val param2 = ParamSpace.fromConfigString("param2", "{3:10:1}") val solver: Solver = Solver(Array(param1, param2), true, surrogate = "Random") val trail: Trail = new TestTrail() (0 until 10).foreach { iter => println(s"------iteration $iter starts------") val configs: Array[Configuration] = solver.suggest() val results: Array[Double] = trail.evaluate(configs) solver.feed(configs, results) } val result: (Vector, Double) = solver.optimal solver.stop println(s"Best configuration ${result._1.toArray.mkString(",")}, best performance: ${result._2}") } test("test_grid") { val param1 = ParamSpace.fromConfigString("param1", "[1,10]") val param2 = ParamSpace.fromConfigString("param2", "[-5:5:10]") val solver: Solver = Solver(Array(param1, param2), true, surrogate = "Grid") val trail: Trail = new TestTrail() (0 until 10).foreach { iter => println(s"------iteration $iter starts------") val configs: Array[Configuration] = solver.suggest() val results: Array[Double] = trail.evaluate(configs) solver.feed(configs, results) } val result: (Vector, Double) = solver.optimal solver.stop println(s"Best configuration ${result._1.toArray.mkString(",")}, best performance: ${result._2}") } test("test_gp") { val param1 = ParamSpace.fromConfigString("param1", "[1,10]") val param2 = ParamSpace.fromConfigString("param2", "[-5:5:10]") val param3 = ParamSpace.fromConfigString("param3", "{0.0,1.0,3.0,5.0}") val param4 = ParamSpace.fromConfigString("param4", "{-5:5:1}") val solver: Solver = Solver(Array(param1, param2, param3, param4), true, surrogate = "GaussianProcess") val trail: Trail = new TestTrail() (0 until 10).foreach { iter => println(s"------iteration $iter starts------") val configs: Array[Configuration] = solver.suggest val results: Array[Double] = trail.evaluate(configs) solver.feed(configs, results) } val result: (Vector, Double) = solver.optimal solver.stop println(s"Best configuration ${result._1.toArray.mkString(",")}, best performance: ${result._2}") } test("test_rf") { val param1 = ParamSpace.fromConfigString("param1", "[1,10]") val param2 = ParamSpace.fromConfigString("param2", "[-5:5:10]") val param3 = ParamSpace.fromConfigString("param3", "{0.0,1.0,3.0,5.0}") val param4 = ParamSpace.fromConfigString("param4", "{-5:5:1}") val solver: Solver = Solver(Array(param1, param2, param3, param4), true, "RandomForest") val trail: Trail = new TestTrail() (0 until 10).foreach { iter => println(s"------iteration $iter starts------") val configs: Array[Configuration] = solver.suggest val results: Array[Double] = trail.evaluate(configs) solver.feed(configs, results) } val result: (Vector, Double) = solver.optimal solver.stop println(s"Best configuration ${result._1.toArray.mkString(",")}, best performance: ${result._2}") } }
Example 120
Source File: VSoftmaxRegressionSuite.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.{SparseMatrix, Vector, Vectors} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset} import scala.language.existentials class VSoftmaxRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { import testImplicits._ private val seed = 42 @transient var multinomialDataset: Dataset[_] = _ private val eps: Double = 1e-5 override def beforeAll(): Unit = { super.beforeAll() multinomialDataset = { val nPoints = 50 val coefficients = Array( -0.57997, 0.912083, -0.371077, -0.819866, 2.688191, -0.16624, -0.84355, -0.048509, -0.301789, 4.170682) val xMean = Array(5.843, 3.057, 3.758, 1.199) val xVariance = Array(0.6856, 0.1899, 3.116, 0.581) val testData = LogisticRegressionSuite.generateMultinomialLogisticInput( coefficients, xMean, xVariance, addIntercept = true, nPoints, seed) val df = sc.parallelize(testData, 4).toDF().withColumn("weight", rand(seed)) df.cache() println("softmax test data:") df.show(10, false) df } } test("test on multinomialDataset") { def b2s(b: Boolean): String = { if (b) "w/" else "w/o" } for (standardization <- Seq(false, true)) { for ((reg, elasticNet) <- Seq((0.0, 0.0), (2.3, 0.0), (0.3, 0.05), (0.01, 1.0))) { println() println(s"# test ${b2s(standardization)} standardization, reg=${reg}, elasticNet=${elasticNet}") val trainer = new LogisticRegression() .setFamily("multinomial") .setStandardization(standardization) .setWeightCol("weight") .setRegParam(reg) .setFitIntercept(false) .setElasticNetParam(elasticNet) val model = trainer.fit(multinomialDataset) val vtrainer = new VSoftmaxRegression() .setColsPerBlock(2) .setRowsPerBlock(5) .setColPartitions(2) .setRowPartitions(3) .setWeightCol("weight") .setGeneratingFeatureMatrixBuffer(2) .setStandardization(standardization) .setRegParam(reg) .setElasticNetParam(elasticNet) val vmodel = vtrainer.fit(multinomialDataset) println(s"VSoftmaxRegression coefficientMatrix:\n" + s"${vmodel.coefficientMatrix.asInstanceOf[SparseMatrix].toDense},\n" + s"ml.SoftmaxRegression coefficientMatrix:\n" + s"${model.coefficientMatrix}\n") assert(vmodel.coefficientMatrix ~== model.coefficientMatrix relTol eps) } } } }
Example 121
Source File: VLinearRegressionSuite.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import scala.language.existentials import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.Instance import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.DataFrame class VLinearRegressionSuite extends SparkFunSuite with MLlibTestSparkContext { import testImplicits._ var datasetWithWeight: DataFrame = _ override def beforeAll(): Unit = { super.beforeAll() datasetWithWeight = sc.parallelize(Seq( Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse), Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)), Instance(23.0, 3.0, Vectors.dense(2.0, 11.0)), Instance(29.0, 4.0, Vectors.dense(3.0, 13.0)) ), 2).toDF() } test("test on datasetWithWeight") { def b2s(b: Boolean): String = { if (b) "w/" else "w/o" } for (fitIntercept <- Seq(false, true)) { for (standardization <- Seq(false, true)) { for ((reg, elasticNet)<- Seq((0.0, 0.0), (2.3, 0.0), (2.3, 0.5))) { println() println(s"# test ${b2s(fitIntercept)} intercept, ${b2s(standardization)} standardization, reg=${reg}, elasticNet=${elasticNet}") val vtrainer = new VLinearRegression() .setColsPerBlock(1) .setRowsPerBlock(1) .setGeneratingFeatureMatrixBuffer(2) .setFitIntercept(fitIntercept) .setStandardization(standardization) .setRegParam(reg) .setWeightCol("weight") .setElasticNetParam(elasticNet) val vmodel = vtrainer.fit(datasetWithWeight) // Note that in ml.LinearRegression, when datasets numInstanse is small // solver l-bfgs and solver normal will generate slightly different result when reg not zero // because there std calculation result have multiple difference numInstance/(numInstance - 1) // here test keep consistent with l-bfgs solver val trainer = new LinearRegression() .setSolver("l-bfgs") // by default it may use noraml solver so here force set it. .setFitIntercept(fitIntercept) .setStandardization(standardization) .setRegParam(reg) .setWeightCol("weight") .setElasticNetParam(elasticNet) val model = trainer.fit(datasetWithWeight) logInfo(s"LinearRegression total iterations: ${model.summary.totalIterations}") println(s"VLinearRegression coefficients: ${vmodel.coefficients.toDense}, intercept: ${vmodel.intercept}\n" + s"LinearRegression coefficients: ${model.coefficients.toDense}, intercept: ${model.intercept}") def filterSmallValue(v: Vector) = { Vectors.dense(v.toArray.map(x => if (math.abs(x) < 1e-6) 0.0 else x)) } assert(filterSmallValue(vmodel.coefficients) ~== filterSmallValue(model.coefficients) relTol 1e-3) assert(vmodel.intercept ~== model.intercept relTol 1e-3) } } } } }
Example 122
Source File: FeaturePropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vector, Vectors, DenseVector} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.{SparkSession, DataFrame} import org.apache.spark.sql.types.{ StructField, IntegerType, DoubleType, BooleanType, StructType, StringType, ArrayType } import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Arbitrary.arbitrary import org.scalatest.PropSpec import com.holdenkarau.spark.testing.{ SharedSparkContext, DataframeGenerator, Column } abstract class FeaturePropSpec extends PropSpec with SharedSparkContext with DefaultReadWriteTest { implicit def arbitraryDenseVector: Arbitrary[DenseVector] = Arbitrary { for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr) } implicit def arbitraryVector: Arbitrary[Vector] = Arbitrary( Gen.frequency( 1 -> arbitrary[DenseVector] )) lazy val spark = SparkSession.builder().getOrCreate() def schema = StructType( List( StructField("integer", IntegerType), StructField("double", DoubleType), StructField("boolean", BooleanType), StructField("string", StringType) )) def integerGen = new Column("integer", Gen.choose(-100, 100)) def doubleGen = new Column("double", Gen.choose(-100.0, 100.0)) def stringGen = new Column("string", Gen.oneOf("A", "BC", "DEF", "GHIJ", "KLMNO")) def dataframeGen = DataframeGenerator.arbitraryDataFrameWithCustomFields( spark.sqlContext, schema)(integerGen, doubleGen, stringGen) def hasDistinctValues(df: DataFrame, columns: String*): Boolean = { columns.foldLeft(true) { (acc, col) => acc && df.select(col).distinct.count() > 1 } } }
Example 123
Source File: ReebDiagramTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vectors, EuclideanDistance, Vector} import org.apache.spark.sql.functions.{col, explode, udf} import org.scalatest.{PropSpec, Matchers, GivenWhenThen} import org.scalatest.prop.GeneratorDrivenPropertyChecks class ReebDiagramTest extends FeaturePropSpec with GivenWhenThen with GeneratorDrivenPropertyChecks with Matchers { val assembler = new VectorAssembler() .setInputCols(Array("double", "integer")) .setOutputCol("vector") val cover = new Cover() .setExploding(true) .setInputCols("double", "integer") .setOutputCol("cover_id") property("argument topTreeSize must be positive") { intercept[IllegalArgumentException] { val reeb = new ReebDiagram() // .setIdCol("id") // .setCoverCol("cover_id") // .setFeaturesCol("vector") // .setOutputCol("cluster_id") .setTopTreeSize(0) } } property("placeholder") { val reeb = new ReebDiagram() .setK(15) .setIdCol("id") .setCoverCol("cover_id") .setFeaturesCol("vector") .setOutputCol("cluster_id") forAll(dataframeGen.arbitrary) { df => val assembled = assembler.transform(df) whenever( assembled.count() > 0 && hasDistinctValues(assembled, "double", "integer")) { val transformed = cover .fit(assembled) .transform(assembled) val result = reeb .setTopTreeSize(1) .fit(transformed) .transform(transformed) // result.show() } } } }
Example 124
Source File: PartitionersTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import org.scalacheck.Prop.forAllNoShrink import org.scalatest.Matchers import org.scalatest.prop.GeneratorDrivenPropertyChecks import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance} class PartitionersTest extends KNNPropSpec with GeneratorDrivenPropertyChecks with Matchers { property("TopTreesPartitioner can be constructed with empty data") { forAll { (v: Vector, coverId: Int) => val partitioner = new TopTreesPartitioner(TopTrees(IndexedSeq.empty[(Int, Tree)])) val vector = VectorEntry(0L, v) intercept[NoSuchElementException] { partitioner.getPartition((coverId, vector)) } } } property( "TopTrees can be constructed with non empty data and maintain its consistency") { forAll(treeGen) { case (trees) => val indexedTrees = trees.zipWithIndex.map { case (t, i) => (i, t) } val partitioner = new TopTreesPartitioner(TopTrees(indexedTrees)) val indices = indexedTrees .flatMap { case (index, tree) => tree.iterator.map(d => (index, d)) } .map { case (index, entry) => partitioner.getPartition((index, entry)) } .toSet indices should contain theSameElementsAs (0 until partitioner.numPartitions) .toSet (0 until partitioner.numPartitions).toSet should contain theSameElementsAs indices intercept[IllegalArgumentException] { partitioner.getPartition(0) } } } }
Example 125
Source File: KNNPropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import scala.reflect.ClassTag import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Gen.{choose, oneOf} import org.scalatest.PropSpec import org.apache.spark.ml.linalg.{ CosineDistance, EuclideanDistance, ManhattanDistance, JaccardDistance, HammingDistance } import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector, Vectors} import com.holdenkarau.spark.testing.SharedSparkContext abstract class KNNPropSpec extends PropSpec with SharedSparkContext { implicit def arbitraryDenseVector: Arbitrary[DenseVector] = Arbitrary { for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr) } implicit def arbitrarySparseVector: Arbitrary[SparseVector] = Arbitrary { for (vec <- arbitrary[DenseVector]) yield vec.toSparse } implicit def arbitraryVector: Arbitrary[Vector] = Arbitrary( Gen.frequency( 1 -> arbitrary[DenseVector], 1 -> arbitrary[SparseVector] )) private def arraysOfNM[T: ClassTag](numRows: Int, numCols: Int, gen: Gen[T]): Gen[Array[Array[T]]] = Gen.listOfN(numRows * numCols, gen).map { square => square.toArray.grouped(numCols).toArray } private def vectorsOfNM(numRows: Int, numCols: Int, gen: Gen[Double]): Gen[Array[DenseVector]] = for { arrays <- arraysOfNM(numRows, numCols, gen) } yield arrays.map(arr => new DenseVector(arr)) val treeGen = for { measure <- oneOf(CosineDistance, EuclideanDistance, ManhattanDistance, HammingDistance, JaccardDistance) numVectors <- choose(1, 100) vectors <- vectorsOfNM(numVectors, 2, choose(-10.0, 10.0)) } yield vectors .scanLeft(Seq[Vector]())(_ :+ _) .tail .map( vs => VPTree(vs.map(v => VectorEntry(0L, v)).toIndexedSeq, measure, 10, 10, 10)) }
Example 126
Source File: IndicesTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import org.scalacheck.Prop.forAllNoShrink import org.scalatest.Matchers import org.scalatest.prop.GeneratorDrivenPropertyChecks import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance} class IndicesTest extends KNNPropSpec with GeneratorDrivenPropertyChecks with Matchers { property("TopTrees can be constructed with empty data") { forAll { (v: Vector, coverId: Int) => val topTrees = TopTrees(IndexedSeq.empty[(Int, Tree)]) val vector = VectorEntry(0L, v) topTrees.get((coverId, vector)) shouldBe None topTrees.isDefinedAt((coverId, vector)) shouldBe false intercept[NoSuchElementException] { topTrees((coverId, vector)) } } } property( "TopTrees can be constructed with non empty data and maintain its consistency") { forAll(treeGen) { case (trees) => val indexedTrees = trees.zipWithIndex.map { case (t, i) => (i, t) } val topTrees = TopTrees(indexedTrees) val indices = indexedTrees .flatMap { case (index, tree) => tree.iterator.map(d => (index, d)) } .map { case (index, entry) => topTrees((index, entry)) } .toSet indices should contain theSameElementsAs (0 until topTrees.numIndices) .toSet (0 until topTrees.numIndices).toSet should contain theSameElementsAs indices } } }
Example 127
Source File: TreesTest.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import org.scalacheck.Prop.forAllNoShrink import org.scalatest.Matchers import org.scalatest.prop.GeneratorDrivenPropertyChecks import org.apache.spark.ml.linalg.{Vector, Vectors, EuclideanDistance} class TreesTest extends KNNPropSpec with GeneratorDrivenPropertyChecks with Matchers { property("VPTree can be constructed with empty data") { forAll { (v: Vector) => val tree = VPTree(IndexedSeq.empty[VectorWithId], EuclideanDistance, 0, 0) val vector = VectorEntry(0L, v) tree.iterator shouldBe empty tree.query(vector) shouldBe empty tree.numLeaves shouldBe 0 } } property("VPTree can be constructed with data not having any duplication") { val origin = VectorEntry(0L, Vectors.dense(0, 0)) val data = (-5 to 5).flatMap { i => (-5 to 5).map { j => VectorEntry(0L, Vectors.dense(i, j)) } } List(1, data.size / 2, data.size, data.size * 2).foreach { leafSize => val tree = VPTree(data, EuclideanDistance, 1, 1, leafSize) tree.size shouldBe data.size tree.iterator.toIterable should contain theSameElementsAs data data.foreach(v => tree.query(v, 1).head._1 shouldBe v) tree .query(origin, 5) .map(_._1.vector) should contain theSameElementsAs Set( Vectors.dense(-1, 0), Vectors.dense(1, 0), Vectors.dense(0, -1), Vectors.dense(0, 1), Vectors.dense(0, 0) ) tree .query(origin, 9) .map(_._1.vector) should contain theSameElementsAs Set( Vectors.dense(-1, -1), Vectors.dense(-1, 0), Vectors.dense(-1, 1), Vectors.dense(0, -1), Vectors.dense(0, 0), Vectors.dense(0, 1), Vectors.dense(1, -1), Vectors.dense(1, 0), Vectors.dense(1, 1) ) tree.numLeaves shouldBe (tree.cardinality / leafSize.toDouble).ceil } } property("VPTree can be constructed with data having duplication") { val origin = VectorEntry(0L, Vectors.dense(0, 0)) val data = (Vectors.dense(2.0, 0.0) +: Array.fill(5)(Vectors.dense(0.0, 1.0))) .map(VectorEntry(0L, _)) val tree = VPTree(data, EuclideanDistance, 6, 6) val knn = tree.query(origin, 5) tree.numLeaves shouldBe 2 knn.size shouldBe 5 knn.map(_._1.vector).toSet should contain theSameElementsAs Array( Vectors.dense(0.0, 1.0)) } }
Example 128
Source File: LeapFrameBuilderSupport.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.runtime.javadsl import ml.combust.mleap.core.types.{BasicType, StructType} import ml.combust.mleap.core.util.VectorConverters import ml.combust.mleap.runtime.frame.ArrayRow import scala.collection.JavaConverters._ import scala.collection.mutable import org.apache.spark.ml.linalg.Vector import ml.combust.mleap.json.JsonSupport._ import spray.json._ class LeapFrameBuilderSupport { def createRowFromIterable(iterable: java.lang.Iterable[Any]): ArrayRow = { val values = iterable.asScala.map { case s: java.util.List[_] => s.asScala case vec: Vector => VectorConverters.sparkVectorToMleapTensor(vec) case v => v }.toArray new ArrayRow(mutable.WrappedArray.make[Any](values)) } def createBoolean(): BasicType = BasicType.Boolean def createByte(): BasicType = BasicType.Byte def createShort(): BasicType = BasicType.Short def createInt(): BasicType = BasicType.Int def createLong(): BasicType = BasicType.Long def createFloat(): BasicType = BasicType.Float def createDouble(): BasicType = BasicType.Double def createString(): BasicType = BasicType.String def createByteString(): BasicType = BasicType.ByteString def createTensorDimensions(dims : java.util.List[Integer]): Option[Seq[Int]] = { Some(dims.asScala.map(_.intValue())) } def createSchema(json: String): StructType = json.parseJson.convertTo[StructType] }
Example 129
Source File: XgbConverters.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.xgboost.runtime import biz.k11i.xgboost.util.FVec import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor} import ml.combust.mleap.xgboost.runtime.struct.FVecFactory import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} trait XgbConverters { implicit class VectorOps(vector: Vector) { def asXGB: DMatrix = { vector match { case SparseVector(_, indices, values) => new DMatrix(Iterator(new LabeledPoint(0.0f, indices, values.map(_.toFloat)))) case DenseVector(values) => new DMatrix(Iterator(new LabeledPoint(0.0f, null, values.map(_.toFloat)))) } } def asXGBPredictor: FVec = { vector match { case sparseVector: SparseVector => FVecFactory.fromSparseVector(sparseVector) case denseVector: DenseVector => FVecFactory.fromDenseVector(denseVector) } } } implicit class DoubleTensorOps(tensor: Tensor[Double]) { def asXGB: DMatrix = { tensor match { case SparseTensor(indices, values, _) => new DMatrix(Iterator(new LabeledPoint(0.0f, indices.map(_.head).toArray, values.map(_.toFloat)))) case DenseTensor(_, _) => new DMatrix(Iterator(new LabeledPoint(0.0f, null, tensor.toDense.rawValues.map(_.toFloat)))) } } def asXGBPredictor: FVec = { tensor match { case sparseTensor: SparseTensor[Double] => FVecFactory.fromSparseTensor(sparseTensor) case denseTensor: DenseTensor[Double] => FVecFactory.fromDenseTensor(denseTensor) } } } } object XgbConverters extends XgbConverters
Example 130
Source File: VectorSlicerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.VectorUtil._ @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala") case class VectorSlicerModel(indices: Array[Int], namedIndices: Array[(String, Int)] = Array(), inputSize: Int) extends Model { val allIndices: Array[Int] = indices.union(namedIndices.map(_._2)) def apply(features: Vector): Vector = features match { case features: DenseVector => Vectors.dense(allIndices.map(features.apply)) case features: SparseVector => features.slice(allIndices) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(allIndices.length)).get }
Example 131
Source File: ElementwiseProductModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructField, StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala") case class ElementwiseProductModel(scalingVec: Vector) extends Model { def apply(vector: Vector): Vector = { vector match { case DenseVector(values) => val vs = values.clone() val size = vs.length var i = 0 while (i < size) { vs(i) *= scalingVec(i) i += 1 } Vectors.dense(vs) case SparseVector(size, indices, values) => val vs = values.clone() val nnz = vs.length var i = 0 while (i < nnz) { vs(i) *= scalingVec(indices(i)) i += 1 } Vectors.sparse(size, indices, vs) } } override def inputSchema: StructType = StructType(StructField("input" -> TensorType.Double(scalingVec.size))).get override def outputSchema: StructType = StructType(StructField("output" -> TensorType.Double(scalingVec.size))).get }
Example 132
Source File: MaxAbsScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.math.{max, min} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala") case class MaxAbsScalerModel(maxAbs: Vector) extends Model { def apply(vector: Vector): Vector = { val maxAbsUnzero = Vectors.dense(maxAbs.toArray.map(x => if (x == 0) 1 else x)) vector match { case DenseVector(values) => val vs = values.clone() val size = vs.length var i = 0 while (i < size) { if (!values(i).isNaN) { val rescale = max(-1.0, min(1.0, values(i) / maxAbsUnzero(i))) vs(i) = rescale } i += 1 } Vectors.dense(vs) case SparseVector(size, indices, values) => val vs = values.clone() val nnz = vs.length var i = 0 while (i < nnz) { val raw = max(-1.0, min(1.0, values(i) / maxAbsUnzero(indices(i)))) vs(i) = raw i += 1 } Vectors.sparse(size, indices, vs) } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(maxAbs.size)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(maxAbs.size)).get }
Example 133
Source File: ChiSqSelectorModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.collection.mutable @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala") case class ChiSqSelectorModel(filterIndices: Seq[Int], inputSize: Int) extends Model { def apply(features: Vector): Vector = { features match { case SparseVector(size, indices, values) => val newSize = filterIndices.length val newValues = mutable.ArrayBuilder.make[Double] val newIndices = mutable.ArrayBuilder.make[Int] var i = 0 var j = 0 var indicesIdx = 0 var filterIndicesIdx = 0 while (i < indices.length && j < filterIndices.length) { indicesIdx = indices(i) filterIndicesIdx = filterIndices(j) if (indicesIdx == filterIndicesIdx) { newIndices += j newValues += values(i) j += 1 i += 1 } else { if (indicesIdx > filterIndicesIdx) { j += 1 } else { i += 1 } } } // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size) Vectors.sparse(newSize, newIndices.result(), newValues.result()) case DenseVector(values) => val values = features.toArray Vectors.dense(filterIndices.map(i => values(i)).toArray) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}.") } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(filterIndices.length)).get }
Example 134
Source File: FeatureHasherModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import ml.combust.mleap.core.util.Platform import ml.combust.mleap.core.util.Murmur3_x86_32.{hashInt, hashLong, hashUnsafeBytes2} import org.apache.spark.ml.linalg.{Vector, Vectors} import scala.collection.mutable object FeatureHasherModel { val seed = HashingTermFrequencyModel.seed def murmur3(term: Any): Int = { term match { case null => seed case b: Boolean => hashInt(if (b) 1 else 0, seed) case b: Byte => hashInt(b, seed) case s: Short => hashInt(s, seed) case i: Int => hashInt(i, seed) case l: Long => hashLong(l, seed) case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed) case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed) case s: String => val utf8 = s.getBytes("UTF-8") hashUnsafeBytes2(utf8, Platform.BYTE_ARRAY_OFFSET, utf8.length, seed) case _ => throw new IllegalStateException("FeatureHasher with murmur3 algorithm does not " + s"support type ${term.getClass.getCanonicalName} of input data.") } } } @SparkCode(uri = "https://github.com/apache/spark/blob/v2.3.0/mllib/src/main/scala/org/apache/spark/ml/feature/FeatureHasher.scala") case class FeatureHasherModel(numFeatures: Int = 1 << 18, categoricalCols: Seq[String], inputNames: Seq[String], inputTypes: Seq[DataType] ) extends Model { assert(inputTypes.forall(dt ⇒ dt.shape.isScalar), "must provide scalar shapes as inputs") val schema = inputNames.zip(inputTypes) val realFields = schema.filter(t ⇒ t._2.base match { case BasicType.Short if !categoricalCols.contains(t._1) ⇒ true case BasicType.Double if !categoricalCols.contains(t._1) ⇒ true case BasicType.Float if !categoricalCols.contains(t._1) ⇒ true case BasicType.Int if !categoricalCols.contains(t._1) ⇒ true case BasicType.Long if !categoricalCols.contains(t._1) ⇒ true case _ ⇒ false }).toMap.keys.toSeq def getDouble(x: Any): Double = { x match { case n: java.lang.Number ⇒ n.doubleValue() // will throw ClassCastException if it cannot be cast, as would row.getDouble case other ⇒ other.asInstanceOf[Double] } } def nonNegativeMod(x: Int, mod: Int): Int = { val rawMod = x % mod rawMod + (if (rawMod < 0) mod else 0) } def apply(things: Seq[Any]): Vector = { val map = new mutable.OpenHashMap[Int, Double]() schema.zip(things).foreach { case (sc, item) ⇒ if (item != null) { val (rawIdx, value) = if (realFields.contains(sc._1)) { // numeric values are kept as is, with vector index based on hash of "column_name" val value = getDouble(item) val hash = FeatureHasherModel.murmur3(sc._1) (hash, value) } else { // string, boolean and numeric values that are in catCols are treated as categorical, // with an indicator value of 1.0 and vector index based on hash of "column_name=value" val value = item.toString val fieldName = s"${sc._1}=$value" val hash = FeatureHasherModel.murmur3(fieldName) (hash, 1.0) } val idx = nonNegativeMod(rawIdx, numFeatures) map.+=((idx, map.getOrElse(idx, 0.0) + value)) } } Vectors.sparse(numFeatures, map.toSeq) } override def inputSchema: StructType = { val inputFields = inputTypes.zipWithIndex.map { case (dtype, i) => StructField(s"input$i", dtype) } StructType(inputFields).get } override def outputSchema: StructType = { StructType(StructField("output" -> TensorType.Double(numFeatures))).get } }
Example 135
Source File: MinMaxScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.mleap.VectorUtil._ import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.math.{max, min} def apply(vector: Vector): Vector = { val scale = maxValue - minValue // 0 in sparse vector will probably be rescaled to non-zero val values = vector.copy.toArray val size = values.length var i = 0 while (i < size) { if (!values(i).isNaN) { val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5 values(i) = raw * scale + minValue } i += 1 } Vectors.dense(values) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(originalRange.length)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(originalRange.length)).get }
Example 136
Source File: WordToVectorModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType} import org.apache.spark.ml.linalg.mleap.BLAS import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} sealed trait WordToVectorKernel { def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector def name: String } object WordToVectorKernel { private val lookup: Map[String, WordToVectorKernel] = Seq(Default, Sqrt).map { k => (k.name, k) }.toMap def forName(name: String): WordToVectorKernel = lookup(name) case object Default extends WordToVectorKernel { override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = { val sum = Vectors.zeros(size) for (v <- vectors) { BLAS.axpy(1.0, v, sum) } BLAS.scal(1.0 / sentenceSize, sum) sum } override def name: String = "default" } case object Sqrt extends WordToVectorKernel { override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = { val sum = Vectors.zeros(size) for (v <- vectors) { BLAS.axpy(1.0, v, sum) } val values = sum match { case sum: DenseVector => sum.values case sum: SparseVector => sum.values } var i = 0 val s = values.length val sqrt = Math.sqrt(BLAS.dot(sum, sum)) while (i < s) { values(i) /= sqrt i += 1 } sum } override def name: String = "sqrt" } } case class WordToVectorModel(wordIndex: Map[String, Int], wordVectors: Array[Double], kernel: WordToVectorKernel = WordToVectorKernel.Default) extends Model { val numWords: Int = wordIndex.size val vectorSize: Int = wordVectors.length / numWords val vectors: Map[String, Vector] = { wordIndex.map { case (word, ind) => (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize)) } }.mapValues(Vectors.dense).map(identity) def apply(sentence: Seq[String]): Vector = { if (sentence.isEmpty) { Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double]) } else { val vs = sentence.iterator.map(vectors.get). filter(_.isDefined). map(_.get) kernel(vectorSize, sentence.size, vs) } } override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(vectorSize)).get }
Example 137
Source File: NormalizerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def apply(features: Vector): Vector = { val norm = Vectors.norm(features, pNorm) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. features match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. features } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get }
Example 138
Source File: HashingTermFrequencyModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.{Vector, Vectors} import ml.combust.mleap.core.util.Murmur3_x86_32._ import ml.combust.mleap.core.util.Platform import scala.collection.mutable object HashingTermFrequencyModel { val seed = 42 def murmur3(term: Any): Int = { term match { case null => seed case b: Boolean => hashInt(if (b) 1 else 0, seed) case b: Byte => hashInt(b, seed) case s: Short => hashInt(s, seed) case i: Int => hashInt(i, seed) case l: Long => hashLong(l, seed) case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed) case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed) case s: String => val utf8 = s.getBytes("UTF-8") hashUnsafeBytes(utf8, Platform.BYTE_ARRAY_OFFSET, utf8.length, seed) case _ => throw new IllegalStateException("HashingTF with murmur3 algorithm does not " + s"support type ${term.getClass.getCanonicalName} of input data.") } } } @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/core/src/main/scala/org/apache/spark/util/Utils.scala") def nonNegativeMod(x: Int, mod: Int): Int = { val rawMod = x % mod rawMod + (if (rawMod < 0) mod else 0) } override def inputSchema: StructType = { StructType(StructField("input" -> ListType(BasicType.String))).get } override def outputSchema: StructType = { StructType(StructField("output" -> TensorType.Double(numFeatures))).get } }
Example 139
Source File: VectorIndexerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import java.util.NoSuchElementException import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.4.5/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala") case class VectorIndexerModel(numFeatures: Int, categoryMaps: Map[Int, Map[Double, Int]], handleInvalid: HandleInvalid = HandleInvalid.Error) extends Model { val sortedCatFeatureIndices = categoryMaps.keys.toArray.sorted val localVectorMap = categoryMaps val localNumFeatures = numFeatures val localHandleInvalid = handleInvalid def apply(features: Vector): Vector = predict(features) def predict(features: Vector): Vector = { assert(features.size == localNumFeatures, "VectorIndexerModel expected vector of length" + s" $numFeatures but found length ${features.size}") features match { case dv: DenseVector => var hasInvalid = false val tmpv = dv.copy localVectorMap.foreach { case (featureIndex: Int, categoryMap: Map[Double, Int]) => try { tmpv.values(featureIndex) = categoryMap(tmpv(featureIndex)) } catch { case _: NoSuchElementException => localHandleInvalid match { case HandleInvalid.Error => throw new IllegalArgumentException(s"VectorIndexer encountered invalid value " + s"${tmpv(featureIndex)} on feature index $featureIndex. To handle " + s"or skip invalid value, try setting VectorIndexer.handleInvalid.") case HandleInvalid.Keep => tmpv.values(featureIndex) = categoryMap.size case HandleInvalid.Skip => hasInvalid = true } } } if (hasInvalid) null else tmpv case sv: SparseVector => // We use the fact that categorical value 0 is always mapped to index 0. var hasInvalid = false val tmpv = sv.copy var catFeatureIdx = 0 // index into sortedCatFeatureIndices var k = 0 // index into non-zero elements of sparse vector while (catFeatureIdx < sortedCatFeatureIndices.length && k < tmpv.indices.length) { val featureIndex = sortedCatFeatureIndices(catFeatureIdx) if (featureIndex < tmpv.indices(k)) { catFeatureIdx += 1 } else if (featureIndex > tmpv.indices(k)) { k += 1 } else { try { tmpv.values(k) = localVectorMap(featureIndex)(tmpv.values(k)) } catch { case _: NoSuchElementException => localHandleInvalid match { case HandleInvalid.Error => throw new IllegalArgumentException(s"VectorIndexer encountered invalid value " + s"${tmpv.values(k)} on feature index $featureIndex. To handle " + s"or skip invalid value, try setting VectorIndexer.handleInvalid.") case HandleInvalid.Keep => tmpv.values(k) = localVectorMap(featureIndex).size case HandleInvalid.Skip => hasInvalid = true } } catFeatureIdx += 1 k += 1 } } if (hasInvalid) null else tmpv } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(localNumFeatures)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(localNumFeatures)).get }
Example 140
Source File: StandardScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def apply(vector: Vector): Vector = { if (mean.nonEmpty) { val shift = mean.get.toArray val values = vector match { // specially handle DenseVector because its toArray does not clone already case d: DenseVector => d.values.clone() case v: SparseVector => v.toArray } val size = values.length if (std.nonEmpty) { val stdDev = std.get var i = 0 while (i < size) { values(i) = if (stdDev(i) != 0.0) (values(i) - shift(i)) * (1.0 / stdDev(i)) else 0.0 i += 1 } } else { var i = 0 while (i < size) { values(i) -= shift(i) i += 1 } } Vectors.dense(values) } else if (std.nonEmpty) { val stdDev = std.get vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while(i < size) { values(i) *= (if (stdDev(i) != 0.0) 1.0 / stdDev(i) else 0.0) i += 1 } Vectors.dense(values) case SparseVector(size, indices, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) *= (if (stdDev(indices(i)) != 0.0) 1.0 / stdDev(indices(i)) else 0.0) i += 1 } Vectors.sparse(size, indices, values) } } else { throw new IllegalStateException("need to scale with mean and/or with stdev") } } override def inputSchema: StructType = { StructType("input" -> TensorType.Double(size)).get } override def outputSchema: StructType = StructType("output" -> TensorType.Double(size)).get }
Example 141
Source File: CountVectorizerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType} import org.apache.spark.ml.linalg.{Vector, Vectors} import scala.collection.mutable @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/CountVectorizer.scala") case class CountVectorizerModel(vocabulary: Array[String], binary: Boolean, minTf: Double) extends Model { val dict: Map[String, Int] = vocabulary.zipWithIndex.toMap def apply(document: Seq[String]): Vector = { val termCounts = mutable.Map[Int, Double]() var tokenCount = 0L document.foreach { term => dict.get(term) match { case Some(index) => termCounts += (index -> termCounts.get(index).map(_ + 1).getOrElse(1)) case None => // ignore terms not found in dictionary } tokenCount += 1 } val effectiveMinTF = if (minTf >= 1.0) minTf else tokenCount * minTf val effectiveCounts = if(binary) { termCounts.filter(_._2 >= effectiveMinTF).map(p => (p._1, 1.0)).toSeq } else { termCounts.filter(_._2 >= effectiveMinTF).toSeq } Vectors.sparse(dict.size, effectiveCounts) } override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(dict.size)).get }
Example 142
Source File: OneHotEncoderModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.{Vector, Vectors} def apply(labels: Array[Double]): Array[Vector] = { if (labels.length != categorySizes.length) { throw new IllegalArgumentException(s"invalid input size: ${labels.length}, must be ${categorySizes.length}") } labels.zipWithIndex.map { case (label: Double, colIdx: Int) ⇒ encoder(label, colIdx) } } private def encoder(label: Double, colIdx: Int): Vector = { val labelInt = label.toInt if(label != labelInt) { throw new IllegalArgumentException(s"invalid label: $label, must be integer") } val origCategorySize = categorySizes(colIdx) val idx = if (label >= 0 && label < origCategorySize) { label } else { if (keepInvalid) { origCategorySize } else { if (label < 0) { throw new IllegalArgumentException(s"Negative value: $label. Input can't be negative. To handle invalid values, set Param handleInvalid to ${HandleInvalid.Keep}") } else { throw new IllegalArgumentException(s"Unseen value: $label. To handle unseen values, set Param handleInvalid to ${HandleInvalid.Keep}") } } } val size = configedCategorySizes(colIdx) if (idx < size) { Vectors.sparse(size, Array(idx.toInt), oneValue) } else { Vectors.sparse(size, emptyIndices, emptyValues) } } override def inputSchema: StructType = { val f = categorySizes.zipWithIndex.map { case (_, i) => StructField(s"input$i", ScalarType.Double.setNullable(false)) } StructType(f).get } override def outputSchema: StructType = { val f = categorySizes.zipWithIndex.map { case (size, i) => StructField(s"output$i", TensorType.Double(size)) } StructType(f).get } }
Example 143
Source File: IDFModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala") case class IDFModel(idf: Vector) extends Model { def apply(v: Vector): Vector = { val n = v.size v match { case SparseVector(size, indices, values) => val nnz = indices.length val newValues = new Array[Double](nnz) var k = 0 while (k < nnz) { newValues(k) = values(k) * idf(indices(k)) k += 1 } Vectors.sparse(n, indices, newValues) case DenseVector(values) => val newValues = new Array[Double](n) var j = 0 while (j < n) { newValues(j) = values(j) * idf(j) j += 1 } Vectors.dense(newValues) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}.") } } override def inputSchema: StructType = StructType("input" -> TensorType.Double()).get override def outputSchema: StructType = StructType("output" -> TensorType.Double()).get }
Example 144
Source File: MinHashLSHModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructType, TensorType} import ml.combust.mleap.tensor.{DenseTensor, Tensor} import org.apache.spark.ml.linalg.{Vector, Vectors} object MinHashLSHModel { val HASH_PRIME = 2038074743 } case class MinHashLSHModel(randomCoefficients: Seq[(Int, Int)], inputSize: Int) extends LSHModel{ def apply(features: Vector): Tensor[Double] = predict(features) def predict(features: Vector): Tensor[Double] = { require(features.numNonzeros > 0, "Must have at least 1 non zero entry.") val elemsList = features.toSparse.indices.toList val hashValues = randomCoefficients.map { case (a, b) => elemsList.map { elem: Int => ((1 + elem) * a + b) % MinHashLSHModel.HASH_PRIME }.min.toDouble } // TODO: Output vectors of dimension numHashFunctions in SPARK-18450 DenseTensor(hashValues.toArray, Seq(hashValues.length, 1)) } override def keyDistance(x: Vector, y: Vector): Double = { val xSet = x.toSparse.indices.toSet val ySet = y.toSparse.indices.toSet val intersectionSize = xSet.intersect(ySet).size.toDouble val unionSize = xSet.size + ySet.size - intersectionSize assert(unionSize > 0, "The union of two input sets must have at least 1 elements") 1 - intersectionSize / unionSize } override def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = { // Since it's generated by hashing, it will be a pair of dense vectors. // TODO: This hashDistance function requires more discussion in SPARK-18454 x.zip(y).map(vectorPair => vectorPair._1.toArray.zip(vectorPair._2.toArray).count(pair => pair._1 != pair._2) ).min } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize, 1)).get }
Example 145
Source File: VectorAssemblerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import ml.combust.mleap.tensor.{DenseTensor, SparseTensor} import org.apache.spark.ml.linalg.{Vector, Vectors} import scala.collection.mutable def apply(vv: Seq[Any]): Vector = { val indices = mutable.ArrayBuilder.make[Int] val values = mutable.ArrayBuilder.make[Double] var cur = 0 vv.foreach { case v: Double => if (v != 0.0) { indices += cur values += v } cur += 1 case tensor: DenseTensor[_] if tensor.dimensions.size == 1 => val dTensor = tensor.asInstanceOf[DenseTensor[Double]] dTensor.values.indices.foreach { i => val v = dTensor.values(i) if(v != 0.0) { indices += cur + i values += v } } cur += dTensor.values.length case tensor: SparseTensor[_] if tensor.dimensions.size == 1 => val dTensor = tensor.asInstanceOf[SparseTensor[Double]] var idx = 0 dTensor.indices.map(_.head).foreach { i => val v = dTensor.values(idx) if(v != 0.0) { indices += cur + i values += v } idx += 1 } cur += dTensor.dimensions.head case vec: Vector => vec.foreachActive { case (i, v) => if (v != 0.0) { indices += cur + i values += v } } cur += vec.size case v: java.math.BigDecimal => val d = v.doubleValue() if (d != 0.0) { indices += cur values += d } cur += 1 case Some(v: Double) => if(v != 0.0) { indices += cur values += v } cur += 1 } Vectors.sparse(cur, indices.result(), values.result()).compressed } override def inputSchema: StructType = { val inputFields = inputShapes.zipWithIndex.map { case (shape, i) => StructField(s"input$i", DataType(BasicType.Double, shape)) } StructType(inputFields).get } override def outputSchema: StructType = StructType("output" -> TensorType.Double(outputSize)).get }
Example 146
Source File: BinarizerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.{Vector, Vectors} import scala.collection.mutable @SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/feature/Binarizer.scala") case class BinarizerModel(threshold: Double, inputShape: DataShape) extends Model { assert(inputShape.isScalar || inputShape.isTensor, "Must provide a tensor or scalar shape") def apply(value: Double): Double = { if (value > threshold) 1.0 else 0.0 } def apply(value: Vector): Vector = { val indices = mutable.ArrayBuilder.make[Int] val values = mutable.ArrayBuilder.make[Double] value.foreachActive { (index, value) => if (value > threshold) { indices += index values += 1.0 } } Vectors.sparse(value.size, indices.result(), values.result()).compressed } override def inputSchema: StructType = { StructType("input" -> DataType(BasicType.Double, inputShape).setNullable(!inputShape.isScalar)).get } override def outputSchema: StructType = { StructType("output" -> DataType(BasicType.Double, inputShape).setNullable(!inputShape.isScalar)).get } }
Example 147
Source File: InteractionModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import ml.combust.mleap.tensor.Tensor import ml.combust.mleap.core.util.VectorConverters._ import org.apache.spark.ml.linalg.{Vector, Vectors} import scala.collection.mutable def foreachNonzeroOutput(v: Any, f: (Int, Double) => Unit): Unit = { val value = v match { case tensor: Tensor[_] => tensor.asInstanceOf[Tensor[Double]]: Vector case _ => v } value match { case d: Double => assert(numFeatures.length == 1, "DoubleType columns should only contain one feature.") val numOutputCols = numFeatures.head if (numOutputCols > 1) { assert( d >= 0.0 && d == d.toInt && d < numOutputCols, s"Values from column must be indices, but got $d.") f(d.toInt, 1.0) } else { f(0, d) } case vec: Vector => assert(numFeatures.length == vec.size, s"Vector column size was ${vec.size}, expected ${numFeatures.length}") vec.foreachActive { (i, v) => val numOutputCols = numFeatures(i) if (numOutputCols > 1) { assert( v >= 0.0 && v == v.toInt && v < numOutputCols, s"Values from column must be indices, but got $v.") f(outputOffsets(i) + v.toInt, 1.0) } else { f(outputOffsets(i), v) } } case null => throw new IllegalArgumentException("Values to interact cannot be null.") case o => throw new IllegalArgumentException(s"$o of type ${o.getClass.getName} is not supported.") } } }
Example 148
Source File: DCTModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import ml.combust.mleap.core.Model import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.{Vector, Vectors} case class DCTModel(inverse: Boolean, inputSize: Int) extends Model { def apply(features: Vector): Vector = { val result = features.toArray.clone() val jTransformer = new DoubleDCT_1D(result.length) if (inverse) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get }
Example 149
Source File: BucketedRandomProjectionLSHModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.types.{StructType, TensorType} import ml.combust.mleap.tensor.{DenseTensor, Tensor} import org.apache.spark.ml.linalg.mleap.BLAS import org.apache.spark.ml.linalg.{Vector, Vectors} case class BucketedRandomProjectionLSHModel(randomUnitVectors: Seq[Vector], bucketLength: Double, inputSize: Int) extends LSHModel { def apply(features: Vector): Tensor[Double] = predict(features) def predict(features: Vector): Tensor[Double] = { val hashValues: Seq[Double] = randomUnitVectors.map({ randUnitVector => Math.floor(BLAS.dot(features, randUnitVector) / bucketLength) }) // TODO: Output vectors of dimension numHashFunctions in SPARK-18450 DenseTensor(hashValues.toArray, Seq(hashValues.length, 1)) } override def keyDistance(x: Vector, y: Vector): Double = { Math.sqrt(Vectors.sqdist(x, y)) } override def hashDistance(x: Seq[Vector], y: Seq[Vector]): Double = { // Since it's generated by hashing, it will be a pair of dense vectors. x.zip(y).map(vectorPair => Vectors.sqdist(vectorPair._1, vectorPair._2)).min } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize, 1)).get }
Example 150
Source File: PolynomialFeaturesModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.sklearn import ml.combust.mleap.core.Model import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{Vector, Vectors} case class PolynomialFeaturesModel(combinations: String) extends Model { private val pattern = "x(\\d+)(?:[\\^](\\d+))?".r private val polynomials = extractPolynomials(combinations) private val indices = polynomials.flatMap(poly => poly.terms).map(term => term.index).toSet private def extractPolynomials(combinations: String): List[Polynomial] = { combinations.split(",") .map(combination => extractPolynomial(combination)) .toList } private def extractPolynomial(polynomial: String): Polynomial = { Polynomial(pattern.findAllIn(polynomial).matchData .map(matcher => {Term(matcher.group(1).toInt, Option(matcher.group(2)).getOrElse("1").toInt)}) .toList ) } def getPolyValue(poly: Polynomial, features: Vector): Double = { poly.terms.map(term => scala.math.pow(features(term.index), term.power)).product } def apply(features: Vector): Vector = { Vectors.dense(polynomials.map(poly => getPolyValue(poly, features)).toArray) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(indices.size)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(polynomials.size)).get } case class Term(index: Int, power: Int) case class Polynomial(terms: List[Term])
Example 151
Source File: KMeansModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.clustering import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.linalg.LinalgUtils import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.mleap.VectorWithNorm import org.apache.spark.ml.linalg.Vector object KMeansModel { def apply(clusterCenters: Seq[Vector], numFeatures: Int): KMeansModel = { KMeansModel(clusterCenters.map(VectorWithNorm.apply).toArray, numFeatures) } } case class KMeansModel(clusterCenters: Array[VectorWithNorm], numFeatures: Int) extends Model { def clusterCount: Int = clusterCenters.length def apply(features: Vector): Int = predict(features) def predict(features: Vector): Int = { findClosest(VectorWithNorm(features))._1 } @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala") private def findClosest(point: VectorWithNorm): (Int, Double) = { var bestDistance = Double.PositiveInfinity var bestIndex = 0 var i = 0 clusterCenters.foreach { center => // Since `\|a - b\| \geq |\|a\| - \|b\||`, we can use this lower bound to avoid unnecessary // distance computation. var lowerBoundOfSqDist = center.norm - point.norm lowerBoundOfSqDist = lowerBoundOfSqDist * lowerBoundOfSqDist if (lowerBoundOfSqDist < bestDistance) { val distance: Double = LinalgUtils.fastSquaredDistance(center, point) if (distance < bestDistance) { bestDistance = distance bestIndex = i } } i += 1 } (bestIndex, bestDistance) } override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("prediction" -> ScalarType.Int.nonNullable).get }
Example 152
Source File: GaussianMixtureModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.clustering import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.mleap.Utils._ import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.ml.stat.distribution.MultivariateGaussian object GaussianMixtureModel { @SparkCode(uri = "https://github.com/apache/spark/blob/branch-2.0/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala") def computeProbabilities(features: DenseVector, dists: Array[MultivariateGaussian], weights: Array[Double]): Array[Double] = { val p = weights.zip(dists).map { case (weight, dist) => EPSILON + weight * dist.pdf(features) } val pSum = p.sum var i = 0 while (i < weights.length) { p(i) /= pSum i += 1 } p } } case class GaussianMixtureModel(gaussians: Array[MultivariateGaussian], weights: Array[Double]) extends Model { val numClusters = gaussians.length val numFeatures: Int = weights.length def apply(features: Vector): Int = predict(features) def predict(features: Vector): Int = { predictionFromProbability(predictProbability(features)) } def predictWithProbability(features: Vector): (Int, Double) = { val probability = predictProbability(features) val index = probability.argmax (index, probability(index)) } def predictionFromProbability(probabilities: Vector): Int = { probabilities.argmax } def predictProbability(features: Vector): Vector = { val probs: Array[Double] = GaussianMixtureModel.computeProbabilities(features.toDense, gaussians, weights) Vectors.dense(probs) } override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("prediction" -> ScalarType.Int.nonNullable, "probability" -> TensorType.Double(numClusters)).get }
Example 153
Source File: Node.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.tree import ml.combust.mleap.core.annotation.SparkCode import org.apache.spark.ml.linalg.{Vector, Vectors} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/tree/Node.scala") final case class InternalNode(left: Node, right: Node, split: Split) extends Node { override def predictImpl(features: Vector): LeafNode = { if(split.shouldGoLeft(features)) { left.predictImpl(features) } else { right.predictImpl(features) } } }
Example 154
Source File: Split.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.tree import ml.combust.mleap.core.annotation.SparkCode import org.apache.spark.ml.linalg.Vector @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/tree/Split.scala") final case class ContinuousSplit(featureIndex: Int, threshold: Double) extends Split { override def shouldGoLeft(features: Vector): Boolean = features(featureIndex) <= threshold override def shouldGoLeft(binnedFeature: Int, splits: Array[Split]): Boolean = { if(binnedFeature == splits.length) { false } else { val featureUpperBound = splits(binnedFeature).asInstanceOf[ContinuousSplit].threshold featureUpperBound <= threshold } } }
Example 155
Source File: OneVsRestModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.Model import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.{Vector, Vectors} def predictAll(features: Vector): (Double, Vector, Double) = { val predArray = Array.fill[Double](classifiers.length)(0.0) val (prediction, probability) = classifiers.zipWithIndex.map { case (c:ProbabilisticClassificationModel, i) => val raw = c.predictRaw(features) predArray(i) = raw(1) val probability = c.rawToProbabilityInPlace(raw)(1) (i.toDouble, probability) case (c,i) => val raw = c.predict(features) predArray(i) = raw (i.toDouble,raw) }.maxBy(_._2) (probability, Vectors.dense(predArray), prediction) } override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("probability" -> ScalarType.Double, "raw_prediction" -> TensorType.Double(classifiers.length), "prediction" -> ScalarType.Double).get }
Example 156
Source File: ClassificationModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} val numClasses: Int val numFeatures: Int def thresholds: Option[Array[Double]] = None def predict(features: Vector): Double = probabilityToPrediction(predictProbabilities(features)) def predictWithProbability(features: Vector): (Double, Double) = { val probabilities = predictProbabilities(features) val index = probabilityToPredictionIndex(probabilities) (index.toDouble, probabilities(index)) } def predictProbabilities(features: Vector): Vector = { val raw = predictRaw(features) rawToProbabilityInPlace(raw) raw } def rawToProbability(raw: Vector): Vector = { val probabilities = raw.copy rawToProbabilityInPlace(probabilities) } def rawToPrediction(raw: Vector): Double = { thresholds match { case Some(t) => probabilityToPrediction(rawToProbability(raw)) case None => raw.argmax } } def probabilityToPrediction(probability: Vector): Double = { probabilityToPredictionIndex(probability).toDouble } def probabilityToPredictionIndex(probability: Vector): Int = { thresholds match { case Some(ts) => val scaledProbability: Array[Double] = probability.toArray.zip(ts).map { case (p, t) => if (t == 0.0) Double.PositiveInfinity else p / t } Vectors.dense(scaledProbability).argmax case None => probability.argmax } } def rawToProbabilityInPlace(raw: Vector): Vector override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses), "probability" -> TensorType.Double(numClasses), "prediction" -> ScalarType.Double.nonNullable).get }
Example 157
Source File: DecisionTreeClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.tree.{DecisionTree, Node} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} case class DecisionTreeClassifierModel(override val rootNode: Node, numFeatures: Int, override val numClasses: Int, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with DecisionTree with Serializable { override def predictRaw(features: Vector): Vector = { rootNode.predictImpl(features).impurities } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in DecisionTreeClassifierModel:" + " raw2probabilityInPlace encountered SparseVector") } } }
Example 158
Source File: GBTClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.regression.DecisionTreeRegressionModel import ml.combust.mleap.core.tree.TreeEnsemble import ml.combust.mleap.core.tree.loss.LogLoss import org.apache.spark.ml.linalg.mleap.BLAS import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def margin(features: Vector): Double = { val treePredictions = Vectors.dense(trees.map(_.predict(features)).toArray) BLAS.dot(treePredictions, treeWeightsVector) } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => dv.values(0) = loss.computeProbability(dv.values(0)) dv.values(1) = 1.0 - dv.values(0) dv case sv: SparseVector => throw new RuntimeException("GBTClassificationModel encountered SparseVector") } } override def predictRaw(features: Vector): Vector = { val prediction: Double = margin(features) Vectors.dense(Array(-prediction, prediction)) } }
Example 159
Source File: MultiLayerPerceptronClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.ann.FeedForwardTopology import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.feature.LabeledPoint import org.apache.spark.ml.linalg.{Vector, Vectors} def decodeLabel(output: Vector): Double = { output.argmax.toDouble } } } @SparkCode(uri = "https://github.com/apache/spark/blob/v2.3.0/mllib/src/main/scala/org/apache/spark/ml/classification/MultilayerPerceptronClassifier.scala") case class MultiLayerPerceptronClassifierModel(layers: Seq[Int], weights: Vector, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel { val numFeatures: Int = layers.head private val mlpModel = FeedForwardTopology .multiLayerPerceptron(layers.toArray) .model(weights) override def predictRaw(features: Vector): Vector = { mlpModel.predictRaw(features) } override def rawToProbabilityInPlace(raw: Vector): Vector = { mlpModel.raw2ProbabilityInPlace(raw) } override val numClasses: Int = layers.last }
Example 160
Source File: NaiveBayesModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.classification.NaiveBayesModel.{Bernoulli, ModelType, Multinomial} import org.apache.spark.ml.linalg.mleap.{BLAS, Matrices} import org.apache.spark.ml.linalg.{DenseVector, Matrix, SparseVector, Vector} @SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala") case class NaiveBayesModel(numFeatures: Int, numClasses: Int, pi: Vector, theta: Matrix, modelType: NaiveBayesModel.ModelType, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with Model { private def multinomialCalculation(raw: Vector) = { val prob = theta.multiply(raw) BLAS.axpy(1.0, pi, prob) prob } private def bernoulliCalculation(raw: Vector) = { val negTheta = Matrices.map(theta, value => math.log(1.0 - math.exp(value))) val ones = new DenseVector(Array.fill(theta.numCols) {1.0}) val thetaMinusNegTheta = Matrices.map(theta, value => value - math.log(1.0 - math.exp(value))) val negThetaSum = negTheta.multiply(ones) raw.foreachActive((_, value) => require(value == 0.0 || value == 1.0, s"Bernoulli naive Bayes requires 0 or 1 feature values but found $raw.") ) val prob = thetaMinusNegTheta.multiply(raw) BLAS.axpy(1.0, pi, prob) BLAS.axpy(1.0, negThetaSum, prob) prob } override def predictRaw(raw: Vector): Vector = { modelType match { case Multinomial => multinomialCalculation(raw) case Bernoulli => bernoulliCalculation(raw) } } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => var i = 0 val size = dv.size val maxLog = dv.values.max while (i < size) { dv.values(i) = math.exp(dv.values(i) - maxLog) i += 1 } ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in NaiveBayesModel:" + " raw2probabilityInPlace encountered SparseVector") } } }
Example 161
Source File: SupportVectorMachineModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.BLAS case class SupportVectorMachineModel(coefficients: Vector, intercept: Double, override val thresholds: Option[Array[Double]] = Some(SupportVectorMachineModel.defaultThresholds)) extends ProbabilisticClassificationModel with Serializable { private def margin(features: Vector): Double = BLAS.dot(coefficients, features) + intercept override val numClasses: Int = 2 override val numFeatures: Int = coefficients.size override def predictRaw(features: Vector): Vector = { val m = margin(features) Vectors.dense(Array(-m, m)) } override def rawToProbabilityInPlace(raw: Vector): Vector = raw }
Example 162
Source File: RandomForestClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.tree.TreeEnsemble import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} case class RandomForestClassifierModel(override val trees: Seq[DecisionTreeClassifierModel], override val treeWeights: Seq[Double], numFeatures: Int, override val numClasses: Int, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with TreeEnsemble with Serializable { override def predictRaw(raw: Vector): Vector = { val votes = Array.fill[Double](numClasses)(0.0) trees.view.foreach { tree => val classCounts: Array[Double] = tree.rootNode.predictImpl(raw).impurities.toArray val total = classCounts.sum if (total != 0) { var i = 0 while (i < numClasses) { votes(i) += classCounts(i) / total i += 1 } } } Vectors.dense(votes) } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in RandomForestClassificationModel:" + " raw2probabilityInPlace encountered SparseVector") } } }
Example 163
Source File: LinearSVCModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.linalg.mleap.BLAS object LinearSVCModel { val defaultThreshold = 0.0 } @SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala") case class LinearSVCModel(coefficients: Vector, intercept: Double, threshold: Double = LinearSVCModel.defaultThreshold ) extends ClassificationModel { val numClasses: Int = 2 val numFeatures: Int = coefficients.size private val margin: Vector => Double = features => { BLAS.dot(features, coefficients) + intercept } override def predict(features: Vector): Double = { if (margin(features) > threshold) 1.0 else 0.0 } override def predictRaw(features: Vector): Vector = { val m = margin(features) Vectors.dense(-m, m) } def rawToPrediction(rawPrediction: Vector): Double = { if (rawPrediction(1) > threshold) 1.0 else 0.0 } override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses), "prediction" -> ScalarType.Double.nonNullable).get }
Example 164
Source File: VectorConverters.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.util import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor} import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors} import scala.language.implicitConversions trait VectorConverters { implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match { case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size)) case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)), values = vector.values, dimensions = Seq(vector.size)) } implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match { case tensor: DenseTensor[_] => Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => Vectors.sparse(tensor.dimensions.product, tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match { case matrix: DenseMatrix => DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols)) case matrix: SparseMatrix => val indices = matrix.rowIndices.zip(matrix.colPtrs).map { case (r, c) => Seq(r, c) }.toSeq SparseTensor(indices = indices, values = matrix.values, dimensions = Seq(matrix.numRows, matrix.numCols)) } implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match { case tensor: DenseTensor[_] => Matrices.dense(tensor.dimensions.head, tensor.dimensions(1), tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip Matrices.sparse(tensor.dimensions.head, tensor.dimensions(1), cols.toArray, rows.toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match { case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size)) case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size)) } implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match { case tensor: DenseTensor[_] => new BDV(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => new BSV(tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]], tensor.dimensions.product) } } object VectorConverters extends VectorConverters
Example 165
Source File: LinalgUtils.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.linalg import ml.combust.mleap.core.annotation.SparkCode import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.{BLAS, VectorWithNorm} val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * normDiff + EPSILON) if (precisionBound1 < precision) { sqDist = sumSquaredNorm - 2.0 * BLAS.dot(v1, v2) } else if (v1.isInstanceOf[SparseVector] || v2.isInstanceOf[SparseVector]) { val dotValue = BLAS.dot(v1, v2) sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0) val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) / (sqDist + EPSILON) if (precisionBound2 > precision) { sqDist = Vectors.sqdist(v1, v2) } } else { sqDist = Vectors.sqdist(v1, v2) } sqDist } def log1pExp(x: Double): Double = { if (x > 0) { x + math.log1p(math.exp(-x)) } else { math.log1p(math.exp(x)) } } }
Example 166
Source File: IsotonicRegressionModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.regression import java.util.Arrays.binarySearch import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.Vector @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/regression/IsotonicRegression.scala") case class IsotonicRegressionModel(boundaries: Array[Double], predictions: Seq[Double], isotonic: Boolean, featureIndex: Option[Int]) extends Model { def apply(features: Vector): Double = apply(features(featureIndex.get)) def apply(feature: Double): Double = { val foundIndex = binarySearch(boundaries, feature) val insertIndex = -foundIndex - 1 // Find if the index was lower than all values, // higher than all values, in between two values or exact match. if (insertIndex == 0) { predictions.head } else if (insertIndex == boundaries.length) { predictions.last } else if (foundIndex < 0) { linearInterpolation( boundaries(insertIndex - 1), predictions(insertIndex - 1), boundaries(insertIndex), predictions(insertIndex), feature) } else { predictions(foundIndex) } } private def linearInterpolation(x1: Double, y1: Double, x2: Double, y2: Double, x: Double): Double = { y1 + (y2 - y1) * (x - x1) / (x2 - x1) } override def inputSchema: StructType = { this.featureIndex match { case Some(_) => StructType("features" -> TensorType.Double()).get case None => StructType("features" -> ScalarType.Double.nonNullable).get } } override def outputSchema: StructType = StructType("prediction" -> ScalarType.Double.nonNullable).get }
Example 167
Source File: AFTSurvivalRegressionModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.regression import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types._ import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.linalg.mleap.BLAS @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala") case class AFTSurvivalRegressionModel(coefficients: Vector, intercept: Double, quantileProbabilities: Array[Double], scale: Double) extends Model { def apply(features: Vector): Double = predict(features) def predictWithQuantiles(features: Vector): (Double, Vector) = { val quantiles = predictQuantiles(features) (predict(features), quantiles) } def predictQuantiles(features: Vector): Vector = { // scale parameter for the Weibull distribution of lifetime val lambda = math.exp(BLAS.dot(coefficients, features) + intercept) // shape parameter for the Weibull distribution of lifetime val k = 1 / scale val quantiles = quantileProbabilities.map { q => lambda * math.exp(math.log(-math.log(1 - q)) / k) } Vectors.dense(quantiles) } def predict(features: Vector): Double = { math.exp(BLAS.dot(coefficients, features) + intercept) } override def inputSchema: StructType = StructType("features" -> TensorType.Double(coefficients.size)).get override def outputSchema: StructType = { StructType("prediction" -> ScalarType.Double.nonNullable, "quantiles" -> TensorType.Double(quantileProbabilities.length)).get } }
Example 168
Source File: LDAParitySpec.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.parity.clustering import org.apache.spark.ml.{Pipeline, Transformer} import org.apache.spark.ml.clustering.LDA import org.apache.spark.ml.feature.{CountVectorizer, StopWordsRemover, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.parity.SparkParityBase import org.apache.spark.sql.DataFrame import org.scalatest.Ignore @Ignore class LDAParitySpec extends SparkParityBase { override val dataset: DataFrame = textDataset.select("text") val tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words") val remover = new StopWordsRemover() .setInputCol(tokenizer.getOutputCol) .setOutputCol("words_filtered") val cv = new CountVectorizer().setInputCol("words_filtered").setOutputCol("features").setVocabSize(50000) val lda = new LDA().setK(5).setMaxIter(2) override val sparkTransformer: Transformer = new Pipeline().setStages(Array(tokenizer, remover, cv, lda)).fit(dataset) override def equalityTest(sparkDataset: DataFrame, mleapDataset: DataFrame): Unit = { val sparkPredictionCol = sparkDataset.schema.fieldIndex("topicDistribution") val mleapPredictionCol = mleapDataset.schema.fieldIndex("topicDistribution") sparkDataset.collect().zip(mleapDataset.collect()).foreach { case (sv, mv) => val sparkPrediction = sv.getAs[Vector](sparkPredictionCol) val mleapPrediction = mv.getAs[Vector](mleapPredictionCol) sparkPrediction.toArray.zip(mleapPrediction.toArray).foreach { case (s, m) => assert(Math.abs(m - s) < 0.001) } } } }
Example 169
Source File: XGBoostRegressionModelOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.dmlc.xgboost4j.scala.spark.mleap import java.nio.file.Files import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl.{Model, NodeShape, Value} import ml.combust.bundle.op.OpModel import ml.dmlc.xgboost4j.scala.spark.XGBoostRegressionModel import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost} import org.apache.spark.ml.bundle._ import org.apache.spark.ml.linalg.Vector import resource.managed override val Model: OpModel[SparkBundleContext, XGBoostRegressionModel] = new OpModel[SparkBundleContext, XGBoostRegressionModel] { override val klazz: Class[XGBoostRegressionModel] = classOf[XGBoostRegressionModel] override def opName: String = "xgboost.regression" override def store(model: Model, obj: XGBoostRegressionModel) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) Files.write(context.file("xgboost.model"), obj._booster.toByteArray) val numFeatures = context.context.dataset.get.select(obj.getFeaturesCol).first.getAs[Vector](0).size model.withValue("num_features", Value.int(numFeatures)). withValue("tree_limit", Value.int(obj.getOrDefault(obj.treeLimit))) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): XGBoostRegressionModel = { val booster = (for(in <- managed(Files.newInputStream(context.file("xgboost.model")))) yield { SXGBoost.loadModel(in) }).tried.get new XGBoostRegressionModel("", booster) } } override def sparkLoad(uid: String, shape: NodeShape, model: XGBoostRegressionModel): XGBoostRegressionModel = { new XGBoostRegressionModel(uid, model._booster) } override def sparkInputs(obj: XGBoostRegressionModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: XGBoostRegressionModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol, "leaf_prediction" -> obj.leafPredictionCol, "contrib_prediction" -> obj.contribPredictionCol) } }
Example 170
Source File: XGBoostClassificationModelOp.scala From mleap with Apache License 2.0 | 5 votes |
package ml.dmlc.xgboost4j.scala.spark.mleap import java.nio.file.Files import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl.{Model, NodeShape, Value} import ml.combust.bundle.op.OpModel import ml.dmlc.xgboost4j.scala.spark.XGBoostClassificationModel import ml.dmlc.xgboost4j.scala.{XGBoost => SXGBoost} import org.apache.spark.ml.bundle._ import org.apache.spark.ml.linalg.Vector import resource._ override val Model: OpModel[SparkBundleContext, XGBoostClassificationModel] = new OpModel[SparkBundleContext, XGBoostClassificationModel] { override val klazz: Class[XGBoostClassificationModel] = classOf[XGBoostClassificationModel] override def opName: String = "xgboost.classifier" override def store(model: Model, obj: XGBoostClassificationModel) (implicit context: BundleContext[SparkBundleContext]): Model = { assert(context.context.dataset.isDefined, BundleHelper.sampleDataframeMessage(klazz)) val thresholds = if(obj.isSet(obj.thresholds)) { Some(obj.getThresholds) } else None val out = Files.newOutputStream(context.file("xgboost.model")) obj._booster.saveModel(out) val numFeatures = context.context.dataset.get.select(obj.getFeaturesCol).first.getAs[Vector](0).size model.withValue("thresholds", thresholds.map(_.toSeq).map(Value.doubleList)). withValue("num_classes", Value.int(obj.numClasses)). withValue("num_features", Value.int(numFeatures)). withValue("tree_limit", Value.int(obj.getOrDefault(obj.treeLimit))) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): XGBoostClassificationModel = { val booster = (for(in <- managed(Files.newInputStream(context.file("xgboost.model")))) yield { SXGBoost.loadModel(in) }).tried.get new XGBoostClassificationModel("", model.value("num_classes").getInt, booster) } } override def sparkLoad(uid: String, shape: NodeShape, model: XGBoostClassificationModel): XGBoostClassificationModel = { new XGBoostClassificationModel(uid, model.numClasses, model._booster) } override def sparkInputs(obj: XGBoostClassificationModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: XGBoostClassificationModel): Seq[SimpleParamSpec] = { Seq("raw_prediction" -> obj.rawPredictionCol, "prediction" -> obj.predictionCol, "probability" -> obj.probabilityCol, "leaf_prediction" -> obj.leafPredictionCol, "contrib_prediction" -> obj.contribPredictionCol) } }
Example 171
Source File: MultinomialLabeler.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.mleap.feature import ml.combust.mleap.core.feature.MultinomialLabelerModel import org.apache.spark.annotation.DeveloperApi import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.mleap.param.{HasLabelsCol, HasProbabilitiesCol} import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.param.shared.HasFeaturesCol import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{udf, col} import ml.combust.mleap.core.util.VectorConverters._ class MultinomialLabeler(override val uid: String = Identifiable.randomUID("math_unary"), val model: MultinomialLabelerModel) extends Transformer with HasFeaturesCol with HasProbabilitiesCol with HasLabelsCol { def setFeaturesCol(value: String): this.type = set(featuresCol, value) def setProbabilitiesCol(value: String): this.type = set(probabilitiesCol, value) def setLabelsCol(value: String): this.type = set(labelsCol, value) @org.apache.spark.annotation.Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { val probabilitiesUdf = udf { (vector: Vector) => model.top(vector).map(_._1).toArray } val labelsUdf = udf { (vector: Vector) => model.topLabels(vector).toArray } dataset.withColumn($(probabilitiesCol), probabilitiesUdf(col($(featuresCol)))). withColumn($(labelsCol), labelsUdf(col($(featuresCol)))) } override def copy(extra: ParamMap): Transformer = copyValues(new MultinomialLabeler(uid, model), extra) @DeveloperApi override def transformSchema(schema: StructType): StructType = { require(schema($(featuresCol)).dataType.isInstanceOf[VectorUDT], s"Features column must be of type NumericType but got ${schema($(featuresCol)).dataType}") val inputFields = schema.fields require(!inputFields.exists(_.name == $(probabilitiesCol)), s"Output column ${$(probabilitiesCol)} already exists.") require(!inputFields.exists(_.name == $(labelsCol)), s"Output column ${$(labelsCol)} already exists.") StructType(schema.fields ++ Seq(StructField($(probabilitiesCol), ArrayType(DoubleType)), StructField($(labelsCol), ArrayType(StringType)))) } }
Example 172
Source File: ProjectedGaussianProcessHelper.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons import breeze.linalg.{any, eigSym, inv, DenseMatrix => BDM, DenseVector => BDV} import org.apache.spark.ml.commons.kernel.Kernel import org.apache.spark.ml.linalg.Vector import org.apache.spark.rdd.RDD private[ml] trait ProjectedGaussianProcessHelper { class NotPositiveDefiniteException extends Exception("Some matrix which is supposed to be " + "positive definite is not. This probably happened due to `sigma2` parameter being too small." + " Try to gradually increase it.") def getMagicVector(kernel: Kernel, matrixKmnKnm: BDM[Double], vectorKmny: BDV[Double], activeSet: Array[Vector], optimalHyperparameter: BDV[Double]) = { val trainKernel = kernel.trainingKernel() val positiveDefiniteMatrix = kernel.whiteNoiseVar * trainKernel // sigma^2 K_mm positiveDefiniteMatrix += matrixKmnKnm // sigma^2 K_mm + K_mn * K_nm assertSymPositiveDefinite(positiveDefiniteMatrix) (positiveDefiniteMatrix \ vectorKmny, inv(positiveDefiniteMatrix) * kernel.whiteNoiseVar - inv(trainKernel)) } protected def assertSymPositiveDefinite(matrix: BDM[Double]): Unit = { if (any(eigSym(matrix).eigenvalues <:< 0d)) throw new NotPositiveDefiniteException } }
Example 173
Source File: ARDRBFKernel.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons.kernel import breeze.linalg.{norm, DenseMatrix => BDM, DenseVector => BDV, Vector => BV} import breeze.numerics.{exp, inf} import org.apache.spark.ml.linalg.Vector class ARDRBFKernel(private var beta: BDV[Double], private val lower: BDV[Double], private val upper: BDV[Double]) extends TrainDatasetBearingKernel with NoiselessKernel with SameOnDiagonalKernel { def this(beta: BDV[Double]) = this(beta, beta * 0d, beta * inf) def this(p : Int, beta: Double = 1, lower: Double = 0, upper : Double = inf) = this(BDV.zeros[Double](p) + beta, BDV.zeros[Double](p) + lower, BDV.zeros[Double](p) + upper) override def setHyperparameters(value: BDV[Double]): ARDRBFKernel.this.type = { beta = value this } override def getHyperparameters: BDV[Double] = beta override def numberOfHyperparameters: Int = beta.length override def hyperparameterBoundaries: (BDV[Double], BDV[Double]) = (lower, upper) private def kernelElement(a: BV[Double], b: BV[Double]) : Double = { val weightedDistance = norm((a - b) *:* beta) exp(- weightedDistance * weightedDistance) } override def trainingKernel(): BDM[Double] = { val train = getTrainingVectors val result = BDM.zeros[Double](train.length, train.length) for (i <- train.indices; j <- 0 to i) { val k = kernelElement(train(i).asBreeze, train(j).asBreeze) result(i, j) = k result(j, i) = k } result } override def trainingKernelAndDerivative(): (BDM[Double], Array[BDM[Double]]) = { val train = getTrainingVectors val K = trainingKernel() val minus2Kernel = -2d * K val result = Array.fill[BDM[Double]](beta.length)(BDM.zeros[Double](train.length, train.length)) for (i <- train.indices; j <- 0 to i) { val diff = train(i).asBreeze - train(j).asBreeze diff :*= diff diff :*= beta val betaXi_Xj = diff for (k <- 0 until beta.length) { result(k)(i, j) = betaXi_Xj(k) result(k)(j, i) = betaXi_Xj(k) } } (K, result.map(derivative => derivative *:* minus2Kernel)) } override def crossKernel(test: Array[Vector]): BDM[Double] = { val train = getTrainingVectors val result = BDM.zeros[Double](test.length, train.length) for (testIndx <- test.indices; trainIndex <- train.indices) result(testIndx, trainIndex) = kernelElement(train(trainIndex).asBreeze, test(testIndx).asBreeze) result } override def selfKernel(test: Vector): Double = 1d override def toString = "ARDRBFKernel(beta=" + BDV2String(beta) + ")" private def BDV2String(v : BDV[Double]) = v.valuesIterator.map(e => f"$e%1.1e").mkString("[", ", " , "]") }
Example 174
Source File: RBFKernel.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons.kernel import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import breeze.numerics.{exp, inf} import org.apache.spark.ml.linalg.{Vector, Vectors} class RBFKernel(private var sigma: Double, private val lower: Double = 1e-6, private val upper: Double = inf) extends TrainDatasetBearingKernel with NoiselessKernel with SameOnDiagonalKernel { def this() = this(1) override def setHyperparameters(value: BDV[Double]): RBFKernel.this.type = { sigma = value(0) this } override def getHyperparameters: BDV[Double] = BDV[Double](sigma) override def numberOfHyperparameters: Int = 1 private def getSigma() = sigma private var squaredDistances: Option[BDM[Double]] = None override def hyperparameterBoundaries: (BDV[Double], BDV[Double]) = { (BDV[Double](lower), BDV[Double](upper)) } override def setTrainingVectors(vectors: Array[Vector]): this.type = { super.setTrainingVectors(vectors) val sqd = BDM.zeros[Double](vectors.length, vectors.length) for (i <- vectors.indices; j <- 0 to i) { val dist = Vectors.sqdist(vectors(i), vectors(j)) sqd(i, j) = dist sqd(j, i) = dist } squaredDistances = Some(sqd) this } override def trainingKernel(): BDM[Double] = { val result = squaredDistances.getOrElse(throw new TrainingVectorsNotInitializedException) / (-2d * sqr(getSigma())) exp.inPlace(result) result } override def trainingKernelAndDerivative(): (BDM[Double], Array[BDM[Double]]) = { val sqd = squaredDistances.getOrElse(throw new TrainingVectorsNotInitializedException) val kernel = trainingKernel() val derivative = sqd *:* kernel derivative /= cube(getSigma()) (kernel, Array(derivative)) } override def crossKernel(test: Array[Vector]): BDM[Double] = { val train = getTrainingVectors val result = BDM.zeros[Double](test.length, train.length) for (i <- test.indices; j <- train.indices) result(i, j) = Vectors.sqdist(test(i), train(j)) / (-2d * sqr(getSigma())) exp.inPlace(result) result } override def selfKernel(test: Vector): Double = 1d private def sqr(x: Double) = x * x private def cube(x: Double) = x * x * x override def toString = f"RBFKernel(sigma=$sigma%1.1e)" }
Example 175
Source File: GaussianProcessCommons.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import breeze.optimize.LBFGSB import org.apache.spark.ml.commons.kernel.{EyeKernel, Kernel, _} import org.apache.spark.ml.commons.util.DiffFunctionMemoized import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.Instrumentation import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Row} private[ml] trait GaussianProcessCommons[F, E <: Predictor[F, E, M], M <: PredictionModel[F, M]] extends ProjectedGaussianProcessHelper { this: Predictor[F, E, M] with GaussianProcessParams => protected val getKernel : () => Kernel = () => $(kernel)() + $(sigma2).const * new EyeKernel protected def getPoints(dataset: Dataset[_]) = { dataset.select(col($(labelCol)), col($(featuresCol))).rdd.map { case Row(label: Double, features: Vector) => LabeledPoint(label, features) } } protected def groupForExperts(points: RDD[LabeledPoint]) = { val numberOfExperts = Math.round(points.count().toDouble / $(datasetSizeForExpert)) points.zipWithIndex.map { case(instance, index) => (index % numberOfExperts, instance) }.groupByKey().map(_._2) } protected def getExpertLabelsAndKernels(points: RDD[LabeledPoint]): RDD[(BDV[Double], Kernel)] = { groupForExperts(points).map { chunk => val (labels, trainingVectors) = chunk.map(lp => (lp.label, lp.features)).toArray.unzip (BDV(labels: _*), getKernel().setTrainingVectors(trainingVectors)) } } protected def projectedProcess(expertLabelsAndKernels: RDD[(BDV[Double], Kernel)], points: RDD[LabeledPoint], optimalHyperparameters: BDV[Double]) = { val activeSet = $(activeSetProvider)($(activeSetSize), expertLabelsAndKernels, points, getKernel, optimalHyperparameters, $(seed)) points.unpersist() val (matrixKmnKnm, vectorKmny) = getMatrixKmnKnmAndVectorKmny(expertLabelsAndKernels, activeSet) expertLabelsAndKernels.unpersist() val optimalKernel = getKernel().setHyperparameters(optimalHyperparameters).setTrainingVectors(activeSet) // inv(sigma^2 K_mm + K_mn * K_nm) * K_mn * y val (magicVector, magicMatrix) = getMagicVector(optimalKernel, matrixKmnKnm, vectorKmny, activeSet, optimalHyperparameters) new GaussianProjectedProcessRawPredictor(magicVector, magicMatrix, optimalKernel) } protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor) : M } class GaussianProjectedProcessRawPredictor private[commons] (val magicVector: BDV[Double], val magicMatrix: BDM[Double], val kernel: Kernel) extends Serializable { def predict(features: Vector): (Double, Double) = { val cross = kernel.crossKernel(features) val selfKernel = kernel.selfKernel(features) (cross * magicVector, selfKernel + cross * magicMatrix * cross.t) } }
Example 176
Source File: GaussianProcessRegression.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import breeze.linalg.{DenseVector => BDV, _} import org.apache.spark.internal.Logging import org.apache.spark.ml.commons._ import org.apache.spark.ml.commons.kernel.Kernel import org.apache.spark.ml.commons.util._ import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, Instrumentation} import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset class GaussianProcessRegression(override val uid: String) extends Regressor[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with GaussianProcessParams with GaussianProcessCommons[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with Logging { def this() = this(Identifiable.randomUID("gaussProcessReg")) override protected def train(dataset: Dataset[_]): GaussianProcessRegressionModel = { val instr = Instrumentation.create(this, dataset) val points: RDD[LabeledPoint] = getPoints(dataset).cache() val expertLabelsAndKernels: RDD[(BDV[Double], Kernel)] = getExpertLabelsAndKernels(points).cache() val optimalHyperparameters = optimizeHypers(instr, expertLabelsAndKernels, likelihoodAndGradient) expertLabelsAndKernels.foreach(_._2.setHyperparameters(optimalHyperparameters)) produceModel(instr, points, expertLabelsAndKernels, optimalHyperparameters) } private def likelihoodAndGradient(yAndK : (BDV[Double], Kernel), x : BDV[Double]) = { val (y: BDV[Double], kernel : Kernel) = yAndK kernel.setHyperparameters(x) val (k, derivative) = kernel.trainingKernelAndDerivative() val (_, logdet, kinv) = logDetAndInv(k) val alpha = kinv * y val likelihood = 0.5 * (y.t * alpha) + 0.5 * logdet val alphaAlphaTMinusKinv = alpha * alpha.t alphaAlphaTMinusKinv -= kinv val gradient = derivative.map(derivative => -0.5 * sum(derivative *= alphaAlphaTMinusKinv)) (likelihood, BDV(gradient:_*)) } override def copy(extra: ParamMap): GaussianProcessRegression = defaultCopy(extra) override protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor): GaussianProcessRegressionModel = new GaussianProcessRegressionModel(uid, rawPredictor) } class GaussianProcessRegressionModel private[regression](override val uid: String, private val gaussianProjectedProcessRawPredictor: GaussianProjectedProcessRawPredictor) extends RegressionModel[Vector, GaussianProcessRegressionModel] { override protected def predict(features: Vector): Double = { gaussianProjectedProcessRawPredictor.predict(features)._1 } override def copy(extra: ParamMap): GaussianProcessRegressionModel = { val newModel = copyValues(new GaussianProcessRegressionModel(uid, gaussianProjectedProcessRawPredictor), extra) newModel.setParent(parent) } }
Example 177
Source File: StreamingMLUtils.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib import scala.language.implicitConversions import org.apache.spark.ml.linalg.{SparseVector, DenseVector, Vector} import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.util.MLUtils object StreamingMLUtils { implicit def mlToMllibVector(v: Vector): OldVector = v match { case dv: DenseVector => OldVectors.dense(dv.toArray) case sv: SparseVector => OldVectors.sparse(sv.size, sv.indices, sv.values) case _ => throw new IllegalArgumentException } def fastSquaredDistance(x: Vector, xNorm: Double, y: Vector, yNorm: Double) = { MLUtils.fastSquaredDistance(x, xNorm, y, yNorm) } }
Example 178
Source File: TextClassificationPipeline.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package org.textclassifier import org.apache.spark.ml.{Pipeline, PipelineModel} import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row import org.utils.StandaloneSpark object TextClassificationPipeline { def main(args: Array[String]): Unit = { val spark = StandaloneSpark.getSparkInstance() // Prepare training documents from a list of (id, text, label) tuples. val training = spark.createDataFrame(Seq( (0L, "a b c d e spark", 1.0), (1L, "b d", 0.0), (2L, "spark f g h", 1.0), (3L, "hadoop mapreduce", 0.0) )).toDF("id", "text", "label") // Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr. val tokenizer = new Tokenizer() .setInputCol("text") .setOutputCol("words") val hashingTF = new HashingTF() .setNumFeatures(1000) .setInputCol(tokenizer.getOutputCol) .setOutputCol("features") val lr = new LogisticRegression() .setMaxIter(10) .setRegParam(0.001) val pipeline = new Pipeline() .setStages(Array(tokenizer, hashingTF, lr)) // Fit the pipeline to training documents. val model = pipeline.fit(training) // Now we can optionally save the fitted pipeline to disk model.write.overwrite().save("/tmp/spark-logistic-regression-model") // We can also save this unfit pipeline to disk pipeline.write.overwrite().save("/tmp/unfit-lr-model") // And load it back in during production val sameModel = PipelineModel.load("/tmp/spark-logistic-regression-model") // Prepare test documents, which are unlabeled (id, text) tuples. val test = spark.createDataFrame(Seq( (4L, "spark i j k"), (5L, "l m n"), (6L, "spark hadoop spark"), (7L, "apache hadoop") )).toDF("id", "text") // Make predictions on test documents. model.transform(test) .select("id", "text", "probability", "prediction") .collect() .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) => println(s"($id, $text) --> prob=$prob, prediction=$prediction") } } }
Example 179
Source File: SparkVector.scala From Machine-Learning-with-Spark-Second-Edition with MIT License | 5 votes |
package linalg.vector import org.apache.spark.ml.linalg.{Vector, Vectors} object SparkVector { def main(args: Array[String]): Unit = { // Create a dense vector (1.0, 0.0, 3.0). val dVectorOne: Vector = Vectors.dense(1.0, 0.0, 2.0) println("dVectorOne:" + dVectorOne) // Sparse vector (1.0, 0.0, 2.0, 3.0) // corresponding to nonzero entries. val sVectorOne: Vector = Vectors.sparse(4, Array(0, 2,3), Array(1.0, 2.0, 3.0)) // Create a sparse vector (1.0, 0.0, 2.0, 2.0) by specifying its // nonzero entries. val sVectorTwo: Vector = Vectors.sparse(4, Seq((0, 1.0), (2, 2.0), (3, 3.0))) println("sVectorOne:" + sVectorOne) println("sVectorTwo:" + sVectorTwo) val sVectorOneMax = sVectorOne.argmax val sVectorOneNumNonZeros = sVectorOne.numNonzeros val sVectorOneSize = sVectorOne.size val sVectorOneArray = sVectorOne.toArray println("sVectorOneMax:" + sVectorOneMax) println("sVectorOneNumNonZeros:" + sVectorOneNumNonZeros) println("sVectorOneSize:" + sVectorOneSize) println("sVectorOneArray:" + sVectorOneArray) val dVectorOneToSparse = dVectorOne.toSparse println("dVectorOneToSparse:" + dVectorOneToSparse) } }
Example 180
Source File: DataFrameExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml import java.io.File import scopt.OptionParser import org.apache.spark.examples.mllib.AbstractParams import org.apache.spark.ml.linalg.Vector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.apache.spark.util.Utils object DataFrameExample { case class Params(input: String = "data/mllib/sample_libsvm_data.txt") extends AbstractParams[Params] def main(args: Array[String]) { val defaultParams = Params() val parser = new OptionParser[Params]("DataFrameExample") { head("DataFrameExample: an example app using DataFrame for ML.") opt[String]("input") .text(s"input path to dataframe") .action((x, c) => c.copy(input = x)) checkConfig { params => success } } parser.parse(args, defaultParams) match { case Some(params) => run(params) case _ => sys.exit(1) } } def run(params: Params): Unit = { val spark = SparkSession .builder .appName(s"DataFrameExample with $params") .getOrCreate() // Load input data println(s"Loading LIBSVM file with UDT from ${params.input}.") val df: DataFrame = spark.read.format("libsvm").load(params.input).cache() println("Schema from LIBSVM:") df.printSchema() println(s"Loaded training data as a DataFrame with ${df.count()} records.") // Show statistical summary of labels. val labelSummary = df.describe("label") labelSummary.show() // Convert features column to an RDD of vectors. val features = df.select("features").rdd.map { case Row(v: Vector) => v } val featureSummary = features.aggregate(new MultivariateOnlineSummarizer())( (summary, feat) => summary.add(Vectors.fromML(feat)), (sum1, sum2) => sum1.merge(sum2)) println(s"Selected features column with average values:\n ${featureSummary.mean.toString}") // Save the records in a parquet file. val tmpDir = Utils.createTempDir() val outputDir = new File(tmpDir, "dataframe").toString println(s"Saving to $outputDir as Parquet file.") df.write.parquet(outputDir) // Load the records back. println(s"Loading Parquet file with UDT from $outputDir.") val newDF = spark.read.parquet(outputDir) println(s"Schema from Parquet:") newDF.printSchema() spark.stop() } } // scalastyle:on println
Example 181
Source File: Word2VecExample.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row // $example off$ import org.apache.spark.sql.SparkSession object Word2VecExample { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Word2Vec example") .getOrCreate() // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.collect().foreach { case Row(text: Seq[_], features: Vector) => println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() } } // scalastyle:on println
Example 182
Source File: ElementwiseProduct.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.Param import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.VectorImplicits._ import org.apache.spark.sql.types.DataType @Since("2.0.0") def getScalingVec: Vector = getOrDefault(scalingVec) override protected def createTransformFunc: Vector => Vector = { require(params.contains(scalingVec), s"transformation requires a weight vector") val elemScaler = new feature.ElementwiseProduct($(scalingVec)) v => elemScaler.transform(v) } override protected def outputDataType: DataType = new VectorUDT() } @Since("2.0.0") object ElementwiseProduct extends DefaultParamsReadable[ElementwiseProduct] { @Since("2.0.0") override def load(path: String): ElementwiseProduct = super.load(path) }
Example 183
Source File: Normalizer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param.{DoubleParam, ParamValidators} import org.apache.spark.ml.util._ import org.apache.spark.mllib.feature import org.apache.spark.mllib.linalg.{Vectors => OldVectors} import org.apache.spark.sql.types.DataType @Since("1.4.0") def setP(value: Double): this.type = set(p, value) override protected def createTransformFunc: Vector => Vector = { val normalizer = new feature.Normalizer($(p)) vector => normalizer.transform(OldVectors.fromML(vector)).asML } override protected def outputDataType: DataType = new VectorUDT() } @Since("1.6.0") object Normalizer extends DefaultParamsReadable[Normalizer] { @Since("1.6.0") override def load(path: String): Normalizer = super.load(path) }
Example 184
Source File: DCT.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import edu.emory.mathcs.jtransforms.dct._ import org.apache.spark.annotation.Since import org.apache.spark.ml.UnaryTransformer import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.BooleanParam import org.apache.spark.ml.util._ import org.apache.spark.sql.types.DataType @Since("1.5.0") def getInverse: Boolean = $(inverse) setDefault(inverse -> false) override protected def createTransformFunc: Vector => Vector = { vec => val result = vec.toArray val jTransformer = new DoubleDCT_1D(result.length) if ($(inverse)) jTransformer.inverse(result, true) else jTransformer.forward(result, true) Vectors.dense(result) } override protected def validateInputType(inputType: DataType): Unit = { require(inputType.isInstanceOf[VectorUDT], s"Input type must be VectorUDT but got $inputType.") } override protected def outputDataType: DataType = new VectorUDT } @Since("1.6.0") object DCT extends DefaultParamsReadable[DCT] { @Since("1.6.0") override def load(path: String): DCT = super.load(path) }
Example 185
Source File: BinaryClassificationEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.linalg.{Vector, VectorUDT} import org.apache.spark.ml.param._ import org.apache.spark.ml.param.shared._ import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable, SchemaUtils} import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.functions._ import org.apache.spark.sql.types.DoubleType @Since("1.2.0") def setLabelCol(value: String): this.type = set(labelCol, value) setDefault(metricName -> "areaUnderROC") @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { val schema = dataset.schema SchemaUtils.checkColumnTypes(schema, $(rawPredictionCol), Seq(DoubleType, new VectorUDT)) SchemaUtils.checkNumericType(schema, $(labelCol)) // TODO: When dataset metadata has been implemented, check rawPredictionCol vector length = 2. val scoreAndLabels = dataset.select(col($(rawPredictionCol)), col($(labelCol)).cast(DoubleType)).rdd.map { case Row(rawPrediction: Vector, label: Double) => (rawPrediction(1), label) case Row(rawPrediction: Double, label: Double) => (rawPrediction, label) } val metrics = new BinaryClassificationMetrics(scoreAndLabels) val metric = $(metricName) match { case "areaUnderROC" => metrics.areaUnderROC() case "areaUnderPR" => metrics.areaUnderPR() } metrics.unpersist() metric } @Since("1.5.0") override def isLargerBetter: Boolean = $(metricName) match { case "areaUnderROC" => true case "areaUnderPR" => true } @Since("1.4.1") override def copy(extra: ParamMap): BinaryClassificationEvaluator = defaultCopy(extra) } @Since("1.6.0") object BinaryClassificationEvaluator extends DefaultParamsReadable[BinaryClassificationEvaluator] { @Since("1.6.0") override def load(path: String): BinaryClassificationEvaluator = super.load(path) }
Example 186
Source File: VectorSlicerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.{Attribute, AttributeGroup, NumericAttribute} import org.apache.spark.ml.linalg.{Vector, Vectors, VectorUDT} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} import org.apache.spark.sql.types.{StructField, StructType} class VectorSlicerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { test("params") { val slicer = new VectorSlicer().setInputCol("feature") ParamsSuite.checkParams(slicer) assert(slicer.getIndices.length === 0) assert(slicer.getNames.length === 0) withClue("VectorSlicer should not have any features selected by default") { intercept[IllegalArgumentException] { slicer.transformSchema(StructType(Seq(StructField("feature", new VectorUDT, true)))) } } } test("feature validity checks") { import VectorSlicer._ assert(validIndices(Array(0, 1, 8, 2))) assert(validIndices(Array.empty[Int])) assert(!validIndices(Array(-1))) assert(!validIndices(Array(1, 2, 1))) assert(validNames(Array("a", "b"))) assert(validNames(Array.empty[String])) assert(!validNames(Array("", "b"))) assert(!validNames(Array("a", "b", "a"))) } test("Test vector slicer") { val data = Array( Vectors.sparse(5, Seq((0, -2.0), (1, 2.3))), Vectors.dense(-2.0, 2.3, 0.0, 0.0, 1.0), Vectors.dense(0.0, 0.0, 0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0, 4.5, 3.3), Vectors.sparse(5, Seq()) ) // Expected after selecting indices 1, 4 val expected = Array( Vectors.sparse(2, Seq((0, 2.3))), Vectors.dense(2.3, 1.0), Vectors.dense(0.0, 0.0), Vectors.dense(-1.1, 3.3), Vectors.sparse(2, Seq()) ) val defaultAttr = NumericAttribute.defaultAttr val attrs = Array("f0", "f1", "f2", "f3", "f4").map(defaultAttr.withName) val attrGroup = new AttributeGroup("features", attrs.asInstanceOf[Array[Attribute]]) val resultAttrs = Array("f1", "f4").map(defaultAttr.withName) val resultAttrGroup = new AttributeGroup("expected", resultAttrs.asInstanceOf[Array[Attribute]]) val rdd = sc.parallelize(data.zip(expected)).map { case (a, b) => Row(a, b) } val df = spark.createDataFrame(rdd, StructType(Array(attrGroup.toStructField(), resultAttrGroup.toStructField()))) val vectorSlicer = new VectorSlicer().setInputCol("features").setOutputCol("result") def validateResults(df: DataFrame): Unit = { df.select("result", "expected").collect().foreach { case Row(vec1: Vector, vec2: Vector) => assert(vec1 === vec2) } val resultMetadata = AttributeGroup.fromStructField(df.schema("result")) val expectedMetadata = AttributeGroup.fromStructField(df.schema("expected")) assert(resultMetadata.numAttributes === expectedMetadata.numAttributes) resultMetadata.attributes.get.zip(expectedMetadata.attributes.get).foreach { case (a, b) => assert(a === b) } } vectorSlicer.setIndices(Array(1, 4)).setNames(Array.empty) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array(1)).setNames(Array("f4")) validateResults(vectorSlicer.transform(df)) vectorSlicer.setIndices(Array.empty).setNames(Array("f1", "f4")) validateResults(vectorSlicer.transform(df)) } test("read/write") { val t = new VectorSlicer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setIndices(Array(1, 3)) .setNames(Array("a", "d")) testDefaultReadWrite(t) } }
Example 187
Source File: MaxAbsScalerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row class MaxAbsScalerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("MaxAbsScaler fit basic case") { val data = Array( Vectors.dense(1, 0, 100), Vectors.dense(2, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-2, -100)), Vectors.sparse(3, Array(0), Array(-1.5))) val expected: Array[Vector] = Array( Vectors.dense(0.5, 0, 1), Vectors.dense(1, 0, 0), Vectors.sparse(3, Array(0, 2), Array(-1, -1)), Vectors.sparse(3, Array(0), Array(-0.75))) val df = data.zip(expected).toSeq.toDF("features", "expected") val scaler = new MaxAbsScaler() .setInputCol("features") .setOutputCol("scaled") val model = scaler.fit(df) model.transform(df).select("expected", "scaled").collect() .foreach { case Row(vector1: Vector, vector2: Vector) => assert(vector1.equals(vector2), s"MaxAbsScaler ut error: $vector2 should be $vector1") } // copied model must have the same parent. MLTestingUtils.checkCopy(model) } test("MaxAbsScaler read/write") { val t = new MaxAbsScaler() .setInputCol("myInputCol") .setOutputCol("myOutputCol") testDefaultReadWrite(t) } test("MaxAbsScalerModel read/write") { val instance = new MaxAbsScalerModel( "myMaxAbsScalerModel", Vectors.dense(1.0, 10.0)) .setInputCol("myInputCol") .setOutputCol("myOutputCol") val newInstance = testDefaultReadWrite(instance) assert(newInstance.maxAbs === instance.maxAbs) } }
Example 188
Source File: ChiSqSelectorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Dataset, Row} class ChiSqSelectorSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { @transient var dataset: Dataset[_] = _ override def beforeAll(): Unit = { super.beforeAll() // Toy dataset, including the top feature for a chi-squared test. // These data are chosen such that each feature's test has a distinct p-value. val allParamSettings: Map[String, Any] = Map( "selectorType" -> "percentile", "numTopFeatures" -> 1, "percentile" -> 0.12, "outputCol" -> "myOutput" ) }
Example 189
Source File: DCTSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import scala.beans.BeanInfo import edu.emory.mathcs.jtransforms.dct.DoubleDCT_1D import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.Row @BeanInfo case class DCTTestData(vec: Vector, wantedVec: Vector) class DCTSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("forward transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = false testDCT(data, inverse) } test("inverse transform of discrete cosine matches jTransforms result") { val data = Vectors.dense((0 until 128).map(_ => 2D * math.random - 1D).toArray) val inverse = true testDCT(data, inverse) } test("read/write") { val t = new DCT() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setInverse(true) testDefaultReadWrite(t) } private def testDCT(data: Vector, inverse: Boolean): Unit = { val expectedResultBuffer = data.toArray.clone() if (inverse) { new DoubleDCT_1D(data.size).inverse(expectedResultBuffer, true) } else { new DoubleDCT_1D(data.size).forward(expectedResultBuffer, true) } val expectedResult = Vectors.dense(expectedResultBuffer) val dataset = Seq(DCTTestData(data, expectedResult)).toDF() val transformer = new DCT() .setInputCol("vec") .setOutputCol("resultVec") .setInverse(inverse) transformer.transform(dataset) .select("resultVec", "wantedVec") .collect() .foreach { case Row(resultVec: Vector, wantedVec: Vector) => assert(Vectors.sqdist(resultVec, wantedVec) < 1e-6) } } }
Example 190
Source File: BinarizerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{DataFrame, Row} class BinarizerSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[Double] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array(0.1, -0.5, 0.2, -0.3, 0.8, 0.7, -0.1, -0.4) } test("params") { ParamsSuite.checkParams(new Binarizer) } test("Binarize continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(defaultBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize continuous features with setter") { val threshold: Double = 0.2 val thresholdBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = data.zip(thresholdBinarized).toSeq.toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Double, y: Double) => assert(x === y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with default parameter") { val defaultBinarized: Array[Double] = data.map(x => if (x > 0.0) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("Binarize vector of continuous features with setter") { val threshold: Double = 0.2 val defaultBinarized: Array[Double] = data.map(x => if (x > threshold) 1.0 else 0.0) val dataFrame: DataFrame = Seq( (Vectors.dense(data), Vectors.dense(defaultBinarized)) ).toDF("feature", "expected") val binarizer: Binarizer = new Binarizer() .setInputCol("feature") .setOutputCol("binarized_feature") .setThreshold(threshold) binarizer.transform(dataFrame).select("binarized_feature", "expected").collect().foreach { case Row(x: Vector, y: Vector) => assert(x == y, "The feature value is not correct after binarization.") } } test("read/write") { val t = new Binarizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setThreshold(0.1) testDefaultReadWrite(t) } }
Example 191
Source File: HashingTFSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.attribute.AttributeGroup import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.param.ParamsSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.feature.{HashingTF => MLlibHashingTF} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.util.Utils class HashingTFSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("params") { ParamsSuite.checkParams(new HashingTF) } test("hashingTF") { val df = Seq((0, "a a b b c d".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) val output = hashingTF.transform(df) val attrGroup = AttributeGroup.fromStructField(output.schema("features")) require(attrGroup.numAttributes === Some(n)) val features = output.select("features").first().getAs[Vector](0) // Assume perfect hash on "a", "b", "c", and "d". def idx: Any => Int = murmur3FeatureIdx(n) val expected = Vectors.sparse(n, Seq((idx("a"), 2.0), (idx("b"), 2.0), (idx("c"), 1.0), (idx("d"), 1.0))) assert(features ~== expected absTol 1e-14) } test("applying binary term freqs") { val df = Seq((0, "a a b c c c".split(" ").toSeq)).toDF("id", "words") val n = 100 val hashingTF = new HashingTF() .setInputCol("words") .setOutputCol("features") .setNumFeatures(n) .setBinary(true) val output = hashingTF.transform(df) val features = output.select("features").first().getAs[Vector](0) def idx: Any => Int = murmur3FeatureIdx(n) // Assume perfect hash on input features val expected = Vectors.sparse(n, Seq((idx("a"), 1.0), (idx("b"), 1.0), (idx("c"), 1.0))) assert(features ~== expected absTol 1e-14) } test("read/write") { val t = new HashingTF() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setNumFeatures(10) testDefaultReadWrite(t) } private def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = { Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures) } }
Example 192
Source File: LibSVMRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 193
Source File: ProbabilisticClassifierSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification import org.apache.spark.SparkFunSuite import org.apache.spark.ml.linalg.{Vector, Vectors} final class TestProbabilisticClassificationModel( override val uid: String, override val numFeatures: Int, override val numClasses: Int) extends ProbabilisticClassificationModel[Vector, TestProbabilisticClassificationModel] { override def copy(extra: org.apache.spark.ml.param.ParamMap): this.type = defaultCopy(extra) override protected def predictRaw(input: Vector): Vector = { input } override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { rawPrediction } def friendlyPredict(values: Double*): Double = { predict(Vectors.dense(values.toArray)) } } class ProbabilisticClassifierSuite extends SparkFunSuite { test("test thresholding") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.5, 0.2)) assert(testModel.friendlyPredict(1.0, 1.0) === 1.0) assert(testModel.friendlyPredict(1.0, 0.2) === 0.0) } test("test thresholding not required") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) assert(testModel.friendlyPredict(1.0, 2.0) === 1.0) } test("test tiebreak") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.4, 0.4)) assert(testModel.friendlyPredict(0.6, 0.6) === 0.0) } test("test one zero threshold") { val testModel = new TestProbabilisticClassificationModel("myuid", 2, 2) .setThresholds(Array(0.0, 0.1)) assert(testModel.friendlyPredict(1.0, 10.0) === 0.0) assert(testModel.friendlyPredict(0.0, 10.0) === 1.0) } test("bad thresholds") { intercept[IllegalArgumentException] { new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(0.0, 0.0)) } intercept[IllegalArgumentException] { new TestProbabilisticClassificationModel("myuid", 2, 2).setThresholds(Array(-0.1, 0.1)) } } } object ProbabilisticClassifierSuite { val allParamSettings: Map[String, Any] = ClassifierSuite.allParamSettings ++ Map( "probabilityCol" -> "myProbability", "thresholds" -> Array(0.4, 0.6) ) }
Example 194
Source File: Word2VecSuite.scala From spark-word2vec with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.embedding import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.ml.linalg.Vector object Word2VecSuite { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Word2Vec example") .master("local[*]") .getOrCreate() // $example on$ // Input data: Each row is a bag of words from a sentence or document. val documentDF = spark.createDataFrame(Seq( "Hi I heard about Spark".split(" "), "I wish Java could use case classes".split(" "), "Logistic regression models are neat".split(" ") ).map(Tuple1.apply)).toDF("text") // Learn a mapping from words to Vectors. val word2Vec = new Word2Vec() .setInputCol("text") .setOutputCol("result") .setVectorSize(3) .setMinCount(0) val model = word2Vec.fit(documentDF) val result = model.transform(documentDF) result.collect().foreach { case Row(text: Seq[_], features: Vector) => println(s"Text: [${text.mkString(", ")}] => \nVector: $features\n") } // $example off$ spark.stop() } }
Example 195
Source File: Tokenizer.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object Tokenizer extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ import ctx.sqlContext.implicits._ DataGenerator.generateDoc( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, vocabSize, docLength, inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.feature.Tokenizer() .setInputCol(inputCol) } }
Example 196
Source File: Bucketizer.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import scala.util.Random import org.apache.spark.ml import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object Bucketizer extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ import ctx.sqlContext.implicits._ val rng = ctx.newGenerator() // For a bucketizer, training data consists of a single column of random doubles DataGenerator.generateContinuousFeatures(ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures = 1).rdd.map { case Row(vec: Vector) => vec(0) // extract the single generated double value for each row }.toDF(inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ val rng = ctx.newGenerator() // Generate an array of (finite) splitting points in [-1, 1) for the Bucketizer val splitPoints = 0.until(bucketizerNumBuckets - 1).map { _ => 2 * rng.nextDouble() - 1 }.sorted.toArray // Final array of splits contains +/- infinity val splits = Array(Double.NegativeInfinity) ++ splitPoints ++ Array(Double.PositiveInfinity) new ml.feature.Bucketizer() .setSplits(splits) .setInputCol(inputCol) } }
Example 197
Source File: StringIndexer.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object StringIndexer extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ import ctx.sqlContext.implicits._ DataGenerator.generateRandString(ctx.sqlContext, numExamples, ctx.seed(), numPartitions, vocabSize, inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ import ctx.sqlContext.implicits._ new ml.feature.StringIndexer() .setInputCol(inputCol) .setHandleInvalid("skip") } }
Example 198
Source File: OneHotEncoder.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object OneHotEncoder extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ import ctx.sqlContext.implicits._ DataGenerator.generateMixedFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, Array.fill(1)(featureArity.get) ).rdd.map { case Row(vec: Vector) => vec(0) // extract the single generated double value for each row }.toDF(inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { new ml.feature.OneHotEncoder() .setInputCol(inputCol) } }
Example 199
Source File: VectorAssembler.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import org.apache.spark.sql.functions._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object VectorAssembler extends BenchmarkAlgorithm with TestFromTraining { private def getInputCols(numInputCols: Int): Array[String] = { Array.tabulate(numInputCols)(i => s"c${i}") } override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ require(numInputCols.get <= numFeatures.get, s"numInputCols (${numInputCols}) cannot be greater than numFeatures (${numFeatures}).") val df = DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, numFeatures) val slice = udf { (v: Vector, numSlices: Int) => val data = v.toArray val n = data.length.toLong (0 until numSlices).map { i => val start = ((i * n) / numSlices).toInt val end = ((i + 1) * n / numSlices).toInt Vectors.dense(data.slice(start, end)) } } val inputCols = getInputCols(numInputCols.get) df.select(slice(col("features"), lit(numInputCols.get)).as("slices")) .select((0 until numInputCols.get).map(i => col("slices")(i).as(inputCols(i))): _*) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ val inputCols = getInputCols(numInputCols.get) new ml.feature.VectorAssembler() .setInputCols(inputCols) } }
Example 200
Source File: QuantileDiscretizer.scala From spark-sql-perf with Apache License 2.0 | 5 votes |
package com.databricks.spark.sql.perf.mllib.feature import org.apache.spark.ml import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.PipelineStage import org.apache.spark.sql._ import com.databricks.spark.sql.perf.mllib.OptionImplicits._ import com.databricks.spark.sql.perf.mllib.data.DataGenerator import com.databricks.spark.sql.perf.mllib.{BenchmarkAlgorithm, MLBenchContext, TestFromTraining} object QuantileDiscretizer extends BenchmarkAlgorithm with TestFromTraining with UnaryTransformer { override def trainingDataSet(ctx: MLBenchContext): DataFrame = { import ctx.params._ import ctx.sqlContext.implicits._ DataGenerator.generateContinuousFeatures( ctx.sqlContext, numExamples, ctx.seed(), numPartitions, 1 ).rdd.map { case Row(vec: Vector) => vec(0) // extract the single generated double value for each row }.toDF(inputCol) } override def getPipelineStage(ctx: MLBenchContext): PipelineStage = { import ctx.params._ new ml.feature.QuantileDiscretizer() .setInputCol(inputCol) .setOutputCol(outputCol) .setNumBuckets(bucketizerNumBuckets) .setRelativeError(relativeError) } }