org.apache.spark.ml.feature.LabeledPoint Scala Examples
The following examples show how to use org.apache.spark.ml.feature.LabeledPoint.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: PerformanceBenchmark.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression.benchmark import breeze.linalg.{sum, DenseMatrix => BDM, DenseVector => BDV, _} import breeze.numerics.sin import org.apache.spark.ml.commons.kernel.RBFKernel import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GaussianProcessRegression import org.apache.spark.sql.SparkSession import scala.util.Random object PerformanceBenchmark extends App { val spark = SparkSession.builder() .appName("bench") .master(s"local[${args(0)}]").getOrCreate() import spark.sqlContext.implicits._ val sampleSize = args(2).toInt val nFeatures = 3 val parallelism = args(0).toInt * 4 val expertSampleSize = args(1).toInt val instancesRDD = spark.sparkContext.parallelize(0 until parallelism).flatMap(index => { val random = new Random(13 * index) val X = BDM.create(sampleSize/parallelism, nFeatures, Array.fill(sampleSize * nFeatures/parallelism)(random.nextDouble())) val Y = sin(sum(X(*, ::)) / 1000d).toArray (0 until X.rows).map{ i=> val x = X(i, ::) val y = Y(i) LabeledPoint(y, Vectors.dense(x.t.toArray)) } }) val instances = instancesRDD.toDF.cache() instances.count() val gp = new GaussianProcessRegression() .setKernel(() => new RBFKernel(0.1)) .setDatasetSizeForExpert(expertSampleSize) .setActiveSetSize(expertSampleSize) .setSeed(13) .setSigma2(1e-3) time(gp.fit(instances)) def time[T](f: => T): T = { val start = System.currentTimeMillis() val result = f println("TIME: " + (System.currentTimeMillis() - start)) result } }
Example 2
Source File: SparkMLTestUtils.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml import scala.util.Random import breeze.linalg.DenseVector import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.random.{GammaGenerator, PoissonGenerator, StandardNormalGenerator} object SparkMLTestUtils { def generateGeneralizedLinearRegressionInput( intercept: Double, coefficients: Array[Double], xMean: Array[Double], xVariance: Array[Double], nPoints: Int, seed: Int, noiseLevel: Double, family: String, link: String): Seq[LabeledPoint] = { val rnd = new Random(seed) def rndElement(i: Int) = { (rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) } val (generator, mean) = family match { case "gaussian" => (new StandardNormalGenerator, 0.0) case "poisson" => (new PoissonGenerator(1.0), 1.0) case "gamma" => (new GammaGenerator(1.0, 1.0), 1.0) } generator.setSeed(seed) (0 until nPoints).map { _ => val x = DenseVector(coefficients.indices.map(rndElement).toArray) val w = DenseVector(coefficients) val eta = w.dot(x) + intercept val mu = link match { case "identity" => eta case "log" => math.exp(eta) case "sqrt" => math.pow(eta, 2.0) case "inverse" => 1.0 / eta } val label = mu + noiseLevel * (generator.nextValue() - mean) // Return LabeledPoints with DenseVector LabeledPoint(label, Vectors.dense(x.data)) } } }
Example 3
Source File: VectorUDTSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.sql.catalyst.JavaTypeInference import org.apache.spark.sql.types._ class VectorUDTSuite extends SparkFunSuite { test("preloaded VectorUDT") { val dv1 = Vectors.dense(Array.empty[Double]) val dv2 = Vectors.dense(1.0, 2.0) val sv1 = Vectors.sparse(2, Array.empty, Array.empty) val sv2 = Vectors.sparse(2, Array(1), Array(2.0)) for (v <- Seq(dv1, dv2, sv1, sv2)) { val udt = UDTRegistration.getUDTFor(v.getClass.getName).get.newInstance() .asInstanceOf[VectorUDT] assert(v === udt.deserialize(udt.serialize(v))) assert(udt.typeName == "vector") assert(udt.simpleString == "vector") } } test("JavaTypeInference with VectorUDT") { val (dataType, _) = JavaTypeInference.inferDataType(classOf[LabeledPoint]) assert(dataType.asInstanceOf[StructType].fields.map(_.dataType) === Seq(new VectorUDT, DoubleType)) } }
Example 4
Source File: ChiSquareTestSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.stat import java.util.Random import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.mllib.stat.test.ChiSqTest import org.apache.spark.mllib.util.MLlibTestSparkContext class ChiSquareTestSuite extends SparkFunSuite with MLlibTestSparkContext with DefaultReadWriteTest { import testImplicits._ test("test DataFrame of labeled points") { // labels: 1.0 (2 / 6), 0.0 (4 / 6) // feature1: 0.5 (1 / 6), 1.5 (2 / 6), 3.5 (3 / 6) // feature2: 10.0 (1 / 6), 20.0 (1 / 6), 30.0 (2 / 6), 40.0 (2 / 6) val data = Seq( LabeledPoint(0.0, Vectors.dense(0.5, 10.0)), LabeledPoint(0.0, Vectors.dense(1.5, 20.0)), LabeledPoint(1.0, Vectors.dense(1.5, 30.0)), LabeledPoint(0.0, Vectors.dense(3.5, 30.0)), LabeledPoint(0.0, Vectors.dense(3.5, 40.0)), LabeledPoint(1.0, Vectors.dense(3.5, 40.0))) for (numParts <- List(2, 4, 6, 8)) { val df = spark.createDataFrame(sc.parallelize(data, numParts)) val chi = ChiSquareTest.test(df, "features", "label") val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = chi.select("pValues", "degreesOfFreedom", "statistics") .as[(Vector, Array[Int], Vector)].head() assert(pValues ~== Vectors.dense(0.6873, 0.6823) relTol 1e-4) assert(degreesOfFreedom === Array(2, 3)) assert(statistics ~== Vectors.dense(0.75, 1.5) relTol 1e-4) } } test("large number of features (SPARK-3087)") { // Test that the right number of results is returned val numCols = 1001 val sparseData = Array( LabeledPoint(0.0, Vectors.sparse(numCols, Seq((100, 2.0)))), LabeledPoint(0.1, Vectors.sparse(numCols, Seq((200, 1.0))))) val df = spark.createDataFrame(sparseData) val chi = ChiSquareTest.test(df, "features", "label") val (pValues: Vector, degreesOfFreedom: Array[Int], statistics: Vector) = chi.select("pValues", "degreesOfFreedom", "statistics") .as[(Vector, Array[Int], Vector)].head() assert(pValues.size === numCols) assert(degreesOfFreedom.length === numCols) assert(statistics.size === numCols) assert(pValues(1000) !== null) // SPARK-3087 } test("fail on continuous features or labels") { val tooManyCategories: Int = 100000 assert(tooManyCategories > ChiSqTest.maxCategories, "This unit test requires that " + "tooManyCategories be large enough to cause ChiSqTest to throw an exception.") val random = new Random(11L) val continuousLabel = Seq.fill(tooManyCategories)( LabeledPoint(random.nextDouble(), Vectors.dense(random.nextInt(2)))) withClue("ChiSquare should throw an exception when given a continuous-valued label") { intercept[SparkException] { val df = spark.createDataFrame(continuousLabel) ChiSquareTest.test(df, "features", "label") } } val continuousFeature = Seq.fill(tooManyCategories)( LabeledPoint(random.nextInt(2), Vectors.dense(random.nextDouble()))) withClue("ChiSquare should throw an exception when given continuous-valued features") { intercept[SparkException] { val df = spark.createDataFrame(continuousFeature) ChiSquareTest.test(df, "features", "label") } } } }
Example 5
Source File: GradientBoostedTreesSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L, "all") val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L, "all") val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 6
Source File: VectorUDTSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.sql.catalyst.JavaTypeInference import org.apache.spark.sql.types._ class VectorUDTSuite extends SparkFunSuite { test("preloaded VectorUDT") { val dv1 = Vectors.dense(Array.empty[Double]) val dv2 = Vectors.dense(1.0, 2.0) val sv1 = Vectors.sparse(2, Array.empty, Array.empty) val sv2 = Vectors.sparse(2, Array(1), Array(2.0)) for (v <- Seq(dv1, dv2, sv1, sv2)) { val udt = UDTRegistration.getUDTFor(v.getClass.getName).get.newInstance() .asInstanceOf[VectorUDT] assert(v === udt.deserialize(udt.serialize(v))) assert(udt.typeName == "vector") assert(udt.simpleString == "vector") } } test("JavaTypeInference with VectorUDT") { val (dataType, _) = JavaTypeInference.inferDataType(classOf[LabeledPoint]) assert(dataType.asInstanceOf[StructType].fields.map(_.dataType) === Seq(new VectorUDT, DoubleType)) } }
Example 7
Source File: GradientBoostedTreesSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 8
Source File: UberXGBoostModel.scala From uberdata with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts.models import ml.dmlc.xgboost4j.java.Rabit import ml.dmlc.xgboost4j.scala.DMatrix import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint} import ml.dmlc.xgboost4j.scala.spark.{XGBoost, XGBoostModel} import org.apache.spark.TaskContext import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.rdd.RDD import scala.collection.JavaConverters._ object UberXGBoostModel { def train(trainLabel: RDD[LabeledPoint], configMap: Map[String, Any], round: Int, nWorkers: Int): XGBoostModel = { val trainData = trainLabel.cache XGBoost.trainWithRDD(trainData, configMap, round, nWorkers,useExternalMemory = true, missing = Float.NaN) } def labelPredict(testSet: RDD[XGBLabeledPoint], useExternalCache: Boolean, booster: XGBoostModel): RDD[(Float, Float)] = { val broadcastBooster = testSet.sparkContext.broadcast(booster) testSet.mapPartitions { testData => val (toPredict, toLabel) = testData.duplicate val dMatrix = new DMatrix(toPredict) val prediction = broadcastBooster.value.booster.predict(dMatrix).flatten.toIterator toLabel.map(_.label).zip(prediction) } } def labelPredict(testSet: RDD[DenseVector], booster: XGBoostModel): RDD[(Float, Float)] = { val broadcastBooster = testSet.sparkContext.broadcast(booster) val rdd = testSet.cache broadcastBooster.value.predict(testSet,missingValue = Float.NaN).map(value => (value(0), value(1))) // testSet. // testSet.mapPartitions { testData => // val (toPredict, toLabel) = testData.duplicate // val dMatrix = new DMatrix(toPredict) // // val prediction = broadcastBooster.value.booster.predict(dMatrix).flatten.toIterator // toLabel.map(_.label).zip(prediction) // } } }
Example 9
Source File: VectorUDTSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.sql.catalyst.JavaTypeInference import org.apache.spark.sql.types._ class VectorUDTSuite extends SparkFunSuite { test("preloaded VectorUDT") { val dv1 = Vectors.dense(Array.empty[Double]) val dv2 = Vectors.dense(1.0, 2.0) val sv1 = Vectors.sparse(2, Array.empty, Array.empty) val sv2 = Vectors.sparse(2, Array(1), Array(2.0)) for (v <- Seq(dv1, dv2, sv1, sv2)) { val udt = UDTRegistration.getUDTFor(v.getClass.getName).get.newInstance() .asInstanceOf[VectorUDT] assert(v === udt.deserialize(udt.serialize(v))) assert(udt.typeName == "vector") assert(udt.simpleString == "vector") } } test("JavaTypeInference with VectorUDT") { val (dataType, _) = JavaTypeInference.inferDataType(classOf[LabeledPoint]) assert(dataType.asInstanceOf[StructType].fields.map(_.dataType) === Seq(new VectorUDT, DoubleType)) } }
Example 10
Source File: GradientBoostedTreesSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }
Example 11
Source File: GaussianProcessRegression.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import breeze.linalg.{DenseVector => BDV, _} import org.apache.spark.internal.Logging import org.apache.spark.ml.commons._ import org.apache.spark.ml.commons.kernel.Kernel import org.apache.spark.ml.commons.util._ import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.param.ParamMap import org.apache.spark.ml.util.{Identifiable, Instrumentation} import org.apache.spark.rdd.RDD import org.apache.spark.sql.Dataset class GaussianProcessRegression(override val uid: String) extends Regressor[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with GaussianProcessParams with GaussianProcessCommons[Vector, GaussianProcessRegression, GaussianProcessRegressionModel] with Logging { def this() = this(Identifiable.randomUID("gaussProcessReg")) override protected def train(dataset: Dataset[_]): GaussianProcessRegressionModel = { val instr = Instrumentation.create(this, dataset) val points: RDD[LabeledPoint] = getPoints(dataset).cache() val expertLabelsAndKernels: RDD[(BDV[Double], Kernel)] = getExpertLabelsAndKernels(points).cache() val optimalHyperparameters = optimizeHypers(instr, expertLabelsAndKernels, likelihoodAndGradient) expertLabelsAndKernels.foreach(_._2.setHyperparameters(optimalHyperparameters)) produceModel(instr, points, expertLabelsAndKernels, optimalHyperparameters) } private def likelihoodAndGradient(yAndK : (BDV[Double], Kernel), x : BDV[Double]) = { val (y: BDV[Double], kernel : Kernel) = yAndK kernel.setHyperparameters(x) val (k, derivative) = kernel.trainingKernelAndDerivative() val (_, logdet, kinv) = logDetAndInv(k) val alpha = kinv * y val likelihood = 0.5 * (y.t * alpha) + 0.5 * logdet val alphaAlphaTMinusKinv = alpha * alpha.t alphaAlphaTMinusKinv -= kinv val gradient = derivative.map(derivative => -0.5 * sum(derivative *= alphaAlphaTMinusKinv)) (likelihood, BDV(gradient:_*)) } override def copy(extra: ParamMap): GaussianProcessRegression = defaultCopy(extra) override protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor): GaussianProcessRegressionModel = new GaussianProcessRegressionModel(uid, rawPredictor) } class GaussianProcessRegressionModel private[regression](override val uid: String, private val gaussianProjectedProcessRawPredictor: GaussianProjectedProcessRawPredictor) extends RegressionModel[Vector, GaussianProcessRegressionModel] { override protected def predict(features: Vector): Double = { gaussianProjectedProcessRawPredictor.predict(features)._1 } override def copy(extra: ParamMap): GaussianProcessRegressionModel = { val newModel = copyValues(new GaussianProcessRegressionModel(uid, gaussianProjectedProcessRawPredictor), extra) newModel.setParent(parent) } }
Example 12
Source File: Airfoil.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression.examples import org.apache.spark.ml.commons.kernel.{ARDRBFKernel, _} import org.apache.spark.ml.commons.util.Scaling import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GaussianProcessRegression object Airfoil extends App with GPExample with Scaling { import spark.sqlContext.implicits._ override def name = "Airfoil" val airfoil = readSCV("data/airfoil.csv") val scaled = scale(airfoil).toDF val gp = new GaussianProcessRegression() .setActiveSetSize(1000) .setSigma2(1e-4) .setKernel(() => 1 * new ARDRBFKernel(5) + 1.const * new EyeKernel) cv(gp, scaled, 2.1) def readSCV(path : String) = { spark.read.format("csv").load(path).rdd.map(row => { val features = Vectors.dense(Array("_c0", "_c1", "_c2", "_c3", "_c4") .map(col => row.getAs[String](col).toDouble)) LabeledPoint(row.getAs[String]("_c5").toDouble, features) }) } }
Example 13
Source File: Synthetics.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression.examples import breeze.linalg._ import breeze.numerics._ import org.apache.spark.ml.commons.KMeansActiveSetProvider import org.apache.spark.ml.commons.kernel.{RBFKernel, WhiteNoiseKernel, _} import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.GaussianProcessRegression object Synthetics extends App with GPExample { import spark.sqlContext.implicits._ override def name = "Synthetics" val noiseVar = 0.01 val g = breeze.stats.distributions.Gaussian(0, math.sqrt(noiseVar)) val X = linspace(0, 1, length = 2000).toDenseMatrix val Y = sin(X).toArray.map(y => y + g.sample()) val instances = spark.sparkContext.parallelize(X.toArray.zip(Y).map { case(v, y) => LabeledPoint(y, Vectors.dense(Array(v)))}).toDF val gp = new GaussianProcessRegression() .setKernel(() => 1*new RBFKernel(0.1, 1e-6, 10) + WhiteNoiseKernel(0.5, 0, 1)) .setDatasetSizeForExpert(100) .setActiveSetProvider(new KMeansActiveSetProvider()) .setActiveSetSize(100) .setSeed(13) .setSigma2(1e-3) cv(gp, instances, 0.11) }
Example 14
Source File: TreePoint.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.tree.{ContinuousSplit, Split} import org.apache.spark.rdd.RDD private def findBin( featureIndex: Int, labeledPoint: LabeledPoint, featureArity: Int, thresholds: Array[Double]): Int = { val featureValue = labeledPoint.features(featureIndex) if (featureArity == 0) { val idx = java.util.Arrays.binarySearch(thresholds, featureValue) if (idx >= 0) { idx } else { -idx - 1 } } else { // Categorical feature bins are indexed by feature values. if (featureValue < 0 || featureValue >= featureArity) { throw new IllegalArgumentException( s"DecisionTree given invalid data:" + s" Feature $featureIndex is categorical with values in {0,...,${featureArity - 1}," + s" but a data point gives it value $featureValue.\n" + " Bad data point: " + labeledPoint.toString) } featureValue.toInt } } }
Example 15
Source File: Iris.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.examples import org.apache.spark.ml.classification.{GaussianProcessClassifier, OneVsRest} import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder} import org.apache.spark.sql.SparkSession object Iris extends App { val name = "Iris" val spark = SparkSession.builder().appName(name).master("local[4]").getOrCreate() import spark.sqlContext.implicits._ val name2indx = Map("Iris-versicolor" -> 0, "Iris-setosa" -> 1, "Iris-virginica" -> 2) val dataset = spark.read.format("csv").load("data/iris.csv").rdd.map(row => { val features = Vectors.dense(Array("_c0", "_c1", "_c2", "_c3") .map(col => row.getAs[String](col).toDouble)) val label = name2indx(row.getAs[String]("_c4")) LabeledPoint(label, features) }).toDF val gp = new GaussianProcessClassifier().setDatasetSizeForExpert(20).setActiveSetSize(30) val ovr = new OneVsRest().setClassifier(gp) val cv = new CrossValidator() .setEstimator(ovr) .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("accuracy")) .setEstimatorParamMaps(new ParamGridBuilder().build()) .setNumFolds(10) println("Accuracy: " + cv.fit(dataset).avgMetrics.toList) }
Example 16
Source File: MNIST.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.classification.examples import org.apache.spark.ml.classification.GaussianProcessClassifier import org.apache.spark.ml.commons.kernel.RBFKernel import org.apache.spark.ml.commons.util.Scaling import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit} import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object MNIST extends App with Scaling { val name = "MNIST" val spark = SparkSession.builder().appName(name).master(s"local[${args(0)}]").getOrCreate() val path = args(1) val parallelism = args(0).toInt * 4 val forExpert = args(2).toInt val activeSet = args(3).toInt import spark.sqlContext.implicits._ val dataset = (scale _ andThen labels201 _) (spark.read.format("csv").load(path).rdd.map(row => { val features = Vectors.dense((1 until row.length).map("_c" + _).map(row.getAs[String]).map(_.toDouble).toArray) val label = row.getAs[String]("_c0").toDouble LabeledPoint(label, features) }).cache()).toDF.repartition(parallelism).cache() val gp = new GaussianProcessClassifier() .setDatasetSizeForExpert(forExpert) .setActiveSetSize(activeSet) .setKernel(() => new RBFKernel(10)) .setTol(1e-3) val cv = new TrainValidationSplit() .setEstimator(gp) .setEvaluator(new MulticlassClassificationEvaluator().setMetricName("accuracy")) .setEstimatorParamMaps(new ParamGridBuilder().build()) .setTrainRatio(0.8) println("Accuracy: " + cv.fit(dataset).validationMetrics.toList) def labels201(data: RDD[LabeledPoint]) : RDD[LabeledPoint] = { val old2new = data.map(_.label).distinct().collect().zipWithIndex.toMap data.map(lp => LabeledPoint(old2new(lp.label), lp.features)) } }
Example 17
Source File: Scaling.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons.util import breeze.linalg.DenseVector import breeze.numerics.sqrt import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD private[ml] trait Scaling { def scale(data: RDD[LabeledPoint]) = { val x = data.map(x => DenseVector(x.features.toArray)).cache() val y = data.map(_.label) val n = x.count().toDouble val mean = x.reduce(_ + _) / n val centered = x.map(_ - mean).cache() val variance = centered.map(xx => xx *:* xx).reduce(_ + _) / n x.unpersist() val varianceNoZeroes = variance.map(v => if (v > 0d) v else 1d) val scaled = centered.map(_ /:/ sqrt(varianceNoZeroes)).map(_.toArray).map(Vectors.dense).zip(y).map { case(f, y) => LabeledPoint(y, f) }.cache() if (scaled.count() > 0) // ensure scaled is materialized centered.unpersist() scaled } }
Example 18
Source File: GaussianProcessCommons.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import breeze.optimize.LBFGSB import org.apache.spark.ml.commons.kernel.{EyeKernel, Kernel, _} import org.apache.spark.ml.commons.util.DiffFunctionMemoized import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.Instrumentation import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Row} private[ml] trait GaussianProcessCommons[F, E <: Predictor[F, E, M], M <: PredictionModel[F, M]] extends ProjectedGaussianProcessHelper { this: Predictor[F, E, M] with GaussianProcessParams => protected val getKernel : () => Kernel = () => $(kernel)() + $(sigma2).const * new EyeKernel protected def getPoints(dataset: Dataset[_]) = { dataset.select(col($(labelCol)), col($(featuresCol))).rdd.map { case Row(label: Double, features: Vector) => LabeledPoint(label, features) } } protected def groupForExperts(points: RDD[LabeledPoint]) = { val numberOfExperts = Math.round(points.count().toDouble / $(datasetSizeForExpert)) points.zipWithIndex.map { case(instance, index) => (index % numberOfExperts, instance) }.groupByKey().map(_._2) } protected def getExpertLabelsAndKernels(points: RDD[LabeledPoint]): RDD[(BDV[Double], Kernel)] = { groupForExperts(points).map { chunk => val (labels, trainingVectors) = chunk.map(lp => (lp.label, lp.features)).toArray.unzip (BDV(labels: _*), getKernel().setTrainingVectors(trainingVectors)) } } protected def projectedProcess(expertLabelsAndKernels: RDD[(BDV[Double], Kernel)], points: RDD[LabeledPoint], optimalHyperparameters: BDV[Double]) = { val activeSet = $(activeSetProvider)($(activeSetSize), expertLabelsAndKernels, points, getKernel, optimalHyperparameters, $(seed)) points.unpersist() val (matrixKmnKnm, vectorKmny) = getMatrixKmnKnmAndVectorKmny(expertLabelsAndKernels, activeSet) expertLabelsAndKernels.unpersist() val optimalKernel = getKernel().setHyperparameters(optimalHyperparameters).setTrainingVectors(activeSet) // inv(sigma^2 K_mm + K_mn * K_nm) * K_mn * y val (magicVector, magicMatrix) = getMagicVector(optimalKernel, matrixKmnKnm, vectorKmny, activeSet, optimalHyperparameters) new GaussianProjectedProcessRawPredictor(magicVector, magicMatrix, optimalKernel) } protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor) : M } class GaussianProjectedProcessRawPredictor private[commons] (val magicVector: BDV[Double], val magicMatrix: BDM[Double], val kernel: Kernel) extends Serializable { def predict(features: Vector): (Double, Double) = { val cross = kernel.crossKernel(features) val selfKernel = kernel.selfKernel(features) (cross * magicVector, selfKernel + cross * magicMatrix * cross.t) } }
Example 19
Source File: OptimizedRandomForestRegressorSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.tree.impl.{OptimizedRandomForestSuite, OptimizedTreeTests} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest, MLTestingUtils} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo} import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest} import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame def compareAPIs( data: RDD[Instance], rf: RandomForestRegressor, orf: OptimizedRandomForestRegressor, categoricalFeatures: Map[Int, Int]): Unit = { val numFeatures = data.first().features.size val oldPoints = data.map(i => LabeledPoint(i.label, i.features)) val newData: DataFrame = OptimizedTreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) val oldData: DataFrame = OptimizedTreeTests.setMetadataForLabeledPoints(oldPoints, categoricalFeatures, numClasses = 0) val oldModel = rf.fit(oldData) val optimizedModel = orf.fit(newData) // Use parent from newTree since this is not checked anyways. OptimizedTreeTests.checkEqualOldRegression(oldModel, optimizedModel) assert(oldModel.numFeatures === numFeatures) } }
Example 20
Source File: OptimizedDecisionTreeIntegrationSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.Estimator import org.apache.spark.ml.classification.{DecisionTreeClassifier, OptimizedDecisionTreeClassifier} import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.{DecisionTreeRegressor, OptimizedDecisionTreeRegressor} import org.apache.spark.mllib.tree.DecisionTreeSuite import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext} import org.apache.spark.sql.DataFrame private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val oldTree = setParams(new DecisionTreeRegressor(), testParams) val newTree = setParams(new OptimizedDecisionTreeRegressor(), testParams) val newModel = newTree.fit(train) val oldModel = oldTree.fit(train) OptimizedTreeTests.checkEqual(oldModel, newModel) } private def testClassifierEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val oldTree = setParams(new DecisionTreeClassifier(), testParams) val newTree = setParams(new OptimizedDecisionTreeClassifier(), testParams) val newModel = newTree.fit(train) val model = oldTree.fit(train) OptimizedTreeTests.checkEqual(model, newModel) } test("Local & distributed training produce the same tree on a toy dataset") { val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree with two feature values") { val data = sc.parallelize(Range(0, 8).map(x => { if (x > 3) { Instance(x, 1.0, Vectors.dense(0.0)) } else { Instance(x, 1.0, Vectors.dense(1.0)) }})) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) testClassifierEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a slightly larger toy dataset") { val data = sc.parallelize(Range(0, 10).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce the same tree on a larger toy dataset") { val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce same tree on a dataset of categorical features") { val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances()) // Create a map of categorical feature index to arity; each feature has arity nclasses val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3) // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its // categorical features val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a dataset of continuous features") { val sqlContext = spark.sqlContext import sqlContext.implicits._ // Use maxDepth = 5 and default params val params = medDepthTreeSettings val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext, nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2) .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray))) .toDF().cache() testEquivalence(data, params) } test("Local & distributed training produce the same tree on a dataset of constant features") { // Generate constant, continuous data val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } }
Example 21
Source File: LocalTreeUnitSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.tree._ import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext def deepTreeTest(depth: Int): Unit = { val deepTreeData = OptimizedTreeTests.deepTreeData(sc, depth) val df = spark.createDataFrame(deepTreeData) // Construct estimators; single-tree random forest & decision tree regressor. val localTree = new LocalDecisionTreeRegressor() .setFeaturesCol("features") // indexedFeatures .setLabelCol("label") .setMaxDepth(depth) .setMinInfoGain(0.0) // Fit model, check depth... val localModel = localTree.fit(df) assert(localModel.rootNode.subtreeDepth == depth) } // Test small depth tree deepTreeTest(10) // Test medium depth tree deepTreeTest(40) // Test high depth tree deepTreeTest(200) } }
Example 22
Source File: LocalTreeIntegrationSuite.scala From oraf with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.Estimator import org.apache.spark.ml.feature.{Instance, LabeledPoint} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.ml.regression.DecisionTreeRegressor import org.apache.spark.mllib.tree.DecisionTreeSuite import org.apache.spark.mllib.util.{LogisticRegressionDataGenerator, MLlibTestSparkContext} import org.apache.spark.sql.DataFrame private def testEquivalence(train: DataFrame, testParams: Map[String, Any]): Unit = { val distribTree = setParams(new DecisionTreeRegressor(), testParams) val localTree = setParams(new LocalDecisionTreeRegressor(), testParams) val localModel = localTree.fit(train) val model = distribTree.fit(train) OptimizedTreeTests.checkEqual(model, localModel) } test("Local & distributed training produce the same tree on a toy dataset") { val data = sc.parallelize(Range(0, 8).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a slightly larger toy dataset") { val data = sc.parallelize(Range(0, 16).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce the same tree on a larger toy dataset") { val data = sc.parallelize(Range(0, 64).map(x => Instance(x, 1.0, Vectors.dense(x)))) val df = spark.createDataFrame(data) testEquivalence(df, medDepthTreeSettings) } test("Local & distributed training produce same tree on a dataset of categorical features") { val data = sc.parallelize(OptimizedRandomForestSuite.generateCategoricalInstances()) // Create a map of categorical feature index to arity; each feature has arity nclasses val featuresMap: Map[Int, Int] = Map(0 -> 3, 1 -> 3) // Convert the data RDD to a DataFrame with metadata indicating the arity of each of its // categorical features val df = OptimizedTreeTests.setMetadata(data, featuresMap, numClasses = 2) testEquivalence(df, OptimizedTreeTests.allParamSettings) } test("Local & distributed training produce the same tree on a dataset of continuous features") { val sqlContext = spark.sqlContext import sqlContext.implicits._ // Use maxDepth = 5 and default params val params = medDepthTreeSettings val data = LogisticRegressionDataGenerator.generateLogisticRDD(spark.sparkContext, nexamples = 1000, nfeatures = 5, eps = 2.0, nparts = 1, probOne = 0.2) .map(lp => Instance(lp.label, 1.0, Vectors.dense(lp.features.toArray))) .toDF().cache() testEquivalence(data, params) } test("Local & distributed training produce the same tree on a dataset of constant features") { // Generate constant, continuous data val data = sc.parallelize(Range(0, 8).map(_ => Instance(1, 1.0, Vectors.dense(1)))) val df = spark.createDataFrame(data) testEquivalence(df, OptimizedTreeTests.allParamSettings) } }
Example 23
Source File: RandomForestRegressorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.regression import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.tree.impl.TreeTests import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTestingUtils} import org.apache.spark.mllib.regression.{LabeledPoint => OldLabeledPoint} import org.apache.spark.mllib.tree.{EnsembleTestHelper, RandomForest => OldRandomForest} import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.DataFrame def compareAPIs( data: RDD[LabeledPoint], rf: RandomForestRegressor, categoricalFeatures: Map[Int, Int]): Unit = { val numFeatures = data.first().features.size val oldStrategy = rf.getOldStrategy(categoricalFeatures, numClasses = 0, OldAlgo.Regression, rf.getOldImpurity) val oldModel = OldRandomForest.trainRegressor(data.map(OldLabeledPoint.fromML), oldStrategy, rf.getNumTrees, rf.getFeatureSubsetStrategy, rf.getSeed.toInt) val newData: DataFrame = TreeTests.setMetadata(data, categoricalFeatures, numClasses = 0) val newModel = rf.fit(newData) // Use parent from newTree since this is not checked anyways. val oldModelAsNew = RandomForestRegressionModel.fromOld( oldModel, newModel.parent.asInstanceOf[RandomForestRegressor], categoricalFeatures) TreeTests.checkEqual(oldModelAsNew, newModel) assert(newModel.numFeatures === numFeatures) } }
Example 24
Source File: VectorUDTSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.SparkFunSuite import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.sql.catalyst.JavaTypeInference import org.apache.spark.sql.types._ class VectorUDTSuite extends SparkFunSuite { test("preloaded VectorUDT") { val dv1 = Vectors.dense(Array.empty[Double]) val dv2 = Vectors.dense(1.0, 2.0) val sv1 = Vectors.sparse(2, Array.empty, Array.empty) val sv2 = Vectors.sparse(2, Array(1), Array(2.0)) for (v <- Seq(dv1, dv2, sv1, sv2)) { val udt = UDTRegistration.getUDTFor(v.getClass.getName).get.newInstance() .asInstanceOf[VectorUDT] assert(v === udt.deserialize(udt.serialize(v))) assert(udt.typeName == "vector") assert(udt.simpleString == "vector") } } test("JavaTypeInference with VectorUDT") { val (dataType, _) = JavaTypeInference.inferDataType(classOf[LabeledPoint]) assert(dataType.asInstanceOf[StructType].fields.map(_.dataType) === Seq(new VectorUDT, DoubleType)) } }
Example 25
Source File: GradientBoostedTreesSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.internal.Logging import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.mllib.tree.{GradientBoostedTreesSuite => OldGBTSuite} import org.apache.spark.mllib.tree.configuration.{BoostingStrategy, Strategy} import org.apache.spark.mllib.tree.configuration.Algo._ import org.apache.spark.mllib.tree.impurity.Variance import org.apache.spark.mllib.tree.loss.{AbsoluteError, LogLoss, SquaredError} import org.apache.spark.mllib.util.MLlibTestSparkContext class GradientBoostedTreesSuite extends SparkFunSuite with MLlibTestSparkContext with Logging { import testImplicits._ test("runWithValidation stops early and performs better on a validation dataset") { // Set numIterations large enough so that it stops early. val numIterations = 20 val trainRdd = sc.parallelize(OldGBTSuite.trainData, 2).map(_.asML) val validateRdd = sc.parallelize(OldGBTSuite.validateData, 2).map(_.asML) val trainDF = trainRdd.toDF() val validateDF = validateRdd.toDF() val algos = Array(Regression, Regression, Classification) val losses = Array(SquaredError, AbsoluteError, LogLoss) algos.zip(losses).foreach { case (algo, loss) => val treeStrategy = new Strategy(algo = algo, impurity = Variance, maxDepth = 2, categoricalFeaturesInfo = Map.empty) val boostingStrategy = new BoostingStrategy(treeStrategy, loss, numIterations, validationTol = 0.0) val (validateTrees, validateTreeWeights) = GradientBoostedTrees .runWithValidation(trainRdd, validateRdd, boostingStrategy, 42L) val numTrees = validateTrees.length assert(numTrees !== numIterations) // Test that it performs better on the validation dataset. val (trees, treeWeights) = GradientBoostedTrees.run(trainRdd, boostingStrategy, 42L) val (errorWithoutValidation, errorWithValidation) = { if (algo == Classification) { val remappedRdd = validateRdd.map(x => new LabeledPoint(2 * x.label - 1, x.features)) (GradientBoostedTrees.computeError(remappedRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(remappedRdd, validateTrees, validateTreeWeights, loss)) } else { (GradientBoostedTrees.computeError(validateRdd, trees, treeWeights, loss), GradientBoostedTrees.computeError(validateRdd, validateTrees, validateTreeWeights, loss)) } } assert(errorWithValidation <= errorWithoutValidation) // Test that results from evaluateEachIteration comply with runWithValidation. // Note that convergenceTol is set to 0.0 val evaluationArray = GradientBoostedTrees .evaluateEachIteration(validateRdd, trees, treeWeights, loss, algo) assert(evaluationArray.length === numIterations) assert(evaluationArray(numTrees) > evaluationArray(numTrees - 1)) var i = 1 while (i < numTrees) { assert(evaluationArray(i) <= evaluationArray(i - 1)) i += 1 } } } }