org.apache.spark.ml.Predictor Scala Examples
The following examples show how to use org.apache.spark.ml.Predictor.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: GaussianProcessCommons.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import breeze.optimize.LBFGSB import org.apache.spark.ml.commons.kernel.{EyeKernel, Kernel, _} import org.apache.spark.ml.commons.util.DiffFunctionMemoized import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vector import org.apache.spark.ml.util.Instrumentation import org.apache.spark.ml.{PredictionModel, Predictor} import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.col import org.apache.spark.sql.{Dataset, Row} private[ml] trait GaussianProcessCommons[F, E <: Predictor[F, E, M], M <: PredictionModel[F, M]] extends ProjectedGaussianProcessHelper { this: Predictor[F, E, M] with GaussianProcessParams => protected val getKernel : () => Kernel = () => $(kernel)() + $(sigma2).const * new EyeKernel protected def getPoints(dataset: Dataset[_]) = { dataset.select(col($(labelCol)), col($(featuresCol))).rdd.map { case Row(label: Double, features: Vector) => LabeledPoint(label, features) } } protected def groupForExperts(points: RDD[LabeledPoint]) = { val numberOfExperts = Math.round(points.count().toDouble / $(datasetSizeForExpert)) points.zipWithIndex.map { case(instance, index) => (index % numberOfExperts, instance) }.groupByKey().map(_._2) } protected def getExpertLabelsAndKernels(points: RDD[LabeledPoint]): RDD[(BDV[Double], Kernel)] = { groupForExperts(points).map { chunk => val (labels, trainingVectors) = chunk.map(lp => (lp.label, lp.features)).toArray.unzip (BDV(labels: _*), getKernel().setTrainingVectors(trainingVectors)) } } protected def projectedProcess(expertLabelsAndKernels: RDD[(BDV[Double], Kernel)], points: RDD[LabeledPoint], optimalHyperparameters: BDV[Double]) = { val activeSet = $(activeSetProvider)($(activeSetSize), expertLabelsAndKernels, points, getKernel, optimalHyperparameters, $(seed)) points.unpersist() val (matrixKmnKnm, vectorKmny) = getMatrixKmnKnmAndVectorKmny(expertLabelsAndKernels, activeSet) expertLabelsAndKernels.unpersist() val optimalKernel = getKernel().setHyperparameters(optimalHyperparameters).setTrainingVectors(activeSet) // inv(sigma^2 K_mm + K_mn * K_nm) * K_mn * y val (magicVector, magicMatrix) = getMagicVector(optimalKernel, matrixKmnKnm, vectorKmny, activeSet, optimalHyperparameters) new GaussianProjectedProcessRawPredictor(magicVector, magicMatrix, optimalKernel) } protected def createModel(uid: String, rawPredictor: GaussianProjectedProcessRawPredictor) : M } class GaussianProjectedProcessRawPredictor private[commons] (val magicVector: BDV[Double], val magicMatrix: BDM[Double], val kernel: Kernel) extends Serializable { def predict(features: Vector): (Double, Double) = { val cross = kernel.crossKernel(features) val selfKernel = kernel.selfKernel(features) (cross * magicVector, selfKernel + cross * magicMatrix * cross.t) } }