org.apache.spark.ml.linalg.DenseVector Scala Examples
The following examples show how to use org.apache.spark.ml.linalg.DenseVector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DatasetExtensions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.schema import org.apache.spark.ml.linalg.{DenseVector, SparseVector} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.StructType import scala.collection.mutable def findUnusedColumnName(prefix: String)(columnNames: scala.collection.Set[String]): String = { var counter = 2 var unusedColumnName = prefix while (columnNames.contains(unusedColumnName)) { unusedColumnName += "_" + counter counter += 1 } unusedColumnName } def findUnusedColumnName(prefix: String, schema: StructType): String = { findUnusedColumnName(prefix)(schema.fieldNames.toSet) } def findUnusedColumnName(prefix: String, df: Dataset[_]): String = { findUnusedColumnName(prefix, df.schema) } }
Example 2
Source File: DecisionTreeClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.tree.{DecisionTree, Node} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} case class DecisionTreeClassifierModel(override val rootNode: Node, numFeatures: Int, override val numClasses: Int, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with DecisionTree with Serializable { override def predictRaw(features: Vector): Vector = { rootNode.predictImpl(features).impurities } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in DecisionTreeClassifierModel:" + " raw2probabilityInPlace encountered SparseVector") } } }
Example 3
Source File: GBTClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.regression.DecisionTreeRegressionModel import ml.combust.mleap.core.tree.TreeEnsemble import ml.combust.mleap.core.tree.loss.LogLoss import org.apache.spark.ml.linalg.mleap.BLAS import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def margin(features: Vector): Double = { val treePredictions = Vectors.dense(trees.map(_.predict(features)).toArray) BLAS.dot(treePredictions, treeWeightsVector) } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => dv.values(0) = loss.computeProbability(dv.values(0)) dv.values(1) = 1.0 - dv.values(0) dv case sv: SparseVector => throw new RuntimeException("GBTClassificationModel encountered SparseVector") } } override def predictRaw(features: Vector): Vector = { val prediction: Double = margin(features) Vectors.dense(Array(-prediction, prediction)) } }
Example 4
Source File: NaiveBayesModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.classification.NaiveBayesModel.{Bernoulli, ModelType, Multinomial} import org.apache.spark.ml.linalg.mleap.{BLAS, Matrices} import org.apache.spark.ml.linalg.{DenseVector, Matrix, SparseVector, Vector} @SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala") case class NaiveBayesModel(numFeatures: Int, numClasses: Int, pi: Vector, theta: Matrix, modelType: NaiveBayesModel.ModelType, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with Model { private def multinomialCalculation(raw: Vector) = { val prob = theta.multiply(raw) BLAS.axpy(1.0, pi, prob) prob } private def bernoulliCalculation(raw: Vector) = { val negTheta = Matrices.map(theta, value => math.log(1.0 - math.exp(value))) val ones = new DenseVector(Array.fill(theta.numCols) {1.0}) val thetaMinusNegTheta = Matrices.map(theta, value => value - math.log(1.0 - math.exp(value))) val negThetaSum = negTheta.multiply(ones) raw.foreachActive((_, value) => require(value == 0.0 || value == 1.0, s"Bernoulli naive Bayes requires 0 or 1 feature values but found $raw.") ) val prob = thetaMinusNegTheta.multiply(raw) BLAS.axpy(1.0, pi, prob) BLAS.axpy(1.0, negThetaSum, prob) prob } override def predictRaw(raw: Vector): Vector = { modelType match { case Multinomial => multinomialCalculation(raw) case Bernoulli => bernoulliCalculation(raw) } } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => var i = 0 val size = dv.size val maxLog = dv.values.max while (i < size) { dv.values(i) = math.exp(dv.values(i) - maxLog) i += 1 } ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in NaiveBayesModel:" + " raw2probabilityInPlace encountered SparseVector") } } }
Example 5
Source File: SupportVectorMachineModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.BLAS case class SupportVectorMachineModel(coefficients: Vector, intercept: Double, override val thresholds: Option[Array[Double]] = Some(SupportVectorMachineModel.defaultThresholds)) extends ProbabilisticClassificationModel with Serializable { private def margin(features: Vector): Double = BLAS.dot(coefficients, features) + intercept override val numClasses: Int = 2 override val numFeatures: Int = coefficients.size override def predictRaw(features: Vector): Vector = { val m = margin(features) Vectors.dense(Array(-m, m)) } override def rawToProbabilityInPlace(raw: Vector): Vector = raw }
Example 6
Source File: RandomForestClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.tree.TreeEnsemble import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} case class RandomForestClassifierModel(override val trees: Seq[DecisionTreeClassifierModel], override val treeWeights: Seq[Double], numFeatures: Int, override val numClasses: Int, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with TreeEnsemble with Serializable { override def predictRaw(raw: Vector): Vector = { val votes = Array.fill[Double](numClasses)(0.0) trees.view.foreach { tree => val classCounts: Array[Double] = tree.rootNode.predictImpl(raw).impurities.toArray val total = classCounts.sum if (total != 0) { var i = 0 while (i < numClasses) { votes(i) += classCounts(i) / total i += 1 } } } Vectors.dense(votes) } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in RandomForestClassificationModel:" + " raw2probabilityInPlace encountered SparseVector") } } }
Example 7
Source File: VectorConverters.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.util import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor} import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors} import scala.language.implicitConversions trait VectorConverters { implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match { case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size)) case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)), values = vector.values, dimensions = Seq(vector.size)) } implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match { case tensor: DenseTensor[_] => Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => Vectors.sparse(tensor.dimensions.product, tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match { case matrix: DenseMatrix => DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols)) case matrix: SparseMatrix => val indices = matrix.rowIndices.zip(matrix.colPtrs).map { case (r, c) => Seq(r, c) }.toSeq SparseTensor(indices = indices, values = matrix.values, dimensions = Seq(matrix.numRows, matrix.numCols)) } implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match { case tensor: DenseTensor[_] => Matrices.dense(tensor.dimensions.head, tensor.dimensions(1), tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip Matrices.sparse(tensor.dimensions.head, tensor.dimensions(1), cols.toArray, rows.toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match { case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size)) case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size)) } implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match { case tensor: DenseTensor[_] => new BDV(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => new BSV(tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]], tensor.dimensions.product) } } object VectorConverters extends VectorConverters
Example 8
Source File: PcaOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.feature import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.tensor.DenseTensor import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.feature.PCAModel import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector} class PcaOp extends SimpleSparkOp[PCAModel] { override val Model: OpModel[SparkBundleContext, PCAModel] = new OpModel[SparkBundleContext, PCAModel] { override val klazz: Class[PCAModel] = classOf[PCAModel] override def opName: String = Bundle.BuiltinOps.feature.pca override def store(model: Model, obj: PCAModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("principal_components", Value.tensor[Double](DenseTensor(obj.pc.values, Seq(obj.pc.numRows, obj.pc.numCols)))) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): PCAModel = { val values = model.value("principal_components").getTensor[Double] new PCAModel(uid = "", pc = new DenseMatrix(values.dimensions.head, values.dimensions(1), values.toArray), explainedVariance = new DenseVector(Array())) } } override def sparkLoad(uid: String, shape: NodeShape, model: PCAModel): PCAModel = { new PCAModel(uid = uid, pc = model.pc, explainedVariance = model.explainedVariance) } override def sparkInputs(obj: PCAModel): Seq[ParamSpec] = { Seq("input" -> obj.inputCol) } override def sparkOutputs(obj: PCAModel): Seq[SimpleParamSpec] = { Seq("output" -> obj.outputCol) } }
Example 9
Source File: KMeansOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.clustering import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.clustering.KMeansModel import org.apache.spark.ml.linalg.{DenseVector, SparseVector} import org.apache.spark.mllib.clustering import org.apache.spark.mllib.linalg.Vectors class KMeansOp extends SimpleSparkOp[KMeansModel] { override val Model: OpModel[SparkBundleContext, KMeansModel] = new OpModel[SparkBundleContext, KMeansModel] { override val klazz: Class[KMeansModel] = classOf[KMeansModel] override def opName: String = Bundle.BuiltinOps.clustering.k_means override def store(model: Model, obj: KMeansModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("cluster_centers", Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.toArray)))). withValue("num_features", Value.long(obj.clusterCenters.head.size)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): KMeansModel = { val clusterCenters = model.value("cluster_centers"). getTensorList[Double].toArray. map(t => Vectors.dense(t.toArray)) val mllibModel = new clustering.KMeansModel(clusterCenters) new KMeansModel(uid = "", parentModel = mllibModel) } } override def sparkLoad(uid: String, shape: NodeShape, model: KMeansModel): KMeansModel = { val clusterCenters = model.clusterCenters.map { case DenseVector(values) => Vectors.dense(values) case SparseVector(size, indices, values) => Vectors.sparse(size, indices, values) } new KMeansModel(uid = uid, parentModel = new clustering.KMeansModel(clusterCenters)) } override def sparkInputs(obj: KMeansModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: KMeansModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 10
Source File: StreamingMLUtils.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib import scala.language.implicitConversions import org.apache.spark.ml.linalg.{SparseVector, DenseVector, Vector} import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.util.MLUtils object StreamingMLUtils { implicit def mlToMllibVector(v: Vector): OldVector = v match { case dv: DenseVector => OldVectors.dense(dv.toArray) case sv: SparseVector => OldVectors.sparse(sv.size, sv.indices, sv.values) case _ => throw new IllegalArgumentException } def fastSquaredDistance(x: Vector, xNorm: Double, y: Vector, yNorm: Double) = { MLUtils.fastSquaredDistance(x, xNorm, y, yNorm) } }
Example 11
Source File: LibSVMRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 12
Source File: UberXGBoostModel.scala From uberdata with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts.models import ml.dmlc.xgboost4j.java.Rabit import ml.dmlc.xgboost4j.scala.DMatrix import ml.dmlc.xgboost4j.{LabeledPoint => XGBLabeledPoint} import ml.dmlc.xgboost4j.scala.spark.{XGBoost, XGBoostModel} import org.apache.spark.TaskContext import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.rdd.RDD import scala.collection.JavaConverters._ object UberXGBoostModel { def train(trainLabel: RDD[LabeledPoint], configMap: Map[String, Any], round: Int, nWorkers: Int): XGBoostModel = { val trainData = trainLabel.cache XGBoost.trainWithRDD(trainData, configMap, round, nWorkers,useExternalMemory = true, missing = Float.NaN) } def labelPredict(testSet: RDD[XGBLabeledPoint], useExternalCache: Boolean, booster: XGBoostModel): RDD[(Float, Float)] = { val broadcastBooster = testSet.sparkContext.broadcast(booster) testSet.mapPartitions { testData => val (toPredict, toLabel) = testData.duplicate val dMatrix = new DMatrix(toPredict) val prediction = broadcastBooster.value.booster.predict(dMatrix).flatten.toIterator toLabel.map(_.label).zip(prediction) } } def labelPredict(testSet: RDD[DenseVector], booster: XGBoostModel): RDD[(Float, Float)] = { val broadcastBooster = testSet.sparkContext.broadcast(booster) val rdd = testSet.cache broadcastBooster.value.predict(testSet,missingValue = Float.NaN).map(value => (value(0), value(1))) // testSet. // testSet.mapPartitions { testData => // val (toPredict, toLabel) = testData.duplicate // val dMatrix = new DMatrix(toPredict) // // val prediction = broadcastBooster.value.booster.predict(dMatrix).flatten.toIterator // toLabel.map(_.label).zip(prediction) // } } }
Example 13
Source File: OptimizedCKNNFitting.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package org.apache.spark.sql.types.injections import com.microsoft.ml.spark.nn._ import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.Dataset import breeze.linalg.{DenseVector => BDV} import org.apache.spark.sql.types._ trait OptimizedCKNNFitting extends ConditionalKNNParams { private def fitGeneric[V, L](dataset: Dataset[_]): ConditionalKNNModel = { val kvlTriples = dataset.toDF().select(getFeaturesCol, getValuesCol, getLabelCol).collect() .map { row => val bdv = new BDV(row.getAs[DenseVector](getFeaturesCol).values) val value = row.getAs[V](getValuesCol) val label = row.getAs[L](getLabelCol) (bdv, value, label) } val ballTree = ConditionalBallTree( kvlTriples.map(_._1), kvlTriples.map(_._2), kvlTriples.map(_._3), getLeafSize) new ConditionalKNNModel() .setFeaturesCol(getFeaturesCol) .setValuesCol(getValuesCol) .setBallTree(ballTree) .setOutputCol(getOutputCol) .setLabelCol(getLabelCol) .setConditionerCol(getConditionerCol) .setK(getK) } protected def fitOptimized(dataset: Dataset[_]): ConditionalKNNModel = { val vt = dataset.schema(getValuesCol).dataType val lt = dataset.schema(getLabelCol).dataType (vt, lt) match { case (avt: AtomicType, alt: AtomicType) => fitGeneric[avt.InternalType, alt.InternalType](dataset) case (avt: AtomicType, _) => fitGeneric[avt.InternalType, Any](dataset) case (_, alt: AtomicType) => fitGeneric[Any, alt.InternalType](dataset) case _ => fitGeneric[Any, Any](dataset) } } } trait OptimizedKNNFitting extends KNNParams { private def fitGeneric[V](dataset: Dataset[_]): KNNModel = { val kvlTuples = dataset.toDF().select(getFeaturesCol, getValuesCol).collect() .map { row => val bdv = new BDV(row.getAs[DenseVector](getFeaturesCol).values) val value = row.getAs[V](getValuesCol) (bdv, value) } val ballTree = BallTree( kvlTuples.map(_._1), kvlTuples.map(_._2), getLeafSize) new KNNModel() .setFeaturesCol(getFeaturesCol) .setValuesCol(getValuesCol) .setBallTree(ballTree) .setOutputCol(getOutputCol) .setK(getK) } protected def fitOptimized(dataset: Dataset[_]): KNNModel = { dataset.schema(getValuesCol).dataType match { case avt: AtomicType => fitGeneric[avt.InternalType](dataset) case _ => fitGeneric[Any](dataset) } } }
Example 14
Source File: VectorFeaturizer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw.featurizer import org.apache.spark.sql.Row import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import scala.collection.mutable override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = { row.getAs[Vector](fieldIdx) match { case v: DenseVector => // check if we need to hash if (v.size < mask + 1) indices ++= 0 until v.size else indices ++= (0 until v.size).map { mask & _ } values ++= v.values case v: SparseVector => // check if we need to hash if (v.size < mask + 1) indices ++= v.indices else indices ++= v.indices.map { mask & _ } values ++= v.values } () } }
Example 15
Source File: ClassificationModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} val numClasses: Int val numFeatures: Int def thresholds: Option[Array[Double]] = None def predict(features: Vector): Double = probabilityToPrediction(predictProbabilities(features)) def predictWithProbability(features: Vector): (Double, Double) = { val probabilities = predictProbabilities(features) val index = probabilityToPredictionIndex(probabilities) (index.toDouble, probabilities(index)) } def predictProbabilities(features: Vector): Vector = { val raw = predictRaw(features) rawToProbabilityInPlace(raw) raw } def rawToProbability(raw: Vector): Vector = { val probabilities = raw.copy rawToProbabilityInPlace(probabilities) } def rawToPrediction(raw: Vector): Double = { thresholds match { case Some(t) => probabilityToPrediction(rawToProbability(raw)) case None => raw.argmax } } def probabilityToPrediction(probability: Vector): Double = { probabilityToPredictionIndex(probability).toDouble } def probabilityToPredictionIndex(probability: Vector): Int = { thresholds match { case Some(ts) => val scaledProbability: Array[Double] = probability.toArray.zip(ts).map { case (p, t) => if (t == 0.0) Double.PositiveInfinity else p / t } Vectors.dense(scaledProbability).argmax case None => probability.argmax } } def rawToProbabilityInPlace(raw: Vector): Vector override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("raw_prediction" -> TensorType.Double(numClasses), "probability" -> TensorType.Double(numClasses), "prediction" -> ScalarType.Double.nonNullable).get }
Example 16
Source File: EnsembleByKeySuite.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.stages import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.DataFrame class EnsembleByKeySuite extends TestBase with TransformerFuzzing[EnsembleByKey] { test("Should work on Dataframes doubles or vectors") { val scoreDF = session.createDataFrame(Seq( (0, "foo", 1.0, .1), (1, "bar", 4.0, -2.0), (1, "bar", 0.0, -3.0))) .toDF("label1", "label2", "score1", "score2") val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1") val scoreDF2 = va.transform(scoreDF) val t = new EnsembleByKey().setKey("label1").setCol("score1") val df1 = t.transform(scoreDF2) df1.printSchema() assert(df1.collect().map(r => (r.getInt(0), r.getDouble(1))).toSet === Set((1, 2.0), (0, 1.0))) val t2 = new EnsembleByKey().setKeys("label1", "label2").setCols("score1", "score2", "v1") val df2 = t2.transform(scoreDF2) val res2 = df2.select("mean(score1)", "mean(v1)").collect().map(r => (r.getDouble(0), r.getAs[DenseVector](1))) val true2 = Set( (2.0, new DenseVector(Array(2.0, -2.5))), (1.0, new DenseVector(Array(1.0, 0.1)))) assert(res2.toSet === true2) } test("should support collapsing or not") { val scoreDF = session.createDataFrame( Seq((0, "foo", 1.0, .1), (1, "bar", 4.0, -2.0), (1, "bar", 0.0, -3.0))) .toDF("label1", "label2", "score1", "score2") val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1") val scoreDF2 = va.transform(scoreDF) val t = new EnsembleByKey().setKey("label1").setCol("score1").setCollapseGroup(false) val df1 = t.transform(scoreDF2) assert(df1.collect().map(r => (r.getInt(0), r.getDouble(5))).toSet === Set((1, 2.0), (0, 1.0))) assert(df1.count() == scoreDF.count()) df1.show() } lazy val testDF: DataFrame = { val initialTestDF = session.createDataFrame( Seq((0, "foo", 1.0, .1), (1, "bar", 4.0, -2.0), (1, "bar", 0.0, -3.0))) .toDF("label1", "label2", "score1", "score2") new VectorAssembler().setInputCols(Array("score1", "score2")) .setOutputCol("v1").transform(initialTestDF) } lazy val testModel: EnsembleByKey = new EnsembleByKey().setKey("label1").setCol("score1") .setCollapseGroup(false).setVectorDims(Map("v1"->2)) test("should support passing the vector dims to avoid maerialization") { val df1 = testModel.transform(testDF) assert(df1.collect().map(r => (r.getInt(0), r.getDouble(5))).toSet === Set((1, 2.0), (0, 1.0))) assert(df1.count() == testDF.count()) df1.show() } test("should overwrite a column if instructed") { val scoreDF = session.createDataFrame( Seq((0, "foo", 1.0, .1), (1, "bar", 4.0, -2.0), (1, "bar", 0.0, -3.0))) .toDF("label1", "label2", "score1", "score2") val va = new VectorAssembler().setInputCols(Array("score1", "score2")).setOutputCol("v1") val scoreDF2 = va.transform(scoreDF) val t = new EnsembleByKey().setKey("label1").setCol("score1").setColName("score1").setCollapseGroup(false) val df1 = t.transform(scoreDF2) assert(scoreDF2.columns.toSet === df1.columns.toSet) } test("should rountrip serialize") { testSerialization() } def testObjects(): Seq[TestObject[EnsembleByKey]] = Seq(new TestObject(testModel, testDF)) def reader: EnsembleByKey.type = EnsembleByKey }
Example 17
Source File: CNTKTestUtils.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.cntk import java.io.File import com.microsoft.ml.spark.build.BuildInfo import com.microsoft.ml.spark.core.env.FileUtilities import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.image.UnrollImage import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql._ import com.microsoft.ml.spark.io.IOImplicits._ trait CNTKTestUtils extends TestBase { val filesRoot = BuildInfo.datasetDir.toString val imagePath = FileUtilities.join(filesRoot, "Images", "CIFAR").toString val modelPath = FileUtilities.join(filesRoot, "CNTKModel", "ConvNet_CIFAR10.model").toString val inputCol = "cntk_images" val outputCol = "out" val labelCol = "labels" val featureVectorLength = 3 * 32 * 32 lazy val saveFile = new File(tmpDir.toFile, "spark-z.model").toString def testModelDF(spark: SparkSession): DataFrame = { import spark.implicits._ spark.sparkContext.parallelize(Seq( Array(1.32165250, -2.1215112, 0.63150704, 0.77315974, -1.28163720, -0.20210080, -2.2839167, -2.08691480, 5.08418200, -1.33741090), Array(3.44079640, 1.4877119, -0.74059330, -0.34381202, -2.48724990, -2.62866950, -3.1693816, -3.14182600, 4.76314800, 0.68712880), Array(-1.88747900, -4.7685330, 0.15169683, 6.80547570, -0.38405967, 3.41065170, 1.3302778, -0.87714905, -2.18046050, -4.16661830), Array(5.01010300, 3.9860306, -1.36795600, -0.89830830, -4.49545430, -4.19537070, -4.4045380, -5.81759450, 6.93805700, 1.49001510), Array(-4.70754600, -6.0414960, 1.20658250, 5.40738300, 1.07661690, 4.71566440, 4.3834330, -1.57187440, -2.96569730, -5.43208270), Array(-1.23873880, -3.2042341, 2.54533000, 5.51954800, 2.89042470, 0.12380804, 3.8639085, -4.79466800, -2.41463420, -5.17418430))).toDF } def testImages(spark: SparkSession): DataFrame = { val images = spark.read.image.load(imagePath) val unroll = new UnrollImage().setInputCol("image").setOutputCol(inputCol) unroll.transform(images).select(inputCol) } def makeFakeData(spark: SparkSession, rows: Int, size: Int, outputDouble: Boolean = false): DataFrame = { import spark.implicits._ if (outputDouble) { List .fill(rows)(List.fill(size)(0.0).toArray) .zip(List.fill(rows)(0.0)) .toDF(inputCol, labelCol) } else { List .fill(rows)(List.fill(size)(0.0.toFloat).toArray) .zip(List.fill(rows)(0.0)) .toDF(inputCol, labelCol) } } protected def compareToTestModel(result: DataFrame) = { //TODO improve checks assert(result.columns.toSet == Set(inputCol, outputCol)) assert(result.count() == testModelDF(result.sparkSession).count()) val max = result .select(outputCol) .collect() .map(row => row.getAs[DenseVector](0).toArray.max) .max assert(max < 10 & max > -10) } }
Example 18
Source File: Word2VecSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.ml import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.spark.ml.feature.Word2Vec import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.DataFrame class Word2VecSpec extends TestBase { def genTokenizedText(): DataFrame = { session.createDataFrame(Seq( (0, Array("I", "walked", "the", "dog", "down", "the", "street")), (1, Array("I", "walked", "with", "the", "dog")), (2, Array("I", "walked", "the", "pup")) )).toDF("label", "words") } def genW2V(): Word2Vec = new Word2Vec().setSeed(1234).setMinCount(0) test("operation on tokenized strings") { val df = genTokenizedText() val df2 = genW2V().setVectorSize(2) .setInputCol("words").setOutputCol("features").fit(df).transform(df) val lines = df2.getDVCol("features") assert(lines.forall(_.size == 2)) } test("return vectors") { val df = genTokenizedText() val model = genW2V().setVectorSize(2) .setInputCol("words").setOutputCol("features").fit(df) val vectors = model.getVectors.getDVCol("vector") assert(vectors(0).size == 2) } test("return synonyms") { val df = genTokenizedText() val model = genW2V().setVectorSize(2) .setInputCol("words").setOutputCol("features").fit(df) val synonyms = model.findSynonyms("dog", 2).getColAs[String]("word") assert(synonyms.length === 2) } test("raise an error when applied to a null array") { val tokenDataFrame = session.createDataFrame(Seq( (0, Some(Array("Hi", "I", "can", "not", "foo"))), (1, None)) ).toDF("label", "tokens") assertSparkException[org.apache.spark.SparkException](genW2V().setInputCol("tokens"), tokenDataFrame) } test("raise an error when given strange values of parameters") { def base(): Word2Vec = genW2V().setInputCol("words") def assertIllegalArgument[T](f: T => Any, args: T*): Unit = args.foreach { n => interceptWithoutLogging[IllegalArgumentException] { f(n) } } assertIllegalArgument[Int](base.setMinCount, -1, -10) assertIllegalArgument[Int](base.setMaxIter, -1, -10) assertIllegalArgument[Int](base.setVectorSize, 0, -1, -10) assertIllegalArgument[Int](base.setWindowSize, 0, -1, -10) assertIllegalArgument[Int](base.setMaxSentenceLength, 0, -1, -10) assertIllegalArgument[Int](base.setNumPartitions, 0, -1, -10) assertIllegalArgument[Double](base.setStepSize, 0.0, -1.0, -10.0) } test("return a vector of zeros when it encounters an OOV word") { val df = genTokenizedText() val model = genW2V().setVectorSize(2).setMinCount(1).setInputCol("words").setOutputCol("features").fit(df) val df2 = session.createDataFrame(Seq( (0, Array("ketchup")))).toDF("label", "words") val results = model.transform(df2) val lines = results.getDVCol("features") val trueLines = List(new DenseVector(Array(0.0, 0.0))) assert(lines === trueLines) } test("be able to set vector size") { val df = genTokenizedText() val vectorSizes = List(1, 10, 100) vectorSizes.foreach { n => val results = genW2V().setVectorSize(n) .setInputCol("words").setOutputCol("features").fit(df).transform(df) .getDVCol("features") assert(results(0).size === n) } } }
Example 19
Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 20
Source File: RichVector.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.spark import breeze.linalg.{DenseVector => BreezeDenseVector, SparseVector => BreezeSparseVector, Vector => BreezeVector} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.collection.mutable.ArrayBuffer def combine(vectors: Seq[Vector]): Vector = { val indices = ArrayBuffer.empty[Int] val values = ArrayBuffer.empty[Double] val size = vectors.foldLeft(0)((size, vector) => { vector.foreachActive { case (i, v) => if (v != 0.0) { indices += size + i values += v } } size + vector.size }) Vectors.sparse(size, indices.toArray, values.toArray).compressed } implicit class RichSparseVector(val v: SparseVector) extends AnyVal { def updated(index: Int, indexVal: Int, value: Double): SparseVector = { require(v.indices(index) == indexVal, s"Invalid index: indices($index)==${v.indices(index)}, expected: $indexVal") v.values(index) = value v } } }
Example 21
Source File: IDFTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.feature.IDF import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.{Estimator, Transformer} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class IDFTest extends FlatSpec with TestSparkContext { val data = Seq( Vectors.sparse(4, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(4, Array(1), Array(1.0)) ) lazy val (ds, f1) = TestFeatureBuilder(data.map(_.toOPVector)) Spec[IDF] should "compute inverted document frequency" in { val idf = f1.idf() val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((data.length + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } it should "compute inverted document frequency when minDocFreq is 1" in { val idf = f1.idf(minDocFreq = 1) val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((data.length + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } private def scaleDataWithIDF(dataSet: Seq[Vector], model: Vector): Seq[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } }
Example 22
Source File: HasNetlibBlas.scala From pravda-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.odkl import com.github.fommil.netlib.BLAS.{getInstance => NativeBLAS} import com.github.fommil.netlib.{F2jBLAS, BLAS => NetlibBLAS} import org.apache.spark.ml.linalg.{DenseVector, Matrices, Vector, Vectors} trait HasNetlibBlas { // For level-1 routines, we use Java implementation. def f2jBLAS: NetlibBLAS = HasNetlibBlas._f2jBLAS def blas: NetlibBLAS = HasNetlibBlas._nativeBLAS def dscal(a: Double, data: Array[Double]) : Unit = f2jBLAS.dscal(data.length, a, data, 1) def axpy(a: Double, x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.daxpy(x.length, a, x, 1, y, 1) def axpy(a: Double, x: Vector, y : Array[Double]) : Unit = x match { case dense: DenseVector => axpy(a, dense.values, y) case _ => x.foreachActive((i, v) => y(i) += a * v) } def copy( x: Array[Double], y : Array[Double]) : Unit = f2jBLAS.dcopy(x.length, x, 1, y, 1) } object HasNetlibBlas extends Serializable { @transient private lazy val _f2jBLAS: NetlibBLAS = { initSparkBlas new F2jBLAS } private def initSparkBlas = synchronized { org.apache.spark.ml.linalg.BLAS.dot(Vectors.zeros(2), Vectors.zeros(2)) org.apache.spark.ml.linalg.BLAS.gemv(1.0, Matrices.zeros(2, 2), Vectors.zeros(2), 0.5, Vectors.zeros(2).toDense) } @transient private lazy val _nativeBLAS: NetlibBLAS = { initSparkBlas NativeBLAS } }
Example 23
Source File: NormalizerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row} class NormalizerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[Vector] = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) } def assertTypeOfVector(lhs: Vector, rhs: Vector): Unit = { assert((lhs, rhs) match { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } def assertValues(lhs: Vector, rhs: Vector): Unit = { assert(lhs ~== rhs absTol 1E-5, "The vector value is not correct after normalization.") } test("Normalization with default parameter") { val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized") val dataFrame: DataFrame = data.zip(l2Normalized).seq.toDF("features", "expected") testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") { case Row(features: Vector, normalized: Vector, expected: Vector) => assertTypeOfVector(normalized, features) assertValues(normalized, expected) } } test("Normalization with setter") { val dataFrame: DataFrame = data.zip(l1Normalized).seq.toDF("features", "expected") val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized").setP(1) testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") { case Row(features: Vector, normalized: Vector, expected: Vector) => assertTypeOfVector(normalized, features) assertValues(normalized, expected) } } test("read/write") { val t = new Normalizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setP(3.0) testDefaultReadWrite(t) } }
Example 24
Source File: ProtobufRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val labelColumnName = "label" val featuresColumnName = "features" val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField( featuresColumnName, VectorType))) it should "serialize a dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "serialize a sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "fail to set schema on invalid features name" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) intercept[IllegalArgumentException] { val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist") } } it should "fail on invalid types" in { val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("features", SQLDataTypes.VectorType, nullable = false))) new ProtobufRequestRowSerializer(Some(validSchema)) } }
Example 25
Source File: UnlabeledLibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StringType, StructField, StructType} class UnlabeledLibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false))) "UnlabeledLibSVMRequestRowSerializer" should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() val serialized = new String(rrs.serializeRow(row)) assert ("0.0 1:-100.0 11:100.1\n" == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() val serialized = new String(rrs.serializeRow(row)) assert("0.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "fail on invalid features column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer(featuresColumnName = "mangoes are not features") intercept[RuntimeException] { rrs.serializeRow(row) } } it should "fail on invalid features type" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, "FEATURESSSSSZ!1!").toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() intercept[RuntimeException] { rrs.serializeRow(row) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("features", SQLDataTypes.VectorType, nullable = false))) val rrs = new UnlabeledLibSVMRequestRowSerializer(Some(validSchema)) } it should "fail to validate incorrect schema" in { val invalidSchema = StructType(Array( StructField("features", StringType, nullable = false))) intercept[IllegalArgumentException] { new UnlabeledLibSVMRequestRowSerializer(Some(invalidSchema)) } } }
Example 26
Source File: LibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest._ import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.LibSVMResponseRowDeserializer class LibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema = new LibSVMResponseRowDeserializer(10).schema "LibSVMRequestRowSerializer" should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new LibSVMRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert ("1.0 1:-100.0 11:100.1\n" == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new LibSVMRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "ignore other columns" in { val schemaWithExtraColumns = StructType(Array( StructField("name", StringType, nullable = false), StructField("label", DoubleType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false), StructField("favorite activity", StringType, nullable = false))) val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq("Elizabeth", 1.0, vec, "Crying").toArray, schema = schemaWithExtraColumns) val rrs = new LibSVMRequestRowSerializer(Some(schemaWithExtraColumns)) val serialized = new String(rrs.serializeRow(row)) assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "fail on invalid features column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schema), featuresColumnName = "i do not exist dear sir!") } } it should "fail on invalid label column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schema), labelColumnName = "Sir! I must protest! I do not exist!") } } it should "fail on invalid types" in { val schemaWithInvalidLabelType = StructType(Array( StructField("label", StringType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false))) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schemaWithInvalidLabelType)) } val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false))) new LibSVMRequestRowSerializer(Some(validSchema)) } }
Example 27
Source File: UnlabeledCSVRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package unit.com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.UnlabeledCSVRequestRowSerializer class UnlabeledCSVRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema: StructType = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false))) it should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) val sparseString = "-100.0," + "0.0," * 9 + "100.1," + "0.0," * 88 + "0.0\n" assert (sparseString == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert("10.0,-100.0,2.0\n" == serialized) } }
Example 28
Source File: KNNPropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import scala.reflect.ClassTag import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Gen.{choose, oneOf} import org.scalatest.PropSpec import org.apache.spark.ml.linalg.{ CosineDistance, EuclideanDistance, ManhattanDistance, JaccardDistance, HammingDistance } import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector, Vectors} import com.holdenkarau.spark.testing.SharedSparkContext abstract class KNNPropSpec extends PropSpec with SharedSparkContext { implicit def arbitraryDenseVector: Arbitrary[DenseVector] = Arbitrary { for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr) } implicit def arbitrarySparseVector: Arbitrary[SparseVector] = Arbitrary { for (vec <- arbitrary[DenseVector]) yield vec.toSparse } implicit def arbitraryVector: Arbitrary[Vector] = Arbitrary( Gen.frequency( 1 -> arbitrary[DenseVector], 1 -> arbitrary[SparseVector] )) private def arraysOfNM[T: ClassTag](numRows: Int, numCols: Int, gen: Gen[T]): Gen[Array[Array[T]]] = Gen.listOfN(numRows * numCols, gen).map { square => square.toArray.grouped(numCols).toArray } private def vectorsOfNM(numRows: Int, numCols: Int, gen: Gen[Double]): Gen[Array[DenseVector]] = for { arrays <- arraysOfNM(numRows, numCols, gen) } yield arrays.map(arr => new DenseVector(arr)) val treeGen = for { measure <- oneOf(CosineDistance, EuclideanDistance, ManhattanDistance, HammingDistance, JaccardDistance) numVectors <- choose(1, 100) vectors <- vectorsOfNM(numVectors, 2, choose(-10.0, 10.0)) } yield vectors .scanLeft(Seq[Vector]())(_ :+ _) .tail .map( vs => VPTree(vs.map(v => VectorEntry(0L, v)).toIndexedSeq, measure, 10, 10, 10)) }
Example 29
Source File: LocalPCAModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.feature.PCAModel import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Vectors} import org.apache.spark.mllib.linalg.{DenseMatrix => OldDenseMatrix, Matrices => OldMatrices} class LocalPCAModel(override val sparkTransformer: PCAModel) extends LocalTransformer[PCAModel] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val pc = OldMatrices.fromML(sparkTransformer.pc).asInstanceOf[OldDenseMatrix] val newData = column.data.mapToMlLibVectors.map(pc.transpose.multiply).map(_.toList) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalPCAModel extends SimpleModelLoader[PCAModel] with TypedTransformerConverter[PCAModel] { override def build(metadata: Metadata, data: LocalData): PCAModel = { val constructor = classOf[PCAModel].getDeclaredConstructor( classOf[String], classOf[DenseMatrix], classOf[DenseVector] ) constructor.setAccessible(true) val pcMap = data.column("pc").get.data.head.asInstanceOf[Map[String, Any]] val pcMat = DataUtils.constructMatrix(pcMap).asInstanceOf[DenseMatrix] data.column("explainedVariance") match { case Some(ev) => // NOTE: Spark >= 2 val evParams = ev.data.head.asInstanceOf[Map[String, Any]] val explainedVariance = DataUtils.constructVector(evParams).toDense constructor .newInstance(metadata.uid, pcMat, explainedVariance) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) case None => // NOTE: Spark < 2 constructor .newInstance( metadata.uid, pcMat, Vectors.dense(Array.empty[Double]).asInstanceOf[DenseVector] ) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) } } override implicit def toLocal(transformer: PCAModel) = new LocalPCAModel(transformer) }
Example 30
Source File: LocalMaxAbsScalerModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common.utils.DataUtils._ import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.feature.MaxAbsScalerModel import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} class LocalMaxAbsScalerModel(override val sparkTransformer: MaxAbsScalerModel) extends LocalTransformer[MaxAbsScalerModel] { override def transform(localData: LocalData): LocalData = { localData.column(sparkTransformer.getInputCol) match { case Some(column) => val maxAbsUnzero = Vectors.dense(sparkTransformer.maxAbs.toArray.map(x => if (x == 0) 1 else x)) val newData = column.data.map(r => { val vec = r match { case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue()) case d => throw new IllegalArgumentException(s"Unknown data type for LocalMaxAbsScaler: $d") } val brz = DataUtils.asBreeze(vec.toArray) / DataUtils.asBreeze(maxAbsUnzero.toArray) DataUtils.fromBreeze(brz).toList }) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalMaxAbsScalerModel extends SimpleModelLoader[MaxAbsScalerModel] with TypedTransformerConverter[MaxAbsScalerModel] { override def build(metadata: Metadata, data: LocalData): MaxAbsScalerModel = { val maxAbsParams = data.column("maxAbs").get.data.head.asInstanceOf[Map[String, Any]] val maxAbs = DataUtils.constructVector(maxAbsParams) val constructor = classOf[MaxAbsScalerModel].getDeclaredConstructor(classOf[String], classOf[Vector]) constructor.setAccessible(true) constructor .newInstance(metadata.uid, maxAbs) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) } override implicit def toLocal( transformer: MaxAbsScalerModel ): LocalMaxAbsScalerModel = new LocalMaxAbsScalerModel(transformer) }
Example 31
Source File: LocalMinMaxScalerModel.scala From spark-ml-serving with Apache License 2.0 | 5 votes |
package io.hydrosphere.spark_ml_serving.preprocessors import io.hydrosphere.spark_ml_serving.TypedTransformerConverter import io.hydrosphere.spark_ml_serving.common._ import io.hydrosphere.spark_ml_serving.common.utils.DataUtils import org.apache.spark.ml.feature.MinMaxScalerModel import org.apache.spark.ml.linalg.{DenseVector, Vector} class LocalMinMaxScalerModel(override val sparkTransformer: MinMaxScalerModel) extends LocalTransformer[MinMaxScalerModel] { override def transform(localData: LocalData): LocalData = { val originalRange = (DataUtils.asBreeze(sparkTransformer.originalMax.toArray) - DataUtils.asBreeze( sparkTransformer.originalMin.toArray )).toArray val minArray = sparkTransformer.originalMin.toArray val min = sparkTransformer.getMin val max = sparkTransformer.getMax localData.column(sparkTransformer.getInputCol) match { case Some(column) => val newData = column.data.map(r => { val scale = max - min val vec = r match { case d: Seq[Number @unchecked] if d.isInstanceOf[Seq[Number]] => d.map(_.doubleValue()) case d => throw new IllegalArgumentException(s"Unknown data type for LocalMinMaxScaler: $d") } val values = vec.toArray val size = values.length var i = 0 while (i < size) { if (!values(i).isNaN) { val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5 values.update(i, raw * scale + min) } i += 1 } values.toList }) localData.withColumn(LocalDataColumn(sparkTransformer.getOutputCol, newData)) case None => localData } } } object LocalMinMaxScalerModel extends SimpleModelLoader[MinMaxScalerModel] with TypedTransformerConverter[MinMaxScalerModel] { override def build(metadata: Metadata, data: LocalData): MinMaxScalerModel = { val originalMinList = data .column("originalMin") .get .data .head .asInstanceOf[Map[String, Any]] val originalMin = DataUtils.constructVector(originalMinList) val originalMaxList = data .column("originalMax") .get .data .head .asInstanceOf[Map[String, Any]] val originalMax = DataUtils.constructVector(originalMaxList) val constructor = classOf[MinMaxScalerModel].getDeclaredConstructor( classOf[String], classOf[Vector], classOf[Vector] ) constructor.setAccessible(true) constructor .newInstance(metadata.uid, originalMin, originalMax) .setInputCol(metadata.paramMap("inputCol").asInstanceOf[String]) .setOutputCol(metadata.paramMap("outputCol").asInstanceOf[String]) .setMin(metadata.paramMap("min").toString.toDouble) .setMax(metadata.paramMap("max").toString.toDouble) } override implicit def toLocal( transformer: MinMaxScalerModel ) = new LocalMinMaxScalerModel(transformer) }
Example 32
Source File: get_features_from_peinfo.scala From gsoc_relationship with Apache License 2.0 | 5 votes |
import com.datastax.spark.connector._ import play.api.libs.json.Json import play.api.libs.json._ import java.io.{ByteArrayOutputStream, ByteArrayInputStream} import java.util.zip.{GZIPOutputStream, GZIPInputStream} import Array.concat import org.apache.spark.sql.types._ import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.ml.linalg._ import org.apache.spark.sql.Row import org.apache.spark.ml.feature.MinMaxScaler import org.apache.spark.ml.linalg.DenseVector import PreProcessingConfig._ case class peinfo_results_by_service_name_class(service_name: String, sha256: String) case class peinfo_results_by_sha256_class(sha256: String, service_name: String, results: Array[Byte]) case class peinfo_join_results_class(sha256: String, service_name: String, results: String) case class peinfo_int_final_array_rdd_class(sha256: String, array_results: Array[Double]) case class peinfo_binaray_final_array_rdd_class(sha256:String, array_results :Array[Double]) case class peinfo_final_array_rdd_class(sha256:String, array_results: Array[Double]) def unzip(x: Array[Byte]) : String = { val inputStream = new GZIPInputStream(new ByteArrayInputStream(x)) val output = scala.io.Source.fromInputStream(inputStream).mkString return output } def findAllIntinpeinfo( peinfo_json_results : JsLookupResult, time: Double): Array[Double]= { val entropy = peinfo_json_results \\ "entropy" ; val virt_address = peinfo_json_results \\ "virt_address"; val virt_size = peinfo_json_results \\ "virt_size"; val size = peinfo_json_results \\ "size"; var i= 0; var List = Array.iterate(0.0,17)(a=>a*0) for (k <- ( peinfo_json_results \\ "section_name")){ k.as[String] match { case ".text\u0000\u0000\u0000" => { List(0)=entropy(i).as[Double]; List(1)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(2)=virt_size(i).as[Double]; List(3)=size(i).as[Double] } case ".data\u0000\u0000\u0000" => { List(4)=entropy(i).as[Double]; List(5)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(6)=virt_size(i).as[Double]; List(7)=size(i).as[Double] } case ".rsrc\u0000\u0000\u0000" => { List(8)=entropy(i).as[Double]; List(9)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(10)=virt_size(i).as[Double]; List(11)=size(i).as[Double] } case ".rdata\u0000\u0000" => { List(12)=entropy(i).as[Double]; List(13)=Integer.parseInt(virt_address(i).as[String].substring(2), 16).toDouble; List(14)=virt_size(i).as[Double]; List(15)=size(i).as[Double] } case other => {} } i = i + 1 } List(16)= time return List.toArray } val peinfo_results_by_service_name_meta = sc.cassandraTable[peinfo_results_by_service_name_class](keyspace,service_name_table).where("service_name=?","peinfo") val peinfo_results_by_service_name_rdd = peinfo_results_by_service_name_meta.keyBy(x=> (x.sha256,x.service_name)) val peinfo_results_by_sha256_meta = sc.cassandraTable[peinfo_results_by_sha256_class](keyspace,sha256_table) val peinfo_results_by_sha256_rdd = peinfo_results_by_sha256_meta.keyBy(x => (x.sha256,x.service_name)) val peinfo_join_results = peinfo_results_by_service_name_rdd.join(peinfo_results_by_sha256_rdd).map(x=> (new peinfo_join_results_class(x._1._1,x._1._2, unzip(x._2._2.results)))).distinct().cache() val peinfo_int_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "pe_sections"),{if ((Json.parse(x.results) \ "timestamp").isInstanceOf[JsUndefined]) 0.0 else (Json.parse(x.results) \ "timestamp" \\ "timestamp")(0).as[Double]})).filter(x=> !x._2.isInstanceOf[JsUndefined]).map(x=>new peinfo_int_final_array_rdd_class(x._1,findAllIntinpeinfo(x._2,x._3))) val peinfo_dllfunction_list= peinfo_join_results.map(x=>Json.parse(x.results) \ "imports").filter(x=> !x.isInstanceOf[JsUndefined]).flatMap(x=>x.as[List[Map[String, String]]].map(x=>(x("dll")+"."+x("function")))).toDF("func_name").groupBy("func_name").count.sort(desc("count")).filter("count > 10000").rdd.map(r => r.getString(0)).collect().toList implicit def bool2int(b:Boolean) = if (b) 1 else 0 def findAllBininpeinfo_dllfunction(peinfo_dllfunction : Seq[String]) : Array[Double] ={ val forlist = for (family <- peinfo_dllfunction_list) yield { (peinfo_dllfunction.contains(family):Int).toDouble } return (forlist).toArray } val List502 = Array.iterate(0.0,502)(a=>0.0) val peinfo_binaray_final_array_rdd = peinfo_join_results.map(x=>(x.sha256,(Json.parse(x.results) \ "imports"))).map(x=>new peinfo_binaray_final_array_rdd_class(x._1,{if (x._2.isInstanceOf[JsUndefined]) List502 else findAllBininpeinfo_dllfunction(x._2.as[Seq[Map[String, String]]].map(x=>(x("dll")+"."+x("function"))))})) val peinfo_int_final_array_rdd_before_join = peinfo_int_final_array_rdd.map(x=>(x.sha256,x.array_results)) val peinfo_binaray_final_array_rdd_before_join = peinfo_binaray_final_array_rdd.map(x=>(x.sha256,x.array_results)) val peinfo_array_rdd_by_join = peinfo_int_final_array_rdd_before_join.join(peinfo_binaray_final_array_rdd_before_join).map(x=> (x._1,concat(x._2._1,x._2._2))) val peinfo_final_array_rdd = peinfo_array_rdd_by_join.map(x=>new peinfo_final_array_rdd_class(x._1,x._2)) val peinfo_schema = new StructType().add("sha256", StringType).add("peinfo",VectorType) val peinfo_vector_rdd = peinfo_final_array_rdd.map(x=>(x.sha256,Vectors.dense(x.array_results))) val peinfo_vector_rowrdd = peinfo_vector_rdd.map(p => Row(p._1,p._2)) val peinfo_vector_dataframe = spark.createDataFrame(peinfo_vector_rowrdd, peinfo_schema) val peinfo_scaler = new MinMaxScaler() .setInputCol("peinfo") .setOutputCol("scaled_peinfo") val peinfo_scalerModel = peinfo_scaler.fit(peinfo_vector_dataframe) val peinfo_scaledData_df = peinfo_scalerModel.transform(peinfo_vector_dataframe) val peinfo_scaledData_rdd = peinfo_scaledData_df.select("sha256","scaled_peinfo").rdd.map(row=>(row.getAs[String]("sha256"),row.getAs[DenseVector]("scaled_peinfo"))).map(x=>new peinfo_final_array_rdd_class(x._1,x._2.toArray)) peinfo_scaledData_rdd.toDF().write.format("parquet").save(peinfo_final_array_file)
Example 33
Source File: FeatureCrossOp.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.cross import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import scala.collection.mutable.ArrayBuffer object FeatureCrossOp { def flatCartesian(vector: Vector): Vector = { val curDim = vector.size vector match { case sv: SparseVector => val indices = new ArrayBuffer[Int]() val values = new ArrayBuffer[Double]() sv.indices.foreach { idx1 => sv.indices.foreach { idx2 => indices += curDim * idx1 + idx2 values += sv(idx1) * sv(idx2) } } val sorted = indices.zip(values).sortBy(_._1) val sortedIndices = sorted.map(_._1) val sortedValues = sorted.map(_._2) new SparseVector(sv.size * sv.size, sortedIndices.toArray, sortedValues.toArray) case dv: DenseVector => val values: Array[Double] = new Array(dv.size * dv.size) (0 until dv.size).foreach { idx1 => (0 until dv.size).foreach { idx2 => values(dv.size * idx1 + idx2) = dv(idx1) * dv(idx2) } } new DenseVector(values) } } def main(args: Array[String]): Unit = { val v = new DenseVector(Array(1, 2, 3)) val cv = flatCartesian(v) println(cv.toDense.values.mkString(",")) } }
Example 34
Source File: FeatureUtils.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.sql.{Dataset, Row} import scala.language.postfixOps object FeatureUtils { def maxDim(dataset: Dataset[Row], col: String = "features"): Int = { dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] => val dim = rows.map { case Row(v: Vector) => v match { case sv: SparseVector => sv.indices.last case dv: DenseVector => dv.size } }.max Iterator(dim) }.max + 1 } def countNonZero(dataset: Dataset[Row], col: String = "features"): Array[Int] = { dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] => val mergeIndices = rows.map { case Row(v: Vector) => v match { case sv: SparseVector => sv.indices.toList } }.reduce(_ union _ distinct) Iterator(mergeIndices) }.reduce((a, b) => (a union b).distinct).toArray } }
Example 35
Source File: DataUtils.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.utils import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} object DataUtils { def parse(ss: SparkSession, schema: StructType, X: Array[Vector], Y: Array[Double]): DataFrame = { require(X.size == Y.size, "The size of configurations should be equal to the size of rewards.") ss.createDataFrame( Y.zip(X)).toDF("label", "features") } def parse(ss: SparkSession, schema: StructType, X: Vector): DataFrame = { parse(ss, schema, Array(X), Array(0)) } def toBreeze(values: Array[Double]): BDV[Double] = { new BDV[Double](values) } def toBreeze(vector: Vector): BDV[Double] = vector match { case sv: SparseVector => new BDV[Double](vector.toDense.values) case dv: DenseVector => new BDV[Double](dv.values) } def toBreeze(X: Array[Vector]): BDM[Double] = { val mat = BDM.zeros[Double](X.size, X(0).size) for (i <- 0 until X.size) { for (j <- 0 until X(0).size) { mat(i, j) = X(i)(j) } } mat } }
Example 36
Source File: Evaluator.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.evaluation import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics import org.apache.spark.sql.DataFrame class Evaluator { def evaluate(predictions:DataFrame):Unit = { import predictions.sparkSession.implicits._ val scoreAndLabels = predictions.select("label", "probability").map { row => (row.apply(1).asInstanceOf[DenseVector](1), row.getAs[Int]("label").toDouble) } val metrics = new BinaryClassificationMetrics(scoreAndLabels.rdd) println("AUC under PR = " + metrics.areaUnderPR()) println("AUC under ROC = " + metrics.areaUnderROC()) } }
Example 37
Source File: OuterProductNNCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.DataFrame class OuterProductNNCtrModel extends BaseCtrModel { def train(samples:DataFrame) : Unit = { //calculate inner product between item embedding and user embedding val samplesWithOuterProduct = FeatureEngineering.calculateEmbeddingOuterProduct(samples) val prePipelineModel = FeatureEngineering.preProcessOuterProductSamples(samplesWithOuterProduct) val preparedSamples = prePipelineModel.transform(samplesWithOuterProduct) //network architecture, better to keep tuning it until metrics converge val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length, preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2) val nnModel = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(150) //max iterations, keep increasing it if loss function or metrics don't converge .setStepSize(0.005) //learning step size, larger size will lead to loss vibration .setFeaturesCol("scaledFeatures") .setLabelCol("label") val pipelineStages = prePipelineModel.stages ++ Array(nnModel) _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithOuterProduct) } override def transform(samples:DataFrame):DataFrame = { val samplesWithOuterProduct = FeatureEngineering.calculateEmbeddingOuterProduct(samples) _pipelineModel.transform(samplesWithOuterProduct) } }
Example 38
Source File: FactorizationMachineCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{FMModel, FMWithSGD, LabeledPoint} import org.apache.spark.sql.DataFrame class FactorizationMachineCtrModel extends BaseCtrModel { var _model:FMModel = _ def train(samples:DataFrame) : Unit = { //calculate inner product between item embedding and user embedding val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) _pipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct) val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct) val formatSamples = preparedSamples.rdd.map( row =>{ new LabeledPoint(row.getAs[Int]("label").toDouble, Vectors.fromML(row.getAs[DenseVector]("scaledFeatures"))) }) _model = FMWithSGD.train(formatSamples, task = 1, numIterations = 200, stepSize = 0.15, miniBatchFraction = 1, dim = (true, true, 2), regParam = (0, 0, 0), initStd = 0.1) } override def transform(samples:DataFrame):DataFrame = { val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) val preparedSamples = _pipelineModel.transform(samplesWithInnerProduct) _model.predict(preparedSamples) } }
Example 39
Source File: InnerProductNNCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.{LogisticRegression, MultilayerPerceptronClassifier} import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.DataFrame class InnerProductNNCtrModel extends BaseCtrModel { def train(samples:DataFrame) : Unit = { //calculate inner product between item embedding and user embedding val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) val prePipelineModel = FeatureEngineering.preProcessInnerProductSamples(samplesWithInnerProduct) val preparedSamples = prePipelineModel.transform(samplesWithInnerProduct) //network architecture, better to keep tuning it until metrics converge val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length, preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2) val nnModel = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(150) //max iterations, keep increasing it if loss function or metrics don't converge .setStepSize(0.005) //learning step size, larger size will lead to loss vibration .setFeaturesCol("scaledFeatures") .setLabelCol("label") val pipelineStages = prePipelineModel.stages ++ Array(nnModel) _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samplesWithInnerProduct) } override def transform(samples:DataFrame):DataFrame = { val samplesWithInnerProduct = FeatureEngineering.calculateEmbeddingInnerProduct(samples) _pipelineModel.transform(samplesWithInnerProduct) } }
Example 40
Source File: NeuralNetworkCtrModel.scala From CTRmodel with Apache License 2.0 | 5 votes |
package com.ggstar.ctrmodel import com.ggstar.features.FeatureEngineering import org.apache.spark.ml.Pipeline import org.apache.spark.ml.classification.MultilayerPerceptronClassifier import org.apache.spark.ml.linalg.DenseVector import org.apache.spark.sql.DataFrame class NeuralNetworkCtrModel extends BaseCtrModel { def train(samples:DataFrame) : Unit = { val prePipelineModel = FeatureEngineering.preProcessSamples(samples) val preparedSamples = prePipelineModel.transform(samples) //network architecture, better to keep tuning it until metrics converge val layers = Array[Int](preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length, preparedSamples.first().getAs[DenseVector]("scaledFeatures").toArray.length / 2, 2) val nnModel = new MultilayerPerceptronClassifier() .setLayers(layers) .setBlockSize(128) .setSeed(1234L) .setMaxIter(150) //max iterations, keep increasing it if loss function or metrics don't converge .setStepSize(0.005) //learning step size, larger size will lead to loss vibration .setFeaturesCol("scaledFeatures") .setLabelCol("label") val pipelineStages = prePipelineModel.stages ++ Array(nnModel) _pipelineModel = new Pipeline().setStages(pipelineStages).fit(samples) } }
Example 41
Source File: FeaturePropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{Vector, Vectors, DenseVector} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.{SparkSession, DataFrame} import org.apache.spark.sql.types.{ StructField, IntegerType, DoubleType, BooleanType, StructType, StringType, ArrayType } import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Arbitrary.arbitrary import org.scalatest.PropSpec import com.holdenkarau.spark.testing.{ SharedSparkContext, DataframeGenerator, Column } abstract class FeaturePropSpec extends PropSpec with SharedSparkContext with DefaultReadWriteTest { implicit def arbitraryDenseVector: Arbitrary[DenseVector] = Arbitrary { for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr) } implicit def arbitraryVector: Arbitrary[Vector] = Arbitrary( Gen.frequency( 1 -> arbitrary[DenseVector] )) lazy val spark = SparkSession.builder().getOrCreate() def schema = StructType( List( StructField("integer", IntegerType), StructField("double", DoubleType), StructField("boolean", BooleanType), StructField("string", StringType) )) def integerGen = new Column("integer", Gen.choose(-100, 100)) def doubleGen = new Column("double", Gen.choose(-100.0, 100.0)) def stringGen = new Column("string", Gen.oneOf("A", "BC", "DEF", "GHIJ", "KLMNO")) def dataframeGen = DataframeGenerator.arbitraryDataFrameWithCustomFields( spark.sqlContext, schema)(integerGen, doubleGen, stringGen) def hasDistinctValues(df: DataFrame, columns: String*): Boolean = { columns.foldLeft(true) { (acc, col) => acc && df.select(col).distinct.count() > 1 } } }
Example 42
Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 43
Source File: XgbConverters.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.xgboost.runtime import biz.k11i.xgboost.util.FVec import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor} import ml.combust.mleap.xgboost.runtime.struct.FVecFactory import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} trait XgbConverters { implicit class VectorOps(vector: Vector) { def asXGB: DMatrix = { vector match { case SparseVector(_, indices, values) => new DMatrix(Iterator(new LabeledPoint(0.0f, indices, values.map(_.toFloat)))) case DenseVector(values) => new DMatrix(Iterator(new LabeledPoint(0.0f, null, values.map(_.toFloat)))) } } def asXGBPredictor: FVec = { vector match { case sparseVector: SparseVector => FVecFactory.fromSparseVector(sparseVector) case denseVector: DenseVector => FVecFactory.fromDenseVector(denseVector) } } } implicit class DoubleTensorOps(tensor: Tensor[Double]) { def asXGB: DMatrix = { tensor match { case SparseTensor(indices, values, _) => new DMatrix(Iterator(new LabeledPoint(0.0f, indices.map(_.head).toArray, values.map(_.toFloat)))) case DenseTensor(_, _) => new DMatrix(Iterator(new LabeledPoint(0.0f, null, tensor.toDense.rawValues.map(_.toFloat)))) } } def asXGBPredictor: FVec = { tensor match { case sparseTensor: SparseTensor[Double] => FVecFactory.fromSparseTensor(sparseTensor) case denseTensor: DenseTensor[Double] => FVecFactory.fromDenseTensor(denseTensor) } } } } object XgbConverters extends XgbConverters
Example 44
Source File: VectorSlicerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.VectorUtil._ @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala") case class VectorSlicerModel(indices: Array[Int], namedIndices: Array[(String, Int)] = Array(), inputSize: Int) extends Model { val allIndices: Array[Int] = indices.union(namedIndices.map(_._2)) def apply(features: Vector): Vector = features match { case features: DenseVector => Vectors.dense(allIndices.map(features.apply)) case features: SparseVector => features.slice(allIndices) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(allIndices.length)).get }
Example 45
Source File: ElementwiseProductModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructField, StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala") case class ElementwiseProductModel(scalingVec: Vector) extends Model { def apply(vector: Vector): Vector = { vector match { case DenseVector(values) => val vs = values.clone() val size = vs.length var i = 0 while (i < size) { vs(i) *= scalingVec(i) i += 1 } Vectors.dense(vs) case SparseVector(size, indices, values) => val vs = values.clone() val nnz = vs.length var i = 0 while (i < nnz) { vs(i) *= scalingVec(indices(i)) i += 1 } Vectors.sparse(size, indices, vs) } } override def inputSchema: StructType = StructType(StructField("input" -> TensorType.Double(scalingVec.size))).get override def outputSchema: StructType = StructType(StructField("output" -> TensorType.Double(scalingVec.size))).get }
Example 46
Source File: MaxAbsScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.math.{max, min} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala") case class MaxAbsScalerModel(maxAbs: Vector) extends Model { def apply(vector: Vector): Vector = { val maxAbsUnzero = Vectors.dense(maxAbs.toArray.map(x => if (x == 0) 1 else x)) vector match { case DenseVector(values) => val vs = values.clone() val size = vs.length var i = 0 while (i < size) { if (!values(i).isNaN) { val rescale = max(-1.0, min(1.0, values(i) / maxAbsUnzero(i))) vs(i) = rescale } i += 1 } Vectors.dense(vs) case SparseVector(size, indices, values) => val vs = values.clone() val nnz = vs.length var i = 0 while (i < nnz) { val raw = max(-1.0, min(1.0, values(i) / maxAbsUnzero(indices(i)))) vs(i) = raw i += 1 } Vectors.sparse(size, indices, vs) } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(maxAbs.size)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(maxAbs.size)).get }
Example 47
Source File: ChiSqSelectorModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.collection.mutable @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala") case class ChiSqSelectorModel(filterIndices: Seq[Int], inputSize: Int) extends Model { def apply(features: Vector): Vector = { features match { case SparseVector(size, indices, values) => val newSize = filterIndices.length val newValues = mutable.ArrayBuilder.make[Double] val newIndices = mutable.ArrayBuilder.make[Int] var i = 0 var j = 0 var indicesIdx = 0 var filterIndicesIdx = 0 while (i < indices.length && j < filterIndices.length) { indicesIdx = indices(i) filterIndicesIdx = filterIndices(j) if (indicesIdx == filterIndicesIdx) { newIndices += j newValues += values(i) j += 1 i += 1 } else { if (indicesIdx > filterIndicesIdx) { j += 1 } else { i += 1 } } } // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size) Vectors.sparse(newSize, newIndices.result(), newValues.result()) case DenseVector(values) => val values = features.toArray Vectors.dense(filterIndices.map(i => values(i)).toArray) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}.") } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(filterIndices.length)).get }
Example 48
Source File: MinMaxScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.mleap.VectorUtil._ import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.math.{max, min} def apply(vector: Vector): Vector = { val scale = maxValue - minValue // 0 in sparse vector will probably be rescaled to non-zero val values = vector.copy.toArray val size = values.length var i = 0 while (i < size) { if (!values(i).isNaN) { val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5 values(i) = raw * scale + minValue } i += 1 } Vectors.dense(values) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(originalRange.length)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(originalRange.length)).get }
Example 49
Source File: WordToVectorModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType} import org.apache.spark.ml.linalg.mleap.BLAS import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} sealed trait WordToVectorKernel { def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector def name: String } object WordToVectorKernel { private val lookup: Map[String, WordToVectorKernel] = Seq(Default, Sqrt).map { k => (k.name, k) }.toMap def forName(name: String): WordToVectorKernel = lookup(name) case object Default extends WordToVectorKernel { override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = { val sum = Vectors.zeros(size) for (v <- vectors) { BLAS.axpy(1.0, v, sum) } BLAS.scal(1.0 / sentenceSize, sum) sum } override def name: String = "default" } case object Sqrt extends WordToVectorKernel { override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = { val sum = Vectors.zeros(size) for (v <- vectors) { BLAS.axpy(1.0, v, sum) } val values = sum match { case sum: DenseVector => sum.values case sum: SparseVector => sum.values } var i = 0 val s = values.length val sqrt = Math.sqrt(BLAS.dot(sum, sum)) while (i < s) { values(i) /= sqrt i += 1 } sum } override def name: String = "sqrt" } } case class WordToVectorModel(wordIndex: Map[String, Int], wordVectors: Array[Double], kernel: WordToVectorKernel = WordToVectorKernel.Default) extends Model { val numWords: Int = wordIndex.size val vectorSize: Int = wordVectors.length / numWords val vectors: Map[String, Vector] = { wordIndex.map { case (word, ind) => (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize)) } }.mapValues(Vectors.dense).map(identity) def apply(sentence: Seq[String]): Vector = { if (sentence.isEmpty) { Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double]) } else { val vs = sentence.iterator.map(vectors.get). filter(_.isDefined). map(_.get) kernel(vectorSize, sentence.size, vs) } } override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(vectorSize)).get }
Example 50
Source File: NormalizerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def apply(features: Vector): Vector = { val norm = Vectors.norm(features, pNorm) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. features match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. features } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get }
Example 51
Source File: VectorIndexerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import java.util.NoSuchElementException import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.4.5/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala") case class VectorIndexerModel(numFeatures: Int, categoryMaps: Map[Int, Map[Double, Int]], handleInvalid: HandleInvalid = HandleInvalid.Error) extends Model { val sortedCatFeatureIndices = categoryMaps.keys.toArray.sorted val localVectorMap = categoryMaps val localNumFeatures = numFeatures val localHandleInvalid = handleInvalid def apply(features: Vector): Vector = predict(features) def predict(features: Vector): Vector = { assert(features.size == localNumFeatures, "VectorIndexerModel expected vector of length" + s" $numFeatures but found length ${features.size}") features match { case dv: DenseVector => var hasInvalid = false val tmpv = dv.copy localVectorMap.foreach { case (featureIndex: Int, categoryMap: Map[Double, Int]) => try { tmpv.values(featureIndex) = categoryMap(tmpv(featureIndex)) } catch { case _: NoSuchElementException => localHandleInvalid match { case HandleInvalid.Error => throw new IllegalArgumentException(s"VectorIndexer encountered invalid value " + s"${tmpv(featureIndex)} on feature index $featureIndex. To handle " + s"or skip invalid value, try setting VectorIndexer.handleInvalid.") case HandleInvalid.Keep => tmpv.values(featureIndex) = categoryMap.size case HandleInvalid.Skip => hasInvalid = true } } } if (hasInvalid) null else tmpv case sv: SparseVector => // We use the fact that categorical value 0 is always mapped to index 0. var hasInvalid = false val tmpv = sv.copy var catFeatureIdx = 0 // index into sortedCatFeatureIndices var k = 0 // index into non-zero elements of sparse vector while (catFeatureIdx < sortedCatFeatureIndices.length && k < tmpv.indices.length) { val featureIndex = sortedCatFeatureIndices(catFeatureIdx) if (featureIndex < tmpv.indices(k)) { catFeatureIdx += 1 } else if (featureIndex > tmpv.indices(k)) { k += 1 } else { try { tmpv.values(k) = localVectorMap(featureIndex)(tmpv.values(k)) } catch { case _: NoSuchElementException => localHandleInvalid match { case HandleInvalid.Error => throw new IllegalArgumentException(s"VectorIndexer encountered invalid value " + s"${tmpv.values(k)} on feature index $featureIndex. To handle " + s"or skip invalid value, try setting VectorIndexer.handleInvalid.") case HandleInvalid.Keep => tmpv.values(k) = localVectorMap(featureIndex).size case HandleInvalid.Skip => hasInvalid = true } } catFeatureIdx += 1 k += 1 } } if (hasInvalid) null else tmpv } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(localNumFeatures)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(localNumFeatures)).get }
Example 52
Source File: StandardScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def apply(vector: Vector): Vector = { if (mean.nonEmpty) { val shift = mean.get.toArray val values = vector match { // specially handle DenseVector because its toArray does not clone already case d: DenseVector => d.values.clone() case v: SparseVector => v.toArray } val size = values.length if (std.nonEmpty) { val stdDev = std.get var i = 0 while (i < size) { values(i) = if (stdDev(i) != 0.0) (values(i) - shift(i)) * (1.0 / stdDev(i)) else 0.0 i += 1 } } else { var i = 0 while (i < size) { values(i) -= shift(i) i += 1 } } Vectors.dense(values) } else if (std.nonEmpty) { val stdDev = std.get vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while(i < size) { values(i) *= (if (stdDev(i) != 0.0) 1.0 / stdDev(i) else 0.0) i += 1 } Vectors.dense(values) case SparseVector(size, indices, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) *= (if (stdDev(indices(i)) != 0.0) 1.0 / stdDev(indices(i)) else 0.0) i += 1 } Vectors.sparse(size, indices, values) } } else { throw new IllegalStateException("need to scale with mean and/or with stdev") } } override def inputSchema: StructType = { StructType("input" -> TensorType.Double(size)).get } override def outputSchema: StructType = StructType("output" -> TensorType.Double(size)).get }
Example 53
Source File: IDFModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala") case class IDFModel(idf: Vector) extends Model { def apply(v: Vector): Vector = { val n = v.size v match { case SparseVector(size, indices, values) => val nnz = indices.length val newValues = new Array[Double](nnz) var k = 0 while (k < nnz) { newValues(k) = values(k) * idf(indices(k)) k += 1 } Vectors.sparse(n, indices, newValues) case DenseVector(values) => val newValues = new Array[Double](n) var j = 0 while (j < n) { newValues(j) = values(j) * idf(j) j += 1 } Vectors.dense(newValues) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}.") } } override def inputSchema: StructType = StructType("input" -> TensorType.Double()).get override def outputSchema: StructType = StructType("output" -> TensorType.Double()).get }
Example 54
Source File: GaussianMixtureModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.clustering import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{ScalarType, StructType, TensorType} import org.apache.spark.ml.linalg.mleap.Utils._ import org.apache.spark.ml.linalg.{DenseVector, Vector, Vectors} import org.apache.spark.ml.stat.distribution.MultivariateGaussian object GaussianMixtureModel { @SparkCode(uri = "https://github.com/apache/spark/blob/branch-2.0/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala") def computeProbabilities(features: DenseVector, dists: Array[MultivariateGaussian], weights: Array[Double]): Array[Double] = { val p = weights.zip(dists).map { case (weight, dist) => EPSILON + weight * dist.pdf(features) } val pSum = p.sum var i = 0 while (i < weights.length) { p(i) /= pSum i += 1 } p } } case class GaussianMixtureModel(gaussians: Array[MultivariateGaussian], weights: Array[Double]) extends Model { val numClusters = gaussians.length val numFeatures: Int = weights.length def apply(features: Vector): Int = predict(features) def predict(features: Vector): Int = { predictionFromProbability(predictProbability(features)) } def predictWithProbability(features: Vector): (Int, Double) = { val probability = predictProbability(features) val index = probability.argmax (index, probability(index)) } def predictionFromProbability(probabilities: Vector): Int = { probabilities.argmax } def predictProbability(features: Vector): Vector = { val probs: Array[Double] = GaussianMixtureModel.computeProbabilities(features.toDense, gaussians, weights) Vectors.dense(probs) } override def inputSchema: StructType = StructType("features" -> TensorType.Double(numFeatures)).get override def outputSchema: StructType = StructType("prediction" -> ScalarType.Int.nonNullable, "probability" -> TensorType.Double(numClusters)).get }