org.apache.spark.ml.linalg.SparseVector Scala Examples
The following examples show how to use org.apache.spark.ml.linalg.SparseVector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: OneHotEncoderSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.ml import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.spark._ import org.apache.spark.ml.feature.OneHotEncoderEstimator import org.apache.spark.ml.linalg.SparseVector import org.apache.spark.sql.DataFrame class OneHotEncoderSpec extends TestBase { test("expand category indicies") { val df = session.createDataFrame(Seq((0, 0.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 1.0), (5, 0.0))) .toDF("id", "categoryIndex") val encoded = new OneHotEncoderEstimator() .setInputCols(Array("categoryIndex")).setOutputCols(Array("categoryVec")) .fit(df).transform(df) val oneHotList = encoded.getSVCol("categoryVec") val trueList = List(new SparseVector(2, Array(0), Array(1.0)), new SparseVector(2, Array(1), Array(1.0)), new SparseVector(2, Array(0), Array(1.0)), new SparseVector(2, Array(), Array()), new SparseVector(2, Array(1), Array(1.0)), new SparseVector(2, Array(0), Array(1.0))) assert(oneHotList === trueList) } test("support interger indicies") { val df = session.createDataFrame(Seq((0, 0), (1, 1), (2, 0), (3, 2), (4, 1), (5, 0) )) .toDF("id", "categoryIndex") val encoded = new OneHotEncoderEstimator() .setInputCols(Array("categoryIndex")).setOutputCols(Array("categoryVec")) .fit(df).transform(df) val oneHotList = encoded.getSVCol("categoryVec") val trueList = List(new SparseVector(2, Array(0), Array(1.0)), new SparseVector(2, Array(1), Array(1.0)), new SparseVector(2, Array(0), Array(1.0)), new SparseVector(2, Array(), Array()), new SparseVector(2, Array(1), Array(1.0)), new SparseVector(2, Array(0), Array(1.0))) assert(oneHotList === trueList) } test("support not dropping the last feature") { val df = session.createDataFrame(Seq((0, 0.0), (1, 1.0), (2, 0.0), (3, 2.0), (4, 1.0), (5, 0.0) )) .toDF("id", "categoryIndex") val encoded = new OneHotEncoderEstimator().setDropLast(false) .setInputCols(Array("categoryIndex")).setOutputCols(Array("categoryVec")) .fit(df).transform(df) val oneHotList = encoded.getSVCol("categoryVec") val trueList = List(new SparseVector(3, Array(0), Array(1.0)), new SparseVector(3, Array(1), Array(1.0)), new SparseVector(3, Array(0), Array(1.0)), new SparseVector(3, Array(2), Array(1.0)), new SparseVector(3, Array(1), Array(1.0)), new SparseVector(3, Array(0), Array(1.0))) assert(oneHotList === trueList) } private def testOHE(data: DataFrame) = { assertSparkException[SparkException]( new OneHotEncoderEstimator() .setInputCols(Array("categoryIndex")).setOutputCols(Array("encodedOutput")), data.toDF("id", "categoryIndex")) } test("raise an error when applied to a null array") { testOHE(session.createDataFrame(Seq((0, Some(0.0)), (1, Some(1.0)), (2, None)))) } test("raise an error when it receives a strange float") { testOHE(session.createDataFrame(Seq((0, 0.0), (1, 1.0), (2, 0.4)))) testOHE(session.createDataFrame(Seq((0, 0.0), (1, 1.0), (2, -1.0)))) } }
Example 2
Source File: LinalgUtils.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.linalg import ml.combust.mleap.core.annotation.SparkCode import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.{BLAS, VectorWithNorm} val precisionBound1 = 2.0 * EPSILON * sumSquaredNorm / (normDiff * normDiff + EPSILON) if (precisionBound1 < precision) { sqDist = sumSquaredNorm - 2.0 * BLAS.dot(v1, v2) } else if (v1.isInstanceOf[SparseVector] || v2.isInstanceOf[SparseVector]) { val dotValue = BLAS.dot(v1, v2) sqDist = math.max(sumSquaredNorm - 2.0 * dotValue, 0.0) val precisionBound2 = EPSILON * (sumSquaredNorm + 2.0 * math.abs(dotValue)) / (sqDist + EPSILON) if (precisionBound2 > precision) { sqDist = Vectors.sqdist(v1, v2) } } else { sqDist = Vectors.sqdist(v1, v2) } sqDist } def log1pExp(x: Double): Double = { if (x > 0) { x + math.log1p(math.exp(-x)) } else { math.log1p(math.exp(x)) } } }
Example 3
Source File: VectorUtil.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg.mleap import ml.combust.mleap.core.annotation.SparkCode import org.apache.spark.ml.linalg import org.apache.spark.ml.linalg.SparseVector @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib-local/src/main/scala/org/apache/spark/ml/linalg/Vector.scala") object VectorUtil { implicit class VectorOps(vector: linalg.Vector) { def toBreeze: breeze.linalg.Vector[Double] = vector.asBreeze } implicit class SparseVectorOps(vector: SparseVector) { def slice(indices: Array[Int]): SparseVector = vector.slice(indices) } def fromBreeze(breezeVector: breeze.linalg.Vector[Double]): linalg.Vector = linalg.Vectors.fromBreeze(breezeVector) } object VectorWithNorm { def apply(vector: linalg.Vector): VectorWithNorm = { VectorWithNorm(vector, linalg.Vectors.norm(vector, 2.0)) } } case class VectorWithNorm(vector: linalg.Vector, norm: Double)
Example 4
Source File: KMeansOp.scala From mleap with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.bundle.ops.clustering import ml.combust.bundle.BundleContext import ml.combust.bundle.dsl._ import ml.combust.bundle.op.{OpModel, OpNode} import ml.combust.mleap.tensor.Tensor import org.apache.spark.ml.bundle.{ParamSpec, SimpleParamSpec, SimpleSparkOp, SparkBundleContext} import org.apache.spark.ml.clustering.KMeansModel import org.apache.spark.ml.linalg.{DenseVector, SparseVector} import org.apache.spark.mllib.clustering import org.apache.spark.mllib.linalg.Vectors class KMeansOp extends SimpleSparkOp[KMeansModel] { override val Model: OpModel[SparkBundleContext, KMeansModel] = new OpModel[SparkBundleContext, KMeansModel] { override val klazz: Class[KMeansModel] = classOf[KMeansModel] override def opName: String = Bundle.BuiltinOps.clustering.k_means override def store(model: Model, obj: KMeansModel) (implicit context: BundleContext[SparkBundleContext]): Model = { model.withValue("cluster_centers", Value.tensorList(obj.clusterCenters.map(cc => Tensor.denseVector(cc.toArray)))). withValue("num_features", Value.long(obj.clusterCenters.head.size)) } override def load(model: Model) (implicit context: BundleContext[SparkBundleContext]): KMeansModel = { val clusterCenters = model.value("cluster_centers"). getTensorList[Double].toArray. map(t => Vectors.dense(t.toArray)) val mllibModel = new clustering.KMeansModel(clusterCenters) new KMeansModel(uid = "", parentModel = mllibModel) } } override def sparkLoad(uid: String, shape: NodeShape, model: KMeansModel): KMeansModel = { val clusterCenters = model.clusterCenters.map { case DenseVector(values) => Vectors.dense(values) case SparseVector(size, indices, values) => Vectors.sparse(size, indices, values) } new KMeansModel(uid = uid, parentModel = new clustering.KMeansModel(clusterCenters)) } override def sparkInputs(obj: KMeansModel): Seq[ParamSpec] = { Seq("features" -> obj.featuresCol) } override def sparkOutputs(obj: KMeansModel): Seq[SimpleParamSpec] = { Seq("prediction" -> obj.predictionCol) } }
Example 5
Source File: ParallelPersonalizedPageRankSuite.scala From graphframes with Apache License 2.0 | 5 votes |
package org.graphframes.lib import com.github.zafarkhaja.semver.Version import org.apache.spark.ml.linalg.{SQLDataTypes, SparseVector} import org.apache.spark.sql.Row import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.DataTypes import org.graphframes.examples.Graphs import org.graphframes.{GraphFrameTestSparkContext, SparkFunSuite, TestUtils} class ParallelPersonalizedPageRankSuite extends SparkFunSuite with GraphFrameTestSparkContext { val n = 100 test("Illegal function call argument setting") { val g = Graphs.star(n) val vertexIds: Array[Any] = Array(1L, 2L, 3L) // Not providing number of iterations intercept[IllegalArgumentException] { g.parallelPersonalizedPageRank.sourceIds(vertexIds).run() } // Not providing sourceIds intercept[IllegalArgumentException] { g.parallelPersonalizedPageRank.maxIter(15).run() } // Provided empty sourceIds intercept[IllegalArgumentException] { g.parallelPersonalizedPageRank.maxIter(15).sourceIds(Array()).run() } } test("Star example parallel personalized PageRank") { val g = Graphs.star(n) val resetProb = 0.15 val maxIter = 10 val vertexIds: Array[Any] = Array(1L, 2L, 3L) lazy val prc = g.parallelPersonalizedPageRank .maxIter(maxIter) .sourceIds(vertexIds) .resetProbability(resetProb) val pr = prc.run() TestUtils.testSchemaInvariants(g, pr) TestUtils.checkColumnType(pr.vertices.schema, "pageranks", SQLDataTypes.VectorType) TestUtils.checkColumnType(pr.edges.schema, "weight", DataTypes.DoubleType) } // In Spark <2.4, sourceIds must be smaller than Int.MaxValue, // which might not be the case for LONG_ID in graph.indexedVertices. if (Version.valueOf(org.apache.spark.SPARK_VERSION) .greaterThanOrEqualTo(Version.valueOf("2.4.0"))) { test("friends graph with parallel personalized PageRank") { val g = Graphs.friends val resetProb = 0.15 val maxIter = 10 val vertexIds: Array[Any] = Array("a") lazy val prc = g.parallelPersonalizedPageRank .maxIter(maxIter) .sourceIds(vertexIds) .resetProbability(resetProb) val pr = prc.run() val prInvalid = pr.vertices .select("pageranks") .collect() .filter { row: Row => vertexIds.size != row.getAs[SparseVector](0).size } assert(prInvalid.size === 0, s"found ${prInvalid.size} entries with invalid number of returned personalized pagerank vector") val gRank = pr.vertices .filter(col("id") === "g") .select("pageranks") .first().getAs[SparseVector](0) assert(gRank.numNonzeros === 0, s"User g (Gabby) doesn't connect with a. So its pagerank should be 0 but we got ${gRank.numNonzeros}.") } } }
Example 6
Source File: StreamingMLUtils.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib import scala.language.implicitConversions import org.apache.spark.ml.linalg.{SparseVector, DenseVector, Vector} import org.apache.spark.mllib.linalg.{Vector => OldVector, Vectors => OldVectors} import org.apache.spark.mllib.util.MLUtils object StreamingMLUtils { implicit def mlToMllibVector(v: Vector): OldVector = v match { case dv: DenseVector => OldVectors.dense(dv.toArray) case sv: SparseVector => OldVectors.sparse(sv.size, sv.indices, sv.values) case _ => throw new IllegalArgumentException } def fastSquaredDistance(x: Vector, xNorm: Double, y: Vector, yNorm: Double) = { MLUtils.fastSquaredDistance(x, xNorm, y, yNorm) } }
Example 7
Source File: LibSVMRelationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 8
Source File: VectorFeaturizer.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw.featurizer import org.apache.spark.sql.Row import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import scala.collection.mutable override def featurize(row: Row, indices: mutable.ArrayBuilder[Int], values: mutable.ArrayBuilder[Double]): Unit = { row.getAs[Vector](fieldIdx) match { case v: DenseVector => // check if we need to hash if (v.size < mask + 1) indices ++= 0 until v.size else indices ++= (0 until v.size).map { mask & _ } values ++= v.values case v: SparseVector => // check if we need to hash if (v.size < mask + 1) indices ++= v.indices else indices ++= v.indices.map { mask & _ } values ++= v.values } () } }
Example 9
Source File: DatasetExtensions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.schema import org.apache.spark.ml.linalg.{DenseVector, SparseVector} import org.apache.spark.sql.Dataset import org.apache.spark.sql.types.StructType import scala.collection.mutable def findUnusedColumnName(prefix: String)(columnNames: scala.collection.Set[String]): String = { var counter = 2 var unusedColumnName = prefix while (columnNames.contains(unusedColumnName)) { unusedColumnName += "_" + counter counter += 1 } unusedColumnName } def findUnusedColumnName(prefix: String, schema: StructType): String = { findUnusedColumnName(prefix)(schema.fieldNames.toSet) } def findUnusedColumnName(prefix: String, df: Dataset[_]): String = { findUnusedColumnName(prefix, df.schema) } }
Example 10
Source File: VerifyVowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.test.base.TestBase import com.microsoft.ml.spark.core.test.fuzzing.{TestObject, TransformerFuzzing} import org.apache.spark.ml.linalg.{SparseVector, Vector, Vectors} import org.apache.spark.ml.util.MLReadable class VerifyVowpalWabbitInteractions extends TestBase with TransformerFuzzing[VowpalWabbitInteractions] { case class Data(val v1: Vector, val v2: Vector, val v3: Vector) lazy val df = session.createDataFrame(Seq(Data( Vectors.dense(Array(1.0, 2.0, 3.0)), Vectors.sparse(8, Array(5), Array(4.0)), Vectors.sparse(11, Array(8, 9), Array(7.0, 8.0)) ))) private def featurizeUsing(interactions: VowpalWabbitInteractions) = interactions.transform(df).head().getAs[SparseVector]("features") private def verifyValues(actual: SparseVector, expected: Array[Double]) = { assert(actual.numNonzeros == expected.length) (actual.values.sorted zip expected.sorted).forall{ case (x,y) => x == y } } test("Verify VowpalWabbit Interactions 3-dense x 1-sparse") { val interactions = new VowpalWabbitInteractions() .setInputCols(Array("v1", "v2")) .setOutputCol("features") val v = featurizeUsing(interactions) verifyValues(v, Array(4.0, 8, 12.0)) } test("Verify VowpalWabbit Interactions 1-sparse x 2-sparse") { val interactions = new VowpalWabbitInteractions() .setInputCols(Array("v2", "v3")) .setOutputCol("features") val v = featurizeUsing(interactions) verifyValues(v, Array(28.0, 32.0)) } test("Verify VowpalWabbit Interactions 3-dense x 1-sparse x 2-sparse") { val interactions = new VowpalWabbitInteractions() .setInputCols(Array("v1", "v2", "v3")) .setOutputCol("features") val v = featurizeUsing(interactions) verifyValues(v, Array( 1.0 * 5 * 7, 1 * 5 * 8.0, 2.0 * 5 * 7, 2 * 5 * 8.0, 3.0 * 5 * 7, 3 * 5 * 8.0 )) } def testObjects(): Seq[TestObject[VowpalWabbitInteractions]] = List(new TestObject( new VowpalWabbitInteractions().setInputCols(Array("v1")).setOutputCol("out"), df)) override def reader: MLReadable[_] = VowpalWabbitInteractions }
Example 11
Source File: HashingTFSpec.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.ml import com.microsoft.ml.spark.core.schema.DatasetExtensions._ import com.microsoft.ml.spark.core.test.base.TestBase import org.apache.spark.ml.feature.{HashingTF, Tokenizer} import org.apache.spark.ml.linalg.SparseVector class HashingTFSpec extends TestBase { test("operation on tokenized strings") { val wordDataFrame = session.createDataFrame(Seq( (0, Array("Hi", "I", "can", "not", "foo", "foo")), (1, Array("I")), (2, Array("Logistic", "regression")), (3, Array("Log", "f", "reg")) )).toDF("label", "words") val hashDF = new HashingTF().setInputCol("words").setOutputCol("hashedTF").transform(wordDataFrame) val lines = hashDF.getSVCol("hashedTF") val trueLines = List( new SparseVector(262144, Array(36073, 51654, 113890, 139098, 242088), Array(1.0, 2.0, 1.0, 1.0, 1.0)), new SparseVector(262144, Array(113890), Array(1.0)), new SparseVector(262144, Array(13671, 142455), Array(1.0, 1.0)), new SparseVector(262144, Array(24152, 74466, 122984), Array(1.0, 1.0, 1.0)) ) assert(lines === trueLines) } test("support several values for number of features") { val featureSizes = List(1, 5, 100, 100000) val words = Array("Hi", "I", "can", "not", "foo", "bar", "foo", "afk") val wordDataFrame = session.createDataFrame(Seq((0, words))).toDF("label", "words") val fsResults = featureSizes.map { n => new HashingTF() .setNumFeatures(n) .setInputCol("words") .setOutputCol("hashedTF") .transform(wordDataFrame) .getSVCol("hashedTF")(0) } val trueResults = Array( new SparseVector(1, Array(0), Array(8.0)), new SparseVector(5, Array(0, 2, 3), Array(4.0, 2.0, 2.0)), new SparseVector(100, Array(0, 10, 18, 33, 62, 67, 80), Array(1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0)), new SparseVector(100000, Array(5833, 9467, 16680, 29018, 68900, 85762, 97510), Array(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0)) ) assert(fsResults === trueResults) } test("treat empty strings as another word") { val wordDataFrame = session.createDataFrame(Seq( (0, "hey you no way"), (1, ""))) .toDF("label", "sentence") val tokenized = new Tokenizer().setInputCol("sentence").setOutputCol("tokens").transform(wordDataFrame) val hashDF = new HashingTF().setInputCol("tokens").setOutputCol("HashedTF").transform(tokenized) val lines = hashDF.getSVCol("hashedTF") assert(lines(1) === new SparseVector(262144, Array(249180), Array(1.0))) } test("raise an error when applied to a null array") { val tokenDataFrame = session.createDataFrame(Seq( (0, Some(Array("Hi", "I", "can", "not", "foo"))), (1, None)) ).toDF("label", "tokens") assertSparkException[org.apache.spark.SparkException](new HashingTF().setInputCol("tokens"), tokenDataFrame) } test("raise an error when given strange values of n") { List(0, -1, -10).foreach { n => intercept[IllegalArgumentException] { new HashingTF().setNumFeatures(n) } } } }
Example 12
Source File: VectorConverters.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.util import breeze.linalg.{DenseVector => BDV, SparseVector => BSV, Vector => BV} import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor} import org.apache.spark.ml.linalg.{DenseMatrix, DenseVector, Matrices, Matrix, SparseMatrix, SparseVector, Vector, Vectors} import scala.language.implicitConversions trait VectorConverters { implicit def sparkVectorToMleapTensor(vector: Vector): Tensor[Double] = vector match { case vector: DenseVector => DenseTensor(vector.toArray, Seq(vector.size)) case vector: SparseVector => SparseTensor(indices = vector.indices.map(i => Seq(i)), values = vector.values, dimensions = Seq(vector.size)) } implicit def mleapTensorToSparkVector(tensor: Tensor[Double]): Vector = tensor match { case tensor: DenseTensor[_] => Vectors.dense(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => Vectors.sparse(tensor.dimensions.product, tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def sparkMatrixToMleapTensor(matrix: Matrix): Tensor[Double] = matrix match { case matrix: DenseMatrix => DenseTensor(matrix.toArray, Seq(matrix.numRows, matrix.numCols)) case matrix: SparseMatrix => val indices = matrix.rowIndices.zip(matrix.colPtrs).map { case (r, c) => Seq(r, c) }.toSeq SparseTensor(indices = indices, values = matrix.values, dimensions = Seq(matrix.numRows, matrix.numCols)) } implicit def mleapTensorToSparkMatrix(tensor: Tensor[Double]): Matrix = tensor match { case tensor: DenseTensor[_] => Matrices.dense(tensor.dimensions.head, tensor.dimensions(1), tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => val (rows, cols) = tensor.indices.map(v => (v.head, v(1))).unzip Matrices.sparse(tensor.dimensions.head, tensor.dimensions(1), cols.toArray, rows.toArray, tensor.values.asInstanceOf[Array[Double]]) } implicit def breezeVectorToMLeapTensor(vector: BV[Double]): Tensor[Double] = vector match { case vector : BDV[Double] => DenseTensor(vector.toArray, Seq(vector.size)) case vector : BSV[Double] => SparseTensor(vector.index.map(i => Seq(i)), vector.data, Seq(vector.values.size)) } implicit def mleapTensorToBreezeVector(tensor: Tensor[Double]): BV[Double] = tensor match { case tensor: DenseTensor[_] => new BDV(tensor.rawValues.asInstanceOf[Array[Double]]) case tensor: SparseTensor[_] => new BSV(tensor.indices.map(_.head).toArray, tensor.values.asInstanceOf[Array[Double]], tensor.dimensions.product) } } object VectorConverters extends VectorConverters
Example 13
Source File: LibSVMRelationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 14
Source File: RichVector.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.utils.spark import breeze.linalg.{DenseVector => BreezeDenseVector, SparseVector => BreezeSparseVector, Vector => BreezeVector} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.collection.mutable.ArrayBuffer def combine(vectors: Seq[Vector]): Vector = { val indices = ArrayBuffer.empty[Int] val values = ArrayBuffer.empty[Double] val size = vectors.foldLeft(0)((size, vector) => { vector.foreachActive { case (i, v) => if (v != 0.0) { indices += size + i values += v } } size + vector.size }) Vectors.sparse(size, indices.toArray, values.toArray).compressed } implicit class RichSparseVector(val v: SparseVector) extends AnyVal { def updated(index: Int, indexVal: Int, value: Double): SparseVector = { require(v.indices(index) == indexVal, s"Invalid index: indices($index)==${v.indices(index)}, expected: $indexVal") v.values(index) = value v } } }
Example 15
Source File: IDFTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.feature import com.salesforce.op._ import com.salesforce.op.features.types._ import com.salesforce.op.test.{TestFeatureBuilder, TestSparkContext} import com.salesforce.op.utils.spark.RichDataset._ import org.apache.spark.ml.feature.IDF import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.{Estimator, Transformer} import org.junit.runner.RunWith import org.scalatest.junit.JUnitRunner import org.scalatest.{Assertions, FlatSpec, Matchers} @RunWith(classOf[JUnitRunner]) class IDFTest extends FlatSpec with TestSparkContext { val data = Seq( Vectors.sparse(4, Array(1, 3), Array(1.0, 2.0)), Vectors.dense(0.0, 1.0, 2.0, 3.0), Vectors.sparse(4, Array(1), Array(1.0)) ) lazy val (ds, f1) = TestFeatureBuilder(data.map(_.toOPVector)) Spec[IDF] should "compute inverted document frequency" in { val idf = f1.idf() val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => math.log((data.length + 1.0) / (x + 1.0)) }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } it should "compute inverted document frequency when minDocFreq is 1" in { val idf = f1.idf(minDocFreq = 1) val model = idf.originStage.asInstanceOf[Estimator[_]].fit(ds) val transformedData = model.asInstanceOf[Transformer].transform(ds) val results = transformedData.select(idf.name).collect(idf) idf.name shouldBe idf.originStage.getOutputFeatureName val expectedIdf = Vectors.dense(Array(0, 3, 1, 2).map { x => if (x > 0) math.log((data.length + 1.0) / (x + 1.0)) else 0 }) val expected = scaleDataWithIDF(data, expectedIdf) for { (res, exp) <- results.zip(expected) (x, y) <- res.value.toArray.zip(exp.toArray) } assert(math.abs(x - y) <= 1e-5) } private def scaleDataWithIDF(dataSet: Seq[Vector], model: Vector): Seq[Vector] = { dataSet.map { case data: DenseVector => val res = data.toArray.zip(model.toArray).map { case (x, y) => x * y } Vectors.dense(res) case data: SparseVector => val res = data.indices.zip(data.values).map { case (id, value) => (id, value * model(id)) } Vectors.sparse(data.size, res) } } }
Example 16
Source File: NormalizerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.util.{DefaultReadWriteTest, MLTest} import org.apache.spark.ml.util.TestingUtils._ import org.apache.spark.sql.{DataFrame, Row} class NormalizerSuite extends MLTest with DefaultReadWriteTest { import testImplicits._ @transient var data: Array[Vector] = _ @transient var l1Normalized: Array[Vector] = _ @transient var l2Normalized: Array[Vector] = _ override def beforeAll(): Unit = { super.beforeAll() data = Array( Vectors.sparse(3, Seq((0, -2.0), (1, 2.3))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.6, -1.1, -3.0), Vectors.sparse(3, Seq((1, 0.91), (2, 3.2))), Vectors.sparse(3, Seq((0, 5.7), (1, 0.72), (2, 2.7))), Vectors.sparse(3, Seq()) ) l1Normalized = Array( Vectors.sparse(3, Seq((0, -0.465116279), (1, 0.53488372))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.12765957, -0.23404255, -0.63829787), Vectors.sparse(3, Seq((1, 0.22141119), (2, 0.7785888))), Vectors.dense(0.625, 0.07894737, 0.29605263), Vectors.sparse(3, Seq()) ) l2Normalized = Array( Vectors.sparse(3, Seq((0, -0.65617871), (1, 0.75460552))), Vectors.dense(0.0, 0.0, 0.0), Vectors.dense(0.184549876, -0.3383414, -0.922749378), Vectors.sparse(3, Seq((1, 0.27352993), (2, 0.96186349))), Vectors.dense(0.897906166, 0.113419726, 0.42532397), Vectors.sparse(3, Seq()) ) } def assertTypeOfVector(lhs: Vector, rhs: Vector): Unit = { assert((lhs, rhs) match { case (v1: DenseVector, v2: DenseVector) => true case (v1: SparseVector, v2: SparseVector) => true case _ => false }, "The vector type should be preserved after normalization.") } def assertValues(lhs: Vector, rhs: Vector): Unit = { assert(lhs ~== rhs absTol 1E-5, "The vector value is not correct after normalization.") } test("Normalization with default parameter") { val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized") val dataFrame: DataFrame = data.zip(l2Normalized).seq.toDF("features", "expected") testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") { case Row(features: Vector, normalized: Vector, expected: Vector) => assertTypeOfVector(normalized, features) assertValues(normalized, expected) } } test("Normalization with setter") { val dataFrame: DataFrame = data.zip(l1Normalized).seq.toDF("features", "expected") val normalizer = new Normalizer().setInputCol("features").setOutputCol("normalized").setP(1) testTransformer[(Vector, Vector)](dataFrame, normalizer, "features", "normalized", "expected") { case Row(features: Vector, normalized: Vector, expected: Vector) => assertTypeOfVector(normalized, features) assertValues(normalized, expected) } } test("read/write") { val t = new Normalizer() .setInputCol("myInputCol") .setOutputCol("myOutputCol") .setP(3.0) testDefaultReadWrite(t) } }
Example 17
Source File: LibSVMResponseRowDeserializer.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers import org.apache.spark.ml.linalg.{SparseVector, SQLDataTypes} import org.apache.spark.sql.Row import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.{ContentTypes, ResponseRowDeserializer} override val accepts: String = ContentTypes.TEXT_LIBSVM private def parseLibSVMRow(record: String): Row = { val items = record.split(' ') val label = items.head.toDouble val (indices, values) = items.tail.filter(_.nonEmpty).map { item => val entry = item.split(':') val index = entry(0).toInt - 1 val value = entry(1).toDouble (index, value) }.unzip Row(label, new SparseVector(dim, indices.toArray, values.toArray)) } override val schema: StructType = StructType( Array( StructField(labelColumnName, DoubleType, nullable = false), StructField(featuresColumnName, SQLDataTypes.VectorType, nullable = false))) }
Example 18
Source File: LibSVMResponseRowDeserializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers import org.scalatest._ import org.scalatest.mock.MockitoSugar import scala.collection.mutable.ListBuffer import org.apache.spark.ml.linalg.SparseVector import org.apache.spark.sql._ class LibSVMResponseRowDeserializerTests extends FlatSpec with Matchers with MockitoSugar { "LibSVMResponseRowDeserializer" should "deserialize a single record with a two features" in { val rrd = new LibSVMResponseRowDeserializer(3) val responseIterator = rrd.deserializeResponse(createLibSVMRecord(1, Array(1, 2), Array(1.0, 2.0)).getBytes) assert(responseIterator.next == Row(1, new SparseVector(3, Array(1, 2), Array(1.0, 2.0)))) } it should "deserialize a single record with no values" in { val rrd = new LibSVMResponseRowDeserializer(0) val responseIterator = rrd.deserializeResponse( createLibSVMRecord(1, Seq[Int]().toArray, Seq[Double]().toArray).getBytes) assert(responseIterator.next == Row(1, new SparseVector(0, Seq[Int]().toArray, Seq[Double]().toArray))) } it should "deserialize multiple records with multiple features" in { val dim = 100 val rrd = new LibSVMResponseRowDeserializer(dim) val sb = new StringBuilder val rows = new ListBuffer[Row] for (i <- Range(0, dim)) { val label = i.asInstanceOf[Double] val indices = Range (0, i) val values = Range(0, i) map( a => (a - 10) * a) map (a => a.asInstanceOf[Double]) sb ++= createLibSVMRecord(label, indices.toArray, values.toArray) rows += Row(label, new SparseVector(dim, indices.toArray, values.toArray)) sb ++= "\n" } assert(List() ++ rrd.deserializeResponse(sb.mkString.getBytes) == rows.toList) } it should "throw on invalid dimension" in { intercept[IllegalArgumentException] { new LibSVMResponseRowDeserializer(-1) } } it should "fail on invalid label" in { val rrd = new LibSVMResponseRowDeserializer(3) intercept[RuntimeException] { val responseIterator = rrd.deserializeResponse("XXX 1:1".getBytes) } } it should "fail on invalid value" in { val rrd = new LibSVMResponseRowDeserializer(3) intercept[RuntimeException] { rrd.deserializeResponse("1.0 1:Elizabeth".getBytes) } } it should "fail on invalid index" in { val rrd = new LibSVMResponseRowDeserializer(3) intercept[RuntimeException] { rrd.deserializeResponse("1.0 BLAH:1.3421".getBytes) } } it should "fail on missing index" in { val rrd = new LibSVMResponseRowDeserializer(3) intercept[RuntimeException] { rrd.deserializeResponse("1.0 :1.3421".getBytes) } } it should "fail on missing value" in { val rrd = new LibSVMResponseRowDeserializer(3) intercept[RuntimeException] { rrd.deserializeResponse("1.0 1:".getBytes) } } it should "fail on index out of bounds" in { val rrd = new LibSVMResponseRowDeserializer(2) intercept[RuntimeException] { rrd.deserializeResponse("1.0 3:2.0".getBytes) } } private def createLibSVMRecord(label : Double, indices : Array[Int], values : Array[Double]) : String = { val sb = new StringBuilder(label.toString) val x = indices zip values for((index, value) <- x) { sb ++= s" ${index + 1}:$value" } sb.mkString } }
Example 19
Source File: ProtobufRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.protobuf.ProtobufConverter class ProtobufRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val labelColumnName = "label" val featuresColumnName = "features" val schema = StructType(Array(StructField(labelColumnName, DoubleType), StructField( featuresColumnName, VectorType))) it should "serialize a dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "serialize a sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new ProtobufRequestRowSerializer(Some(schema)) val protobuf = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Option.empty) val serialized = rrs.serializeRow(row) val protobufIterator = ProtobufConverter.recordIOByteArrayToProtobufs(serialized) val protobufFromRecordIO = protobufIterator.next assert(!protobufIterator.hasNext) assert(protobuf.equals(protobufFromRecordIO)) } it should "fail to set schema on invalid features name" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) intercept[IllegalArgumentException] { val rrs = new ProtobufRequestRowSerializer(Some(schema), featuresColumnName = "doesNotExist") } } it should "fail on invalid types" in { val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new ProtobufRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("features", SQLDataTypes.VectorType, nullable = false))) new ProtobufRequestRowSerializer(Some(validSchema)) } }
Example 20
Source File: UnlabeledLibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StringType, StructField, StructType} class UnlabeledLibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false))) "UnlabeledLibSVMRequestRowSerializer" should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() val serialized = new String(rrs.serializeRow(row)) assert ("0.0 1:-100.0 11:100.1\n" == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() val serialized = new String(rrs.serializeRow(row)) assert("0.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "fail on invalid features column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer(featuresColumnName = "mangoes are not features") intercept[RuntimeException] { rrs.serializeRow(row) } } it should "fail on invalid features type" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, "FEATURESSSSSZ!1!").toArray, schema = schema) val rrs = new UnlabeledLibSVMRequestRowSerializer() intercept[RuntimeException] { rrs.serializeRow(row) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("features", SQLDataTypes.VectorType, nullable = false))) val rrs = new UnlabeledLibSVMRequestRowSerializer(Some(validSchema)) } it should "fail to validate incorrect schema" in { val invalidSchema = StructType(Array( StructField("features", StringType, nullable = false))) intercept[IllegalArgumentException] { new UnlabeledLibSVMRequestRowSerializer(Some(invalidSchema)) } } }
Example 21
Source File: LibSVMRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest._ import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.deserializers.LibSVMResponseRowDeserializer class LibSVMRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema = new LibSVMResponseRowDeserializer(10).schema "LibSVMRequestRowSerializer" should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new LibSVMRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert ("1.0 1:-100.0 11:100.1\n" == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(1.0, vec).toArray, schema = schema) val rrs = new LibSVMRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "ignore other columns" in { val schemaWithExtraColumns = StructType(Array( StructField("name", StringType, nullable = false), StructField("label", DoubleType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false), StructField("favorite activity", StringType, nullable = false))) val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq("Elizabeth", 1.0, vec, "Crying").toArray, schema = schemaWithExtraColumns) val rrs = new LibSVMRequestRowSerializer(Some(schemaWithExtraColumns)) val serialized = new String(rrs.serializeRow(row)) assert("1.0 1:10.0 2:-100.0 3:2.0\n" == serialized) } it should "fail on invalid features column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schema), featuresColumnName = "i do not exist dear sir!") } } it should "fail on invalid label column name" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schema), labelColumnName = "Sir! I must protest! I do not exist!") } } it should "fail on invalid types" in { val schemaWithInvalidLabelType = StructType(Array( StructField("label", StringType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false))) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schemaWithInvalidLabelType)) } val schemaWithInvalidFeaturesType = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", StringType, nullable = false))) intercept[RuntimeException] { new LibSVMRequestRowSerializer(Some(schemaWithInvalidFeaturesType)) } } it should "validate correct schema" in { val validSchema = StructType(Array( StructField("label", DoubleType, nullable = false), StructField("features", SQLDataTypes.VectorType, nullable = false))) new LibSVMRequestRowSerializer(Some(validSchema)) } }
Example 22
Source File: UnlabeledCSVRequestRowSerializerTests.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package unit.com.amazonaws.services.sagemaker.sparksdk.transformation.serializers import org.scalatest.{FlatSpec, Matchers} import org.scalatest.mock.MockitoSugar import org.apache.spark.ml.linalg.{DenseVector, SparseVector, SQLDataTypes} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{StructField, StructType} import com.amazonaws.services.sagemaker.sparksdk.transformation.serializers.UnlabeledCSVRequestRowSerializer class UnlabeledCSVRequestRowSerializerTests extends FlatSpec with Matchers with MockitoSugar { val schema: StructType = StructType(Array(StructField("features", SQLDataTypes.VectorType, nullable = false))) it should "serialize sparse vector" in { val vec = new SparseVector(100, Seq[Int](0, 10).toArray, Seq[Double](-100.0, 100.1).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) val sparseString = "-100.0," + "0.0," * 9 + "100.1," + "0.0," * 88 + "0.0\n" assert (sparseString == serialized) } it should "serialize dense vector" in { val vec = new DenseVector(Seq(10.0, -100.0, 2.0).toArray) val row = new GenericRowWithSchema(values = Seq(vec).toArray, schema = schema) val rrs = new UnlabeledCSVRequestRowSerializer(Some(schema)) val serialized = new String(rrs.serializeRow(row)) assert("10.0,-100.0,2.0\n" == serialized) } }
Example 23
Source File: MinMaxScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.mleap.VectorUtil._ import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.math.{max, min} def apply(vector: Vector): Vector = { val scale = maxValue - minValue // 0 in sparse vector will probably be rescaled to non-zero val values = vector.copy.toArray val size = values.length var i = 0 while (i < size) { if (!values(i).isNaN) { val raw = if (originalRange(i) != 0) (values(i) - minArray(i)) / originalRange(i) else 0.5 values(i) = raw * scale + minValue } i += 1 } Vectors.dense(values) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(originalRange.length)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(originalRange.length)).get }
Example 24
Source File: FeatureCrossOp.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature.cross import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import scala.collection.mutable.ArrayBuffer object FeatureCrossOp { def flatCartesian(vector: Vector): Vector = { val curDim = vector.size vector match { case sv: SparseVector => val indices = new ArrayBuffer[Int]() val values = new ArrayBuffer[Double]() sv.indices.foreach { idx1 => sv.indices.foreach { idx2 => indices += curDim * idx1 + idx2 values += sv(idx1) * sv(idx2) } } val sorted = indices.zip(values).sortBy(_._1) val sortedIndices = sorted.map(_._1) val sortedValues = sorted.map(_._2) new SparseVector(sv.size * sv.size, sortedIndices.toArray, sortedValues.toArray) case dv: DenseVector => val values: Array[Double] = new Array(dv.size * dv.size) (0 until dv.size).foreach { idx1 => (0 until dv.size).foreach { idx2 => values(dv.size * idx1 + idx2) = dv(idx1) * dv(idx2) } } new DenseVector(values) } } def main(args: Array[String]): Unit = { val v = new DenseVector(Array(1, 2, 3)) val cv = flatCartesian(v) println(cv.toDense.values.mkString(",")) } }
Example 25
Source File: FeatureUtils.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.feature import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.sql.{Dataset, Row} import scala.language.postfixOps object FeatureUtils { def maxDim(dataset: Dataset[Row], col: String = "features"): Int = { dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] => val dim = rows.map { case Row(v: Vector) => v match { case sv: SparseVector => sv.indices.last case dv: DenseVector => dv.size } }.max Iterator(dim) }.max + 1 } def countNonZero(dataset: Dataset[Row], col: String = "features"): Array[Int] = { dataset.select(col).rdd.mapPartitions { rows: Iterator[Row] => val mergeIndices = rows.map { case Row(v: Vector) => v match { case sv: SparseVector => sv.indices.toList } }.reduce(_ union _ distinct) Iterator(mergeIndices) }.reduce((a, b) => (a union b).distinct).toArray } }
Example 26
Source File: DataUtils.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl.utils import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{DataFrame, SparkSession} object DataUtils { def parse(ss: SparkSession, schema: StructType, X: Array[Vector], Y: Array[Double]): DataFrame = { require(X.size == Y.size, "The size of configurations should be equal to the size of rewards.") ss.createDataFrame( Y.zip(X)).toDF("label", "features") } def parse(ss: SparkSession, schema: StructType, X: Vector): DataFrame = { parse(ss, schema, Array(X), Array(0)) } def toBreeze(values: Array[Double]): BDV[Double] = { new BDV[Double](values) } def toBreeze(vector: Vector): BDV[Double] = vector match { case sv: SparseVector => new BDV[Double](vector.toDense.values) case dv: DenseVector => new BDV[Double](dv.values) } def toBreeze(X: Array[Vector]): BDM[Double] = { val mat = BDM.zeros[Double](X.size, X(0).size) for (i <- 0 until X.size) { for (j <- 0 until X(0).size) { mat(i, j) = X(i)(j) } } mat } }
Example 27
Source File: KNNPropSpec.scala From spark-tda with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.util.knn import scala.reflect.ClassTag import org.scalacheck.{Arbitrary, Gen} import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Gen.{choose, oneOf} import org.scalatest.PropSpec import org.apache.spark.ml.linalg.{ CosineDistance, EuclideanDistance, ManhattanDistance, JaccardDistance, HammingDistance } import org.apache.spark.ml.linalg.{Vector, SparseVector, DenseVector, Vectors} import com.holdenkarau.spark.testing.SharedSparkContext abstract class KNNPropSpec extends PropSpec with SharedSparkContext { implicit def arbitraryDenseVector: Arbitrary[DenseVector] = Arbitrary { for (arr <- arbitrary[Array[Double]]) yield new DenseVector(arr) } implicit def arbitrarySparseVector: Arbitrary[SparseVector] = Arbitrary { for (vec <- arbitrary[DenseVector]) yield vec.toSparse } implicit def arbitraryVector: Arbitrary[Vector] = Arbitrary( Gen.frequency( 1 -> arbitrary[DenseVector], 1 -> arbitrary[SparseVector] )) private def arraysOfNM[T: ClassTag](numRows: Int, numCols: Int, gen: Gen[T]): Gen[Array[Array[T]]] = Gen.listOfN(numRows * numCols, gen).map { square => square.toArray.grouped(numCols).toArray } private def vectorsOfNM(numRows: Int, numCols: Int, gen: Gen[Double]): Gen[Array[DenseVector]] = for { arrays <- arraysOfNM(numRows, numCols, gen) } yield arrays.map(arr => new DenseVector(arr)) val treeGen = for { measure <- oneOf(CosineDistance, EuclideanDistance, ManhattanDistance, HammingDistance, JaccardDistance) numVectors <- choose(1, 100) vectors <- vectorsOfNM(numVectors, 2, choose(-10.0, 10.0)) } yield vectors .scanLeft(Seq[Vector]())(_ :+ _) .tail .map( vs => VPTree(vs.map(v => VectorEntry(0L, v)).toIndexedSeq, measure, 10, 10, 10)) }
Example 28
Source File: XgbConverters.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.xgboost.runtime import biz.k11i.xgboost.util.FVec import ml.combust.mleap.tensor.{DenseTensor, SparseTensor, Tensor} import ml.combust.mleap.xgboost.runtime.struct.FVecFactory import ml.dmlc.xgboost4j.LabeledPoint import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} trait XgbConverters { implicit class VectorOps(vector: Vector) { def asXGB: DMatrix = { vector match { case SparseVector(_, indices, values) => new DMatrix(Iterator(new LabeledPoint(0.0f, indices, values.map(_.toFloat)))) case DenseVector(values) => new DMatrix(Iterator(new LabeledPoint(0.0f, null, values.map(_.toFloat)))) } } def asXGBPredictor: FVec = { vector match { case sparseVector: SparseVector => FVecFactory.fromSparseVector(sparseVector) case denseVector: DenseVector => FVecFactory.fromDenseVector(denseVector) } } } implicit class DoubleTensorOps(tensor: Tensor[Double]) { def asXGB: DMatrix = { tensor match { case SparseTensor(indices, values, _) => new DMatrix(Iterator(new LabeledPoint(0.0f, indices.map(_.head).toArray, values.map(_.toFloat)))) case DenseTensor(_, _) => new DMatrix(Iterator(new LabeledPoint(0.0f, null, tensor.toDense.rawValues.map(_.toFloat)))) } } def asXGBPredictor: FVec = { tensor match { case sparseTensor: SparseTensor[Double] => FVecFactory.fromSparseTensor(sparseTensor) case denseTensor: DenseTensor[Double] => FVecFactory.fromDenseTensor(denseTensor) } } } } object XgbConverters extends XgbConverters
Example 29
Source File: CachedDatasetUtils.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.xgboost.runtime.testing import ml.combust.mleap.core.types.TensorType import ml.combust.mleap.core.util.VectorConverters import ml.combust.mleap.runtime.frame.{ArrayRow, DefaultLeapFrame, Row} import ml.dmlc.xgboost4j.scala.DMatrix import org.apache.spark.ml.linalg.SparseVector import org.apache.spark.sql.SparkSession import org.apache.spark.sql.mleap.TypeConverters trait CachedDatasetUtils { private final val TrainDataFilePath = "datasources/agaricus.train" private final val TrainDataMultinomialFilePath = "datasources/iris.scale.txt" val binomialDataset: DMatrix = new DMatrix(this.getClass.getClassLoader.getResource(TrainDataFilePath).getFile) val multinomialDataset: DMatrix = new DMatrix(this.getClass.getClassLoader.getResource(TrainDataMultinomialFilePath).getFile) lazy val leapFrameLibSVMtrain: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataFilePath) lazy val leapFrameIrisTrain: DefaultLeapFrame = leapFrameFromLibSVMFile(TrainDataMultinomialFilePath) def numFeatures(dataset: DefaultLeapFrame): Int = dataset.schema.getField("features").get.dataType.asInstanceOf[TensorType].dimensions.get.head private def leapFrameFromLibSVMFile(filePath: String): DefaultLeapFrame = { // Use Spark utils to load libsvm from disk val spark = SparkSession.builder() .master("local[2]") .appName(s"${this.getClass.getName}") .getOrCreate() // This is the dataset used by dmlc-XGBoost https://github.com/dmlc/xgboost/blob/master/demo/data/agaricus.txt.train val dataFrame = spark.read.format("libsvm") .load(this.getClass.getClassLoader.getResource(filePath).getFile) val mleapSchema = Option(TypeConverters.sparkSchemaToMleapSchema(dataFrame)) val mleapMatrix: Array[ArrayRow] = dataFrame.collect().map { r => ArrayRow( Seq( r.get(0), VectorConverters.sparkVectorToMleapTensor(r.get(1).asInstanceOf[SparseVector]) )) } DefaultLeapFrame(mleapSchema.get, mleapMatrix) } def toDenseFeaturesLeapFrame(sparseLeapFrame: DefaultLeapFrame): DefaultLeapFrame = { val featureColumnIndex = sparseLeapFrame.schema.indexOf("features").get val labelColumnIndex = sparseLeapFrame.schema.indexOf("label").get val denseDataset: Seq[Row] = sparseLeapFrame.dataset.map{ row => { val array = new Array[Any](2) array(labelColumnIndex) = row.getDouble(labelColumnIndex) array(featureColumnIndex) = row.getTensor[Double](featureColumnIndex).toDense ArrayRow(array) } } DefaultLeapFrame(sparseLeapFrame.schema, denseDataset) } }
Example 30
Source File: VectorSlicerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.VectorUtil._ @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/VectorSlicer.scala") case class VectorSlicerModel(indices: Array[Int], namedIndices: Array[(String, Int)] = Array(), inputSize: Int) extends Model { val allIndices: Array[Int] = indices.union(namedIndices.map(_._2)) def apply(features: Vector): Vector = features match { case features: DenseVector => Vectors.dense(allIndices.map(features.apply)) case features: SparseVector => features.slice(allIndices) } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(allIndices.length)).get }
Example 31
Source File: ElementwiseProductModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructField, StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/ElementwiseProduct.scala") case class ElementwiseProductModel(scalingVec: Vector) extends Model { def apply(vector: Vector): Vector = { vector match { case DenseVector(values) => val vs = values.clone() val size = vs.length var i = 0 while (i < size) { vs(i) *= scalingVec(i) i += 1 } Vectors.dense(vs) case SparseVector(size, indices, values) => val vs = values.clone() val nnz = vs.length var i = 0 while (i < nnz) { vs(i) *= scalingVec(indices(i)) i += 1 } Vectors.sparse(size, indices, vs) } } override def inputSchema: StructType = StructType(StructField("input" -> TensorType.Double(scalingVec.size))).get override def outputSchema: StructType = StructType(StructField("output" -> TensorType.Double(scalingVec.size))).get }
Example 32
Source File: MaxAbsScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.math.{max, min} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/MaxAbsScaler.scala") case class MaxAbsScalerModel(maxAbs: Vector) extends Model { def apply(vector: Vector): Vector = { val maxAbsUnzero = Vectors.dense(maxAbs.toArray.map(x => if (x == 0) 1 else x)) vector match { case DenseVector(values) => val vs = values.clone() val size = vs.length var i = 0 while (i < size) { if (!values(i).isNaN) { val rescale = max(-1.0, min(1.0, values(i) / maxAbsUnzero(i))) vs(i) = rescale } i += 1 } Vectors.dense(vs) case SparseVector(size, indices, values) => val vs = values.clone() val nnz = vs.length var i = 0 while (i < nnz) { val raw = max(-1.0, min(1.0, values(i) / maxAbsUnzero(indices(i)))) vs(i) = raw i += 1 } Vectors.sparse(size, indices, vs) } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(maxAbs.size)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(maxAbs.size)).get }
Example 33
Source File: ChiSqSelectorModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import scala.collection.mutable @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/mllib/feature/ChiSqSelector.scala") case class ChiSqSelectorModel(filterIndices: Seq[Int], inputSize: Int) extends Model { def apply(features: Vector): Vector = { features match { case SparseVector(size, indices, values) => val newSize = filterIndices.length val newValues = mutable.ArrayBuilder.make[Double] val newIndices = mutable.ArrayBuilder.make[Int] var i = 0 var j = 0 var indicesIdx = 0 var filterIndicesIdx = 0 while (i < indices.length && j < filterIndices.length) { indicesIdx = indices(i) filterIndicesIdx = filterIndices(j) if (indicesIdx == filterIndicesIdx) { newIndices += j newValues += values(i) j += 1 i += 1 } else { if (indicesIdx > filterIndicesIdx) { j += 1 } else { i += 1 } } } // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size) Vectors.sparse(newSize, newIndices.result(), newValues.result()) case DenseVector(values) => val values = features.toArray Vectors.dense(filterIndices.map(i => values(i)).toArray) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}.") } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(filterIndices.length)).get }
Example 34
Source File: LibSVMRelationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.source.libsvm import java.io.File import java.nio.charset.StandardCharsets import com.google.common.io.Files import org.apache.spark.{SparkException, SparkFunSuite} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.sql.{Row, SaveMode} import org.apache.spark.util.Utils class LibSVMRelationSuite extends SparkFunSuite with MLlibTestSparkContext { // Path for dataset var path: String = _ override def beforeAll(): Unit = { super.beforeAll() val lines = """ |1 1:1.0 3:2.0 5:3.0 |0 |0 2:4.0 4:5.0 6:6.0 """.stripMargin val dir = Utils.createDirectory(tempDir.getCanonicalPath, "data") val file = new File(dir, "part-00000") Files.write(lines, file, StandardCharsets.UTF_8) path = dir.toURI.toString } override def afterAll(): Unit = { try { Utils.deleteRecursively(new File(path)) } finally { super.afterAll() } } test("select as sparse vector") { val df = spark.read.format("libsvm").load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("select as dense vector") { val df = spark.read.format("libsvm").options(Map("vectorType" -> "dense")) .load(path) assert(df.columns(0) == "label") assert(df.columns(1) == "features") assert(df.count() == 3) val row1 = df.first() assert(row1.getDouble(0) == 1.0) val v = row1.getAs[DenseVector](1) assert(v == Vectors.dense(1.0, 0.0, 2.0, 0.0, 3.0, 0.0)) } test("select a vector with specifying the longer dimension") { val df = spark.read.option("numFeatures", "100").format("libsvm") .load(path) val row1 = df.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(100, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data and read it again") { val df = spark.read.format("libsvm").load(path) val tempDir2 = new File(tempDir, "read_write_test") val writepath = tempDir2.toURI.toString // TODO: Remove requirement to coalesce by supporting multiple reads. df.coalesce(1).write.format("libsvm").mode(SaveMode.Overwrite).save(writepath) val df2 = spark.read.format("libsvm").load(writepath) val row1 = df2.first() val v = row1.getAs[SparseVector](1) assert(v == Vectors.sparse(6, Seq((0, 1.0), (2, 2.0), (4, 3.0)))) } test("write libsvm data failed due to invalid schema") { val df = spark.read.format("text").load(path) intercept[SparkException] { df.write.format("libsvm").save(path + "_2") } } test("select features from libsvm relation") { val df = spark.read.format("libsvm").load(path) df.select("features").rdd.map { case Row(d: Vector) => d }.first df.select("features").collect } }
Example 35
Source File: WordToVectorModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.types.{BasicType, ListType, StructType, TensorType} import org.apache.spark.ml.linalg.mleap.BLAS import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} sealed trait WordToVectorKernel { def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector def name: String } object WordToVectorKernel { private val lookup: Map[String, WordToVectorKernel] = Seq(Default, Sqrt).map { k => (k.name, k) }.toMap def forName(name: String): WordToVectorKernel = lookup(name) case object Default extends WordToVectorKernel { override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = { val sum = Vectors.zeros(size) for (v <- vectors) { BLAS.axpy(1.0, v, sum) } BLAS.scal(1.0 / sentenceSize, sum) sum } override def name: String = "default" } case object Sqrt extends WordToVectorKernel { override def apply(size: Int, sentenceSize: Int, vectors: Iterator[Vector]): Vector = { val sum = Vectors.zeros(size) for (v <- vectors) { BLAS.axpy(1.0, v, sum) } val values = sum match { case sum: DenseVector => sum.values case sum: SparseVector => sum.values } var i = 0 val s = values.length val sqrt = Math.sqrt(BLAS.dot(sum, sum)) while (i < s) { values(i) /= sqrt i += 1 } sum } override def name: String = "sqrt" } } case class WordToVectorModel(wordIndex: Map[String, Int], wordVectors: Array[Double], kernel: WordToVectorKernel = WordToVectorKernel.Default) extends Model { val numWords: Int = wordIndex.size val vectorSize: Int = wordVectors.length / numWords val vectors: Map[String, Vector] = { wordIndex.map { case (word, ind) => (word, wordVectors.slice(vectorSize * ind, vectorSize * ind + vectorSize)) } }.mapValues(Vectors.dense).map(identity) def apply(sentence: Seq[String]): Vector = { if (sentence.isEmpty) { Vectors.sparse(vectorSize, Array.empty[Int], Array.empty[Double]) } else { val vs = sentence.iterator.map(vectors.get). filter(_.isDefined). map(_.get) kernel(vectorSize, sentence.size, vs) } } override def inputSchema: StructType = StructType("input" -> ListType(BasicType.String)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(vectorSize)).get }
Example 36
Source File: NormalizerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def apply(features: Vector): Vector = { val norm = Vectors.norm(features, pNorm) if (norm != 0.0) { // For dense vector, we've to allocate new memory for new output vector. // However, for sparse vector, the `index` array will not be changed, // so we can re-use it to save memory. features match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while (i < size) { values(i) /= norm i += 1 } Vectors.dense(values) case SparseVector(size, ids, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) /= norm i += 1 } Vectors.sparse(size, ids, values) case v => throw new IllegalArgumentException("Do not support vector type " + v.getClass) } } else { // Since the norm is zero, return the input vector object itself. // Note that it's safe since we always assume that the data in RDD // should be immutable. features } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(inputSize)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(inputSize)).get }
Example 37
Source File: VectorIndexerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import java.util.NoSuchElementException import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.4.5/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala") case class VectorIndexerModel(numFeatures: Int, categoryMaps: Map[Int, Map[Double, Int]], handleInvalid: HandleInvalid = HandleInvalid.Error) extends Model { val sortedCatFeatureIndices = categoryMaps.keys.toArray.sorted val localVectorMap = categoryMaps val localNumFeatures = numFeatures val localHandleInvalid = handleInvalid def apply(features: Vector): Vector = predict(features) def predict(features: Vector): Vector = { assert(features.size == localNumFeatures, "VectorIndexerModel expected vector of length" + s" $numFeatures but found length ${features.size}") features match { case dv: DenseVector => var hasInvalid = false val tmpv = dv.copy localVectorMap.foreach { case (featureIndex: Int, categoryMap: Map[Double, Int]) => try { tmpv.values(featureIndex) = categoryMap(tmpv(featureIndex)) } catch { case _: NoSuchElementException => localHandleInvalid match { case HandleInvalid.Error => throw new IllegalArgumentException(s"VectorIndexer encountered invalid value " + s"${tmpv(featureIndex)} on feature index $featureIndex. To handle " + s"or skip invalid value, try setting VectorIndexer.handleInvalid.") case HandleInvalid.Keep => tmpv.values(featureIndex) = categoryMap.size case HandleInvalid.Skip => hasInvalid = true } } } if (hasInvalid) null else tmpv case sv: SparseVector => // We use the fact that categorical value 0 is always mapped to index 0. var hasInvalid = false val tmpv = sv.copy var catFeatureIdx = 0 // index into sortedCatFeatureIndices var k = 0 // index into non-zero elements of sparse vector while (catFeatureIdx < sortedCatFeatureIndices.length && k < tmpv.indices.length) { val featureIndex = sortedCatFeatureIndices(catFeatureIdx) if (featureIndex < tmpv.indices(k)) { catFeatureIdx += 1 } else if (featureIndex > tmpv.indices(k)) { k += 1 } else { try { tmpv.values(k) = localVectorMap(featureIndex)(tmpv.values(k)) } catch { case _: NoSuchElementException => localHandleInvalid match { case HandleInvalid.Error => throw new IllegalArgumentException(s"VectorIndexer encountered invalid value " + s"${tmpv.values(k)} on feature index $featureIndex. To handle " + s"or skip invalid value, try setting VectorIndexer.handleInvalid.") case HandleInvalid.Keep => tmpv.values(k) = localVectorMap(featureIndex).size case HandleInvalid.Skip => hasInvalid = true } } catFeatureIdx += 1 k += 1 } } if (hasInvalid) null else tmpv } } override def inputSchema: StructType = StructType("input" -> TensorType.Double(localNumFeatures)).get override def outputSchema: StructType = StructType("output" -> TensorType.Double(localNumFeatures)).get }
Example 38
Source File: StandardScalerModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def apply(vector: Vector): Vector = { if (mean.nonEmpty) { val shift = mean.get.toArray val values = vector match { // specially handle DenseVector because its toArray does not clone already case d: DenseVector => d.values.clone() case v: SparseVector => v.toArray } val size = values.length if (std.nonEmpty) { val stdDev = std.get var i = 0 while (i < size) { values(i) = if (stdDev(i) != 0.0) (values(i) - shift(i)) * (1.0 / stdDev(i)) else 0.0 i += 1 } } else { var i = 0 while (i < size) { values(i) -= shift(i) i += 1 } } Vectors.dense(values) } else if (std.nonEmpty) { val stdDev = std.get vector match { case DenseVector(vs) => val values = vs.clone() val size = values.length var i = 0 while(i < size) { values(i) *= (if (stdDev(i) != 0.0) 1.0 / stdDev(i) else 0.0) i += 1 } Vectors.dense(values) case SparseVector(size, indices, vs) => val values = vs.clone() val nnz = values.length var i = 0 while (i < nnz) { values(i) *= (if (stdDev(indices(i)) != 0.0) 1.0 / stdDev(indices(i)) else 0.0) i += 1 } Vectors.sparse(size, indices, values) } } else { throw new IllegalStateException("need to scale with mean and/or with stdev") } } override def inputSchema: StructType = { StructType("input" -> TensorType.Double(size)).get } override def outputSchema: StructType = StructType("output" -> TensorType.Double(size)).get }
Example 39
Source File: IDFModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.feature import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.types.{StructType, TensorType} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} @SparkCode(uri = "https://github.com/apache/spark/blob/v2.0.0/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala") case class IDFModel(idf: Vector) extends Model { def apply(v: Vector): Vector = { val n = v.size v match { case SparseVector(size, indices, values) => val nnz = indices.length val newValues = new Array[Double](nnz) var k = 0 while (k < nnz) { newValues(k) = values(k) * idf(indices(k)) k += 1 } Vectors.sparse(n, indices, newValues) case DenseVector(values) => val newValues = new Array[Double](n) var j = 0 while (j < n) { newValues(j) = values(j) * idf(j) j += 1 } Vectors.dense(newValues) case other => throw new UnsupportedOperationException( s"Only sparse and dense vectors are supported but got ${other.getClass}.") } } override def inputSchema: StructType = StructType("input" -> TensorType.Double()).get override def outputSchema: StructType = StructType("output" -> TensorType.Double()).get }
Example 40
Source File: DecisionTreeClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.tree.{DecisionTree, Node} import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} case class DecisionTreeClassifierModel(override val rootNode: Node, numFeatures: Int, override val numClasses: Int, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with DecisionTree with Serializable { override def predictRaw(features: Vector): Vector = { rootNode.predictImpl(features).impurities } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in DecisionTreeClassifierModel:" + " raw2probabilityInPlace encountered SparseVector") } } }
Example 41
Source File: GBTClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.regression.DecisionTreeRegressionModel import ml.combust.mleap.core.tree.TreeEnsemble import ml.combust.mleap.core.tree.loss.LogLoss import org.apache.spark.ml.linalg.mleap.BLAS import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} def margin(features: Vector): Double = { val treePredictions = Vectors.dense(trees.map(_.predict(features)).toArray) BLAS.dot(treePredictions, treeWeightsVector) } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => dv.values(0) = loss.computeProbability(dv.values(0)) dv.values(1) = 1.0 - dv.values(0) dv case sv: SparseVector => throw new RuntimeException("GBTClassificationModel encountered SparseVector") } } override def predictRaw(features: Vector): Vector = { val prediction: Double = margin(features) Vectors.dense(Array(-prediction, prediction)) } }
Example 42
Source File: NaiveBayesModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.Model import ml.combust.mleap.core.annotation.SparkCode import ml.combust.mleap.core.classification.NaiveBayesModel.{Bernoulli, ModelType, Multinomial} import org.apache.spark.ml.linalg.mleap.{BLAS, Matrices} import org.apache.spark.ml.linalg.{DenseVector, Matrix, SparseVector, Vector} @SparkCode(uri = "https://github.com/apache/spark/blob/master/mllib/src/main/scala/org/apache/spark/ml/classification/NaiveBayes.scala") case class NaiveBayesModel(numFeatures: Int, numClasses: Int, pi: Vector, theta: Matrix, modelType: NaiveBayesModel.ModelType, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with Model { private def multinomialCalculation(raw: Vector) = { val prob = theta.multiply(raw) BLAS.axpy(1.0, pi, prob) prob } private def bernoulliCalculation(raw: Vector) = { val negTheta = Matrices.map(theta, value => math.log(1.0 - math.exp(value))) val ones = new DenseVector(Array.fill(theta.numCols) {1.0}) val thetaMinusNegTheta = Matrices.map(theta, value => value - math.log(1.0 - math.exp(value))) val negThetaSum = negTheta.multiply(ones) raw.foreachActive((_, value) => require(value == 0.0 || value == 1.0, s"Bernoulli naive Bayes requires 0 or 1 feature values but found $raw.") ) val prob = thetaMinusNegTheta.multiply(raw) BLAS.axpy(1.0, pi, prob) BLAS.axpy(1.0, negThetaSum, prob) prob } override def predictRaw(raw: Vector): Vector = { modelType match { case Multinomial => multinomialCalculation(raw) case Bernoulli => bernoulliCalculation(raw) } } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => var i = 0 val size = dv.size val maxLog = dv.values.max while (i < size) { dv.values(i) = math.exp(dv.values(i) - maxLog) i += 1 } ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in NaiveBayesModel:" + " raw2probabilityInPlace encountered SparseVector") } } }
Example 43
Source File: SupportVectorMachineModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} import org.apache.spark.ml.linalg.mleap.BLAS case class SupportVectorMachineModel(coefficients: Vector, intercept: Double, override val thresholds: Option[Array[Double]] = Some(SupportVectorMachineModel.defaultThresholds)) extends ProbabilisticClassificationModel with Serializable { private def margin(features: Vector): Double = BLAS.dot(coefficients, features) + intercept override val numClasses: Int = 2 override val numFeatures: Int = coefficients.size override def predictRaw(features: Vector): Vector = { val m = margin(features) Vectors.dense(Array(-m, m)) } override def rawToProbabilityInPlace(raw: Vector): Vector = raw }
Example 44
Source File: RandomForestClassifierModel.scala From mleap with Apache License 2.0 | 5 votes |
package ml.combust.mleap.core.classification import ml.combust.mleap.core.tree.TreeEnsemble import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector, Vectors} case class RandomForestClassifierModel(override val trees: Seq[DecisionTreeClassifierModel], override val treeWeights: Seq[Double], numFeatures: Int, override val numClasses: Int, override val thresholds: Option[Array[Double]] = None) extends ProbabilisticClassificationModel with TreeEnsemble with Serializable { override def predictRaw(raw: Vector): Vector = { val votes = Array.fill[Double](numClasses)(0.0) trees.view.foreach { tree => val classCounts: Array[Double] = tree.rootNode.predictImpl(raw).impurities.toArray val total = classCounts.sum if (total != 0) { var i = 0 while (i < numClasses) { votes(i) += classCounts(i) / total i += 1 } } } Vectors.dense(votes) } override def rawToProbabilityInPlace(raw: Vector): Vector = { raw match { case dv: DenseVector => ProbabilisticClassificationModel.normalizeToProbabilitiesInPlace(dv) dv case sv: SparseVector => throw new RuntimeException("Unexpected error in RandomForestClassificationModel:" + " raw2probabilityInPlace encountered SparseVector") } } }